#!/usr/bin/env python
"""Parse HTML files and encode email addresses to hide them from spammers.

This program uses numerical HTML entities, either in decimal (default) or 
hexadecimal encoding.

Author: Andreas Neudecker
Created: 2003-07-30
Changed: 2003-09-28
Version: 0.0.2

Copyright (c) 2003 by Andreas Neudecker.
Licensed under the GNU General Public License (GPL), 
see http://www.gnu.org/copyleft/ for details.

"""

# Changelog:
# (Add change notes chronologially. Include name, email, date)
# [2003-09-28] Added terrific new feature. E. Xample (e.xample@some-email.net)


# Comment conventions:
# [???] = Problem to be solved
# [!!!] = Needs testing/editing
# [2do] = Needs to be implemented/changed/optimised
# You should add a usefull explanation.

# Execute always --------------------------------------------------------------#

# Use Python JIT Psyco to speed up execution. (http://psyco.sourceforge.net/)
#
# Needs to be imported BEFORE other modules to allow them to be optimised at
# runtime. Makes most sense to import and run Psyco at the start of your
# program's main module!
#   Set 'usePsyco' to false while testing. Leave it on for finished code.
# It is NOT necessary to switch off (False) on systems without Psyco installed.
usePsyco = False
# set to true, if you want to use it AND have verbosity.py
#useVerbosity = True

if usePsyco and __name__ == '__main__':
	try:
		import psyco
		psyco.profile()
	except:
		print 'Psyco not found, ignoring it'



# Modules ---------------------------------------------------------------------#
#from optik import *
#import string
import re
import random as R


# Globals ---------------------------------------------------------------------#
verboseTesting = True

# Classes ---------------------------------------------------------------------#

# Functions -------------------------------------------------------------------#

def obfuscate (string, mode = 'decimal'):
	"""Encode string using numeric HTML entities.
	
	Possible encoding modes for the entities are:
	    'decimal',     'dec', 'd': decimal encoding (default)
	    'hexadecimal', 'hex', 'h': hexadecimal encoding
	    'mixed',       'mix', 'm': mixed hex and dec encoding
	    'none',               'n': do not encode
	"""

	modeUse = {
		'decimal'     : decHtmlEnt,
		'dec'         : decHtmlEnt,
		'd'           : decHtmlEnt,
		'hexadecimal' : hexHtmlEnt,
		'hex'         : hexHtmlEnt,
		'h'           : hexHtmlEnt,
		'mixed'       : mixHtmlEnt,
		'mix'         : mixHtmlEnt,
		'm'           : mixHtmlEnt,
	}
	
	modeKeys = modeUse.keys ()
	
	if mode and mode in modeKeys:
		return ''.join (map (modeUse [mode], string))
	else: # wrong mode or "don't encode" selected.
		return string

	
def decHtmlEnt (string):
	"""Encode letters in 'string' as decimal HTML entities &<nn>;"""
	return ''.join (['&#' + str (ord (ch)) + ';' for ch in string])


def hexHtmlEnt (string):
	"""Encode letters in 'string' as hexadecimal HTML entities &<nn>;"""
	return ''.join (['&#' + hex (ord (ch)) [1:] + ';' for ch in string])


def mixHtmlEnt (string):
	"""Encode 'string' as random mix of hex and decimal encoded HTML entities."""
	mix = []
	for ch in string:
		if posRandInt (1):
			mix.append (decHtmlEnt (ch))
		else:
			mix.append (hexHtmlEnt (ch))
	return ''.join (mix)


def noEncoding (string):
	"""Do not manipulate 'string'."""
	return string


def posRandInt (n, count = None):
	"""Pick a random positive integer in the range of [0, n].
	
	If count is given, return a list of 'count' random positive integers.
	Else, return just one.
	"""
	if count:
		return [int (round (R.random () * n)) for i in range (count)]
	else:
		return int (round (R.random () * n))


def encLine (line, mode = 'dec'):
	"""Encode email addresses and URIs in a line as desired.
	
		line:   a 'string', usually one line of a HTML file.
		mode:   the email address string is encoded using numerical HTML 
		        entities ('&[#]<nn[nn]>;'). Allowed values are:
		            'decimal',     'dec', 'd': decimal encoding (default)
		            'hexadecimal', 'hex', 'h': hexadecimal encoding
		            'mixed',       'mix', 'm': mixed hex and dec encoding
		        For all other values, including None the original string
		        is returned.
	"""
	# Prepare regular expressions
	# complete mailto email link (the URI part)
	mailto = re.compile ('\"mailto:.*?@.*?\"')
	# email somewhere in the string, is NOT a link, or just visible link text
	email  = re.compile ("""
	(
		(^
		| [ \s\>\(])	   # email behind '>' part of <a>, after a whitespace or a '('
		[^ \s\(\)\>\<\?\*\"\!\/\{\}\&\%\$\\=\+]*?
		@
		[^ \s\(\)\>\<\?\*\"\!\/\{\}\&\%\$\\=\+]*?
		[ \s\<\)\&\,\:\!\?\+\/\|]    # email before a whitespace, a </a>, ), &
		| [ \.](?=[\s])
	)
	
	""", re.VERBOSE)
	
	line = replExpr (mailto, line)
	line = replExpr (email, line)
	return line
	
	
def replExpr (regx, line):
	"""Use regular expression 'regx' to perform replacements in line."""
	newLine = []
	pieces = []
	iterator = regx.finditer (line)
	# [!!!] replace with list comprehension?
	for it in iterator:
		pieces.append (it.span ())
	#print "pieces =", pieces
	#print "len (line) =", len (line)
	pos = 0
	cutLine = []
	for sp in pieces:
		cutLine.append (line [ pos : sp [0] + 1 ])
		cutLine.append (obfuscate (line [ sp [0] + 1 : sp [1] - 1 ]))
		pos = sp [1] - 1
	if pos <= len (line):
		cutLine.append (line [ pos : ])

	# Reconstruct line
	return ''.join (cutLine)


# main () ---------------------------------------------------------------------#
def __main ():
	a = 'a@b.c, def (<a href="mailto:d@e.f">d@e.f</a>), ghi (g@h.i)'

	print encLine (a)

if __name__ == '__main__':
	__main ()

# EOF -------------------------------------------------------------------------#


