In [1]:
import re
In [6]:
#crating regex for give sample
regex = re.compile(r'<td>(.+?) <\;(.+\(at\).+\(dot\).+)>\;</td>')
In [11]:
#open file
file=open('email_sample.html','r')
In [12]:
email=[]
for line in file:
m=regex.search(line)
try:
email.append((m.group(1),m.group(2)))
except:
pass
print 'Total Email Extracted: ',len(email)
print 'sample Email :',email[0]
In [8]:
email[1]
Out[8]:
In [17]:
#Testing
replaceDot = re.compile(r'\(dot\)')
replaceAt = re.compile(r'\(at\)')
temp = replaceDot.sub('.',email[0][1])
print replaceAt.sub('@',temp)
In [21]:
replaceDot = re.compile(r'\(dot\)')
replaceAt = re.compile(r'\(at\)')
PureEmail_data=[]
for rec in email:
temp = replaceDot.sub('.',rec[1])
PureEmail_data.append((rec[0],replaceAt.sub('@',temp)))
print "sample result :",PureEmail_data[0]
In [23]:
print 'Total Harvested Email :',len(PureEmail_data)
del email
In [ ]: