In [4]:
f = open( 'raw_content.html', 'rb')
raw_content = f.read()
f.close()
In [8]:
import readability
#import random
#import numpy
text_outputs2 = []
#raw_content = bad_etos[0]['raw_content']
#import readability
for x in range( 10):
#reload( lxml.etree )
#reload ( lxml.html )
#reload( readability.cleaners )
#reload( readability.encoding )
#reload( readability)
#reload( readability.htmls )
#readability.htmls = lxml.html.HTMLParser(encoding='utf-8')
#print readability.cleaners.html_cleaner
#print readability.htmls.utf8_parser
#random.seed(12345)
#numpy.random.seed( 12345 )
#print isinstance( raw_content, unicode )
f = open( 'raw_content.html', 'rb')
raw_content = f.read()
f.close()
text_outputs2.append( readability.Document( raw_content ).summary() )
print len( text_outputs2 )
print len(set( text_outputs2 ) )
In [6]:
import hashlib
[hashlib.md5( out_text ).hexdigest() for out_text in text_outputs2 ]
Out[6]: