This notebook contains our experiments with optimizing the internal heuristics of the python readability module.
We forked python-readabilty and replaced the hard coded constants with instance variables which we could then tweak to find their optimial value. ( Forked repo is available at https://github.com/dlarochelle/python-readability ) We then attempted to find the optimial values for each of these instance variable constants.
We experimented with 2 approaches:
We randomly split our data into training and test sets. Then using a binary like search, we found that there were 6 parameters that could be modified to improve the F1 score on the test set by more than 0.001: LOW_WEIGHT_LINK_DENSITY_THRESHOLD, MIN_SIBLING_SCORE_THRESHOLD, BEST_SCORE_MULTIPLIER_THRESHOLD, CONTENT_SCORE_DIV_BONUS, CLASS_WEIGHT_NEGATIVE_RE_PENALTY, CLASS_WEIGHT_POSITVE_RE_BONUS. Unfortunately, our tests showed that the parameters were not independent. Rerunning the analysis on the training data with all of the significantly optimized parameters set to their optimized value yielded an improvement of only 0.000805. Note that this was a smaller improvement than we obtained from only tweaking each parameter individually.
When we also tried analyzing the test set while modifying only the parameter which created the largest improvement in the training set ('CONTENT_SCORE_DIV_BONUS' - 0.005 improvement). However, our accuracy on the testset actually decreased compared to using only the default values.
There are routines in Python that try to do automated global parameter optimization. We experimented with them a bit but found that they were mainly designed for functions in which even small changes to the params changed the result. Thus they tended to accept the initial values as optimal when small tweaks e.g. +/- 0.001 had no effect.
Sci.py also contains a basinhopping optimization function ( http://docs.scipy.org/doc/scipy-dev/reference/generated/scipy.optimize.basinhopping.html) which automates the process of selecting initial param values which are then passed to another optimization routine. However, it is slow -- even with only a few parameters to optimize, it did not complete after running over night. (The reason it's slow is because evaluating a single set of parameter values on our test set takes over 1 minute.) However, if we want to try global optimization of multiple parameters basinhopping is probably what we want to use.
At the moment, we have decided to simply use the default version of python-readability instead of investing more development time in parameter optimization.
Although it would be interesting to explore basinhopping in more detail, that would require more development time and CPU resources. Additionally, there is the danger that by optimizing over multiple parameters, we would overfit the training data.
Internal Redmine issue for reference: https://cyber.law.harvard.edu/projectmanagement/issues/10722
In [1]:
import cPickle
import os.path
api_key = cPickle.load( file( os.path.expanduser( '~/mediacloud_api_key.pickle' ), 'r' ) )
In [2]:
import cPickle
import os.path
cPickle.dump( api_key, file( os.path.expanduser( '~/mediacloud_api_key.pickle' ), 'wb' ) )
In [3]:
#import sys
#sys.path.append('../../foreign_modules/python/')
In [4]:
loc_key = 'f66a50230d54afaf18822808aed649f1d6ca72b08fb06d5efb6247afe9fbae52'
In [5]:
import subprocess
import tempfile
import codecs
import time
In [6]:
import operator
In [7]:
def lines_to_comparable_text( lines ):
text = u"\n\n".join([ clean_for_comparison(line) for line in lines ])
if text == '':
text = u''
return text
def html_to_comparable_text( html_text ):
text = clean_for_comparison( html_text )
if text == '' or text == None:
text = u''
return text
In [8]:
import lxml
import html2text
def html_strip( str ):
if str.isspace() or str == '':
return u' '
if str == '<':
return u' '
try:
h = html2text.HTML2Text()
h.ignore_links = True
return h.handle( str )
#return lxml.html.fromstring(str).text_content()
except:
print "Unexpected error on string '" + str + "'" , sys.exc_info()[0]
raise
return u''
def clean_for_comparison( str ):
if len(str) > 0:
ret = html_strip( str )
else:
return str
return ret
In [9]:
import difflib
from IPython.display import HTML
from collections import Counter
def ro_compare_base( actual_text, expected_text ):
words_expected = expected_text.split()
words_crf = actual_text.split()
differ = difflib.Differ( )
#print words_crf[:10]
#print words_expected[:10]
list( differ.compare( words_crf , words_expected ) )
counts = Counter([ d[0] for d in differ.compare( words_expected, words_crf ) ])
tp = counts[' ']
fp = counts['+']
fn = counts['-']
return { 'tp': tp, 'fp': fp, 'fn': fn }
def precision_recall_f1( tp, fp, fn ):
if float(tp+fp) == 0:
precision = 0.0
else:
precision = tp/float(tp+fp)
if float( tp + fn ) == 0:
recall = 0
else:
recall = tp/float( tp + fn )
if ( precision + recall ) > 0:
f1 = 2*(precision*recall)/( precision + recall )
else:
f1 = 0
ret = { 'precision': precision,
'recall': recall,
'f1': f1
}
return ret
def ratcliff_obershelp_compare( actual_text, expected_text ):
comp_results = ro_compare_base( actual_text, expected_text )
tp = comp_results[ 'tp' ]
fp = comp_results['fp']
fn = comp_results['fn']
ret = precision_recall_f1( tp, fp, fn )
return ret
def compare_with_expected( extractor_name, actual_text, actual_html, expected_text, story ):
#actual_text = lines_to_comparable_text( actual_lines )
#expected_text = lines_to_comparable_text( expected_lines )
ret = {}
ret[ extractor_name ] = ratcliff_obershelp_compare( actual_text, expected_text )
if compare_deduplicated:
dedup_text = remove_duplicate_sentences( actual_html, story )
ret[ extractor_name + "_dedup" ] = ratcliff_obershelp_compare( dedup_text, expected_text )
return ret
In [10]:
def python_readability_results( eto, readability_options):
#readability_options['debug'] = True
raw_content = eto['raw_content']
extract_res = { 'extracted_html': extract_with_python_readability( raw_content , readability_options) }
if 'extracted_text' not in extract_res:
extract_res['extracted_text'] = html_to_comparable_text( extract_res['extracted_html' ] )
expected_text = eto['expected_text']
story = eto['story']
return ro_compare_base( actual_text=extract_res['extracted_text' ] , expected_text=expected_text )
In [11]:
regenerate_extractor_training_objects = True
regenerate_media_id_media_map = False
regenerate_comps_downloads = True
compare_deduplicated = False
In [12]:
extractor_training_objects = cPickle.load( file(
os.path.expanduser( '~/Dropbox/mc/extractor_test/extractor_training_objects.pickle' ), "rb" ) )
#cPickle.load( open( "extractor_traning_objects.pickle", "rb") )
print len( extractor_training_objects )
In [13]:
import math
import random
import numpy
def python_readability_f1_mean( extractor_training_objects, py_readability_options = {}):
#reload( difflib )
#reload( readability )
#reload( lxml )
#reload( html2text )
#dreload( difflib )
#dreload( readability )
#dreload( lxml )
#random.seed(12345)
#numpy.random.seed( 12345 )
print 'python_readibility_f1_mean', py_readability_options
#py_readability_options = {}
#py_readability_options['retry_length'] = retry_length
#py_readability_options['min_text_length'] = min_text_length
download_results = {}
comp_res = []
for eto in extractor_training_objects:
comp_result = python_readability_results( eto, py_readability_options )
#comp_result = python_readability_results( eto, {} )
comp_res.append( comp_result )
download_results[ eto['downloads_id'] ] = comp_result
#return download_results
#comp_res = [ python_readability_results( eto, py_readability_options ) for eto in extractor_training_objects ]
#print comp_res
#f1_s = [ res['python_readability']['f1'] for res in comp_res ]
#ret = math.fsum( f1_s ) / len( f1_s )
tp = sum( [ x['tp'] for x in comp_res ] )
fp = sum( [ x['fp'] for x in comp_res ] )
fn = sum( [ x['fn'] for x in comp_res ] )
comp_stats = precision_recall_f1( tp=tp, fp=fp, fn=fn )
print 'comp_stats', comp_stats['f1']
return comp_stats['f1' ]
#rint 'python_readibility_f1_mean', 'retry_length', retry_length, 'min_text_length', min_text_length, 'return', ret
#eturn ret
In [43]:
sys.path = ['/home/dlarochelle/dev_scratch/python-readability-non-determinism/'] + sys.path
import readability
COMMA_COUNT = 10
P_TO_INPUT_RATIO = 3
MIN_EMBED_COMMENT_LENGTH = 75
def extract_with_python_readability( raw_content, readability_options=None ):
if readability_options == None:
readability_options = {}
#readability.htmls = lxml.html.HTMLParser(encoding='utf-8')
doc = readability.Document( raw_content, **readability_options )
if 'LONG_NODE_LENGTH' in readability_options:
doc.LONG_NODE_LENGTH # ensure class varaible has been declared
doc.LONG_NODE_LENGTH = readability_options['LONG_NODE_LENGTH']
if 'P_TO_INPUT_RATIO' in readability_options:
doc.P_TO_INPUT_RATIO # ensure class varaible has been declared
doc.P_TO_INPUT_RATIO = readability_options['P_TO_INPUT_RATIO']
if 'LOW_WEIGHT_LINK_DENSITY_THRESHOLD' in readability_options:
doc.LOW_WEIGHT_LINK_DENSITY_THRESHOLD # ensure class varaible has been declared
doc.LOW_WEIGHT_LINK_DENSITY_THRESHOLD = readability_options['LOW_WEIGHT_LINK_DENSITY_THRESHOLD']
if 'HEADER_LINK_DENSITY_THRESHOLD' in readability_options:
doc.HEADER_LINK_DENSITY_THRESHOLD # ensure class varaible has been declared
doc.HEADER_LINK_DENSITY_THRESHOLD = readability_options['HEADER_LINK_DENSITY_THRESHOLD']
if 'HIGH_WEIGHT_LINK_DENSITY_THRESHOLD' in readability_options:
doc.HIGH_WEIGHT_LINK_DENSITY_THRESHOLD # ensure class varaible has been declared
doc.HIGH_WEIGHT_LINK_DENSITY_THRESHOLD = readability_options['HIGH_WEIGHT_LINK_DENSITY_THRESHOLD']
if 'MIN_SIBLING_SCORE_THRESHOLD' in readability_options:
doc.MIN_SIBLING_SCORE_THRESHOLD # ensure class varaible has been declared
doc.MIN_SIBLING_SCORE_THRESHOLD = readability_options['MIN_SIBLING_SCORE_THRESHOLD']
if 'BEST_SCORE_MULTIPLIER_THRESHOLD' in readability_options:
doc.BEST_SCORE_MULTIPLIER_THRESHOLD # ensure class varaible has been declared
doc.BEST_SCORE_MULTIPLIER_THRESHOLD = readability_options['BEST_SCORE_MULTIPLIER_THRESHOLD']
if 'LONG_NODE_LINK_DENSITY_THRESHOLD' in readability_options:
doc.LONG_NODE_LINK_DENSITY_THRESHOLD # ensure class varaible has been declared
doc.LONG_NODE_LINK_DENSITY_THRESHOLD = readability_options['LONG_NODE_LINK_DENSITY_THRESHOLD']
if 'COMMA_COUNT' in readability_options:
doc.COMMA_COUNT # ensure class varaible has been declared
doc.COMMA_COUNT = readability_options['COMMA_COUNT']
if 'MIN_EMBED_COMMENT_LENGTH' in readability_options:
doc.MIN_EMBED_COMMENT_LENGTH # ensure class varaible has been declared
doc.MIN_EMBED_COMMENT_LENGTH = readability_options['MIN_EMBED_COMMENT_LENGTH']
if 'TEXT_LENGTH_THRESHOLD' in readability_options:
doc.TEXT_LENGTH_THRESHOLD # ensure class varaible has been declared
doc.TEXT_LENGTH_THRESHOLD = readability_options['TEXT_LENGTH_THRESHOLD']
if 'RETRY_LENGTH' in readability_options:
doc.RETRY_LENGTH # ensure class varaible has been declared
doc.RETRY_LENGTH = readability_options['RETRY_LENGTH']
if 'SIBLING_CONTENT_LENGTH_SUM' in readability_options:
doc.SIBLING_CONTENT_LENGTH_SUM # ensure class varaible has been declared
doc.SIBLING_CONTENT_LENGTH_SUM = readability_options['SIBLING_CONTENT_LENGTH_SUM']
if 'CONTENT_SCORE_DIV_BONUS' in readability_options:
doc.CONTENT_SCORE_DIV_BONUS # ensure class varaible has been declared
doc.CONTENT_SCORE_DIV_BONUS = readability_options['CONTENT_SCORE_DIV_BONUS']
if 'CONTENT_SCORE_PRE_TD_BONUS' in readability_options:
doc.CONTENT_SCORE_PRE_TD_BONUS # ensure class varaible has been declared
doc.CONTENT_SCORE_PRE_TD_BONUS = readability_options['CONTENT_SCORE_PRE_TD_BONUS']
if 'CONTENT_SCORE_ADDRESS_OL_PENALTY' in readability_options:
doc.CONTENT_SCORE_ADDRESS_OL_PENALTY # ensure class varaible has been declared
doc.CONTENT_SCORE_ADDRESS_OL_PENALTY = readability_options['CONTENT_SCORE_ADDRESS_OL_PENALTY']
if 'CONTENT_SCORE_HEADER_PENALTY' in readability_options:
doc.CONTENT_SCORE_HEADER_PENALTY # ensure class varaible has been declared
doc.CONTENT_SCORE_HEADER_PENALTY = readability_options['CONTENT_SCORE_HEADER_PENALTY']
if 'CLASS_WEIGHT_NEGATIVE_RE_PENALTY' in readability_options:
doc.CLASS_WEIGHT_NEGATIVE_RE_PENALTY # ensure class varaible has been declared
doc.CLASS_WEIGHT_NEGATIVE_RE_PENALTY = readability_options['CLASS_WEIGHT_NEGATIVE_RE_PENALTY']
if 'CLASS_WEIGHT_POSITVE_RE_BONUS' in readability_options:
doc.CLASS_WEIGHT_POSITVE_RE_BONUS # ensure class varaible has been declared
doc.CLASS_WEIGHT_POSITVE_RE_BONUS = readability_options['CLASS_WEIGHT_POSITVE_RE_BONUS']
if 'CONTENT_SCORE_START' in readability_options:
doc.CONTENT_SCORE_START # ensure class varaible has been declared
doc.CONTENT_SCORE_START = readability_options['CONTENT_SCORE_START']
if 'CONTENT_SCORE_INNER_TEXT_MIN_BONUS' in readability_options:
doc.CONTENT_SCORE_INNER_TEXT_MIN_BONUS # ensure class varaible has been declared
doc.CONTENT_SCORE_INNER_TEXT_MIN_BONUS = readability_options['CONTENT_SCORE_INNER_TEXT_MIN_BONUS']
if 'LI_COUNT_REDUCTION' in readability_options:
doc.LI_COUNT_REDUCTION # ensure class varaible has been declared
doc.LI_COUNT_REDUCTION = readability_options['LI_COUNT_REDUCTION']
valid_options = ['LONG_NODE_LENGTH', 'P_TO_INPUT_RATIO', 'LOW_WEIGHT_LINK_DENSITY_THRESHOLD',
'HEADER_LINK_DENSITY_THRESHOLD', 'HIGH_WEIGHT_LINK_DENSITY_THRESHOLD', 'MIN_SIBLING_SCORE_THRESHOLD',
'BEST_SCORE_MULTIPLIER_THRESHOLD', 'LONG_NODE_LINK_DENSITY_THRESHOLD', 'COMMA_COUNT',
'MIN_EMBED_COMMENT_LENGTH', 'TEXT_LENGTH_THRESHOLD', 'RETRY_LENGTH', 'SIBLING_CONTENT_LENGTH_SUM',
'CONTENT_SCORE_DIV_BONUS', 'CONTENT_SCORE_PRE_TD_BONUS', 'CONTENT_SCORE_ADDRESS_OL_PENALTY',
'CONTENT_SCORE_HEADER_PENALTY', 'CLASS_WEIGHT_NEGATIVE_RE_PENALTY', 'CLASS_WEIGHT_POSITVE_RE_BONUS',
'CONTENT_SCORE_START', 'CONTENT_SCORE_INNER_TEXT_MIN_BONUS', 'LI_COUNT_REDUCTION']
for key in readability_options.keys():
#print key
assert key in valid_options, "invalid key " + key
#doc.RETRY_LENGTH = 100000
#doc.TEXT_LENGTH_THRESHOLD = 0
#doc.MAX_SIBLING_P_LINK_DENSITY
#doc.MAX_SIBLING_P_LINK_DENSITY = 2.0
title = doc.short_title()
summary = doc.summary()
ret = title + "\n\n" + summary
return ret
In [15]:
#dreload( readability )
In [16]:
python_readability_results( extractor_training_objects[0], {} )
Out[16]:
In [17]:
import datetime
test_sizes = [ 10, 100, 1000 ]
if False:
f1_expected = {}
start_time = datetime.datetime.now()
for test_size in test_sizes:
f1_expected[ test_size ] = python_readability_f1_mean( extractor_training_objects[ : test_size ], 250, 25 )
current_time = datetime.datetime.now()
print test_size, "total time", current_time - start_time
cPickle.dump( f1_expected,
file( os.path.expanduser( '~/Dropbox/mc/extractor_test/python_reability_expected.pickle'), "wb") )
f1_expected
In [18]:
f1_expected = cPickle.load(
file( os.path.expanduser( '~/Dropbox/mc/extractor_test/python_reability_expected.pickle'), "rb") )
f1_expected
Out[18]:
In [19]:
reload(readability)
test_sizes = [ 10, 100, 1000 ]
f1_actual = {}
start_time = datetime.datetime.now()
for test_size in test_sizes:
f1_actual[ test_size ] = python_readability_f1_mean( extractor_training_objects[ : test_size ],
# {'SIBLING_CONTENT_LENGTH_SUM': 0} )
{} )
current_time = datetime.datetime.now()
print test_size, "total time", current_time - start_time
print 'result for sample size ', test_size, f1_expected[ test_size ] == f1_actual[ test_size ]
In [20]:
run1 = python_readability_f1_mean( extractor_training_objects[:], 250, 25 )
In [ ]:
print "foo"
opt_function = lambda p : 1 - python_readability_f1_mean( extractor_training_objects[:50], { 'LOW_WEIGHT_LINK_DENSITY': p[0], 'HEADER_LINK_DENSITY_THRESHOLD': p[1], 'HIGH_WEIGHT_LINK_DENSITY_THRESHOLD': p[ 2] } )
opt_function( [ LOW_WEIGHT_LINK_DENSITY_THRESHOLD, HEADER_LINK_DENSITY_THRESHOLD, HIGH_WEIGHT_LINK_DENSITY_THRESHOLD ] )
In [ ]:
import scipy.optimize
opt_result = scipy.optimize.minimize( opt_function,
[ LOW_WEIGHT_LINK_DENSITY_THRESHOLD, HEADER_LINK_DENSITY_THRESHOLD, HIGH_WEIGHT_LINK_DENSITY_THRESHOLD ],
method='TNC', bounds=[ [0,1],[0,1],[0,1]],
options={ 'maxiter': 1, 'disp': True} )
opt_result
In [ ]:
class MyBounds(object):
def __init__(self, xmax=[1,1,1], xmin=[0,0,0] ):
self.xmax = np.array(xmax)
self.xmin = np.array(xmin)
def __call__(self, **kwargs):
x = kwargs["x_new"]
tmax = bool(np.all(x <= self.xmax))
tmin = bool(np.all(x >= self.xmin))
return tmax and tmin
In [ ]:
import scipy.optimize
mybounds = MyBounds()
opt_result = scipy.optimize.basinhopping( opt_function,
[ LOW_WEIGHT_LINK_DENSITY_THRESHOLD, HEADER_LINK_DENSITY_THRESHOLD, HIGH_WEIGHT_LINK_DENSITY_THRESHOLD ],
minimizer_kwargs={ 'method':"L-BFGS-B", 'bounds': [ [0,1],[0,1],[0,1]] },
accept_test = mybounds, disp=True )
opt_result
In [21]:
params {
LOW_WEIGHT_LINK_DENSITY_THRESHOLD = 0.2
HEADER_LINK_DENSITY_THRESHOLD = 0.33
HIGH_WEIGHT_LINK_DENSITY_THRESHOLD = 0.5
MIN_SIBLING_SCORE_THRESHOLD = 10
BEST_SCORE_MULTIPLIER_THRESHOLD = 0.2
LONG_NODE_LINK_DENSITY_THRESHOLD = 0.25
LONG_NODE_LENGTH = 80
COMMA_COUNT = 10
P_TO_INPUT_RATIO = 3
MIN_EMBED_COMMENT_LENGTH = 75
type( COMMA_COUNT )
In [38]:
params_to_optimize = [
{'param_name':'LONG_NODE_LENGTH', 'start_value': 80},
{'param_name':'P_TO_INPUT_RATIO', 'start_value': 3},
{'param_name':'LOW_WEIGHT_LINK_DENSITY_THRESHOLD', 'start_value': 0.2},
{'param_name':'HEADER_LINK_DENSITY_THRESHOLD', 'start_value': 0.33},
{'param_name':'HIGH_WEIGHT_LINK_DENSITY_THRESHOLD', 'start_value': 0.5},
{'param_name':'MIN_SIBLING_SCORE_THRESHOLD', 'start_value': 10},
{'param_name':'BEST_SCORE_MULTIPLIER_THRESHOLD', 'start_value': 0.2},
{'param_name':'LONG_NODE_LINK_DENSITY_THRESHOLD', 'start_value': 0.25},
{'param_name':'COMMA_COUNT', 'start_value': 10},
{'param_name':'MIN_EMBED_COMMENT_LENGTH', 'start_value': 75},
{'param_name':'TEXT_LENGTH_THRESHOLD', 'start_value': 25},
{'param_name':'RETRY_LENGTH', 'start_value': 250} ,
{'param_name': 'SIBLING_CONTENT_LENGTH_SUM', 'start_value': 1000},
{'param_name': 'CONTENT_SCORE_DIV_BONUS', 'start_value': 5},
{'param_name': 'CONTENT_SCORE_PRE_TD_BONUS', 'start_value': 3},
{'param_name': 'CONTENT_SCORE_ADDRESS_OL_PENALTY', 'start_value': 3},
{'param_name': 'CONTENT_SCORE_HEADER_PENALTY', 'start_value': 5 },
{'param_name': 'CLASS_WEIGHT_NEGATIVE_RE_PENALTY', 'start_value': 25},
{'param_name': 'CLASS_WEIGHT_POSITVE_RE_BONUS', 'start_value': 25 },
{'param_name': 'CONTENT_SCORE_START', 'start_value': 1},
{'param_name': 'CONTENT_SCORE_INNER_TEXT_MIN_BONUS', 'start_value': 3},
#{'param_name': 'CONTENT_SCORE_GRAND_PARENT_BONUS_FACTOR', 'start_value': 2.0, 'non_zero': True},
{'param_name': 'LI_COUNT_REDUCTION', 'start_value': 100}
]
for value_dict in params_to_optimize:
value_dict[ 'make_int'] = type( value_dict['start_value']) == int
print len( params_to_optimize )
params_to_optimize
Out[38]:
In [39]:
for param_to_opt in params_to_optimize:
param_name = param_to_opt['param_name']
print "if '" + param_name + "' in readability_options:"
print ' doc.' + param_name +' # ensure class varaible has been declared'
print ' doc.' + param_name + " = readability_options['" + param_name + "']"
print
param_names = [ param_to_opt['param_name'] for param_to_opt in params_to_optimize ]
print 'valid_options = ',
print param_names
In [62]:
def adjusted_mean( a, b, make_int ):
ret = ( a + b ) / 2
if make_int:
ret = int ( ret )
else:
ret = round( ret, 2 )
return ret
import random
random.seed( 12345 )
extractor_training_subset = extractor_training_objects[:]
random.shuffle( extractor_training_subset )
#extractor_training_subset = extractor_training_subset[ : ( len( extractor_training_objects)/2 ) ]
def binary_search_opt_param( value_to_optimize, start_value, make_int):
current = start_value
prev = {}
iteration = 0
if make_int:
stop_delta = 1.01
else:
stop_delta = 0.011
max_iterations = 100
funct = lambda param : python_readability_f1_mean( extractor_training_subset, { value_to_optimize: param } )
while True:
iteration += 1
if iteration > max_iterations:
break
print 'iteration', iteration
if make_int:
current = int( current )
print 'current', current
if current not in prev:
prev[ current ] = funct( current )
if iteration % 5 == 0 and iteration > 0 :
sorted_keys = list(reversed(sorted( prev.keys() )))
best_keys = sorted( sorted_keys, key = lambda k : prev[ k ] )
best_keys.reverse()
print 'best_keys', [ ( k, prev[k] ) for k in best_keys ]
if abs( best_keys[1] - best_keys[0] ) <= stop_delta:
print 'stopping for small delta'
print current
break
current = adjusted_mean(best_keys[0], best_keys[1], make_int )
if current in prev:
current = adjusted_mean( best_keys[0], current, make_int )
# just pick a point between the current best and the next closest value
if current in prev:
print 'falling back in heuristic'
best_index = sorted_keys.index( best_keys[ 0 ] )
if best_index == 0:
comp_index = 1
else:
comp_index = best_index - 1
current = adjusted_mean( sorted_keys[ best_index ], sorted_keys[ comp_index ], make_int )
if current in prev:
assert abs( best_keys[ 0 ] - current ) <= stop_delta
print "stopping for small delta", best_keys[0], current
break
print 'continue'
continue
lower = round( current/2.0, 2 )
higher = round( current*2.0, 2)
if make_int:
lower = int( lower )
higher = int( higher )
if lower not in prev:
prev[lower] = funct( lower )
if higher not in prev:
prev[higher] = funct( higher )
if prev[lower] >= prev[higher]:
compare_point = lower
else:
compare_point = higher
if prev[ current ] > prev[ compare_point ]:
current = adjusted_mean(current, compare_point, make_int )
else:
current = compare_point
ret = { 'start_value': start_value,
'start_result': prev[start_value],
'opt_value': best_keys[0],
'opt_result': prev[ best_keys[0] ],
'param_name': value_to_optimize
}
return ret
In [60]:
len( extractor_training_subset )
Out[60]:
In [45]:
python_readability_f1_mean( extractor_training_subset, {} )
Out[45]:
In [61]:
opt_results = []
start_time = datetime.datetime.now()
for param_info in params_to_optimize:
print param_info
opt_results.append( binary_search_opt_param( param_info['param_name'], param_info['start_value'], param_info['make_int'] ))
for opt_result in opt_results:
if opt_result['opt_result'] > opt_result['start_result']:
improvement = opt_result['opt_result'] - opt_result['start_result']
if improvement < 0.001:
print opt_result['param_name'], " - SMALL OPT - start", opt_result['start_value'], opt_result['start_result'], "opt to", opt_result['opt_value'],
print opt_result['opt_result'], 'improvement', improvement
else:
print opt_result['param_name'], "- LARGE OPT - start", opt_result['start_value'], opt_result['start_result'], "opt to", opt_result['opt_value'],
print opt_result['opt_result'], 'improvement', improvement
else:
print opt_result['param_name'], "no improvement - start", opt_result['start_value'], opt_result['start_result'], "opt to", opt_result['opt_value'], opt_result['opt_result']
end_time = datetime.datetime.now()
print 'total time', end_time - start_time
In [73]:
default_f1 = python_readability_f1_mean( extractor_training_subset[ ( len( extractor_training_objects)/2 ) : ], {} )
print 'F1 with default params', default_f1
In [67]:
opt_params = {
'TEXT_LENGTH_THRESHOLD': 48,
'CONTENT_SCORE_DIV_BONUS': 18,
'CLASS_WEIGHT_POSITVE_RE_BONUS': 6
}
opt_params = {
'LOW_WEIGHT_LINK_DENSITY_THRESHOLD': 0.69,
'MIN_SIBLING_SCORE_THRESHOLD': 30 ,
'BEST_SCORE_MULTIPLIER_THRESHOLD': 0.01,
'CONTENT_SCORE_DIV_BONUS': 41,
'CLASS_WEIGHT_NEGATIVE_RE_PENALTY': 36,
'CLASS_WEIGHT_POSITVE_RE_BONUS': 7}
opt_f1 = python_readability_f1_mean(extractor_training_subset[ ( len( extractor_training_objects)/2 ) : ], opt_params )
print 'F1 with optimized params', opt_f1
In [74]:
opt_params = {
'CONTENT_SCORE_DIV_BONUS': 41,
}
opt_f1 = python_readability_f1_mean( extractor_training_subset[ ( len( extractor_training_objects)/2 ) : ], opt_params )
print 'F1 with optimized params', opt_f1
In [27]:
make_int = True
value_to_optimize = 'min_text_length'
start_value = 25
binary_search_opt_param( value_to_optimize, start_value, make_int )
Out[27]:
In [19]:
#random.seed(12345)
#numpy.random.seed( 12345 )
#reload( difflib )
#reload( readability )
#reload( lxml )
#dreload( difflib )
#dreload( readability )
#dreload( lxml )
run2 = python_readability_f1_mean( extractor_training_objects[:], 250, 25 )
In [20]:
print run1 == run2
print run1.keys() == run2.keys()
for k in run1.keys():
if run1[ k] != run2[k]:
print k, run1[k], run2[k]
In [ ]:
#random.seed(12345)
#numpy.random.seed( 12345 )
#reload( difflib )
#reload( readability )
#reload( lxml )
#dreload( difflib )
#dreload( readability )
#dreload( lxml )
run3 = python_readability_f1_mean( extractor_training_objects, 250, 25 )
In [ ]:
print run1.keys() == run3.keys()
print run1.keys() == run3.keys()
for k in run1.keys():
if run1[ k] != run3[k]:
print k, run1[k], run3[k]
In [ ]:
bad_etos = [ eto for eto in extractor_training_objects if eto['downloads_id'] == 590957745 ]
for x in range( 10):
print python_readability_f1_mean( bad_etos, 250, 25 )
In [ ]:
bad_etos = [ eto for eto in extractor_training_objects if eto['downloads_id'] == 590957745 ]
for x in range( 10):
print python_readability_results( bad_etos[0], {} )
In [ ]:
text_outputs = []
for x in range( 10):
text_outputs.append( extract_with_python_readability( bad_etos[0]['raw_content'] ) )
print len( text_outputs )
print len(set( text_outputs ) )
In [ ]:
isinstance( raw_content, unicode )
import hashlib
[ hashlib.md5( out_text ).hexdigest() for out_text in text_outputs2 ]
In [ ]:
text_outputs2 = []
raw_content = bad_etos[0]['raw_content']
#import readability
for x in range( 10):
reload( lxml.etree )
reload ( lxml.html )
reload( readability.cleaners )
reload( readability.encoding )
reload( readability)
reload( readability.htmls )
#readability.htmls = lxml.html.HTMLParser(encoding='utf-8')
print readability.cleaners.html_cleaner
print readability.htmls.utf8_parser
random.seed(12345)
numpy.random.seed( 12345 )
print isinstance( raw_content, unicode )
text_outputs2.append( readability.Document( raw_content ).summary() )
print len( text_outputs2 )
print len(set( text_outputs2 ) )
In [ ]:
set( text_outputs )
In [ ]:
sys.path
In [ ]:
readability.readability.total_siblings = 0
readability.readability.total_eval_siblings = 0
readability.readability.total_candidate_siblings = 0
readability.readability.num_articles = 0
readability.readability.articles_with_siblings = 0
for eto in extractor_training_objects:
extract_with_python_readability( eto['raw_content'] )
print readability.readability.total_siblings
print readability.readability.num_articles
print float( readability.readability.total_siblings ) / readability.readability.num_articles
print readability.readability.articles_with_siblings
print readability.readability.total_candidate_siblings
print float( readability.readability.total_candidate_siblings ) / readability.readability.num_articles
print
print readability.readability.total_eval_siblings
print float( readability.readability.total_eval_siblings ) / readability.readability.num_articles
In [ ]:
#comp_res = [ comp_extractors( eto ) for eto in extractor_training_objects[ :10] ]
#print comp_res
#f1_s = [ res['python_readibilty']['f1'] for res in comp_res ]
#print f1_s
#print numpy.mean( f1_s )
print python_readability_f1_mean( extractor_training_objects, 100000, 25 )
print python_readability_f1_mean( extractor_training_objects, -10, 25 )
print python_readability_f1_mean( extractor_training_objects, 10, 15 )
In [ ]:
opt_fun = lambda p : 1 - python_readability_f1_mean( extractor_training_objects[:400], retry_length=p[0], min_text_length=p[1])
#print opt_fun( [0, 250] )
#print opt_fun( [1000000000, 250] )
In [ ]:
from operator import itemgetter, attrgetter, methodcaller
defaults = { 'retry_length': 250,
'min_text_length':25
}
current_min_text_length = defaults['min_text_length']
range = [0, 100]
prev_values = []
eval_with_param = lambda param : { 'param': param, 'result': opt_fun( [param, defaults['min_text_length'], param ]) }
prev_values = [ eval_with_param( param ) for param in [ 0, 300]]
#print prev_values
#prev_values.sort( key=itemgetter('result') )
#print prev_values
#prev_values.pop()
#print prev_values
old_value = opt_fun( [ defaults['retry_length'], current_min_text_length] )
while True:
prev_values.sort( key=itemgetter('result') )
if prev_values[0]['param'] == prev_values[1]['param']:
prev_values.pop()
break
if abs(prev_values[0]['param'] - prev_values[1]['param']) == 1:
prev_values.pop()
break
print prev_values
new_param = int ( ( prev_values[0]['param'] + prev_values[1]['param'])/2 )
prev_values.pop()
if new_param == prev_values[0]['param']:
break
prev_values.append( eval_with_param( new_param ) )
print prev_values
In [18]:
python_readability_cache = {}
def cached_readability_f1( extractor_training_objects, p ):
print 'cached_readability_f1( extractor_training_objects', p
if id( extractor_training_objects ) not in python_readability_cache:
python_readability_cache[ id( extractor_training_objects ) ] = {}
retry_length = max( -1, int( p[0] ) )
min_text_length = max( -1, int(p[1] ) )
if retry_length not in python_readability_cache[ id( extractor_training_objects ) ]:
python_readability_cache[ id( extractor_training_objects ) ][ retry_length ] = {}
if min_text_length not in python_readability_cache[ id( extractor_training_objects ) ][ retry_length ]:
print 'cached_readability_f1 recalculating ', retry_length, min_text_length
ret = python_readability_f1_mean( extractor_training_objects,
retry_length=retry_length, min_text_length=min_text_length)
python_readability_cache[ id( extractor_training_objects ) ][ retry_length ][min_text_length] = ret
else:
print 'cached_readability_f1 returning from cache', retry_length, min_text_length
ret = python_readability_cache[ id( extractor_training_objects ) ][ retry_length ][min_text_length]
print 'f1', ret
return ret
In [19]:
#id(extractor_training_objects)
cached_readability_f1( extractor_training_objects, [250, 25 ] )
Out[19]:
In [20]:
import scipy.optimize
#f = lambda var : opt_fun( [defaults['retry_length'], var])
#f( 2 )
opt_result = scipy.optimize.minimize( lambda p : 1- cached_readability_f1( extractor_training_objects, p ),
[ 150, 20 ], method='SLSQP', bounds=[ [1,500],[0,50]],
options={ 'maxiter': 1, 'disp': True} )
print opt_result
opt_result
#opt_result.values()
Out[20]:
In [21]:
import scipy.optimize
#f = lambda var : opt_fun( [defaults['retry_length'], var])
#f( 2 )
opt_result = scipy.optimize.minimize_scalar(
lambda min_text_length : 1-
cached_readability_f1( extractor_training_objects, [ 250, min_text_length] ),
bounds=[ 0, 50 ], method='Brent',
options={ 'maxiter': 10, 'disp': True} )
print opt_result
opt_result
#opt_result.values()
Out[21]:
In [22]:
import scipy.optimize
opt_result = scipy.optimize.brute( lambda min_text_length : 1 -
cached_readability_f1( extractor_training_objects, [ 250, min_text_length] ),
ranges=[ slice( 0, 50, 1 )], disp=True, full_output=True)
print opt_result
opt_result
#opt_result = scipy.optimize.minimize( opt_fun, [ 0, 0 ], method='L-BFGS-B', bounds=[ [0,400],[0,100]],options={ 'maxiter': 2000} )
#print opt_result
#opt_result.values()
Out[22]:
In [23]:
import pandas as pd
zip( opt_result[2], [ 1 - x for x in opt_result[3]])
Out[23]:
In [24]:
print opt_fun( [10, 0] )
print opt_fun( [250, 25] )
opt_result
In [ ]:
#opt_result = scipy.optimize.brute( opt_fun, ranges=[ slice( 0,400, 1 ),slice( 0,1000, 1 )])
opt_result = scipy.optimize.brute( opt_fun, ranges=[ slice( 249,251, 1 ),slice( 22,27, 1 )], disp=True, full_output=True)
print opt_result
opt_result
In [ ]:
start_time = datetime.datetime.now()
batch = extractor_training_objects[:400]
print python_readability_f1_mean( batch, 250, 25 )
end_time = datetime.datetime.now()
print "Total_time", end_time - start_time
print "Time per download", (end_time - start_time)/ (len(batch) )
In [ ]:
import datetime
if regenerate_comps_downloads:
comps_downloads = []
processed = 0
skipped = 0
start_time = datetime.datetime.now()
e=None
for extractor_training_object in extractor_training_objects[:100]:
print 'processed ', processed
print 'skipped ', skipped
print extractor_training_object[ 'downloads_id']
try:
res = comp_extractors( extractor_training_object )
#print res
comps_downloads.append( res )
processed += 1
except Exception, e:
print "error on download{}".format( extractor_training_object[ 'downloads_id'] )
e = sys.exc_info()
import traceback
traceback.print_exc()
print e
#raise e
skipped += 1
end_time = datetime.datetime.now()
print "Total_time", end_time - start_time
print "Time per download", (end_time - start_time)/ (processed + skipped )
#cPickle.dump( comps_downloads, file(
# os.path.expanduser( "~/Dropbox/mc/extractor_test/comps_downloads.pickle"), "wb"))
e
#extractor_training_objects
In [ ]:
#comps_downloads = cPickle.load( file(
# os.path.expanduser( "~/Dropbox/mc/extractor_test/comps_downloads.pickle"), "rb") )
In [ ]:
comps_downloads[0]
In [ ]:
df = get_data_frame_from_comparision_objects( comps_downloads )
print_results_by_measurement_type( df )
In [ ]:
non_spidered_downloads = remove_spidered_downloads( comps_downloads )
df = get_data_frame_from_comparision_objects( non_spidered_downloads )
print_results_by_measurement_type( df )
In [ ]:
print "spidered"
df = get_data_frame_from_comparision_objects( only_spidered_downloads( comps_downloads ) )
print_results_by_measurement_type( df )
In [ ]:
regional = { 2453107 }
print "region / pew knight study / 245107 "
df = get_data_frame_from_comparision_objects( filter_by_media_tags_id( non_spidered_downloads, regional ) )
print_results_by_measurement_type( df )
ap_english_us_top_25 = { 2453107 }
print "ap_english_us_top25 / 8875027 "
df = get_data_frame_from_comparision_objects( filter_by_media_tags_id( non_spidered_downloads, ap_english_us_top_25 ) )
print_results_by_measurement_type( df )
political_blogs = { 125 }
print "political blogs / 125"
df = get_data_frame_from_comparision_objects( filter_by_media_tags_id( non_spidered_downloads, political_blogs ) )
print_results_by_measurement_type( df )
russian = { 7796878 }
print 'russian'
df = get_data_frame_from_comparision_objects( filter_by_media_tags_id( non_spidered_downloads, russian ) )
print_results_by_measurement_type( df )
print 'brazil'
df = get_data_frame_from_comparision_objects( filter_by_media_tags_id( non_spidered_downloads, {8877968, 8877969, 8877973, 8877970 } ) )
print_results_by_measurement_type( df )
arabic = { 8878255 }
print 'arabic'
df = get_data_frame_from_comparision_objects( filter_by_media_tags_id( non_spidered_downloads, arabic ) )
print_results_by_measurement_type( df )
In [ ]:
boiler_pipe_extractor_training_objects = cPickle.load( open( "boiler_pipe_google_news_extractor_training_objects.pickle", "rb") )
#eto = extractor_training_objects[ 0 ]
#eto.keys()
#print eto['expected_text']
#get_extraction_results( eto )
#comp_extractors ( eto )
comps_downloads_boiler_pipe = []
processed = 0
skipped = 0
start_time = datetime.datetime.now()
e=None
for extractor_training_object in boiler_pipe_extractor_training_objects[:]:
try:
res = comp_extractors( extractor_training_object )
#print res
comps_downloads_boiler_pipe.append( res )
processed += 1
except Exception, e:
print "error on download{}".format( extractor_training_object[ 'downloads_id'] )
e = sys.exc_info()
import traceback
traceback.print_exc()
print e
#raise e
skipped += 1
print 'processed', processed, 'skipped', skipped
#extraction_results.append( er )
end_time = datetime.datetime.now()
print "Total_time", end_time - start_time
print "Time per download", (end_time - start_time)/ (processed + skipped )
res.keys()
In [ ]:
df = get_data_frame_from_comparision_objects( comps_downloads_boiler_pipe )
print_results_by_measurement_type( df )