This notebook evaluates both Media Cloud's internal extractors and third party FLOSS extractor libraries across a corpus of hand annotated articles.
Readers may wish to skip to the results section at the end.
In [1]:
import cPickle
import os.path
api_key = cPickle.load( file( os.path.expanduser( '~/mediacloud_api_key.pickle' ), 'r' ) )
In [2]:
import cPickle
import os.path
cPickle.dump( api_key, file( os.path.expanduser( '~/mediacloud_api_key.pickle' ), 'wb' ) )
In [3]:
import sys
sys.path.append('../../foreign_modules/python/')
In [4]:
loc_key = 'f66a50230d54afaf18822808aed649f1d6ca72b08fb06d5efb6247afe9fbae52'
In [5]:
import mediacloud, requests, csv, sys, os, json, cPickle
def get_download( downloads_id ):
download = requests.get('https://api.mediacloud.org/api/v2/downloads/single/'+str(downloads_id)+'?key='+api_key)
return download.json()[0]
def extract_story( preprocessed_lines, title, description, extractor_method ):
extract_params = {'key':loc_key, 'preprocessed_lines':preprocessed_lines,
'story_title':title, 'story_description':description, 'extractor_method': extractor_method}
extract_result = requests.put('http://0:3000/api/v2/extractlines/extract',data=json.dumps(extract_params),
headers = {'Content-type': 'application/json'})
extract_result.raise_for_status()
return extract_result.json()
def get_story_lines( raw_content ):
story_lines_params = {'key':loc_key, 'body_html':raw_content }
headers = {'Content-type': 'application/json'}
story_lines = requests.put('http://0:3000/api/v2/extractlines/story_lines',data=json.dumps(story_lines_params),
params={ 'key': loc_key },headers=headers)
story_lines.raise_for_status()
return story_lines
In [6]:
import subprocess
import tempfile
import codecs
import time
from lxml import html
#download = get_download( downloads_id )
#raw_content = download[u'raw_content']
def extract_with_boilerpipe( raw_content ):
with tempfile.NamedTemporaryFile( suffix='.html', delete=False ) as t:
#print t.name
UTF8Writer = codecs.getwriter('utf8')
t.file = UTF8Writer(t.file)
t.file.write( raw_content )
t.close()
#time.sleep( 2 )
#print "original article tmp file ", t.name
#input_file = '/tmp/416655019.htm'
input_file = t.name
output_tmp = tempfile.NamedTemporaryFile( suffix='.html', delete=False )
output_file = output_tmp.name
#output_file = '/tmp/highlighted.html'
#print output_file
subprocess.check_output(['java', '-jar',
'/home/dlarochelle/dev_scratch/boilerpipe_test/out/artifacts/boilerpipe_test_jar/boilerpipe_test.jar',
input_file, output_file ] )
f = open( output_file, 'rb' )
annotated_file_str = f.read()
#t.unlink( t.name )
output_tmp.close()
#output_tmp.unlink( output_tmp.name )
tree = html.fromstring( annotated_file_str )
spans = tree.xpath('//span[@class="x-boilerpipe-mark1"]')
boiler_pipe_lines = [ etree.tostring(s) for s in spans ]
ret = { 'extracted_html': "\n\n".join(boiler_pipe_lines) }
return ret
In [7]:
#f = open( '/tmp/tmp01CV6F.html' )
#annotated_file_str = f.read()
#tree = html.fromstring( annotated_file_str )
#spans = tree.xpath('//span[@class="x-boilerpipe-mark1"]')
#span = spans[0]
#etree.tostring( span )
In [8]:
import readability
def extract_with_python_readability( raw_content ):
doc = readability.Document( raw_content )
return doc.short_title() + "\n\n" + doc.summary()
In [9]:
import goose
def extract_with_python_goose( raw_content ):
g = goose.Goose()
r = g.extract( raw_html=raw_content )
return r.title + "\n\n" + r.cleaned_text
In [10]:
import justext
def extract_with_justext( raw_content ):
ret = []
paragraphs = justext.justext( raw_content, justext.get_stoplist('English') )
#p = paragraphs[0]
for p in paragraphs:
if not p.is_boilerplate:
ret.append(p.text)
return "\n\n".join(ret)
#extract_with_justext( raw_content )
#raw_html
#justext.get_stoplists()
In [11]:
import operator
def get_extractor_training_text( downloads_id, preprocessed_lines ):
extractor_training_lines_result = requests.get(
'https://api.mediacloud.org/api/v2/extractlines/extractor_training_lines/' + str(downloads_id),
headers = {'Content-type': 'application/json'}
, params= {'key': api_key}
)
extractor_training_lines_result.raise_for_status()
extractor_training_lines_result = extractor_training_lines_result.json()
line_numbers = [ x['line_number'] for x in extractor_training_lines_result ]
line_numbers = sorted(line_numbers)
line_numbers.sort()
#print line_numbers
return operator.itemgetter( * line_numbers )( preprocessed_lines )
import operator
def get_extracted_text( extractor_results ):
included_line_numbers = extractor_results['included_line_numbers']
#print included_line_numbers
dl = extractor_results['download_lines']
if len( included_line_numbers ) == 0:
return []
else:
return operator.itemgetter( * extractor_results['included_line_numbers'] )(dl)
In [12]:
#import Levenshtein
def lines_to_comparable_text( lines ):
text = u"\n\n".join([ clean_for_comparison(line) for line in lines ])
if text == '':
text = u''
return text
def html_to_comparable_text( html_text ):
text = clean_for_comparison( html_text )
if text == '' or text == None:
text = u''
return text
#def compare_accuracy( lines, lines_expected ):
# return Levenshtein.distance( lines_to_comparable_text( lines ) , lines_to_comparable_text( lines_expected ) )
In [13]:
def get_anncestors( element ):
anncestors = [ element ];
anncestor = element.getparent()
while anncestor != None :
#print 'loop'
anncestors.append( anncestor )
anncestor = anncestor.getparent()
return anncestors
In [14]:
def text_from_lxml_object( obj):
if type(obj) is etree._ElementStringResult:
return u'' + obj
if type(obj) == etree._ElementUnicodeResult:
return u'' + obj
else:
try:
return etree.tostring( obj , method='text', encoding="UTF-8")
except:
print type(obj)
print obj
raise ''
In [15]:
from lxml import etree
downloads_id = 582817308
download = get_download( downloads_id )
raw_content = download[ 'raw_content' ]
with open( '/tmp/' + str(downloads_id) , 'wb' ) as f:
f.write( raw_content )
In [16]:
from nltk import tokenize
def remove_duplicate_sentences( article_html, story ):
#sentences_from_html
extract_params = {'key':loc_key, 'story_html': article_html }
extract_result = requests.put('http://0:3000/api/v2/extractlines/sentences_from_html',data=json.dumps(extract_params),
headers = {'Content-type': 'application/json'})
#print "article_html:\n", article_html
extract_result.raise_for_status()
sentences = extract_result.json()
#print "sentences", sentences
#comp_text = lines_to_comparable_text( text_lines )
#sentences = [ sent.strip() for sent in tokenize.sent_tokenize( comp_text ) ]
#print sentences
non_duplicate_sentences = [sentence for sentence in sentences if not sentence_is_duplicate( sentence, story ) ]
return u"\n".join( non_duplicate_sentences )
In [17]:
def text_children( element):
ret = [ t for t in element.xpath("//text()" ) if t.getparent() == element ]
assert len( ret ) <= 2
if len( ret ) == 2:
assert ret[0].is_text
assert ret[1].is_tail
for r in ret:
if r.is_text:
assert element.text == r
else:
assert r.is_tail
assert element.tail == r
return ret
In [18]:
def html_strip( str ):
if str.isspace() or str == '':
return u' '
if str == '<':
return u' '
try:
return html.fromstring(str).text_content()
except:
print "Unexpected error on string '" + str + "'" , sys.exc_info()[0]
#raise
return u''
def clean_for_comparison( str ):
if len(str) > 0:
ret = html_strip( str )
else:
return str
return ret
In [19]:
def extract_with_mc_extractor( eto, method ):
story = eto['story']
preprocessed_lines = eto['preprocessed_lines']
title = story[u'title']
description = story[u'description']
extract_result = extract_story( preprocessed_lines, title, description, 'HeuristicExtractor')
#html_lines = get_extracted_text( extract_result )
ret = {}
ret['extracted_html'] = extract_result[ 'extracted_html' ]
return ret
def extract_with_heur( eto ):
return extract_with_mc_extractor( eto, 'HeuristicExtractor' )
def extract_with_crf( eto ):
return extract_with_mc_extractor( eto, 'CrfExtractor' )
In [20]:
import difflib
from IPython.display import HTML
from collections import Counter
def ratcliff_obershelp_compare( actual_text, expected_text ):
words_expected = expected_text.split()
words_crf = actual_text.split()
differ = difflib.Differ( )
#print words_crf[:10]
#print words_expected[:10]
list( differ.compare( words_crf , words_expected ) )
counts = Counter([ d[0] for d in differ.compare( words_expected, words_crf ) ])
tp = counts[' ']
fp = counts['+']
fn = counts['-']
if float(tp+fp) == 0:
precision = 0.0
else:
precision = tp/float(tp+fp)
if float( tp + fn ) == 0:
recall = 0
else:
recall = tp/float( tp + fn )
if ( precision + recall ) > 0:
f1 = 2*(precision*recall)/( precision + recall )
else:
f1 = 0
ret = { 'precision': precision,
'recall': recall,
'f1': f1
}
return ret
def compare_with_expected( extractor_name, actual_text, actual_html, expected_text, story ):
#actual_text = lines_to_comparable_text( actual_lines )
#expected_text = lines_to_comparable_text( expected_lines )
ret = {}
ret[ extractor_name ] = ratcliff_obershelp_compare( actual_text, expected_text )
dedup_text = remove_duplicate_sentences( actual_html, story )
ret[ extractor_name + "_dedup" ] = ratcliff_obershelp_compare( dedup_text, expected_text )
return ret
In [21]:
ratcliff_obershelp_compare( actual_text='foo', expected_text='bar foo baz BAST')
Out[21]:
In [22]:
def get_extraction_results( eto ):
raw_content = eto[ 'raw_content' ]
ret = {}
ret['heur'] = extract_with_heur( eto )
ret['crf'] = extract_with_crf( eto )
ret['boiler_pipe'] = extract_with_boilerpipe( raw_content)
ret['python_readibilty'] = { 'extracted_html': extract_with_python_readability( raw_content ) }
ret['py_goose'] = { 'extracted_html': extract_with_python_goose( raw_content ) }
ret['justext'] = { 'extracted_html': extract_with_justext( raw_content ) }
for method, result in ret.iteritems():
if 'extracted_text' not in result:
result['extracted_text'] = html_to_comparable_text( result['extracted_html' ] )
return ret
In [23]:
def compare_extractors_for_download( downloads_id ):
eto = create_extractor_training_object( downloads_id )
return comp_extractors( eto )
def comp_extractors( eto ):
downloads_id = eto['downloads_id']
media_id = eto['media_id' ]
story = eto['story']
raw_content = eto['raw_content']
preprocessed_lines = eto['preprocessed_lines']
expected_text = eto['expected_text']
title = story[u'title']
description = story[u'description']
url = story[u'url']
extraction_results = get_extraction_results( eto )
comp_results = {}
comp_results['downloads_id'] = downloads_id
comp_results['media_id'] = media_id
comp_results['story_is_spidered'] = eto['story_is_spidered']
for name, value in extraction_results.iteritems():
#print name, value
comp_results.update (compare_with_expected( name, value['extracted_text'], value['extracted_html'], expected_text, story ) )
comp_results.update( compare_with_expected( 'gold', expected_text, expected_text, expected_text, story ) )
return comp_results
In [24]:
def create_extractor_training_object( downloads_id, expected_text=None ):
download = get_download( downloads_id )
raw_content = download[u'raw_content']
stories_id = download[u'stories_id']
#print download['url']
story = requests.get('https://api.mediacloud.org/api/v2/stories/single/'+str(stories_id)+'?key='+api_key)
story = story.json()[0]
story_lines = get_story_lines( raw_content )
#print story_lines.content
preprocessed_lines = story_lines.json()
if not expected_text:
expected_lines = get_extractor_training_text( downloads_id, preprocessed_lines )
expected_text = lines_to_comparable_text( expected_lines )
story_is_spidered_result = story_is_spidered( story )
ret = { 'downloads_id': downloads_id,
'raw_content': raw_content,
'media_id': story['media_id'],
'story': story,
'story_is_spidered': story_is_spidered_result,
'preprocessed_lines': preprocessed_lines,
'expected_text': expected_text
}
return ret
In [25]:
import sys
sys.path.append('../')
import mc_config
def get_db_info():
config_file = mc_config.read_config()
db_infos = config_file['database']
db_info = next (db_info for db_info in db_infos if db_info['port'] == '6000' )
return db_info
import psycopg2
#import solr_reimport
import psycopg2.extras
#db_info = get_db_info()
#conn = psycopg2.connect( database=db_info['db'], user=db_info['user'],
# password=db_info['pass'], host=db_info['host'], port=db_info['port'] )
conn = None
story_sentence_counts_cache = {}
def get_sentence_counts( sentence, story ):
stories_id = story['stories_id']
if not stories_id in story_sentence_counts_cache:
story_sentence_counts_cache[ stories_id ] = {}
if sentence in story_sentence_counts_cache[ stories_id ]:
return story_sentence_counts_cache[stories_id ][sentence]
global conn
if conn == None:
db_info = get_db_info()
conn = psycopg2.connect( database=db_info['db'], user=db_info['user'],
password=db_info['pass'], host=db_info['host'], port=db_info['port'] )
cursor = conn.cursor(cursor_factory=psycopg2.extras.DictCursor)
query = '''
SELECT MIN( story_sentence_counts_id) AS story_sentence_counts_id, sentence_count, first_stories_id,
sentence_md5
FROM story_sentence_counts
WHERE sentence_md5 = md5(%(sentence)s)
AND media_id = %(media_id)s
AND publish_week = DATE_TRUNC( 'week', %(publish_date)s::date )
GROUP BY story_sentence_counts_id
'''
#print sentence
#md5_sum = md5.new( sentence ).hexdigest()
params = { 'sentence': sentence,
'media_id': story['media_id'],
'publish_date': story['publish_date']
}
#print params
#print eto[ 'story'] ['stories_id' ]
cursor.execute( query, params )
fetched = cursor.fetchall()
if len( fetched ) == 0:
story_sentence_counts_cache[ stories_id ][sentence] = None
else:
story_sentence_counts_cache[ stories_id ][sentence] = dict(fetched[0])
return story_sentence_counts_cache[stories_id ][sentence]
def sentence_is_duplicate( sentence, story ):
sentence_counts = get_sentence_counts( sentence, story )
if sentence_counts != None:
if sentence_counts['sentence_count'] > 1:
#print "duplicate sentence", sentence
return True
elif sentence_counts['first_stories_id'] == story['stories_id']:
return True
#print "duplicate sentence (diff first_stories_id) ", sentence
else:
return False
pass
#print "sentence not found ", sentence
In [26]:
def get_extractor_training_objects_legacy ( downloads_ids ):
print downloads_ids
extractor_training_objects = []
for downloads_id in downloads_ids[:]:
print 'downloads_id:', downloads_id
extractor_training_objects.append( create_extractor_training_object( downloads_id ) )
return extractor_training_objects
In [27]:
import sqlite3
def get_extractor_training_objects_sqlite( db_file ):
db = sqlite3.connect( db_file )
db.row_factory = sqlite3.Row
cursor = db.cursor()
cursor.execute( "SELECT * from dlannotations where selected_texts_json is not null order by downloads_id" )
extractor_training_objects = []
skipped_downloads = 0
for row in list( cursor.fetchall() )[:]:
row = dict([ (k, row[k]) for k in row.keys() ])
#print row
row['annotations'] = json.loads( row['annotations_json'] )
row['raw_content'] = u'' + row['raw_content']
row['selected_texts'] = json.loads( row['selected_texts_json'] )
annotations = row['annotations']
download = get_download( row['downloads_id'] )
assert row['selected_texts'] != None
assert row['selected_texts'] > 0
eto = create_extractor_training_object( row['downloads_id'], expected_text=u"\n".join(row['selected_texts']) )
if eto['raw_content'] != row['raw_content']:
#TODO figure out why these may differ
pass
#d = difflib.Differ()
#diff = d.compare(eto['raw_content'].splitlines(1), row['raw_content'].splitlines(1))
#print '\n'.join(diff)
extractor_training_objects.append( eto )
print "skipped", skipped_downloads
print "processed", len(extractor_training_objects)
return extractor_training_objects
In [28]:
import pandas as pd
def get_data_frame_from_comparision_objects( comparison_objects ):
new_comps = []
for comp in comparison_objects:
new_comp = {}
new_comp = { 'downloads_id': comp['downloads_id'] }
extractor_types = [ k for k in comp.keys() if k not in { 'downloads_id', 'media_id', 'story_is_spidered' } ]
for extractor_type in extractor_types:
new_comp.update([ ( k + '_' + extractor_type , v) for k,v in comp[ extractor_type ].iteritems() ])
new_comps.append( new_comp )
df = pd.DataFrame( new_comps )
df.set_index('downloads_id', inplace=True )
return df
In [29]:
import boilerpipe.extract
def extract_with_py_boilerpipe( raw_content ):
e = boilerpipe.extract.Extractor( extractor='ArticleExtractor', html=raw_content )
html = e.getHTML()
ret = { 'extracted_html': html }
return ret
In [30]:
def print_results_by_measurement_type( df ):
df.describe(percentiles=[.5] )
result_types = [ 'precision', 'recall', 'f1' ]
for result_type in result_types:
res_columns = [ col for col in df.columns if col.startswith( result_type ) ]
#df.ix[:,['f1_boiler_pipe', 'f1_crf', 'f1_heur', 'f1_python_readibilty']].describe()
print df.ix[:,res_columns].describe( percentiles=[0.02, 0.05,.1,0.5])
In [31]:
def filter_by_media_tags_id( comps_downloads, media_tags_ids ):
media_ids_matching = set()
#print media_id_media_map
for media_id, media in media_id_media_map.iteritems():
if not media[ 'media_source_tags_ids'].isdisjoint( media_tags_ids ):
media_ids_matching.add( media_id )
return [cd for cd in comps_downloads if cd['media_id'] in media_ids_matching ]
In [32]:
def remove_spidered_downloads( comps_downloads ):
return [cd for cd in comps_downloads if not cd['story_is_spidered'] ]
def only_spidered_downloads( comps_downloads ):
return [cd for cd in comps_downloads if cd['story_is_spidered'] ]
In [33]:
def story_is_spidered( story ):
for taginfo in story['story_tags']:
if taginfo['tag'] == 'spidered' and taginfo['tag_set'] == 'spidered':
return True
return False
In [34]:
regenerate_extractor_training_objects = False
regenerate_media_id_media_map = False
regenerate_comps_downloads = False
In [35]:
brazil_downloads_ids = [391881020,401370599,412896439,412952145,412977048,413024519,413657081,413835576,414040102,
414257623,414377428,414480464,414818749,414983458,415185946,415186582,415197547,415424551,
415978069,416026460,416026587,416047494,416047513,416210404,416263840,416306952,416426245,
416655019,416730837,416802690,417347290,417347524,417368539,417389613,417477837,417653177,
418489742,418544762,418574641,418648698,418661859,419404469,419440474,419483895,419873979,
420430754,420599387,420666122,421520860,421834553,422181106,422280595,422910963,423318170,
424080271,424369085,424796346,424840366,425206279,426405203,426560018,426632784,426709900,
428449440,429607289,430363249,430995428,433457459,435624796,435659593,461175103,461175549,
461176415,461176844,461177487,461178557,461178590,461179203,461179222,461179441,461179762,
461179818,461179954,461179956,461180307,461181039,461181597,461186137,461186258,461186833,
461187188,461187261,461187577,461188549,461189069,461190586,461193383]
sqlite_db_file = 'extractor_train_dbs/dev_2014-12-09T12_27_40-0500.db'
In [36]:
extractor_training_objects = []
if regenerate_extractor_training_objects:
eto_sqlite = get_extractor_training_objects_sqlite( sqlite_db_file )
eto_brazil = get_extractor_training_objects_legacy( brazil_downloads_ids )
extractor_training_objects.extend( eto_brazil )
extractor_training_objects.extend( eto_sqlite )
cPickle.dump( extractor_training_objects, open("extractor_traning_objects.pickle", "wb"))
In [37]:
extractor_training_objects = cPickle.load( open( "extractor_traning_objects.pickle", "rb") )
In [38]:
import itertools
from collections import Counter
mc = mediacloud.api.MediaCloud(api_key)
if regenerate_media_id_media_map:
media_id_media_map = {}
media_ids = sorted(list(set([ eto['media_id'] for eto in extractor_training_objects ])))
for media_id in list(media_ids)[:10]:
media = mc.media( media_id )
media[ 'media_source_tags_ids' ] = set( [ media_source_tag['tags_id']
for media_source_tag in media['media_source_tags'] ] )
media_id_media_map[ media_id ] = media
cPickle.dump( media_id_media_map, open("media_id_media_map.pickle", "wb"))
In [39]:
media_id_media_map = cPickle.load( open( "media_id_media_map.pickle", "rb") )
media_tag_counts = Counter(list ( itertools.chain.from_iterable( media_source['media_source_tags_ids'] for media_source in media_id_media_map.values() )) )
tags_id_to_media_tags_map = {}
for media_tag in media_id_media_map.values():
source_tags = media_tag[ 'media_source_tags' ]
for source_tag in source_tags:
tags_id_to_media_tags_map[ source_tag[ 'tags_id' ] ] = source_tag
In [40]:
[ (tags_id_to_media_tags_map[tag_id], count) for tag_id, count in media_tag_counts.most_common( 15 ) ]
Out[40]:
In [41]:
#eto = extractor_training_objects[ 0 ]
#eto.keys()
#print eto['expected_text']
#get_extraction_results( eto )
#comp_extractors ( eto )
extraction_results = []
for eto in extractor_training_objects[:2]:
er = dict( eto )
er[ 'extractor_results'] = get_extraction_results( eto )
extraction_results.append( er )
eto.keys()
#er.keys()
Out[41]:
In [42]:
if regenerate_comps_downloads:
comps_downloads = []
processed = 0
skipped = 0
e=None
for extractor_training_object in extractor_training_objects[:]:
print 'processed ', processed
print 'skipped ', skipped
print extractor_training_object[ 'downloads_id']
try:
res = comp_extractors( extractor_training_object )
#print res
comps_downloads.append( res )
processed += 1
except Exception, e:
print "error on download{}".format( extractor_training_object[ 'downloads_id'] )
e = sys.exc_info()
import traceback
traceback.print_exc()
print e
#raise e
skipped += 1
cPickle.dump( comps_downloads, open("comps_downloads.pickle", "wb"))
e
#extractor_training_objects
In [43]:
comps_downloads = cPickle.load( open( "comps_downloads.pickle", "rb") )
In [44]:
comps_downloads[0]
Out[44]:
In [45]:
df = get_data_frame_from_comparision_objects( comps_downloads )
print_results_by_measurement_type( df )
In [46]:
print "spidered"
df = get_data_frame_from_comparision_objects( only_spidered_downloads( comps_downloads ) )
print_results_by_measurement_type( df )
In [47]:
regional = { 2453107 }
print "region / pew knight sutdy / 245107 "
df = get_data_frame_from_comparision_objects( filter_by_media_tags_id( comps_downloads, regional ) )
print_results_by_measurement_type( df )
ap_english_us_top_25 = { 2453107 }
print "ap_english_us_top25 / 8875027 "
df = get_data_frame_from_comparision_objects( filter_by_media_tags_id( comps_downloads, ap ) )
print_results_by_measurement_type( df )
political_blogs = { 125 }
print "political blogs / 125"
df = get_data_frame_from_comparision_objects( filter_by_media_tags_id( comps_downloads, political_blogs ) )
print_results_by_measurement_type( df )
russian = { 7796878 }
print 'russian'
df = get_data_frame_from_comparision_objects( filter_by_media_tags_id( comps_downloads, russian ) )
print_results_by_measurement_type( df )
print 'brazil'
df = get_data_frame_from_comparision_objects( filter_by_media_tags_id( comps_downloads, {8877968, 8877969, 8877973, 8877970 } ) )
print_results_by_measurement_type( df )
arabic = { 8878255 }
print 'arabic'
df = get_data_frame_from_comparision_objects( filter_by_media_tags_id( comps_downloads, arabic ) )
print_results_by_measurement_type( df )