This notebook is intended enable testing and evaluation of various extractor methods such as:
In [1]:
import cPickle
import os.path
api_key = cPickle.load( file( os.path.expanduser( '~/mediacloud_api_key.pickle' ), 'r' ) )
In [2]:
import cPickle
import os.path
cPickle.dump( api_key, file( os.path.expanduser( '~/mediacloud_api_key.pickle' ), 'wb' ) )
In [3]:
import sys
sys.path.append('../../foreign_modules/python/')
In [4]:
loc_key = 'f66a50230d54afaf18822808aed649f1d6ca72b08fb06d5efb6247afe9fbae52'
In [5]:
import mediacloud, requests, csv, sys, os, json, cPickle
def get_download( downloads_id ):
download = requests.get('https://api.mediacloud.org/api/v2/downloads/single/'+str(downloads_id)+'?key='+api_key)
return download.json()[0]
def extract_story( preprocessed_lines, title, description, extractor_method ):
extract_params = {'key':loc_key, 'preprocessed_lines':preprocessed_lines,
'story_title':title, 'story_description':description, 'extractor_method': extractor_method}
extract_result = requests.put('http://0:3000/api/v2/extractlines/extract',data=json.dumps(extract_params),
headers = {'Content-type': 'application/json'})
extract_result.raise_for_status()
return extract_result.json()
def get_story_lines( raw_content ):
story_lines_params = {'key':loc_key, 'body_html':raw_content }
headers = {'Content-type': 'application/json'}
story_lines = requests.put('http://0:3000/api/v2/extractlines/story_lines',data=json.dumps(story_lines_params),
params={ 'key': loc_key },headers=headers)
story_lines.raise_for_status()
return story_lines
In [6]:
import subprocess
import tempfile
import codecs
import time
from lxml import html
#download = get_download( downloads_id )
#raw_content = download[u'raw_content']
def extract_with_boilerpipe( raw_content ):
with tempfile.NamedTemporaryFile( suffix='.html', delete=False ) as t:
#print t.name
UTF8Writer = codecs.getwriter('utf8')
t.file = UTF8Writer(t.file)
t.file.write( raw_content )
t.close()
#time.sleep( 2 )
print "original article tmp file ", t.name
#input_file = '/tmp/416655019.htm'
input_file = t.name
output_tmp = tempfile.NamedTemporaryFile( suffix='.html', delete=False )
output_file = output_tmp.name
#output_file = '/tmp/highlighted.html'
print output_file
print subprocess.check_output(['java', '-jar',
'/home/dlarochelle/dev_scratch/boilerpipe_test/out/artifacts/boilerpipe_test_jar/boilerpipe_test.jar',
input_file, output_file ] )
f = open( output_file, 'rb' )
annotated_file_str = f.read()
#t.unlink( t.name )
output_tmp.close()
#output_tmp.unlink( output_tmp.name )
return annotated_file_str
In [7]:
import readability
def extract_with_python_readability( raw_content ):
doc = readability.Document( raw_content )
return [ doc.short_title(),
doc.summary() ]
In [8]:
import goose
def extract_with_python_goose( raw_content ):
g = goose.Goose()
r = g.extract( raw_html=raw_content )
return [r.title, r.cleaned_text ]
In [10]:
import justext
def extract_with_justext( raw_content ):
ret = []
paragraphs = justext.justext( raw_content, justext.get_stoplist('Portuguese') )
#p = paragraphs[0]
for p in paragraphs:
if not p.is_boilerplate:
ret.append(p.text)
return ret
#extract_with_justext( raw_content )
#raw_html
#justext.get_stoplists()
In [11]:
import operator
def get_extractor_training_text( downloads_id, preprocessed_lines ):
extractor_training_lines_result = requests.get(
'https://api.mediacloud.org/api/v2/extractlines/extractor_training_lines/' + str(downloads_id),
headers = {'Content-type': 'application/json'}
, params= {'key': api_key}
)
extractor_training_lines_result.raise_for_status()
extractor_training_lines_result = extractor_training_lines_result.json()
line_numbers = [ x['line_number'] for x in extractor_training_lines_result ]
line_numbers = sorted(line_numbers)
line_numbers.sort()
#print line_numbers
return operator.itemgetter( * line_numbers )( preprocessed_lines )
import operator
def get_extracted_text( extractor_results ):
included_line_numbers = extractor_results['included_line_numbers']
#print included_line_numbers
dl = extractor_results['download_lines']
if len( included_line_numbers ) == 0:
return []
else:
return operator.itemgetter( * extractor_results['included_line_numbers'] )(dl)
In [12]:
def html_strip( str ):
if str.isspace() or str == '':
return u' '
if str == '<':
return u' '
try:
return html.fromstring(str).text_content()
except:
print "Unexpected error on string '" + str + "'" , sys.exc_info()[0]
#raise
return u''
def clean_for_comparison( str ):
if len(str) > 0:
ret = html_strip( str )
else:
return str
if len(ret) > 0:
ret = ret.strip()
return ret
In [13]:
import Levenshtein
def lines_to_comparable_text( lines ):
text = " ".join([ clean_for_comparison(line) for line in lines ])
if text == '':
text = u''
return text
def compare_accuracy( lines, lines_expected ):
return Levenshtein.distance( lines_to_comparable_text( lines ) , lines_to_comparable_text( lines_expected ) )
In [14]:
def get_anncestors( element ):
anncestors = [ element ];
anncestor = element.getparent()
while anncestor != None :
#print 'loop'
anncestors.append( anncestor )
anncestor = anncestor.getparent()
return anncestors
In [15]:
def text_from_lxml_object( obj):
if type(obj) is etree._ElementStringResult:
return u'' + obj
if type(obj) == etree._ElementUnicodeResult:
return u'' + obj
else:
try:
return etree.tostring( obj , method='text', encoding="UTF-8")
except:
print type(obj)
print obj
raise ''
In [16]:
from lxml import etree
downloads_id = 582817308
download = get_download( downloads_id )
raw_content = download[ 'raw_content' ]
with open( '/tmp/' + str(downloads_id) , 'wb' ) as f:
f.write( raw_content )
In [17]:
def text_children( element):
ret = [ t for t in element.xpath("//text()" ) if t.getparent() == element ]
assert len( ret ) <= 2
if len( ret ) == 2:
assert ret[0].is_text
assert ret[1].is_tail
for r in ret:
if r.is_text:
assert element.text == r
else:
assert r.is_tail
assert element.tail == r
return ret
In [17]:
#start_container[ annotation['start_offset']: annotation['end_offset'] + 1 ]
In [49]:
def get_annotated_text( raw_content, annotation):
htmlparser = etree.HTMLParser()
root = etree.fromstring( raw_content, htmlparser )
#print annotation[ 'start_xpath' ]
start_container = get_element_from_xpath(root, annotation[ 'start_xpath' ] )
end_container = get_element_from_xpath( root, annotation[ 'end_xpath' ] )
if ( start_container == end_container ):
return [start_container[ annotation['start_offset']: annotation['end_offset'] + 1 ]]
if start_container.getparent() == end_container.getparent():
common_parent = start_container.getparent()
assert start_container.is_text
assert end_container.is_tail
assert common_parent.text == start_container
assert common_parent.tail == end_container
return [start_container[ annotation['start_offset']:], end_container[ : annotation['end_offset'] + 1 ]]
start_anncestors = get_anncestors( start_container )
end_anncestors = get_anncestors( end_container )
print 'sc', start_container
print 'ec', end_container
print 'common'
middle_contents = []
p = start_container.getparent()
prev_p = start_container
if start_container.is_text:
# append the tail
texts = text_children( start_container.getparent() )
if len( texts ) == 2:
assert texts[1].is_tail
middle_contents.append( texts[1] )
while p not in end_anncestors:
#print "parent:", p, "\n", etree.tostring( p )
assert p in start_anncestors
if prev_p != start_container:
child_index = p.index( prev_p )
if (child_index + 1) < len( list( p )):
el = list(p)[ child_index + 1]
else:
el = None
#print "stripping previous parent is the last child of curparrent"
else:
el = None
while (el not in end_anncestors) and (el != None) :
#print "inner loop"
#print el
#print etree.tostring( el )
middle_contents.append( el )
print el
el = el.getnext()
print "end inner loop"
prev_p = p
p = p.getparent()
print "end loop"
print p
commonanncestors = list([ s for s in start_anncestors if s in end_anncestors ] )
assert p in commonanncestors
commonanncestor = commonanncestors[0]
print commonanncestors
print start_container == end_container
assert p == commonanncestor
print "commonacccestor", commonanncestor
print etree.tostring( commonanncestor )
processed_children = [ c for c in list (commonanncestor) if c in start_anncestors ]
print "ca's processed children"
#print processed_children
#print [ etree.tostring( c ) + "\n" for c in list( commonanncestor ) ]
assert( len( processed_children ) == 1 )
processed_child = processed_children[ 0]
print "processed_child", processed_child
print etree.tostring( processed_child )
el = processed_child.getnext()
print "start True"
assert el != None
#print etree.tostring( el )
while True:
print 'outer loop:', el, "\n", etree.tostring( el )
while (el not in end_anncestors):
print "inner loop"
print el
assert el != None
print etree.tostring( el )
middle_contents.append( el )
el = el.getnext()
assert el != None
print 'end inner loop'
print el
print etree.tostring( el )
# element is an ancester of end_container and has no (non-text)children
# since end_container is text el must be it's parent so we can stop
if (len(list(el)) == 0 ):
print 'found end_contain parent, exiting loop:', el, "\n", etree.tostring( el )
assert end_container in text_children(el)
assert end_container.getparent() == el
break
## HACK bc/ lxml/etree doesn't have a real text node
## treat the text as the first child node
texts = text_children(el)
assert ( all ( [t.getparent() == el for t in texts] ) )
assert( len (texts ) <= 2 )
if len( texts) > 0:
if texts[0].is_text:
assert el.text == texts[0]
middle_contents.append(texts[0] );
el = el[0]
assert el != None
#[ text_from_lxml_object( mc ) for mc in middle_contents ]
print etree.tostring(el)
#print etree.tostring(middle_contents[-1] )
print end_container.is_tail
print end_container.is_text
print "escape while"
#print list ( el.itertext() )
#print 'ca'
#commonanncestor.text
#type(el)
type( end_container )
assert end_container in text_children(el)
assert el == end_container.getparent()
print etree.tostring( end_container.getparent() )
#print middle_contents[-2:]
#print end_container
text_children(el )
etree.tostring( el.getparent() )
#p_el = el.getparent()
#print etree.tostring( p_el )
#text_children( p_el )[0].is_text
#annotation
#list(p_el)
if end_container.is_tail:
assert len(text_children(el )) == 2
middle_contents.append( text_children( el )[ 0] )
else:
assert end_container.is_text
end_text = end_container[:annotation['end_offset'] - 1]
print 'start container'
print etree.tostring( start_container.getparent())
print start_container
print 'offset', annotation['start_offset']
print 'end container'
print etree.tostring( end_container.getparent())
print end_container
print 'offset', annotation['end_offset']
#assert start_container.is_text
start_text = start_container[annotation['start_offset']:]
target_text = [ start_text ]
target_text.extend( [ text_from_lxml_object( mc ) for mc in middle_contents ] )
target_text.append( end_text )
return target_text
In [50]:
import io
annotation = {u'end_offset': 67, u'end_xpath': u'/html[1]/body[1]/div[3]/section[1]/div[2]/hgroup[1]/h1[1]/text()[1]', u'start_offset': 0,
u'start_xpath': u'/html[1]/body[1]/div[3]/section[1]/div[2]/hgroup[1]/h1[1]/text()[1]'}
annotation = {u'end_offset': 142, u'end_xpath': u'/html[1]/body[1]/div[3]/section[1]/div[2]/div[1]/div[1]/div[1]/p[10]/text()[2]',
u'start_offset': 0, u'start_xpath': u'/html[1]/body[1]/div[3]/section[1]/div[2]/div[1]/div[1]/div[1]/p[1]/text()[1]'}
annotation = {u'end_offset': 1, u'end_xpath': u'/html[1]/body[1]/div[3]/section[1]/div[2]/hgroup[1]/h5[1]/text()[3]',
u'start_offset': 0, u'start_xpath': u'/html[1]/body[1]/div[3]/section[1]/div[2]/hgroup[1]/h5[1]/time[1]/text()[1]'}
downloads_id = 582815971
annotation = {u'end_offset': 123, u'end_xpath': u'/html[1]/body[1]/div[2]/div[1]/div[1]/div[3]/div[2]/div[1]/div[1]/div[1]/div[1]/div[1]/article[1]/p[1]/text()[1]', u'start_offset': 0,
u'start_xpath': u'/html[1]/body[1]/div[2]/div[1]/div[1]/div[3]/div[2]/div[1]/div[1]/div[1]/div[1]/div[1]/article[1]/h1[1]/text()[1]'}
downloads_id = 413070223
annotation = {u'end_offset': 0, u'end_xpath': u'/html[1]/body[1]/div[2]/div[3]/div[1]/div[2]/div[2]', u'start_offset': 0, u'start_xpath': u'/html[1]/body[1]/div[2]/div[3]/div[1]/div[2]/h1[1]/text()[1]'}
download = get_download( downloads_id )
raw_content = u'' + download[ 'raw_content' ]
#with io.open( '/tmp/' + str(downloads_id) + '.html' , 'w', encoding='utf8' ) as f:
# f.write( raw_content )
get_annotated_text( u''+ raw_content, annotation )
htmlparser = etree.HTMLParser()
root = etree.fromstring( raw_content, htmlparser )
d = root.xpath( annotation['start_xpath'] )[0]
p = d.getparent()
In [20]:
import sqlite3
db = sqlite3.connect('extractor_train_dbs/dev_2014-11-03T09_06_11-0500.db')
db.row_factory = sqlite3.Row
cursor = db.cursor()
cursor.execute( "SELECT * from dlannotations where downloads_id = 582817308" )
row = cursor.fetchone()
print row
d = dict([ (k, row[k]) for k in row.keys() ])
d.keys()
d['annotations_json']
annotations = json.loads( d['annotations_json'] )
annotations
raw_content == d['raw_content']
#get_annotated_text( d['raw_content'], annotations[1] )
Out[20]:
In [21]:
#print 'end el'
#print el
#print etree.tostring( el )
#print start_anncestors
#print end_anncestors
#commonanncestors = list([ s for s in start_anncestors if s in end_anncestors ] )
#commonanncestor = commonanncestors[0]
#commonanncestor
#print commonanncestor
#print 'full dump'
#print etree.tostring( commonanncestor )
#parent = start_container.getparent()
#parent.getnext()
#parent.getnext().index( parent )
#list(parent.itertext() )
#list(parent)
#.index( start_container )
#root.xpath( '/html[1]/body[1]/div[2]/div[4]/div[1]/div[3]/div[1]/h1[1]/text()[1]' )
#root.xpath( '/html[1]/body[1]/div[2]/div[4]/div[1]/div[3]/div[1]/div[1]/p[1]/span[1]/text()[1]' )
#res = root.xpath( '/html[1]/body[1]/div[2]/div[4]/div[1]/div[3]/div[1]/div[1]/p[1]/span[3]/text()[1]' )
#res = root.xpath( '/html[1]/body[1]/div[2]/div[4]/div[1]/div[3]/div[1]/span[1]/span[2]/p[1]/text()[1]' )
#res = root.xpath( '/html[1]/body[1]/div[2]/div[4]/div[1]/div[3]/div[1]/span[1]/p[15]/text()[2]' )
#len ( res[0] )
#res
In [21]:
import difflib
from IPython.display import HTML
from collections import Counter
def ratcliff_obershelp_compare( actual_lines, expected_lines ):
words_expected = lines_to_comparable_text(expected_lines ).split()
words_crf = lines_to_comparable_text(actual_lines ).split()
differ = difflib.Differ( )
#print words_crf[:10]
#print words_expected[:10]
list( differ.compare( words_crf , words_expected ) )
counts = Counter([ d[0] for d in differ.compare( words_expected, words_crf ) ])
tp = counts[' ']
fp = counts['+']
fn = counts['-']
if float(tp+fp) == 0:
precision = 0.0
else:
precision = tp/float(tp+fp)
if float( tp + fn ) == 0:
recall = 0
else:
recall = tp/float( tp + fn )
if ( precision + recall ) > 0:
f1 = 2*(precision*recall)/( precision + recall )
else:
f1 = 0
ret = { 'precision': precision,
'recall': recall,
'f1': f1
}
return ret
#ratcliff_obershelp_compare( words_crf, words_expected )
In [22]:
#downloads_id
#story
#raw_content
#expected_lines
#preprocessed_lines
def create_extractor_training_object( downloads_id, expected_lines=None ):
download = get_download( downloads_id )
raw_content = download[u'raw_content']
stories_id = download[u'stories_id']
print download['url']
story = requests.get('https://api.mediacloud.org/api/v2/stories/single/'+str(stories_id)+'?key='+api_key)
story = story.json()[0]
story_lines = get_story_lines( raw_content )
#print story_lines.content
preprocessed_lines = story_lines.json()
if not expected_lines:
expected_lines = get_extractor_training_text( downloads_id, preprocessed_lines )
ret = { 'downloads_id': downloads_id,
'raw_content': raw_content,
'story': story,
'preprocessed_lines': preprocessed_lines,
'expected_lines': expected_lines
}
return ret
In [23]:
def compare_extractors_for_download( downloads_id ):
eto = create_extractor_training_object( downloads_id )
return comp_extractors( eto )
download = get_download( downloads_id )
raw_content = download[u'raw_content']
stories_id = download[u'stories_id']
story = requests.get('https://api.mediacloud.org/api/v2/stories/single/'+str(stories_id)+'?key='+api_key)
story = story.json()[0]
story_lines = get_story_lines( raw_content )
#print story_lines.content
preprocessed_lines = story_lines.json()
expected_lines = get_extractor_training_text( downloads_id, preprocessed_lines )
def comp_extractors( eto ):
downloads_id = eto['downloads_id']
story = eto['story']
raw_content = eto['raw_content']
preprocessed_lines = eto['preprocessed_lines']
expected_lines = eto['expected_lines']
title = story[u'title']
description = story[u'description']
url = story[u'url']
heur_extract = extract_story( preprocessed_lines, title, description, 'HeuristicExtractor')
crf_extract = extract_story( preprocessed_lines, title, description, 'CrfExtractor')
heur_lines = get_extracted_text( heur_extract )
crf_lines = get_extracted_text( crf_extract )
python_readability_lines = extract_with_python_readability( raw_content )
py_goose_lines = extract_with_python_goose( raw_content )
justext_lines = extract_with_justext( raw_content )
global glob_expected_lines
global glob_crf_lines
glob_expected_lines = expected_lines
glob_crf_lines = crf_lines
#tree = html.fromstring( extract_with_boilerpipe( raw_content) )
#spans = tree.xpath('//span[@class="x-boilerpipe-mark1"]')
#boiler_pipe_lines = [ s.text for s in spans ]
#print "expected_lines:"
#print lines_to_comparable_text(expected_lines)
#print "boilerpipe lines"
#print lines_to_comparable_text(boiler_pipe_lines)
comp_results = {}
comp_results['heur'] = ratcliff_obershelp_compare( heur_lines, expected_lines )
comp_results['crf'] = ratcliff_obershelp_compare( crf_lines, expected_lines )
#comp_results['boiler_pipe'] = ratcliff_obershelp_compare( boiler_pipe_lines, expected_lines )
comp_results['python_readibilty'] = ratcliff_obershelp_compare( python_readability_lines, expected_lines )
comp_results['py_goose'] = ratcliff_obershelp_compare( py_goose_lines, expected_lines )
comp_results['justext'] = ratcliff_obershelp_compare( justext_lines, expected_lines )
comp_results['downloads_id'] = downloads_id
#comp_results['expected'] = compare_accuracy( expected_lines, expected_lines )
return comp_results
In [25]:
#comps_expected = comps
In [71]:
downloads_id = 416655019
downloads_ids = [391881020,401370599,412896439,412952145,412977048,413024519,413657081,413835576,414040102,414257623,
414377428,414480464,414818749,414983458,415185946,415186582,415197547,415424551,415978069,416026460,
416026587,416047494,416047513,416210404,416263840,416306952,416426245,416655019,416730837,416802690,
417347290,417347524,417368539,417389613,417477837,417653177,418489742,418544762,418574641,418648698,
418661859,419404469,419440474,419483895,419873979,420430754,420599387,420666122,421520860,421834553,
422181106,422280595,422910963,423318170,424080271,424369085,424796346,424840366,425206279,426405203,
426560018,426632784,426709900,428449440,429607289,430363249,430995428,433457459,435624796,435659593,461175103,461175549,461176415,461176844,461177487,461178557,461178590,461179203,461179222,461179441,461179762,461179818,461179954,461179956,461180307,461181039,461181597,461186137,461186258,461186833,461187188,461187261,461187577,461188549,461189069,461190586,461193383]
print len( downloads_ids )
comps = []
extractor_training_objects = []
for downloads_id in downloads_ids[:10]:
print 'downloads_id:', downloads_id
extractor_training_objects.append( create_extractor_training_object( downloads_id ) )
In [128]:
for extractor_training_object in extractor_training_objects:
res = comp_extractors( extractor_training_object )
#print res
comps.append( res )
#print 'Comps_expteced', comps_expected
#print 'comps', comps
#comps == comps_expected
In [30]:
download = get_download( 391881020 )
download['raw_content']
download['url']
#None
Out[30]:
In [31]:
print lines_to_comparable_text( extract_with_python_readability( download['raw_content' ] ) )
In [32]:
import readability
doc = readability.Document( download['raw_content'] )
doc.content()
print doc.short_title()
print lines_to_comparable_text( [ doc.summary() ] )
In [25]:
extractor_training_objects = []
In [24]:
#raw_content
#annotations
def get_element_anncestor_indexes( anchor_el ):
element_indexes = []
if anchor_el.is_text:
element_indexes.append( 0 )
elif anchor.is_tail:
element_indexes.append( 1 )
else:
assert False
element = anchor_el.getparent()
while element.getparent() is not None:
#print 'el', element
element_index = element.getparent().index( element )
#print 'index', element.getparent().index( element )
element_indexes.append( element_index )
element = element.getparent()
#print element_indexes
element_indexes.reverse()
return element_indexes
def remove_last_div( xpath ):
if '/div' not in xpath:
print 'no div in xpath', xpath
return xpath
else:
i = xpath.rfind( '/div' )
e = xpath.find('/', i + 1 )
return xpath[:i] + xpath[e:]
def get_element_from_xpath( root, xpath ):
if len(root.xpath( xpath )) > 0:
return root.xpath( xpath )[0]
elif len(root.xpath( remove_last_div( xpath )) ) > 0:
return root.xpath( remove_last_div( xpath ))[0]
else:
while len(root.xpath( xpath )) == 0:
pass
assert '/' in xpath[2:]
xpath = '/' + xpath[ xpath.find( '/', 2 ): ]
return root.xpath( xpath )[0]
def sort_annotations( annotations, raw_content ):
htmlparser = etree.HTMLParser()
root = etree.fromstring( raw_content, htmlparser )
#print annotation[ 'start_xpath' ]
annotation = annotations[0]
start_container = root.xpath( annotation[ 'start_xpath' ] )[0]
end_container = root.xpath( annotation[ 'end_xpath' ] )[0]
get_element_anncestor_indexes( start_container )
#element = start_container.getparent()
annotations.sort( key= lambda a: get_element_anncestor_indexes( get_element_from_xpath( root, a['start_xpath'] ) ) )
return annotations
f = "/html/div/d"
'/' + f[f.find('/',2):]
#"oood" in f
Out[24]:
In [40]:
import sqlite3
db = sqlite3.connect('extractor_train_dbs/dev_2014-11-03T09_06_11-0500.db')
db.row_factory = sqlite3.Row
cursor = db.cursor()
cursor.execute( "SELECT * from dlannotations where last_updated > '2014-10-31 00:00:00.069409' order by downloads_id" )
extractor_training_objects = []
skipped_downloads = 0
for row in list( cursor.fetchall() )[:]:
row = dict([ (k, row[k]) for k in row.keys() ])
#print row
row['annotations'] = json.loads( row['annotations_json'] )
row['raw_content'] = u'' + row['raw_content']
annotated_text = []
try:
annotations = row['annotations']
download = get_download( row['downloads_id'] )
raw_content = u'' + download[
u'raw_content']
annotations = sort_annotations( annotations, raw_content )
for annotation in annotations:
annotated_text.extend(get_annotated_text( u''+ raw_content, annotation ))
eto = create_extractor_training_object( row['downloads_id'], expected_lines=annotated_text )
#assert eto['raw_content'] != row['raw_content']
if eto['raw_content'] != row['raw_content']:
#TODO figure out why these may differ
pass
#d = difflib.Differ()
#diff = d.compare(eto['raw_content'].splitlines(1), row['raw_content'].splitlines(1))
#print '\n'.join(diff)
extractor_training_objects.append( eto )
except:
print "error"
print 'downloads_id', row['downloads_id']
print annotation
skipped_downloads += 1
raise
print "skipped", skipped_downloads
print "processed", len(extractor_training_objects)
In [192]:
import re
htmlparser = etree.HTMLParser()
root = etree.fromstring( raw_content, htmlparser )
#print annotation[ 'start_xpath' ]
annotation = annotations[0]
start_container = root.xpath( annotation[ 'start_xpath' ] )[0]
end_container = root.xpath( annotation[ 'end_xpath' ] )[0]
get_element_anncestor_indexes( start_container )
#element = start_container.getparent()
#annotations.sort( key= lambda a: get_element_anncestor_indexes( root.xpath( a['start_xpath'] )[0] ) )
#print row['annotations']
print download['downloads_id']
a = {u'end_offset': 28, u'end_xpath': u'/html[1]/body[1]/div[2]/div[1]/div[1]/div[3]/div[2]/div[1]/div[1]/div[1]/div[1]/div[1]/article[1]/div[2]/ul[1]/li[2]/time[1]/text()[1]', u'start_offset': 0, u'start_xpath': u'/html[1]/body[1]/div[2]/div[1]/div[1]/div[3]/div[2]/div[1]/div[1]/div[1]/div[1]/div[1]/article[1]/div[2]/ul[1]/li[2]/time[1]/text()[1]'}
sp = '/html[1]/body[1]/div[2]/div[1]/div[1]/div[3]/div[2]/div[1]/div[1]/div[1]/div[1]/div[1]/article[1]/div[2]/ul[1]/li[2]/time[1]/text()[1]'
s = '/html[1]/body[1]/div[2]/div[1]/div[1]/div[3]/div[2]/div[1]/div[1]/div[1]/div[1]/div[1]/article[1]/div[2]/ul[1]/li[2]/time[1]/text()[1]'
remove_last_div( sp )
#root.xpath( remove_last_div( sp ) )
#m = re.search( '\/div', sp )
#m.pos
#for a in row['annotations']:
# print a[ 'start_xpath']
#print raw_content
# if len( root.xpath( a[ 'start_xpath' ] ) ) == 0:
# sp = a['start_xpath']
Out[192]:
In [205]:
print len( extractor_training_objects )
comps_new_downloads = []
for extractor_training_object in extractor_training_objects:
res = comp_extractors( extractor_training_object )
#print res
comps_new_downloads.append( res )
#extractor_training_objects
In [206]:
import pandas as pd
new_comps = []
for comp in comps_new_downloads:
new_comp = {}
new_comp = { 'downloads_id': comp['downloads_id'] }
#del comp['downloads_id']
extractor_types = [ k for k in comp.keys() if k != 'downloads_id' ]
for extractor_type in extractor_types:
new_comp.update([ ( k + '_' + extractor_type , v) for k,v in comp[ extractor_type ].iteritems() ])
#new_comp[ k + 'boiler_pipe
new_comps.append( new_comp )
new_comps
df = pd.DataFrame( new_comps )
df.set_index('downloads_id', inplace=True )
df.describe(percentiles=[.5] )
result_types = [ 'precision', 'recall', 'f1' ]
for result_type in result_types:
res_columns = [ col for col in df.columns if col.startswith( result_type ) ]
#df.ix[:,['f1_boiler_pipe', 'f1_crf', 'f1_heur', 'f1_python_readibilty']].describe()
print df.ix[:,res_columns].describe( percentiles=[0.5])
#df.describe()
In [37]:
df.describe()
Out[37]:
In [38]:
res = compare_extractors_for_download( 461179954 )
res
Out[38]:
In [39]:
def gen_data(downloads_id, included_line_numbers):
heuristic_training_ip = []
c_t_ip = []
h_t_ip = []
try:
#api_key = ''
loc_key = api_key
download = requests.get('https://api.mediacloud.org/api/v2/downloads/single/'+str(downloads_id)+'?key='+api_key)
raw_content = download.json()[0][u'raw_content']
stories_id = download.json()[0][u'stories_id']
story = requests.get('https://api.mediacloud.org/api/v2/stories/single/'+str(stories_id)+'?key='+api_key)
title = story.json()[0][u'title']
description = story.json()[0][u'description']
url = story.json()[0][u'url']
story_lines_params = {'key':loc_key, 'body_html':raw_content}
headers = {'Content-type': 'application/json'}
story_lines = requests.put('https://api.mediacloud.org/api/v2/extractlines/story_lines',data=story_lines_params, headers=headers)
#story_lines = requests.get('https://api.mediacloud.org/api/v2/extractlines/story_lines',params=story_lines_params)
preprocessed_lines = story_lines.text
heur_extract_params = {'key':loc_key, 'preprocessed_lines':preprocessed_lines, 'story_title':title, 'story_description':description, 'extractor_method':'HeuristicExtractor'}
heur_extract = requests.get('https://api.mediacloud.org/api/v2/extractlines/extract',params=heur_extract_params)
crf_extract_params = {'key':loc_key, 'preprocessed_lines':preprocessed_lines, 'story_title':title, 'story_description':description, 'extractor_method':'CrfExtractor'}
crf_extract = requests.get('https://api.mediacloud.org/api/v2/extractlines/extract',params=crf_extract_params)
for ln, hscore in enumerate(heur_extract.json()[u'scores']):
t = 1 if str(ln) in included_line_numbers else 0
if hscore[u'autoexcluded'] != 1:
h_t_ip.append( (t, hscore[u'include_probability']) )
cscore = crf_extract.json()[u'scores'][ln]
if u'autoexcluded' not in cscore:
c_t_ip.append( (t, cscore[u'include_probability']) )
except Exception as e:
pass
return h_t_ip, c_t_ip
In [40]:
import mediacloud, requests, csv, sys, os, json, cPickle
from pyroc import *
#extractor_training_lines_checked has the training lines for downloads for which the highest line listed as 'included' was less than the number of lines in the download (max(included_line_numbers) < len(story_lines.json()))
f = open("extractor_training_lines_checked.json").read()
reader = json.loads(f)
heur = []
cPickle.dump(heur, open("heur.p", "wb"))
crf = []
cPickle.dump(crf, open("crf.p", "wb"))
done = []
cPickle.dump(done, open("done.p", "wb"))
for row in reader[:30]:
did = row[u'downloads_id']
lns = row[u'included_line_numbers']
curh, curc = gen_data(did, lns)
heur+=curh
crf+=curc
done.append(did)
cPickle.dump(done, open("done.p", "wb"))
cPickle.dump(heur, open("heur.p", "wb"))
cPickle.dump(crf, open("crf.p", "wb"))
In [41]:
import cPickle
from pyroc import *
heur = cPickle.load(open("heur.p","rb"))
crf = cPickle.load(open("crf.p","rb"))
rocheur = ROCData(heur)
roccrf = ROCData(crf)
print rocheur.auc()
plot_multiple_roc(rocList=(rocheur,roccrf), title='Extractor ROC Curve', labels=("heuristic curve","crf curve"))
In [74]:
annotation = {u'end_offset': 61,
u'start_offset': 0,
u'end_xpath': u'/html[1]/body[1]/div[2]/div[4]/div[1]/div[3]/div[1]/h1[1]/text()[1]',
u'start_xpath': u'/html[1]/body[1]/div[2]/div[4]/div[1]/div[3]/div[1]/h1[1]/text()[1]'}
annotation = {u'end_offset': 28, u'end_xpath': u'/html[1]/body[1]/div[2]/div[4]/div[1]/div[3]/div[1]/div[1]/p[1]/span[3]/text()[1]', u'start_offset': 0, u'start_xpath': u'/html[1]/body[1]/div[2]/div[4]/div[1]/div[3]/div[1]/div[1]/p[1]/span[1]/text()[1]'}
annotation = {u'end_offset': 26, u'end_xpath': u'/html[1]/body[1]/div[2]/div[4]/div[1]/div[3]/div[1]/span[1]/p[15]/text()[2]', u'start_offset': 1, u'start_xpath': u'/html[1]/body[1]/div[2]/div[4]/div[1]/div[3]/div[1]/span[1]/span[2]/p[1]/text()[1]'}
annotation = {u'end_offset': 272, u'end_xpath': u'/html[1]/body[1]/div[2]/div[4]/div[1]/div[3]/div[1]/span[1]/p[2]/text()[1]', u'start_offset': 0, u'start_xpath': u'/html[1]/body[1]/div[2]/div[4]/div[1]/div[3]/div[1]/span[1]/p[1]/text()[1]'}
print annotation
get_annotated_text( raw_content, annotation )