This notebook evaluates both Media Cloud's internal extractors and third party FLOSS extractor libraries across a corpus of hand annotated articles.
Readers may wish to skip to the results section at the end.
In [1]:
import cPickle
import os.path
api_key = cPickle.load( file( os.path.expanduser( '~/mediacloud_api_key.pickle' ), 'r' ) )
In [2]:
import cPickle
import os.path
cPickle.dump( api_key, file( os.path.expanduser( '~/mediacloud_api_key.pickle' ), 'wb' ) )
In [3]:
import sys
sys.path.append('../../foreign_modules/python/')
In [4]:
loc_key = 'f66a50230d54afaf18822808aed649f1d6ca72b08fb06d5efb6247afe9fbae52'
In [5]:
import mediacloud, requests, csv, sys, os, json, cPickle
def get_download( downloads_id ):
download = requests.get('https://api.mediacloud.org/api/v2/downloads/single/'+str(downloads_id)+'?key='+api_key)
return download.json()[0]
def extract_story( preprocessed_lines, title, description, extractor_method ):
extract_params = {'key':loc_key, 'preprocessed_lines':preprocessed_lines,
'story_title':title, 'story_description':description, 'extractor_method': extractor_method}
extract_result = requests.put('http://0:3000/api/v2/extractlines/extract',data=json.dumps(extract_params),
headers = {'Content-type': 'application/json'})
extract_result.raise_for_status()
return extract_result.json()
def get_story_lines( raw_content ):
story_lines_params = {'key':loc_key, 'body_html':raw_content }
headers = {'Content-type': 'application/json'}
story_lines = requests.put('http://0:3000/api/v2/extractlines/story_lines',data=json.dumps(story_lines_params),
params={ 'key': loc_key },headers=headers)
story_lines.raise_for_status()
return story_lines
In [6]:
import subprocess
import tempfile
import codecs
import time
from lxml import html
#download = get_download( downloads_id )
#raw_content = download[u'raw_content']
def extract_with_boilerpipe( raw_content ):
with tempfile.NamedTemporaryFile( suffix='.html', delete=False ) as t:
#print t.name
UTF8Writer = codecs.getwriter('utf8')
t.file = UTF8Writer(t.file)
t.file.write( raw_content )
t.close()
#time.sleep( 2 )
#print "original article tmp file ", t.name
#input_file = '/tmp/416655019.htm'
input_file = t.name
output_tmp = tempfile.NamedTemporaryFile( suffix='.html', delete=False )
output_file = output_tmp.name
#output_file = '/tmp/highlighted.html'
#print output_file
subprocess.check_output(['java', '-jar',
'/home/dlarochelle/dev_scratch/boilerpipe_test/out/artifacts/boilerpipe_test_jar/boilerpipe_test.jar',
input_file, output_file ] )
f = open( output_file, 'rb' )
annotated_file_str = f.read()
#t.unlink( t.name )
output_tmp.close()
#output_tmp.unlink( output_tmp.name )
tree = html.fromstring( annotated_file_str )
spans = tree.xpath('//span[@class="x-boilerpipe-mark1"]')
boiler_pipe_lines = [ etree.tostring(s) for s in spans ]
ret = { 'extracted_html': "\n\n".join(boiler_pipe_lines) }
return ret
In [7]:
#f = open( '/tmp/tmp01CV6F.html' )
#annotated_file_str = f.read()
#tree = html.fromstring( annotated_file_str )
#spans = tree.xpath('//span[@class="x-boilerpipe-mark1"]')
#span = spans[0]
#etree.tostring( span )
In [8]:
import readability
def extract_with_python_readability( raw_content ):
doc = readability.Document( raw_content )
return doc.short_title() + "\n\n" + doc.summary()
In [9]:
import goose
def extract_with_python_goose( raw_content ):
g = goose.Goose()
r = g.extract( raw_html=raw_content )
return r.title + "\n\n" + r.cleaned_text
In [10]:
import justext
def extract_with_justext( raw_content ):
ret = []
paragraphs = justext.justext( raw_content, justext.get_stoplist('English') )
#p = paragraphs[0]
for p in paragraphs:
if not p.is_boilerplate:
ret.append(p.text)
return "\n\n".join(ret)
#extract_with_justext( raw_content )
#raw_html
#justext.get_stoplists()
In [11]:
import operator
def get_extractor_training_text( downloads_id, preprocessed_lines ):
extractor_training_lines_result = requests.get(
'https://api.mediacloud.org/api/v2/extractlines/extractor_training_lines/' + str(downloads_id),
headers = {'Content-type': 'application/json'}
, params= {'key': api_key}
)
extractor_training_lines_result.raise_for_status()
extractor_training_lines_result = extractor_training_lines_result.json()
line_numbers = [ x['line_number'] for x in extractor_training_lines_result ]
line_numbers = sorted(line_numbers)
line_numbers.sort()
#print line_numbers
return operator.itemgetter( * line_numbers )( preprocessed_lines )
import operator
def get_extracted_text( extractor_results ):
included_line_numbers = extractor_results['included_line_numbers']
#print included_line_numbers
dl = extractor_results['download_lines']
if len( included_line_numbers ) == 0:
return []
else:
return operator.itemgetter( * extractor_results['included_line_numbers'] )(dl)
In [12]:
#import Levenshtein
def lines_to_comparable_text( lines ):
text = u"\n\n".join([ clean_for_comparison(line) for line in lines ])
if text == '':
text = u''
return text
def html_to_comparable_text( html_text ):
text = clean_for_comparison( html_text )
if text == '' or text == None:
text = u''
return text
#def compare_accuracy( lines, lines_expected ):
# return Levenshtein.distance( lines_to_comparable_text( lines ) , lines_to_comparable_text( lines_expected ) )
In [13]:
def get_anncestors( element ):
anncestors = [ element ];
anncestor = element.getparent()
while anncestor != None :
#print 'loop'
anncestors.append( anncestor )
anncestor = anncestor.getparent()
return anncestors
In [14]:
def text_from_lxml_object( obj):
if type(obj) is etree._ElementStringResult:
return u'' + obj
if type(obj) == etree._ElementUnicodeResult:
return u'' + obj
else:
try:
return etree.tostring( obj , method='text', encoding="UTF-8")
except:
print type(obj)
print obj
raise ''
In [15]:
from lxml import etree
downloads_id = 582817308
download = get_download( downloads_id )
raw_content = download[ 'raw_content' ]
with open( '/tmp/' + str(downloads_id) , 'wb' ) as f:
f.write( raw_content )
In [16]:
from nltk import tokenize
def remove_duplicate_sentences( article_html, story ):
#sentences_from_html
extract_params = {'key':loc_key, 'story_html': article_html }
extract_result = requests.put('http://0:3000/api/v2/extractlines/sentences_from_html',data=json.dumps(extract_params),
headers = {'Content-type': 'application/json'})
#print "article_html:\n", article_html
extract_result.raise_for_status()
sentences = extract_result.json()
#print "sentences", sentences
#comp_text = lines_to_comparable_text( text_lines )
#sentences = [ sent.strip() for sent in tokenize.sent_tokenize( comp_text ) ]
#print sentences
non_duplicate_sentences = [sentence for sentence in sentences if not sentence_is_duplicate( sentence, story ) ]
return u"\n".join( non_duplicate_sentences )
In [17]:
def html_strip( str ):
if str.isspace() or str == '':
return u' '
if str == '<':
return u' '
try:
return html.fromstring(str).text_content()
except:
print "Unexpected error on string '" + str + "'" , sys.exc_info()[0]
#raise
return u''
def clean_for_comparison( str ):
if len(str) > 0:
ret = html_strip( str )
else:
return str
return ret
In [18]:
def extract_with_mc_extractor( eto, method ):
story = eto['story']
preprocessed_lines = eto['preprocessed_lines']
title = story[u'title']
description = story[u'description']
extract_result = extract_story( preprocessed_lines, title, description, method)
#html_lines = get_extracted_text( extract_result )
ret = {}
ret['extracted_html'] = extract_result[ 'extracted_html' ]
return ret
def extract_with_heur( eto ):
return extract_with_mc_extractor( eto, 'HeuristicExtractor' )
def extract_with_crf( eto ):
return extract_with_mc_extractor( eto, 'CrfExtractor' )
In [19]:
import difflib
from IPython.display import HTML
from collections import Counter
def ratcliff_obershelp_compare( actual_text, expected_text ):
words_expected = expected_text.split()
words_crf = actual_text.split()
differ = difflib.Differ( )
#print words_crf[:10]
#print words_expected[:10]
list( differ.compare( words_crf , words_expected ) )
counts = Counter([ d[0] for d in differ.compare( words_expected, words_crf ) ])
tp = counts[' ']
fp = counts['+']
fn = counts['-']
if float(tp+fp) == 0:
precision = 0.0
else:
precision = tp/float(tp+fp)
if float( tp + fn ) == 0:
recall = 0
else:
recall = tp/float( tp + fn )
if ( precision + recall ) > 0:
f1 = 2*(precision*recall)/( precision + recall )
else:
f1 = 0
ret = { 'precision': precision,
'recall': recall,
'f1': f1
}
return ret
def compare_with_expeto migcted( extractor_name, actual_text, actual_html, expected_text, story ):
#actual_text = lines_to_comparable_text( actual_lines )
#expected_text = lines_to_comparable_text( expected_lines )
ret = {}
ret[ extractor_name ] = ratcliff_obershelp_compare( actual_text, expected_text )
if compare_deduplicated:
dedup_text = remove_duplicate_sentences( actual_html, story )
ret[ extractor_name + "_dedup" ] = ratcliff_obershelp_compare( dedup_text, expected_text )
return ret
In [20]:
ratcliff_obershelp_compare( actual_text='foo', expected_text='bar foo baz BAST')
Out[20]:
{'f1': 0.4, 'precision': 1.0, 'recall': 0.25}
In [66]:
def get_extraction_results( eto ):
raw_content = eto[ 'raw_content' ]
assert raw_content != None
assert len(raw_content ) > 0
ret = {}
#ret['heur'] = extract_with_heur( eto )
#ret['crf'] = extract_with_crf( eto )
#ret['boiler_pipe'] = extract_with_boilerpipe( raw_content)
print 'extracting with thr_boilerpipe'
ret['thr_boiler_pipe_ArticleExtractor'] = extract_with_thr_boilerpipe_ArticleExtractor( raw_content)
#ret['thr_boiler_pipe_ArticleSentencesExtractor'] = extract_with_thr_boilerpipe_ArticleSentencesExtractor( raw_content)
ret['thr_boiler_pipe_DefaultExtractor'] = extract_with_thr_boilerpipe_DefaultExtractor( raw_content )
ret['py_boiler_pipe_ArticleExtractor'] = extract_with_py_boilerpipe_ArticleExtractor( raw_content)
ret['py_boiler_pipe_ArticleSentencesExtractor'] = extract_with_py_boilerpipe_ArticleSentencesExtractor( raw_content)
ret['py_boiler_pipe_CanolaExtractor'] = extract_with_py_boilerpipe_CanolaExtractor( raw_content)
ret['py_boiler_pipe_DefaultExtractor'] = extract_with_py_boilerpipe_DefaultExtractor( raw_content )
ret['py_boiler_pipe_KeepEverythingExtractor'] = extract_with_py_boilerpipe_KeepEverythingExtractor( raw_content)
#ret['py_boiler_pipe_KeepEverythingWithMinKWordsExtractor'] = extract_with_py_boilerpipe_KeepEverythingWithMinKWordsExtractor( raw_content)
ret['py_boiler_pipe_LargestContentExtractor'] = extract_with_py_boilerpipe_LargestContentExtractor( raw_content)
ret['py_boiler_pipe_NumWordsRulesExtractor'] = extract_with_py_boilerpipe_NumWordsRulesExtractor( raw_content)
ret['python_readibilty'] = { 'extracted_html': extract_with_python_readability( raw_content ) }
#ret['py_goose'] = { 'extracted_html': extract_with_python_goose( raw_content ) }
#ret['justext'] = { 'extracted_html': extract_with_justext( raw_content ) }
for method, result in ret.iteritems():
if 'extracted_text' not in result:
result['extracted_text'] = html_to_comparable_text( result['extracted_html' ] )
return ret
In [22]:
def compare_extractors_for_download( downloads_id ):
eto = create_extractor_training_object( downloads_id )
return comp_extractors( eto )
def comp_extractors( eto ):
downloads_id = eto['downloads_id']
media_id = eto['media_id' ]
story = eto['story']
raw_content = eto['raw_content']
preprocessed_lines = eto['preprocessed_lines']
expected_text = eto['expected_text']
title = story[u'title']
description = story[u'description']
url = story[u'url']
extraction_results = get_extraction_results( eto )
comp_results = {}
comp_results['downloads_id'] = downloads_id
comp_results['media_id'] = media_id
comp_results['story_is_spidered'] = eto['story_is_spidered']
for name, value in extraction_results.iteritems():
#print name, value
comp_results.update (compare_with_expected( name, value['extracted_text'], value['extracted_html'], expected_text, story ) )
comp_results.update( compare_with_expected( 'gold', expected_text, expected_text, expected_text, story ) )
return comp_results
In [23]:
import sys
sys.path.append('../')
import mc_config
def get_db_info():
config_file = mc_config.read_config()
db_infos = config_file['database']
db_info = next (db_info for db_info in db_infos if db_info['port'] == '6000' )
return db_info
import psycopg2
#import solr_reimport
import psycopg2.extras
#db_info = get_db_info()
#conn = psycopg2.connect( database=db_info['db'], user=db_info['user'],
# password=db_info['pass'], host=db_info['host'], port=db_info['port'] )
conn = None
story_sentence_counts_cache = {}
def get_sentence_counts( sentence, story ):
stories_id = story['stories_id']
if not stories_id in story_sentence_counts_cache:
story_sentence_counts_cache[ stories_id ] = {}
if sentence in story_sentence_counts_cache[ stories_id ]:
return story_sentence_counts_cache[stories_id ][sentence]
global conn
if conn == None:
db_info = get_db_info()
conn = psycopg2.connect( database=db_info['db'], user=db_info['user'],
password=db_info['pass'], host=db_info['host'], port=db_info['port'] )
cursor = conn.cursor(cursor_factory=psycopg2.extras.DictCursor)
query = '''
SELECT MIN( story_sentence_counts_id) AS story_sentence_counts_id, sentence_count, first_stories_id,
sentence_md5
FROM story_sentence_counts
WHERE sentence_md5 = md5(%(sentence)s)
AND media_id = %(media_id)s
AND publish_week = DATE_TRUNC( 'week', %(publish_date)s::date )
GROUP BY story_sentence_counts_id
'''
#print sentence
#md5_sum = md5.new( sentence ).hexdigest()
params = { 'sentence': sentence,
'media_id': story['media_id'],
'publish_date': story['publish_date']
}
#print params
#print eto[ 'story'] ['stories_id' ]
cursor.execute( query, params )
fetched = cursor.fetchall()
if len( fetched ) == 0:
story_sentence_counts_cache[ stories_id ][sentence] = None
else:
story_sentence_counts_cache[ stories_id ][sentence] = dict(fetched[0])
return story_sentence_counts_cache[stories_id ][sentence]
def sentence_is_duplicate( sentence, story ):
sentence_counts = get_sentence_counts( sentence, story )
if sentence_counts != None:
if sentence_counts['sentence_count'] > 1:
#print "duplicate sentence", sentence
return True
elif sentence_counts['first_stories_id'] == story['stories_id']:
return True
#print "duplicate sentence (diff first_stories_id) ", sentence
else:
return False
pass
#print "sentence not found ", sentence
In [24]:
import pandas as pd
def get_data_frame_from_comparision_objects( comparison_objects ):
assert len( comparison_objects ) > 0
new_comps = []
for comp in comparison_objects:
new_comp = {}
new_comp = { 'downloads_id': comp['downloads_id'] }
extractor_types = [ k for k in comp.keys() if k not in { 'downloads_id', 'media_id', 'story_is_spidered' } ]
for extractor_type in extractor_types:
new_comp.update([ ( k + '_' + extractor_type , v) for k,v in comp[ extractor_type ].iteritems() ])
new_comps.append( new_comp )
df = pd.DataFrame( new_comps )
df.set_index('downloads_id', inplace=True )
return df
In [25]:
import sys
sys.path = ['/home/dlarochelle/git_dev/mediacloud/python_scripts/notebook/thriftboilerpipe'] + sys.path
from thrift.transport import TTransport
from thrift.transport import TSocket
from thrift.transport import TSSLSocket
from thrift.transport import THttpClient
from thrift.protocol import TBinaryProtocol
from thriftboilerpipe import ExtractorService
from thriftboilerpipe.ttypes import *
In [53]:
host = 'localhost'
port = 9090
uri = ''
socket = TSocket.TSocket(host, port)
transport = TTransport.TBufferedTransport(socket)
protocol = TBinaryProtocol.TBinaryProtocol(transport)
client = ExtractorService.Client(protocol)
transport.open()
In [68]:
def thrift_bp_extract( raw_content, extractor_type ) :
#print 'start thrift_bp_extract'
thrift_ret = client.extract_html( raw_content, extractor_type )
#print thrift_ret[1][15230:]
#print type(thrift_ret[1])
#print repr(thrift_ret[1])
#unicode( thrift_ret[1], 'utf-8' )
extracted_text = u"\n\n".join( [ u'' + s for s in thrift_ret ] )
#print 'returning from thrift_bp_extract'
ret = { 'extracted_text': extracted_text,
'extracted_html': ''
}
return ret
extract_with_thr_boilerpipe_DefaultExtractor = lambda raw_content : thrift_bp_extract( raw_content, 'DefaultExtractor' )
extract_with_thr_boilerpipe_ArticleExtractor = lambda raw_content: thrift_bp_extract( raw_content, 'ArticleExtractor')
extract_with_thr_boilerpipe_ArticleSentencesExtractor = lambda raw_content: thrift_bp_extract( raw_content, 'ArticleSentencesExtractor')
In [28]:
import boilerpipe.extract
def extract_with_py_boilerpipe( raw_content, extractor ):
e = boilerpipe.extract.Extractor( extractor=extractor, html=raw_content )
html = e.getHTML()
ret = { 'extracted_html': html }
return ret
extract_with_py_boilerpipe_DefaultExtractor = lambda raw_content: extract_with_py_boilerpipe( raw_content,
'DefaultExtractor')
extract_with_py_boilerpipe_ArticleExtractor = lambda raw_content: extract_with_py_boilerpipe( raw_content,
'ArticleExtractor')
extract_with_py_boilerpipe_ArticleSentencesExtractor = lambda raw_content: extract_with_py_boilerpipe( raw_content,
'ArticleSentencesExtractor')
extract_with_py_boilerpipe_KeepEverythingExtractor = lambda raw_content: extract_with_py_boilerpipe( raw_content,
'KeepEverythingExtractor')
extract_with_py_boilerpipe_KeepEverythingWithMinKWordsExtractor = lambda raw_content: extract_with_py_boilerpipe( raw_content,
'KeepEverythingWithMinKWordsExtractor')
extract_with_py_boilerpipe_LargestContentExtractor = lambda raw_content: extract_with_py_boilerpipe( raw_content,
'LargestContentExtractor')
extract_with_py_boilerpipe_NumWordsRulesExtractor = lambda raw_content: extract_with_py_boilerpipe( raw_content,
'NumWordsRulesExtractor')
extract_with_py_boilerpipe_CanolaExtractor = lambda raw_content: extract_with_py_boilerpipe( raw_content,
'CanolaExtractor')
In [29]:
def print_results_by_measurement_type( df ):
df.describe(percentiles=[.5] )
result_types = [ 'precision', 'recall', 'f1' ]
for result_type in result_types:
res_columns = [ col for col in df.columns if col.startswith( result_type ) ]
#df.ix[:,['f1_boiler_pipe', 'f1_crf', 'f1_heur', 'f1_python_readibilty']].describe()
print df.ix[:,res_columns].describe( percentiles=[0.02, 0.05,.1,0.5])
In [30]:
def filter_by_media_tags_id( comps_downloads, media_tags_ids ):
media_ids_matching = set()
#print media_id_media_map
for media_id, media in media_id_media_map.iteritems():
if not media[ 'media_source_tags_ids'].isdisjoint( media_tags_ids ):
media_ids_matching.add( media_id )
return [cd for cd in comps_downloads if cd['media_id'] in media_ids_matching ]
In [31]:
def remove_spidered_downloads( comps_downloads ):
return [cd for cd in comps_downloads if not cd['story_is_spidered'] ]
def only_spidered_downloads( comps_downloads ):
return [cd for cd in comps_downloads if cd['story_is_spidered'] ]
In [32]:
def story_is_spidered( story ):
for taginfo in story['story_tags']:
if taginfo['tag'] == 'spidered' and taginfo['tag_set'] == 'spidered':
return True
return False
In [71]:
regenerate_media_id_media_map = False
regenerate_comps_downloads = True
compare_deduplicated = False
In [34]:
extractor_training_objects = cPickle.load( file(
os.path.expanduser( '~/Dropbox/mc/extractor_test/extractor_training_objects.pickle' ), "rb" ) )
#cPickle.load( open( "extractor_traning_objects.pickle", "rb") )
In [35]:
len( extractor_training_objects )
c = Counter( sorted([ eto['media_id'] for eto in extractor_training_objects ]) )
c.most_common()
sorted( c.keys() )
Out[35]:
[1,
2,
4,
6,
7,
8,
14,
15,
23,
28,
39,
109,
110,
113,
115,
117,
118,
121,
125,
129,
131,
139,
147,
160,
268,
285,
294,
307,
336,
564,
669,
687,
697,
711,
712,
713,
731,
751,
752,
771,
788,
789,
801,
805,
809,
853,
870,
889,
955,
1004,
1027,
1040,
1056,
1062,
1063,
1082,
1089,
1092,
1094,
1095,
1104,
1127,
1147,
1149,
1200,
1259,
1270,
1311,
1347,
1349,
1352,
1359,
1376,
1398,
1420,
1452,
1490,
1536,
1549,
1568,
1585,
1589,
1596,
1607,
1626,
1630,
1641,
1648,
1650,
1651,
1654,
1655,
1658,
1667,
1670,
1674,
1675,
1680,
1684,
1685,
1694,
1724,
1725,
1726,
1728,
1729,
1730,
1731,
1733,
1740,
1742,
1747,
1750,
1751,
1752,
1757,
4415,
4418,
4419,
5527,
6162,
6335,
18203,
18204,
18210,
18213,
18346,
18350,
18364,
18370,
18761,
18886,
19027,
19081,
19127,
19327,
19347,
19445,
19690,
19854,
20763,
21629,
21936,
21990,
23209,
23464,
23881,
24697,
24767,
25170,
25369,
25527,
26309,
26705,
26924,
27692,
29363,
32120,
35065,
35400,
35625,
39008,
39648,
39677,
40405,
40584,
40789,
41502,
41771,
48279,
48768,
53459,
57137,
58992,
61322,
63247,
64110,
65731,
66924,
66949,
66951,
66957,
68627,
72816,
73384,
77657,
79784,
83352,
83353,
83371,
83860,
84057,
84249,
84256,
84654,
84991,
87821,
88695,
88819,
95973,
96863,
102611,
102618,
102622,
102629,
102634,
102635,
102637,
102639,
102647,
102652,
102657,
102661,
102668,
102670,
102671,
102673,
102680,
102689,
102691,
102693,
102694,
102708,
102739,
102741,
102744,
102745,
102747,
102759,
102766,
102769,
102770,
102781,
104881,
104903,
104942,
104950,
105130,
105403,
107088,
107637,
107692,
107764,
111691,
111851,
112357,
112386,
112982,
113157,
120850,
128098,
133122,
137922,
142103,
143836,
144943,
145564,
146357,
146402,
147230,
177339,
177382]
In [36]:
import itertools
from collections import Counter
mc = mediacloud.api.MediaCloud(api_key)
if regenerate_media_id_media_map:
media_id_media_map = {}
media_ids = sorted(list(set([ eto['media_id'] for eto in extractor_training_objects ])))
for media_id in list(media_ids)[:]:
media = mc.media( media_id )
media[ 'media_source_tags_ids' ] = set( [ media_source_tag['tags_id']
for media_source_tag in media['media_source_tags'] ] )
media_id_media_map[ media_id ] = media
print len( media_ids )
print 'pickling'
cPickle.dump( media_id_media_map,
file( os.path.expanduser( '~/Dropbox/mc/extractor_test/media_id_media_map.pickle'), "wb") )
In [37]:
media_id_media_map = cPickle.load(
file( os.path.expanduser(
'~/Dropbox/mc/extractor_test/media_id_media_map.pickle'),
"rb") )
media_tag_counts = Counter(list ( itertools.chain.from_iterable( media_source['media_source_tags_ids'] for media_source in media_id_media_map.values() )) )
tags_id_to_media_tags_map = {}
for media_tag in media_id_media_map.values():
source_tags = media_tag[ 'media_source_tags' ]
for source_tag in source_tags:
tags_id_to_media_tags_map[ source_tag[ 'tags_id' ] ] = source_tag
In [38]:
media_id_media_map.keys()
Out[38]:
[1536,
1,
2,
1027,
4,
113157,
6,
7,
8,
72816,
1549,
14,
15,
1040,
6162,
68627,
23,
28,
41502,
1056,
1062,
39,
27692,
144943,
1585,
564,
1589,
1082,
1596,
1089,
1654,
1092,
1094,
1607,
35400,
268,
111691,
1104,
26705,
25170,
19027,
84057,
1626,
63247,
1630,
96863,
39008,
128098,
955,
1095,
1127,
1641,
4419,
109,
110,
58992,
113,
1650,
115,
117,
118,
1655,
121,
1658,
1147,
1149,
48768,
129,
133122,
131,
1670,
40584,
19081,
1674,
1675,
1730,
102639,
1680,
147,
1684,
1685,
48279,
145564,
669,
1694,
26309,
160,
1648,
18203,
73384,
23209,
105130,
107692,
84654,
687,
1200,
29363,
19127,
697,
177339,
1724,
1725,
1726,
6335,
1728,
1729,
137922,
1731,
1733,
711,
712,
713,
88695,
1740,
1742,
40789,
1747,
1750,
1751,
1752,
102618,
731,
1757,
102622,
805,
39648,
120850,
95973,
65731,
19690,
1259,
102637,
21629,
752,
53459,
107764,
1270,
809,
35065,
102652,
39677,
102657,
112386,
771,
102661,
102668,
87821,
102670,
102671,
102673,
1667,
788,
789,
142103,
102680,
25369,
20763,
18204,
285,
147230,
1311,
84256,
102689,
18210,
102691,
18213,
294,
35625,
41771,
26924,
102635,
57137,
307,
102708,
24697,
84249,
4415,
4418,
139,
1349,
102611,
1352,
18761,
1359,
336,
102739,
102741,
112982,
88819,
102744,
102745,
102747,
112357,
1376,
102629,
177382,
870,
102759,
66924,
24767,
102766,
102769,
102770,
19347,
102694,
1398,
32120,
889,
102781,
23881,
19327,
111851,
66949,
66951,
61322,
1420,
66957,
19854,
125,
1347,
83860,
104942,
5527,
83352,
83353,
751,
1063,
23464,
18346,
83371,
1452,
18350,
21936,
104881,
1651,
146357,
25527,
105403,
18364,
18370,
102693,
18886,
104903,
102647,
1490,
40405,
143836,
107637,
107088,
146402,
21990,
1004,
77657,
64110,
79784,
19445,
104950,
1568,
102634,
84991,
801,
853]
In [39]:
[ m['media_source_tags_ids'] for m in media_id_media_map.values() ]
Out[39]:
[{2453107, 2454253, 2491715},
{109, 6071565, 6729599, 8875027, 8878390, 8878416},
{6, 7, 18, 6071565, 6729599, 8875027, 8878390, 8878416},
{125, 8878332},
{5, 6, 6071565, 6729599, 8875027, 8878416},
{8875452},
{6, 14, 8875027, 8875460, 8878390, 8878416},
{6, 16, 6071565, 6729599, 8875027, 8875676, 8878416},
{17, 6071565, 6729599, 8875027, 8875676, 8878416},
{8875452, 8878416},
{2453107, 2454099, 2454253},
{21, 8875027, 8875460, 8878416},
{22, 8875027, 8878416},
{125, 8878415},
{8875028, 8875033, 8875107, 8878420},
{8875452},
{32, 2453107, 2496423, 6260468},
{11, 2453107, 2496423, 6260349, 8878416},
{8876474, 8876475, 8876476, 8876479, 8876504, 8876548, 8876585},
{125, 8875031, 8875108, 8875110, 8875113, 8875456, 8878415},
{125},
{43, 2453107, 2496423, 6260349, 8875031},
{8875458, 8878418},
{8875452},
{2453107, 2495238, 2495253},
{118, 2453107, 2496423, 2497397, 8875031, 8875108, 8875110, 8875113, 8878416},
{2453107, 2495238, 2495239},
{125, 796, 8878292},
{2453107, 2495238, 2495255},
{129,
8875027,
8876474,
8876475,
8876476,
8876484,
8876499,
8876509,
8876510,
8878416},
{2453107, 2496423, 2497397},
{8875027, 8875028, 8878416},
{6124858, 8875027, 8878416},
{796, 2453107, 2495238, 2495240},
{8875452},
{117, 8878292, 8878415, 8878451},
{8875452},
{8875027, 8878416},
{8875452},
{8875452, 8878416},
{8875452, 8876987, 8876988, 8878419},
{8877968, 8877969, 8877973, 8877990},
{142317, 2453107, 2496423, 2497397},
{8876987, 8876988, 8878416},
{2453107, 2496423, 2496424, 5648819},
{8875452},
{8876474,
8876475,
8876484,
8876492,
8876499,
8876500,
8876562,
8876987,
8876988,
8878413},
{8875452},
{125, 8878416},
{8875027, 8875031, 8878416},
{142001, 8878293, 8878332, 8878416},
{2453107, 2496423, 2496424, 7055831, 8878416},
{8875027, 8878416},
{7, 117, 118, 125, 6071565, 6729401, 6729599, 8875028, 8878292, 8878415},
{7, 117, 118, 125, 6124858},
{8876987, 8876988},
{7,
117,
118,
125,
6071565,
6729549,
6729599,
8875028,
8875031,
8875108,
8875109,
8875114,
8875456,
8878293,
8878416},
{2453107, 2496423, 2497397},
{7,
117,
118,
125,
6124858,
8875028,
8875031,
8875108,
8875109,
8875114,
8875456,
8875458,
8878063,
8878293,
8878423},
{7,
117,
118,
125,
796,
142001,
6071565,
6729401,
6729599,
8875028,
8875031,
8875108,
8875111,
8875115,
8875459,
8878062,
8878292,
8878416},
{7, 117, 118, 125, 6071565, 6729549, 6729599, 8875028, 8878415},
{2453107, 2496423, 2497397},
{6, 117, 118, 125, 796, 8878293},
{2453107, 2496423, 2497397},
{796,
142001,
8875028,
8875031,
8875108,
8875109,
8875114,
8875456,
8878063,
8878293,
8878332,
8878413},
{8875027,
8876474,
8876475,
8876476,
8876479,
8876484,
8876509,
8876510,
8878293,
8878416},
{8875452, 8878416},
{7, 117, 125},
{8875452},
{117, 118, 125, 8875456, 8878293},
{2453107, 2496423, 2497397},
{8876474,
8876475,
8876476,
8876479,
8876502,
8876508,
8876576,
8878255,
8878259,
8878263},
{8875452, 8878423},
{2453107, 2496423, 2497397},
{2453107, 2496423, 2497397},
{7796878},
{8878255, 8878259, 8878263},
{2453107, 2496423, 2497397},
{7, 117, 118, 125, 8878415},
{2453107, 2496423, 2497397},
{2453107, 2496423, 2497397},
{8875452, 8876987, 8877044, 8878443},
{8875452},
{125, 8878416},
{50, 2453107, 2496423, 2497397, 7055831, 8878416},
{8875452},
{7, 117, 125, 796, 8875458, 8878293},
{2453107, 2496423, 2497397},
{8875227, 8875361, 8878255, 8878259, 8878269, 8878416},
{8875452, 8878293, 8878416},
{8875452, 8876987, 8876988, 8878416, 8878442},
{8875452, 8878415},
{8875452, 8878416},
{8875452},
{125, 6071565, 6729401, 6729599},
{2453107, 2496423, 2497397, 8878416},
{8875452, 8878416},
{8875452, 8878413},
{125},
{8875452},
{7796878, 8876474, 8876475, 8876476, 8876479, 8876491, 8876504, 8876505},
{7796878,
8875024,
8876474,
8876475,
8876476,
8876479,
8876491,
8876504,
8876505},
{7, 7796878, 8875024},
{8875031, 8875108, 8875111, 8878062, 8878292, 8878415},
{7796878, 8875024, 8876987, 8876995},
{7796878, 8875024, 8876987, 8876995},
{8875452},
{7796878},
{7796878, 8875035},
{125, 8875031, 8875108, 8875111, 8875115, 8878062, 8878292, 8878416},
{125, 8875031, 8875108, 8875110, 8875113, 8878332},
{125, 8878293},
{8875452},
{7796878, 8875026},
{7796878, 8875026},
{8876474,
8876475,
8876476,
8876479,
8876508,
8876569,
8876570,
8877914,
8878255,
8878259,
8878273},
{8875027,
8876474,
8876475,
8876476,
8876478,
8876479,
8876484,
8876487,
8878292,
8878416},
{8875027,
8876474,
8876475,
8876476,
8876478,
8876479,
8876484,
8876487,
8878292,
8878416},
{8875027,
8875031,
8876474,
8876475,
8876476,
8876478,
8876479,
8876484,
8876487,
8878293,
8878416},
{8875027, 8878416},
{8878255, 8878259, 8878263},
{125, 796, 8878293},
{796, 8875456, 8878293, 8878390, 8878416},
{8878255, 8878259, 8878263},
{125},
{8876474,
8876475,
8876476,
8876479,
8876502,
8876508,
8876576,
8877914,
8878255,
8878261},
{8875452},
{8875452, 8878416},
{8876987, 8877000, 8878332, 8878416},
{8875452, 8878292, 8878416},
{2453107, 2496423, 6260328},
{8878255, 8878259, 8878273},
{8875452, 8876987, 8876988, 8878416},
{125},
{8875452},
{8875452},
{2453107, 2496423, 6260328, 8878416},
{125, 8875456, 8875458, 8878293},
{8875452},
{8878255, 8878259, 8878263},
{8876474,
8876475,
8876476,
8876479,
8876502,
8876508,
8876576,
8878255,
8878259,
8878273},
{8878255, 8878259, 8878263},
{8875452},
{125},
{8878255, 8878259, 8878263},
{8878255, 8878259, 8878270},
{8875452},
{8878255, 8878259, 8878263},
{8878255, 8878259, 8878263},
{8878255, 8878259, 8878263},
{2453107, 2496423, 2497397},
{125, 8878293},
{125},
{8875452},
{8878255, 8878259, 8878263},
{8875452, 8875456, 8878293, 8878415},
{8875452, 8876987, 8876988},
{8875227,
8875361,
8876474,
8876476,
8876478,
8876479,
8876487,
8876508,
8876544,
8878255,
8878259,
8878269},
{117, 125},
{8875452},
{2453107, 2496423, 2496424},
{8877968, 8877989, 8877997},
{8878255, 8878259, 8878270},
{8875228, 8877914, 8878255, 8878259, 8878263},
{8878255, 8878259, 8878270},
{8875228, 8878255, 8878257},
{118, 125, 8878416},
{8875452},
{8876474,
8876475,
8876476,
8876479,
8876502,
8876508,
8876576,
8878255,
8878259,
8878270},
{8875452, 8878416},
{8878255, 8878259, 8878263},
{8876987, 8877008},
{118, 125, 8875028, 8875458, 8878293, 8878415},
{8878255, 8878259, 8878263},
{8875452, 8876987, 8877043},
{8877968, 8877989, 8878011},
{8875027, 8878420},
{796, 8875027, 8878416},
{7, 117, 118, 125, 8875456, 8878293},
{2453107, 2496423, 6260349},
{8878255, 8878257},
{2453107, 2496423, 6260349},
{8875452, 8876987, 8876988},
{2453107, 2496423, 6260349},
{118, 125, 8878292, 8878415},
{8878255, 8878259, 8878270},
{8878255, 8878259, 8878261, 8878273},
{8875452},
{8875452},
{8878255, 8878259, 8878263},
{8878255, 8878259, 8878269, 8878416},
{8878255, 8878259, 8878266},
{8875452},
{2453107, 2496423, 6260349},
{8878255, 8878259, 8878263},
{8875452},
{125},
{8878255, 8878259, 8878273},
{8877913, 8878255, 8878261},
{8875452},
{8878255, 8878259, 8878273},
{8878255, 8878259, 8878263},
{8878255, 8878259, 8878263},
{8875452, 8876987, 8877046},
{8878255, 8878259, 8878263},
{2453107, 2496423, 6260468},
{8875452},
{125},
{8878255, 8878259, 8878270},
{142364, 8875361, 8878255, 8878259, 8878269},
{8875452},
{8875452},
{8877914, 8878255, 8878259, 8878273},
{8877914, 8878255, 8878259, 8878273},
{8876987, 8877006, 8878255, 8878259},
{2453107, 2496423, 6260468, 8878416},
{8877915, 8878255, 8878259, 8878263, 8878429},
{8875452, 8875456, 8878416},
{7,
117,
125,
796,
8875031,
8875108,
8875111,
8875115,
8878062,
8878293,
8878416},
{2453107, 2496423, 6260349},
{8877968, 8877969},
{8875452},
{8875028, 8875033, 8875107, 8878420},
{8877968, 8877969, 8877970, 8878416},
{8877968, 8877969, 8877970, 8878416},
{125},
{125, 8875031, 8875108, 8875110, 8875113, 8878416},
{8875452, 8875456, 8876987, 8877000},
{8875452, 8876987, 8876988, 8878423},
{8877968, 8877969, 8877980},
{2453107, 2454099, 2454253},
{8875452, 8878416},
{8875452, 8878429},
{8875452},
{2453107, 2496423, 2497397},
{8875452},
{8875452, 8878416},
{8875452},
{8875452, 8875456, 8878416},
{8875452, 8876987, 8876988, 8878423},
{8878255, 8878259, 8878273},
{8875452, 8876987, 8876988, 8878416},
{8875452},
{8878255, 8878259, 8878263},
{2453107, 2454253, 2484526},
{8876474,
8876475,
8876476,
8876479,
8876491,
8876492,
8876508,
8878255,
8878259,
8878269},
{8875452},
{8875452, 8878413},
{8875452},
{8875452},
{8875452,
8876474,
8876475,
8876476,
8876484,
8876499,
8876509,
8876510,
8876987,
8876988,
8878416},
{125, 8875028, 8878293, 8878332, 8878415},
{8875452, 8878293},
{8876987, 8877006, 8878255, 8878260, 8878416},
{8875452, 8878443},
{8875452},
{8875452},
{2453107, 2495238, 2495240},
{8878255, 8878259, 8878263, 8878273},
{8875452},
{125, 8875456, 8875458, 8878293, 8878416},
{125, 8878415}]
In [64]:
extract_with_thr_boilerpipe_ArticleExtractor( extractor_training_objects[0]['raw_content'] )
start thrift_bp_extract
returning from thrift_bp_extract
Out[64]:
{'extracted_html': '',
'extracted_text': u'Metr\xf4 tem confus\xe3o por causa de encontro em shopping; veja v\xeddeo - Bizarro\n\nfonte: Reprodu\xe7\xe3o/Facebook Rol\xea Shopping Itaquera\nHouve superlota\xe7\xe3o e a confus\xe3o foi generalizada\nEra para ser um s\xe1bado muvucado como todos aqueles que antecedem o Natal em mais um shopping de S\xe3o Paulo. Mas foi muito mais do que isso. Milhares de jovens compareceram a um encontro promovido por uma p\xe1gina \xa0no Facebook para este s\xe1bado (7), no segundo andar do Shopping Metr\xf4 Itaquera, \xe0s 17h. Mas o que aconteceu foi uma bagun\xe7a total, com a chegada da pol\xedcia e o fechamento do shopping duas horas antes que o previsto.\nNingu\xe9m sabe ao certo o que aconteceu, alguns dizem que foi arrast\xe3o, mas os pr\xf3prios lojistas negam. De acordo com um rapaz que registrou a movimenta\xe7\xe3o na sa\xedda do metr\xf4, algumas pessoas isoladamente tiveram itens furtados. Infelizmente, o celular do garoto era meio ruim, mas deu para ver a confus\xe3o. Veja aqui o v\xeddeo que ele fez.\nComo muita gente compareceu, houve a confus\xe3o. O certo \xe9 que j\xe1 est\xe1 marcado o terceiro encontro, que \xe9 mais uma badala\xe7\xe3o entre a galera, ou melhor, como diz a pr\xf3pria p\xe1gina , \xe9 s\xf3 um Rol\xeazinho Parte 3. Tudo come\xe7ou na semana passado, no domingo (1), com o Vuuk no Shopping Itaquera.\nVEJA+\n'}
In [72]:
comp_extractors( extractor_training_objects[ 0] )
extracting with thr_boilerpipe
Out[72]:
{'downloads_id': 391881020,
'gold': {'f1': 1.0, 'precision': 1.0, 'recall': 1.0},
'media_id': 83371,
'py_boiler_pipe_ArticleExtractor': {'f1': 0.898148148148148,
'precision': 0.8220338983050848,
'recall': 0.9897959183673469},
'py_boiler_pipe_ArticleSentencesExtractor': {'f1': 0,
'precision': 0.0,
'recall': 0.0},
'py_boiler_pipe_CanolaExtractor': {'f1': 0.8729016786570742,
'precision': 0.8235294117647058,
'recall': 0.9285714285714286},
'py_boiler_pipe_DefaultExtractor': {'f1': 0.8656036446469247,
'precision': 0.7818930041152263,
'recall': 0.9693877551020408},
'py_boiler_pipe_KeepEverythingExtractor': {'f1': 0.4714459295261239,
'precision': 0.3094098883572568,
'recall': 0.9897959183673469},
'py_boiler_pipe_LargestContentExtractor': {'f1': 0.9440389294403893,
'precision': 0.9023255813953488,
'recall': 0.9897959183673469},
'py_boiler_pipe_NumWordsRulesExtractor': {'f1': 0.900473933649289,
'precision': 0.8407079646017699,
'recall': 0.9693877551020408},
'python_readibilty': {'f1': 0.9411764705882353,
'precision': 0.9435897435897436,
'recall': 0.9387755102040817},
'story_is_spidered': False,
'thr_boiler_pipe_ArticleExtractor': {'f1': 0.938875305623472,
'precision': 0.9014084507042254,
'recall': 0.9795918367346939},
'thr_boiler_pipe_DefaultExtractor': {'f1': 0.7840670859538783,
'precision': 0.6654804270462633,
'recall': 0.9540816326530612}}
In [73]:
import datetime
if regenerate_comps_downloads:
comps_downloads = []
processed = 0
skipped = 0
start_time = datetime.datetime.now()
e=None
for extractor_training_object in extractor_training_objects[:]:
print 'processed ', processed
print 'skipped ', skipped
print extractor_training_object[ 'downloads_id']
try:
res = comp_extractors( extractor_training_object )
#print res
comps_downloads.append( res )
processed += 1
except Exception, e:
print "error on download{}".format( extractor_training_object[ 'downloads_id'] )
e = sys.exc_info()
import traceback
traceback.print_exc()
print e
#raise e
skipped += 1
end_time = datetime.datetime.now()
print "Total_time", end_time - start_time
print "Time per download", (end_time - start_time)/ (processed + skipped )
cPickle.dump( comps_downloads, file(
os.path.expanduser( "~/Dropbox/mc/extractor_test/comps_downloads.pickle"), "wb"))
e
#extractor_training_objects
processed 0
skipped 0
391881020
extracting with thr_boilerpipe
processed 1
skipped 0
401370599
extracting with thr_boilerpipe
processed 2
skipped 0
412896439
extracting with thr_boilerpipe
processed 3
skipped 0
412952145
extracting with thr_boilerpipe
processed 4
skipped 0
412977048
extracting with thr_boilerpipe
processed 5
skipped 0
406397565
extracting with thr_boilerpipe
processed 6
skipped 0
406946308
extracting with thr_boilerpipe
processed 7
skipped 0
407440556
extracting with thr_boilerpipe
processed 8
skipped 0
408175125
extracting with thr_boilerpipe
processed 9
skipped 0
408604940
extracting with thr_boilerpipe
processed 10
skipped 0
408964569
extracting with thr_boilerpipe
processed 11
skipped 0
409307571
extracting with thr_boilerpipe
processed 12
skipped 0
409362014
extracting with thr_boilerpipe
processed 13
skipped 0
410019932
extracting with thr_boilerpipe
processed 14
skipped 0
410769033
extracting with thr_boilerpipe
processed 15
skipped 0
410893054
extracting with thr_boilerpipe
processed 16
skipped 0
411494070
extracting with thr_boilerpipe
processed 17
skipped 0
412822633
extracting with thr_boilerpipe
processed 18
skipped 0
412832327
extracting with thr_boilerpipe
processed 19
skipped 0
413070223
extracting with thr_boilerpipe
processed 20
skipped 0
413080625
extracting with thr_boilerpipe
processed 21
skipped 0
413724188
extracting with thr_boilerpipe
processed 22
skipped 0
413990063
extracting with thr_boilerpipe
processed 23
skipped 0
414001946
extracting with thr_boilerpipe
processed 24
skipped 0
414705852
extracting with thr_boilerpipe
processed 25
skipped 0
414754369
extracting with thr_boilerpipe
processed 26
skipped 0
414974366
extracting with thr_boilerpipe
processed 27
skipped 0
415381471
extracting with thr_boilerpipe
processed 28
skipped 0
415752815
extracting with thr_boilerpipe
processed 29
skipped 0
415977284
extracting with thr_boilerpipe
processed 30
skipped 0
416560140
extracting with thr_boilerpipe
processed 31
skipped 0
416773947
extracting with thr_boilerpipe
processed 32
skipped 0
416931217
extracting with thr_boilerpipe
processed 33
skipped 0
417026931
extracting with thr_boilerpipe
processed 34
skipped 0
417913575
extracting with thr_boilerpipe
processed 35
skipped 0
418921816
extracting with thr_boilerpipe
processed 36
skipped 0
419059149
extracting with thr_boilerpipe
processed 37
skipped 0
419061380
extracting with thr_boilerpipe
processed 38
skipped 0
419312194
extracting with thr_boilerpipe
processed 39
skipped 0
419463576
extracting with thr_boilerpipe
processed 40
skipped 0
419897309
extracting with thr_boilerpipe
processed 41
skipped 0
420605489
extracting with thr_boilerpipe
processed 42
skipped 0
420695976
extracting with thr_boilerpipe
processed 43
skipped 0
420972612
extracting with thr_boilerpipe
processed 44
skipped 0
421512071
extracting with thr_boilerpipe
processed 45
skipped 0
421586812
extracting with thr_boilerpipe
processed 46
skipped 0
421950008
extracting with thr_boilerpipe
processed 47
skipped 0
422329395
extracting with thr_boilerpipe
processed 48
skipped 0
423644184
extracting with thr_boilerpipe
processed 49
skipped 0
423780415
extracting with thr_boilerpipe
processed 50
skipped 0
423811752
extracting with thr_boilerpipe
processed 51
skipped 0
423826345
extracting with thr_boilerpipe
processed 52
skipped 0
426274790
extracting with thr_boilerpipe
processed 53
skipped 0
426745030
extracting with thr_boilerpipe
processed 54
skipped 0
426958717
extracting with thr_boilerpipe
processed 55
skipped 0
427023406
extracting with thr_boilerpipe
processed 56
skipped 0
427039192
extracting with thr_boilerpipe
processed 57
skipped 0
427131787
extracting with thr_boilerpipe
processed 58
skipped 0
427645929
extracting with thr_boilerpipe
processed 59
skipped 0
427851499
extracting with thr_boilerpipe
processed 60
skipped 0
428523804
extracting with thr_boilerpipe
processed 61
skipped 0
429112619
extracting with thr_boilerpipe
processed 62
skipped 0
429500447
extracting with thr_boilerpipe
processed 63
skipped 0
429714766
extracting with thr_boilerpipe
processed 64
skipped 0
429793967
extracting with thr_boilerpipe
processed 65
skipped 0
430099220
extracting with thr_boilerpipe
processed 66
skipped 0
430660615
extracting with thr_boilerpipe
processed 67
skipped 0
431012388
extracting with thr_boilerpipe
processed 68
skipped 0
431311136
extracting with thr_boilerpipe
processed 69
skipped 0
431387007
extracting with thr_boilerpipe
processed 70
skipped 0
431606529
extracting with thr_boilerpipe
processed 71
skipped 0
431839510
extracting with thr_boilerpipe
processed 72
skipped 0
431905299
extracting with thr_boilerpipe
processed 73
skipped 0
432896524
extracting with thr_boilerpipe
processed 74
skipped 0
432929604
extracting with thr_boilerpipe
processed 75
skipped 0
433044130
extracting with thr_boilerpipe
processed 76
skipped 0
434192180
extracting with thr_boilerpipe
processed 77
skipped 0
434250046
extracting with thr_boilerpipe
processed 78
skipped 0
434842660
extracting with thr_boilerpipe
processed 79
skipped 0
435142187
extracting with thr_boilerpipe
processed 80
skipped 0
435417572
extracting with thr_boilerpipe
processed 81
skipped 0
435417726
extracting with thr_boilerpipe
processed 82
skipped 0
435795096
extracting with thr_boilerpipe
processed 83
skipped 0
436540640
extracting with thr_boilerpipe
processed 84
skipped 0
436554531
extracting with thr_boilerpipe
processed 85
skipped 0
437087573
extracting with thr_boilerpipe
processed 86
skipped 0
437239472
extracting with thr_boilerpipe
processed 87
skipped 0
437281074
extracting with thr_boilerpipe
processed 88
skipped 0
437315186
extracting with thr_boilerpipe
processed 89
skipped 0
437523403
extracting with thr_boilerpipe
processed 90
skipped 0
437526788
extracting with thr_boilerpipe
processed 91
skipped 0
438596131
extracting with thr_boilerpipe
processed 92
skipped 0
438672710
extracting with thr_boilerpipe
processed 93
skipped 0
439340014
extracting with thr_boilerpipe
error on download439340014
(<class 'thrift.Thrift.TApplicationException'>, TApplicationException(None,), <traceback object at 0x7f3c8a8d51b8>)
processed 93
skipped 1
439491986
extracting with thr_boilerpipe
processed 94
skipped 1
439619864
extracting with thr_boilerpipe
processed 95
skipped 1
439631363
extracting with thr_boilerpipe
processed 96
skipped 1
440193709
extracting with thr_boilerpipe
processed 97
skipped 1
440219216
extracting with thr_boilerpipe
processed 98
skipped 1
440780462
extracting with thr_boilerpipe
processed 99
skipped 1
441039343
extracting with thr_boilerpipe
processed 100
skipped 1
441313630
extracting with thr_boilerpipe
processed 101
skipped 1
441455864
extracting with thr_boilerpipe
processed 102
skipped 1
441846112
extracting with thr_boilerpipe
processed 103
skipped 1
442808951
extracting with thr_boilerpipe
processed 104
skipped 1
442862362
extracting with thr_boilerpipe
processed 105
skipped 1
443616755
extracting with thr_boilerpipe
processed 106
skipped 1
444570682
extracting with thr_boilerpipe
processed 107
skipped 1
444894095
extracting with thr_boilerpipe
processed 108
skipped 1
445086933
extracting with thr_boilerpipe
processed 109
skipped 1
445428584
extracting with thr_boilerpipe
processed 110
skipped 1
445477281
extracting with thr_boilerpipe
processed 111
skipped 1
446094222
extracting with thr_boilerpipe
processed 112
skipped 1
446613221
extracting with thr_boilerpipe
processed 113
skipped 1
447062817
extracting with thr_boilerpipe
processed 114
skipped 1
447776250
extracting with thr_boilerpipe
processed 115
skipped 1
448661284
extracting with thr_boilerpipe
processed 116
skipped 1
448661534
extracting with thr_boilerpipe
processed 117
skipped 1
448892156
extracting with thr_boilerpipe
processed 118
skipped 1
448905892
extracting with thr_boilerpipe
processed 119
skipped 1
449094788
extracting with thr_boilerpipe
processed 120
skipped 1
449256732
extracting with thr_boilerpipe
processed 121
skipped 1
450223351
extracting with thr_boilerpipe
processed 122
skipped 1
450797637
extracting with thr_boilerpipe
processed 123
skipped 1
452232437
extracting with thr_boilerpipe
processed 124
skipped 1
453272806
extracting with thr_boilerpipe
processed 125
skipped 1
454046798
extracting with thr_boilerpipe
processed 126
skipped 1
454492602
extracting with thr_boilerpipe
processed 127
skipped 1
455066382
extracting with thr_boilerpipe
processed 128
skipped 1
455593030
extracting with thr_boilerpipe
processed 129
skipped 1
456789442
extracting with thr_boilerpipe
processed 130
skipped 1
457113351
extracting with thr_boilerpipe
processed 131
skipped 1
457749321
extracting with thr_boilerpipe
processed 132
skipped 1
457918791
extracting with thr_boilerpipe
processed 133
skipped 1
457945604
extracting with thr_boilerpipe
processed 134
skipped 1
458053172
extracting with thr_boilerpipe
processed 135
skipped 1
458710845
extracting with thr_boilerpipe
processed 136
skipped 1
459297373
extracting with thr_boilerpipe
processed 137
skipped 1
459314297
extracting with thr_boilerpipe
processed 138
skipped 1
459678712
extracting with thr_boilerpipe
processed 139
skipped 1
459731765
extracting with thr_boilerpipe
processed 140
skipped 1
461252885
extracting with thr_boilerpipe
processed 141
skipped 1
462404061
extracting with thr_boilerpipe
processed 142
skipped 1
462983422
extracting with thr_boilerpipe
processed 143
skipped 1
463954045
extracting with thr_boilerpipe
processed 144
skipped 1
464270345
extracting with thr_boilerpipe
processed 145
skipped 1
465068895
extracting with thr_boilerpipe
processed 146
skipped 1
465090983
extracting with thr_boilerpipe
processed 147
skipped 1
466804114
extracting with thr_boilerpipe
processed 148
skipped 1
467150362
extracting with thr_boilerpipe
processed 149
skipped 1
468568475
extracting with thr_boilerpipe
processed 150
skipped 1
468788039
extracting with thr_boilerpipe
processed 151
skipped 1
468996262
extracting with thr_boilerpipe
processed 152
skipped 1
469058922
extracting with thr_boilerpipe
processed 153
skipped 1
470488107
extracting with thr_boilerpipe
processed 154
skipped 1
473138193
extracting with thr_boilerpipe
processed 155
skipped 1
474060840
extracting with thr_boilerpipe
processed 156
skipped 1
474649098
extracting with thr_boilerpipe
processed 157
skipped 1
474891663
extracting with thr_boilerpipe
processed 158
skipped 1
475232740
extracting with thr_boilerpipe
processed 159
skipped 1
475489411
extracting with thr_boilerpipe
processed 160
skipped 1
475550192
extracting with thr_boilerpipe
processed 161
skipped 1
475842962
extracting with thr_boilerpipe
processed 162
skipped 1
476050916
extracting with thr_boilerpipe
processed 163
skipped 1
476079482
extracting with thr_boilerpipe
processed 164
skipped 1
476189164
extracting with thr_boilerpipe
processed 165
skipped 1
476365487
extracting with thr_boilerpipe
processed 166
skipped 1
476850484
extracting with thr_boilerpipe
processed 167
skipped 1
476936059
extracting with thr_boilerpipe
processed 168
skipped 1
476962103
extracting with thr_boilerpipe
processed 169
skipped 1
477126381
extracting with thr_boilerpipe
processed 170
skipped 1
477171081
extracting with thr_boilerpipe
processed 171
skipped 1
477673129
extracting with thr_boilerpipe
processed 172
skipped 1
477798748
extracting with thr_boilerpipe
processed 173
skipped 1
477850182
extracting with thr_boilerpipe
processed 174
skipped 1
478307074
extracting with thr_boilerpipe
processed 175
skipped 1
478793359
extracting with thr_boilerpipe
processed 176
skipped 1
479262110
extracting with thr_boilerpipe
processed 177
skipped 1
479410656
extracting with thr_boilerpipe
processed 178
skipped 1
480072496
extracting with thr_boilerpipe
processed 179
skipped 1
480080971
extracting with thr_boilerpipe
processed 180
skipped 1
480850060
extracting with thr_boilerpipe
processed 181
skipped 1
480965210
extracting with thr_boilerpipe
processed 182
skipped 1
481064362
extracting with thr_boilerpipe
processed 183
skipped 1
481747325
extracting with thr_boilerpipe
processed 184
skipped 1
481956983
extracting with thr_boilerpipe
processed 185
skipped 1
481995599
extracting with thr_boilerpipe
processed 186
skipped 1
482339280
extracting with thr_boilerpipe
processed 187
skipped 1
482455316
extracting with thr_boilerpipe
processed 188
skipped 1
482905957
extracting with thr_boilerpipe
processed 189
skipped 1
483167965
extracting with thr_boilerpipe
processed 190
skipped 1
483933348
extracting with thr_boilerpipe
processed 191
skipped 1
484173730
extracting with thr_boilerpipe
processed 192
skipped 1
484469651
extracting with thr_boilerpipe
processed 193
skipped 1
485257678
extracting with thr_boilerpipe
processed 194
skipped 1
485289153
extracting with thr_boilerpipe
processed 195
skipped 1
485312050
extracting with thr_boilerpipe
processed 196
skipped 1
485772601
extracting with thr_boilerpipe
processed 197
skipped 1
485874387
extracting with thr_boilerpipe
processed 198
skipped 1
486956929
extracting with thr_boilerpipe
processed 199
skipped 1
486963918
extracting with thr_boilerpipe
processed 200
skipped 1
486975782
extracting with thr_boilerpipe
processed 201
skipped 1
487099193
extracting with thr_boilerpipe
processed 202
skipped 1
487930345
extracting with thr_boilerpipe
processed 203
skipped 1
487938684
extracting with thr_boilerpipe
processed 204
skipped 1
488325235
extracting with thr_boilerpipe
processed 205
skipped 1
489427373
extracting with thr_boilerpipe
processed 206
skipped 1
489785301
extracting with thr_boilerpipe
processed 207
skipped 1
490000422
extracting with thr_boilerpipe
processed 208
skipped 1
491390357
extracting with thr_boilerpipe
processed 209
skipped 1
491394627
extracting with thr_boilerpipe
processed 210
skipped 1
491580307
extracting with thr_boilerpipe
processed 211
skipped 1
491868589
extracting with thr_boilerpipe
processed
Traceback (most recent call last):
File "<ipython-input-73-5462c770f344>", line 17, in <module>
res = comp_extractors( extractor_training_object )
File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
extraction_results = get_extraction_results( eto )
File "<ipython-input-66-716979e1aa41>", line 17, in get_extraction_results
ret['thr_boiler_pipe_ArticleExtractor'] = extract_with_thr_boilerpipe_ArticleExtractor( raw_content)
File "<ipython-input-68-fdb56e8b1b24>", line 21, in <lambda>
extract_with_thr_boilerpipe_ArticleExtractor = lambda raw_content: thrift_bp_extract( raw_content, 'ArticleExtractor')
File "<ipython-input-68-fdb56e8b1b24>", line 4, in thrift_bp_extract
thrift_ret = client.extract_html( raw_content, extractor_type )
File "thriftboilerpipe/ExtractorService.py", line 44, in extract_html
return self.recv_extract_html()
File "thriftboilerpipe/ExtractorService.py", line 62, in recv_extract_html
raise x
TApplicationException: Internal error processing extract_html
Traceback (most recent call last):
212
skipped 1
492035281
extracting with thr_boilerpipe
error on download492035281
(<class 'thrift.Thrift.TApplicationException'>, TApplicationException(None,), <traceback object at 0x7f3c8a8dd248>)
processed 212
skipped 2
492212969
extracting with thr_boilerpipe
processed 213
skipped 2
492220330
extracting with thr_boilerpipe
processed 214
skipped 2
492825637
extracting with thr_boilerpipe
processed 215
skipped 2
492998487
extracting with thr_boilerpipe
processed 216
skipped 2
493395197
extracting with thr_boilerpipe
processed 217
skipped 2
493603632
extracting with thr_boilerpipe
processed 218
skipped 2
494910637
extracting with thr_boilerpipe
processed 219
skipped 2
495132412
extracting with thr_boilerpipe
processed 220
skipped 2
495402183
extracting with thr_boilerpipe
processed 221
skipped 2
495442862
extracting with thr_boilerpipe
processed 222
skipped 2
495702037
extracting with thr_boilerpipe
processed 223
skipped 2
495856120
extracting with thr_boilerpipe
processed 224
skipped 2
496830079
extracting with thr_boilerpipe
processed 225
skipped 2
496960779
extracting with thr_boilerpipe
processed 226
skipped 2
497510792
extracting with thr_boilerpipe
processed 227
skipped 2
497750204
extracting with thr_boilerpipe
processed 228
skipped 2
498241896
extracting with thr_boilerpipe
processed 229
skipped 2
498496981
extracting with thr_boilerpipe
processed 230
skipped 2
498497707
extracting with thr_boilerpipe
processed 231
skipped 2
499288413
extracting with thr_boilerpipe
processed 232
skipped 2
499367463
extracting with thr_boilerpipe
processed 233
skipped 2
499577868
extracting with thr_boilerpipe
processed 234
skipped 2
499738311
extracting with thr_boilerpipe
processed 235
skipped 2
499979607
extracting with thr_boilerpipe
processed 236
skipped 2
500618275
extracting with thr_boilerpipe
processed 237
skipped 2
501409722
extracting with thr_boilerpipe
processed 238
skipped 2
501420351
extracting with thr_boilerpipe
processed 239
skipped 2
501420884
extracting with thr_boilerpipe
processed 240
skipped 2
501421014
extracting with thr_boilerpipe
processed 241
skipped 2
501431149
extracting with thr_boilerpipe
processed 242
skipped 2
501442658
extracting with thr_boilerpipe
processed 243
skipped 2
501453175
extracting with thr_boilerpipe
processed 244
skipped 2
501471473
extracting with thr_boilerpipe
processed 245
skipped 2
502622175
extracting with thr_boilerpipe
processed 246
skipped 2
502653177
extracting with thr_boilerpipe
processed 247
skipped 2
502830407
extracting with thr_boilerpipe
processed 248
skipped 2
503101426
extracting with thr_boilerpipe
processed 249
skipped 2
503171821
extracting with thr_boilerpipe
processed 250
skipped 2
503183518
extracting with thr_boilerpipe
processed 251
skipped 2
503244112
extracting with thr_boilerpipe
processed 252
skipped 2
503478903
extracting with thr_boilerpipe
processed 253
skipped 2
504316681
extracting with thr_boilerpipe
processed 254
skipped 2
504570818
extracting with thr_boilerpipe
processed 255
skipped 2
504702541
extracting with thr_boilerpipe
processed 256
skipped 2
505354116
extracting with thr_boilerpipe
processed 257
skipped 2
505354986
extracting with thr_boilerpipe
processed 258
skipped 2
505362351
extracting with thr_boilerpipe
processed 259
skipped 2
505513151
extracting with thr_boilerpipe
processed 260
skipped 2
505554845
extracting with thr_boilerpipe
processed 261
skipped 2
505630012
extracting with thr_boilerpipe
processed 262
skipped 2
505666313
extracting with thr_boilerpipe
processed 263
skipped 2
505709237
extracting with thr_boilerpipe
processed 264
skipped 2
505875260
extracting with thr_boilerpipe
processed 265
skipped 2
505891713
extracting with thr_boilerpipe
processed 266
skipped 2
505960251
extracting with thr_boilerpipe
processed 267
skipped 2
506245926
extracting with thr_boilerpipe
processed 268
skipped 2
506248868
extracting with thr_boilerpipe
processed 269
skipped 2
506288913
extracting with thr_boilerpipe
processed 270
skipped 2
506374300
extracting with thr_boilerpipe
processed 271
skipped 2
506905377
extracting with thr_boilerpipe
processed 272
skipped 2
508032017
extracting with thr_boilerpipe
processed 273
skipped 2
508653746
extracting with thr_boilerpipe
processed 274
skipped 2
509162462
extracting with thr_boilerpipe
processed 275
skipped 2
509252393
extracting with thr_boilerpipe
processed 276
skipped 2
509863893
extracting with thr_boilerpipe
processed 277
skipped 2
510413590
extracting with thr_boilerpipe
processed 278
skipped 2
510662547
extracting with thr_boilerpipe
processed 279
skipped 2
510881596
extracting with thr_boilerpipe
processed 280
skipped 2
510954456
extracting with thr_boilerpipe
processed 281
skipped 2
510955330
extracting with thr_boilerpipe
processed 282
skipped 2
511823889
extracting with thr_boilerpipe
processed 283
skipped 2
512505124
extracting with thr_boilerpipe
processed 284
skipped 2
512677517
extracting with thr_boilerpipe
processed 285
skipped 2
512765019
extracting with thr_boilerpipe
processed 286
skipped 2
514105704
extracting with thr_boilerpipe
processed 287
skipped 2
514153344
extracting with thr_boilerpipe
processed 288
skipped 2
514364011
extracting with thr_boilerpipe
processed 289
skipped 2
514378750
extracting with thr_boilerpipe
processed 290
skipped 2
514572763
extracting with thr_boilerpipe
processed 291
skipped 2
514931592
extracting with thr_boilerpipe
processed 292
skipped 2
515594059
extracting with thr_boilerpipe
processed 293
skipped 2
515980414
extracting with thr_boilerpipe
processed 294
skipped 2
516147422
extracting with thr_boilerpipe
processed 295
skipped 2
516155213
extracting with thr_boilerpipe
processed 296
skipped 2
516527178
extracting with thr_boilerpipe
processed 297
skipped 2
517088904
extracting with thr_boilerpipe
processed 298
skipped 2
517204863
extracting with thr_boilerpipe
processed 299
skipped 2
517393170
extracting with thr_boilerpipe
processed 300
skipped 2
518149302
extracting with thr_boilerpipe
processed 301
skipped 2
518752724
extracting with thr_boilerpipe
processed 302
skipped 2
518815044
extracting with thr_boilerpipe
processed 303
skipped 2
518917973
extracting with thr_boilerpipe
processed 304
skipped 2
519698497
extracting with thr_boilerpipe
processed 305
skipped 2
519788971
extracting with thr_boilerpipe
processed 306
skipped 2
519948923
extracting with thr_boilerpipe
processed 307
skipped 2
520302517
extracting with thr_boilerpipe
processed 308
skipped 2
520916358
extracting with thr_boilerpipe
processed 309
skipped 2
521501901
extracting with thr_boilerpipe
processed 310
skipped 2
521596942
extracting with thr_boilerpipe
processed 311
skipped 2
524336614
extracting with thr_boilerpipe
processed 312
skipped 2
524583342
extracting with thr_boilerpipe
processed 313
skipped 2
524747370
extracting with thr_boilerpipe
processed 314
skipped 2
525018299
extracting with thr_boilerpipe
processed 315
skipped 2
525274342
extracting with thr_boilerpipe
processed 316
skipped 2
525762267
extracting with thr_boilerpipe
processed 317
skipped 2
526210669
extracting with thr_boilerpipe
processed 318
skipped 2
526399492
extracting with thr_boilerpipe
processed 319
skipped 2
526437284
extracting with thr_boilerpipe
processed 320
skipped 2
526513004
extracting with thr_boilerpipe
processed 321
skipped 2
526972862
extracting with thr_boilerpipe
processed 322
skipped 2
527258084
extracting with thr_boilerpipe
processed 323
skipped 2
528174162
extracting with thr_boilerpipe
processed 324
skipped 2
528717265
extracting with thr_boilerpipe
processed 325
skipped 2
529134306
extracting with thr_boilerpipe
processed 326
skipped 2
529174292
extracting with thr_boilerpipe
processed 327
skipped 2
529318707
extracting with thr_boilerpipe
processed 328
skipped 2
529350751
extracting with thr_boilerpipe
processed 329
skipped 2
529500550
extracting with thr_boilerpipe
processed 330
skipped 2
529638751
extracting with thr_boilerpipe
processed 331
skipped 2
529645874
extracting with thr_boilerpipe
processed 332
skipped 2
529645941
extracting with thr_boilerpipe
processed 333
skipped 2
529698279
extracting with thr_boilerpipe
processed 334
skipped 2
529792413
extracting with thr_boilerpipe
processed 335
skipped 2
529835888
extracting with thr_boilerpipe
processed 336
skipped 2
529849320
extracting with thr_boilerpipe
processed 337
skipped 2
529860654
extracting with thr_boilerpipe
processed 338
skipped 2
529872483
extracting with thr_boilerpipe
processed 339
skipped 2
529878837
extracting with thr_boilerpipe
processed 340
skipped 2
529887477
extracting with thr_boilerpipe
processed 341
skipped 2
529897403
extracting with thr_boilerpipe
processed 342
skipped 2
529897845
extracting with thr_boilerpipe
processed 343
skipped 2
529985569
extracting with thr_boilerpipe
processed 344
skipped 2
530078770
extracting with thr_boilerpipe
processed 345
skipped 2
530528969
extracting with thr_boilerpipe
processed 346
skipped 2
530758329
extracting with thr_boilerpipe
processed 347
skipped 2
530760478
extracting with thr_boilerpipe
processed 348
skipped 2
531116330
extracting with thr_boilerpipe
processed 349
skipped 2
531271665
extracting with thr_boilerpipe
processed 350
skipped 2
531541254
extracting with thr_boilerpipe
processed 351
skipped 2
531805217
extracting with thr_boilerpipe
processed 352
skipped 2
531997009
extracting with thr_boilerpipe
processed 353
skipped 2
532554918
extracting with thr_boilerpipe
processed 354
skipped 2
532672804
extracting with thr_boilerpipe
processed 355
skipped 2
533194544
extracting with thr_boilerpipe
processed 356
skipped 2
533730844
extracting with thr_boilerpipe
processed 357
skipped 2
533889958
extracting with thr_boilerpipe
processed 358
skipped 2
534093073
extracting with thr_boilerpipe
processed 359
skipped 2
534505078
extracting with thr_boilerpipe
processed 360
skipped 2
534718899
extracting with thr_boilerpipe
processed 361
skipped 2
534742315
extracting with thr_boilerpipe
processed 362
skipped 2
534981910
extracting with thr_boilerpipe
processed 363
skipped 2
535201962
extracting with thr_boilerpipe
processed 364
skipped 2
535469332
extracting with thr_boilerpipe
processed 365
skipped 2
535880958
extracting with thr_boilerpipe
processed 366
skipped 2
536622410
extracting with thr_boilerpipe
processed 367
skipped 2
536622427
extracting with thr_boilerpipe
processed 368
skipped 2
536670142
extracting with thr_boilerpipe
processed 369
skipped 2
536689603
extracting with thr_boilerpipe
processed 370
skipped 2
536697475
extracting with thr_boilerpipe
processed 371
skipped 2
536720884
extracting with thr_boilerpipe
processed 372
skipped 2
536817371
extracting with thr_boilerpipe
processed 373
skipped 2
536828101
extracting with thr_boilerpipe
processed 374
skipped 2
536833731
extracting with thr_boilerpipe
processed 375
skipped 2
536870204
extracting with thr_boilerpipe
processed 376
skipped 2
536977072
extracting with thr_boilerpipe
processed 377
skipped 2
537231678
extracting with thr_boilerpipe
processed 378
skipped 2
537256396
extracting with thr_boilerpipe
processed 379
skipped 2
537501183
extracting with thr_boilerpipe
processed 380
skipped 2
537704893
extracting with thr_boilerpipe
processed 381
skipped 2
538721777
extracting with thr_boilerpipe
processed 382
skipped 2
539056055
extracting with thr_boilerpipe
processed 383
skipped 2
539126425
extracting with thr_boilerpipe
processed 384
skipped 2
539382819
extracting with thr_boilerpipe
processed 385
skipped 2
539387198
extracting with thr_boilerpipe
processed 386
skipped 2
539389371
extracting with thr_boilerpipe
processed 387
skipped 2
539392922
extracting with thr_boilerpipe
processed 388
skipped 2
539411169
extracting with thr_boilerpipe
processed 389
skipped 2
539415012
extracting with thr_boilerpipe
processed 390
skipped 2
539423034
extracting with thr_boilerpipe
processed 391
skipped 2
539444342
extracting with thr_boilerpipe
processed 392
skipped 2
539444757
extracting with thr_boilerpipe
processed 393
skipped 2
539445540
extracting with thr_boilerpipe
processed 394
skipped 2
539453644
extracting with thr_boilerpipe
processed 395
skipped 2
539482866
extracting with thr_boilerpipe
processed 396
skipped 2
539483121
extracting with thr_boilerpipe
processed 397
skipped 2
539535265
extracting with thr_boilerpipe
processed 398
skipped 2
539535898
extracting with thr_boilerpipe
processed 399
skipped 2
539639827
extracting with thr_boilerpipe
processed 400
skipped 2
539693458
extracting with thr_boilerpipe
processed 401
skipped 2
539699310
extracting with thr_boilerpipe
processed 402
skipped 2
539784484
extracting with thr_boilerpipe
processed 403
skipped 2
539855645
extracting with thr_boilerpipe
processed 404
skipped 2
539862886
extracting with thr_boilerpipe
processed 405
skipped 2
540049468
extracting with thr_boilerpipe
processed 406
skipped 2
540072482
extracting with thr_boilerpipe
processed 407
skipped 2
540124060
extracting with thr_boilerpipe
processed 408
skipped 2
540219129
extracting with thr_boilerpipe
processed 409
skipped 2
540246359
extracting with thr_boilerpipe
processed 410
skipped 2
540356681
extracting with thr_boilerpipe
processed 411
skipped 2
540448548
extracting with thr_boilerpipe
processed 412
skipped 2
540628443
extracting with thr_boilerpipe
processed 413
skipped 2
540729123
extracting with thr_boilerpipe
processed 414
skipped 2
540938254
extracting with thr_boilerpipe
processed 415
skipped 2
541044672
extracting with thr_boilerpipe
processed 416
skipped 2
541380062
extracting with thr_boilerpipe
processed 417
skipped 2
541787974
extracting with thr_boilerpipe
processed 418
skipped 2
541944011
extracting with thr_boilerpipe
processed 419
skipped 2
542151181
extracting with thr_boilerpipe
processed 420
skipped 2
542292829
extracting with thr_boilerpipe
processed 421
skipped 2
542601235
extracting with thr_boilerpipe
processed 422
skipped 2
542607593
extracting with thr_boilerpipe
processed 423
skipped 2
542722436
extracting with thr_boilerpipe
processed 424
skipped 2
543002984
extracting with thr_boilerpipe
processed 425
skipped 2
544804153
extracting with thr_boilerpipe
processed 426
skipped 2
545408829
extracting with thr_boilerpipe
processed 427
skipped 2
545490257
extracting with thr_boilerpipe
processed 428
skipped 2
546226391
extracting with thr_boilerpipe
processed 429
skipped 2
546447285
extracting with thr_boilerpipe
processed 430
skipped 2
546452241
extracting with thr_boilerpipe
processed 431
skipped 2
546475377
extracting with thr_boilerpipe
processed 432
skipped 2
546744400
extracting with thr_boilerpipe
processed 433
skipped 2
546865980
extracting with thr_boilerpipe
processed 434
skipped 2
547134278
extracting with thr_boilerpipe
processed 435
skipped 2
547947151
extracting with thr_boilerpipe
processed 436
skipped 2
550077777
extracting with thr_boilerpipe
processed 437
skipped 2
550220223
extracting with thr_boilerpipe
processed 438
skipped 2
550246134
extracting with thr_boilerpipe
processed 439
skipped 2
550918776
extracting with thr_boilerpipe
processed 440
skipped 2
551714821
extracting with thr_boilerpipe
processed 441
skipped 2
551991048
extracting with thr_boilerpipe
processed 442
skipped 2
552179030
extracting with thr_boilerpipe
processed 443
skipped 2
552285278
extracting with thr_boilerpipe
processed 444
skipped 2
552343286
extracting with thr_boilerpipe
processed 445
skipped 2
552450557
extracting with thr_boilerpipe
processed 446
skipped 2
553165338
extracting with thr_boilerpipe
processed 447
skipped 2
554883505
extracting with thr_boilerpipe
processed 448
skipped 2
555372352
extracting with thr_boilerpipe
processed 449
skipped 2
555689825
extracting with thr_boilerpipe
processed 450
skipped 2
555843132
extracting with thr_boilerpipe
processed 451
skipped 2
556645479
extracting with thr_boilerpipe
processed 452
skipped 2
556934454
extracting with thr_boilerpipe
processed 453
skipped 2
557152619
extracting with thr_boilerpipe
processed 454
skipped 2
557237971
extracting with thr_boilerpipe
processed 455
skipped 2
557521276
extracting with thr_boilerpipe
processed 456
skipped 2
558095303
extracting with thr_boilerpipe
processed 457
skipped 2
558197649
extracting with thr_boilerpipe
processed 458
skipped 2
558655687
extracting with thr_boilerpipe
processed 459
skipped 2
558851890
extracting with thr_boilerpipe
processed 460
skipped 2
559736417
extracting with thr_boilerpipe
processed 461
skipped 2
559785151
extracting with thr_boilerpipe
processed 462
skipped 2
560048673
extracting with thr_boilerpipe
processed 463
skipped 2
560090309
extracting with thr_boilerpipe
processed 464
skipped 2
560127916
extracting with thr_boilerpipe
processed 465
skipped 2
560262829
extracting with thr_boilerpipe
processed 466
skipped 2
560310961
extracting with thr_boilerpipe
processed 467
skipped 2
560339085
extracting with thr_boilerpipe
processed 468
skipped 2
560351631
extracting with thr_boilerpipe
processed 469
skipped 2
560378287
extracting with thr_boilerpipe
processed 470
skipped 2
560417790
extracting with thr_boilerpipe
processed 471
skipped 2
560535896
extracting with thr_boilerpipe
processed 472
skipped 2
560707952
extracting with thr_boilerpipe
processed 473
skipped 2
560751009
extracting with thr_boilerpipe
processed 474
skipped 2
560768548
extracting with thr_boilerpipe
processed 475
skipped 2
560842330
extracting with thr_boilerpipe
processed 476
skipped 2
561122957
extracting with thr_boilerpipe
processed 477
skipped 2
561174738
extracting with thr_boilerpipe
processed 478
skipped 2
561368626
extracting with thr_boilerpipe
processed 479
skipped 2
561800981
extracting with thr_boilerpipe
processed 480
skipped 2
562073055
extracting with thr_boilerpipe
processed 481
skipped 2
562399059
extracting with thr_boilerpipe
processed 482
skipped 2
562399486
extracting with thr_boilerpipe
processed 483
skipped 2
562736854
extracting with thr_boilerpipe
processed 484
skipped 2
562742684
extracting with thr_boilerpipe
processed 485
skipped 2
562984785
extracting with thr_boilerpipe
processed 486
skipped 2
563073521
extracting with thr_boilerpipe
processed 487
skipped 2
563250031
extracting with thr_boilerpipe
processed 488
skipped 2
563556588
extracting with thr_boilerpipe
processed 489
skipped 2
563582892
extracting with thr_boilerpipe
processed 490
skipped 2
563851373
extracting with thr_boilerpipe
processed 491
skipped 2
564075589
extracting with thr_boilerpipe
processed 492
skipped 2
564196161
extracting with thr_boilerpipe
processed 493
skipped 2
564418488
extracting with thr_boilerpipe
processed 494
skipped 2
564457008
extracting with thr_boilerpipe
processed 495
skipped 2
565254169
extracting with thr_boilerpipe
processed 496
skipped 2
565270689
extracting with thr_boilerpipe
processed 497
skipped 2
565612572
extracting with thr_boilerpipe
processed 498
skipped 2
565620132
extracting with thr_boilerpipe
processed 499
skipped 2
565774715
extracting with thr_boilerpipe
error on download565774715
File "<ipython-input-73-5462c770f344>", line 17, in <module>
res = comp_extractors( extractor_training_object )
File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
extraction_results = get_extraction_results( eto )
File "<ipython-input-66-716979e1aa41>", line 17, in get_extraction_results
ret['thr_boiler_pipe_ArticleExtractor'] = extract_with_thr_boilerpipe_ArticleExtractor( raw_content)
File "<ipython-input-68-fdb56e8b1b24>", line 21, in <lambda>
extract_with_thr_boilerpipe_ArticleExtractor = lambda raw_content: thrift_bp_extract( raw_content, 'ArticleExtractor')
File "<ipython-input-68-fdb56e8b1b24>", line 4, in thrift_bp_extract
thrift_ret = client.extract_html( raw_content, extractor_type )
File "thriftboilerpipe/ExtractorService.py", line 44, in extract_html
return self.recv_extract_html()
File "thriftboilerpipe/ExtractorService.py", line 62, in recv_extract_html
raise x
TApplicationException: Internal error processing extract_html
Traceback (most recent call last):
(<class 'thrift.Thrift.TApplicationException'>, TApplicationException(None,), <traceback object at 0x7f3c69406098>)
processed 499
skipped 3
565831221
extracting with thr_boilerpipe
processed 500
skipped 3
566175982
extracting with thr_boilerpipe
processed 501
skipped 3
566194155
extracting with thr_boilerpipe
processed 502
skipped 3
566220098
extracting with thr_boilerpipe
processed 503
skipped 3
566546448
extracting with thr_boilerpipe
processed 504
skipped 3
566592645
extracting with thr_boilerpipe
processed 505
skipped 3
566726127
extracting with thr_boilerpipe
processed 506
skipped 3
567132910
extracting with thr_boilerpipe
processed 507
skipped 3
567149912
extracting with thr_boilerpipe
processed 508
skipped 3
567150914
extracting with thr_boilerpipe
processed 509
skipped 3
567201539
extracting with thr_boilerpipe
processed 510
skipped 3
567387189
extracting with thr_boilerpipe
processed 511
skipped 3
567440968
extracting with thr_boilerpipe
processed 512
skipped 3
567594230
extracting with thr_boilerpipe
processed 513
skipped 3
567706084
extracting with thr_boilerpipe
processed 514
skipped 3
567808993
extracting with thr_boilerpipe
processed 515
skipped 3
568058112
extracting with thr_boilerpipe
processed 516
skipped 3
568220153
extracting with thr_boilerpipe
processed 517
skipped 3
568278639
extracting with thr_boilerpipe
processed 518
skipped 3
568314417
extracting with thr_boilerpipe
processed 519
skipped 3
568405515
extracting with thr_boilerpipe
processed 520
skipped 3
568873296
extracting with thr_boilerpipe
processed 521
skipped 3
569385275
extracting with thr_boilerpipe
processed 522
skipped 3
569400896
extracting with thr_boilerpipe
processed 523
skipped 3
569440622
extracting with thr_boilerpipe
error on download569440622
File "<ipython-input-73-5462c770f344>", line 17, in <module>
res = comp_extractors( extractor_training_object )
File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
extraction_results = get_extraction_results( eto )
File "<ipython-input-66-716979e1aa41>", line 17, in get_extraction_results
ret['thr_boiler_pipe_ArticleExtractor'] = extract_with_thr_boilerpipe_ArticleExtractor( raw_content)
File "<ipython-input-68-fdb56e8b1b24>", line 21, in <lambda>
extract_with_thr_boilerpipe_ArticleExtractor = lambda raw_content: thrift_bp_extract( raw_content, 'ArticleExtractor')
File "<ipython-input-68-fdb56e8b1b24>", line 4, in thrift_bp_extract
thrift_ret = client.extract_html( raw_content, extractor_type )
File "thriftboilerpipe/ExtractorService.py", line 44, in extract_html
return self.recv_extract_html()
File "thriftboilerpipe/ExtractorService.py", line 62, in recv_extract_html
raise x
TApplicationException: Internal error processing extract_html
Traceback (most recent call last):
(<class 'thrift.Thrift.TApplicationException'>, TApplicationException(None,), <traceback object at 0x7f3c8a8ca518>)
processed 523
skipped 4
569448540
extracting with thr_boilerpipe
processed 524
skipped 4
569458361
extracting with thr_boilerpipe
processed 525
skipped 4
569473458
extracting with thr_boilerpipe
processed 526
skipped 4
570126417
extracting with thr_boilerpipe
processed 527
skipped 4
570281609
extracting with thr_boilerpipe
processed 528
skipped 4
570420066
extracting with thr_boilerpipe
processed 529
skipped 4
570820516
extracting with thr_boilerpipe
processed 530
skipped 4
571250692
extracting with thr_boilerpipe
processed 531
skipped 4
572427751
extracting with thr_boilerpipe
processed 532
skipped 4
572497331
extracting with thr_boilerpipe
processed 533
skipped 4
572595598
extracting with thr_boilerpipe
processed 534
skipped 4
576800952
extracting with thr_boilerpipe
processed 535
skipped 4
576826346
extracting with thr_boilerpipe
processed 536
skipped 4
576906221
extracting with thr_boilerpipe
processed 537
skipped 4
577070880
extracting with thr_boilerpipe
processed 538
skipped 4
577076453
extracting with thr_boilerpipe
processed 539
skipped 4
577226126
extracting with thr_boilerpipe
error on download577226126
File "<ipython-input-73-5462c770f344>", line 17, in <module>
res = comp_extractors( extractor_training_object )
File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
extraction_results = get_extraction_results( eto )
File "<ipython-input-66-716979e1aa41>", line 17, in get_extraction_results
ret['thr_boiler_pipe_ArticleExtractor'] = extract_with_thr_boilerpipe_ArticleExtractor( raw_content)
File "<ipython-input-68-fdb56e8b1b24>", line 21, in <lambda>
extract_with_thr_boilerpipe_ArticleExtractor = lambda raw_content: thrift_bp_extract( raw_content, 'ArticleExtractor')
File "<ipython-input-68-fdb56e8b1b24>", line 4, in thrift_bp_extract
thrift_ret = client.extract_html( raw_content, extractor_type )
File "thriftboilerpipe/ExtractorService.py", line 44, in extract_html
return self.recv_extract_html()
File "thriftboilerpipe/ExtractorService.py", line 62, in recv_extract_html
raise x
TApplicationException: Internal error processing extract_html
Traceback (most recent call last):
(<class 'thrift.Thrift.TApplicationException'>, TApplicationException(None,), <traceback object at 0x7f3c8a8d51b8>)
processed 539
skipped 5
578103835
extracting with thr_boilerpipe
processed 540
skipped 5
578124694
extracting with thr_boilerpipe
processed 541
skipped 5
578153622
extracting with thr_boilerpipe
processed 542
skipped 5
578156115
extracting with thr_boilerpipe
processed 543
skipped 5
578156372
extracting with thr_boilerpipe
processed 544
skipped 5
578156412
extracting with thr_boilerpipe
processed 545
skipped 5
578167861
extracting with thr_boilerpipe
processed 546
skipped 5
578210309
extracting with thr_boilerpipe
error on download578210309
File "<ipython-input-73-5462c770f344>", line 17, in <module>
res = comp_extractors( extractor_training_object )
File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
extraction_results = get_extraction_results( eto )
File "<ipython-input-66-716979e1aa41>", line 17, in get_extraction_results
ret['thr_boiler_pipe_ArticleExtractor'] = extract_with_thr_boilerpipe_ArticleExtractor( raw_content)
File "<ipython-input-68-fdb56e8b1b24>", line 21, in <lambda>
extract_with_thr_boilerpipe_ArticleExtractor = lambda raw_content: thrift_bp_extract( raw_content, 'ArticleExtractor')
File "<ipython-input-68-fdb56e8b1b24>", line 4, in thrift_bp_extract
thrift_ret = client.extract_html( raw_content, extractor_type )
File "thriftboilerpipe/ExtractorService.py", line 44, in extract_html
return self.recv_extract_html()
File "thriftboilerpipe/ExtractorService.py", line 62, in recv_extract_html
raise x
TApplicationException: Internal error processing extract_html
Traceback (most recent call last):
(<class 'thrift.Thrift.TApplicationException'>, TApplicationException(None,), <traceback object at 0x7f3c8a8ce680>)
processed 546
skipped 6
578210949
extracting with thr_boilerpipe
processed 547
skipped 6
578294304
extracting with thr_boilerpipe
processed 548
skipped 6
578364597
extracting with thr_boilerpipe
processed 549
skipped 6
578371687
extracting with thr_boilerpipe
processed 550
skipped 6
578520886
extracting with thr_boilerpipe
processed 551
skipped 6
578636827
extracting with thr_boilerpipe
processed 552
skipped 6
578653839
extracting with thr_boilerpipe
processed 553
skipped 6
578713987
extracting with thr_boilerpipe
processed
File "<ipython-input-73-5462c770f344>", line 17, in <module>
res = comp_extractors( extractor_training_object )
File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
extraction_results = get_extraction_results( eto )
File "<ipython-input-66-716979e1aa41>", line 17, in get_extraction_results
ret['thr_boiler_pipe_ArticleExtractor'] = extract_with_thr_boilerpipe_ArticleExtractor( raw_content)
File "<ipython-input-68-fdb56e8b1b24>", line 21, in <lambda>
extract_with_thr_boilerpipe_ArticleExtractor = lambda raw_content: thrift_bp_extract( raw_content, 'ArticleExtractor')
File "<ipython-input-68-fdb56e8b1b24>", line 4, in thrift_bp_extract
thrift_ret = client.extract_html( raw_content, extractor_type )
File "thriftboilerpipe/ExtractorService.py", line 44, in extract_html
return self.recv_extract_html()
File "thriftboilerpipe/ExtractorService.py", line 62, in recv_extract_html
raise x
TApplicationException: Internal error processing extract_html
Traceback (most recent call last):
554
skipped 6
578974914
extracting with thr_boilerpipe
error on download578974914
(<class 'thrift.Thrift.TApplicationException'>, TApplicationException(None,), <traceback object at 0x7f3c8a8d6098>)
processed 554
skipped 7
579044624
extracting with thr_boilerpipe
processed 555
skipped 7
580296869
extracting with thr_boilerpipe
processed 556
skipped 7
581963231
extracting with thr_boilerpipe
processed 557
skipped 7
589072496
extracting with thr_boilerpipe
processed 558
skipped 7
589074541
extracting with thr_boilerpipe
processed 559
skipped 7
589074546
extracting with thr_boilerpipe
processed 560
skipped 7
589088428
extracting with thr_boilerpipe
processed 561
skipped 7
589089635
extracting with thr_boilerpipe
processed 562
skipped 7
589090828
extracting with thr_boilerpipe
processed 563
skipped 7
589116902
extracting with thr_boilerpipe
processed 564
skipped 7
589116915
extracting with thr_boilerpipe
processed 565
skipped 7
589129653
extracting with thr_boilerpipe
processed 566
skipped 7
589153423
extracting with thr_boilerpipe
processed 567
skipped 7
589156380
extracting with thr_boilerpipe
processed 568
skipped 7
589208129
extracting with thr_boilerpipe
processed 569
skipped 7
589213448
extracting with thr_boilerpipe
processed 570
skipped 7
589213994
extracting with thr_boilerpipe
processed 571
skipped 7
589224922
extracting with thr_boilerpipe
processed 572
skipped 7
589225296
extracting with thr_boilerpipe
processed 573
skipped 7
589239170
extracting with thr_boilerpipe
processed 574
skipped 7
589240076
extracting with thr_boilerpipe
processed 575
skipped 7
589261134
extracting with thr_boilerpipe
processed 576
skipped 7
589261136
extracting with thr_boilerpipe
processed 577
skipped 7
589273978
extracting with thr_boilerpipe
processed 578
skipped 7
589285851
extracting with thr_boilerpipe
processed 579
skipped 7
589285856
extracting with thr_boilerpipe
processed 580
skipped 7
589298598
extracting with thr_boilerpipe
processed 581
skipped 7
589305788
extracting with thr_boilerpipe
processed 582
skipped 7
589310534
extracting with thr_boilerpipe
processed 583
skipped 7
589316558
extracting with thr_boilerpipe
processed 584
skipped 7
589324992
extracting with thr_boilerpipe
processed 585
skipped 7
589326873
extracting with thr_boilerpipe
processed 586
skipped 7
589335687
extracting with thr_boilerpipe
processed 587
skipped 7
589355839
extracting with thr_boilerpipe
processed 588
skipped 7
589368807
extracting with thr_boilerpipe
processed 589
skipped 7
589371772
extracting with thr_boilerpipe
processed 590
skipped 7
589377853
extracting with thr_boilerpipe
processed 591
skipped 7
589384584
extracting with thr_boilerpipe
processed 592
skipped 7
589440172
extracting with thr_boilerpipe
processed 593
skipped 7
589502987
extracting with thr_boilerpipe
processed 594
skipped 7
589513642
extracting with thr_boilerpipe
processed 595
skipped 7
589568015
extracting with thr_boilerpipe
processed 596
skipped 7
589625611
extracting with thr_boilerpipe
processed 597
skipped 7
589655068
extracting with thr_boilerpipe
processed 598
skipped 7
589674386
extracting with thr_boilerpipe
processed 599
skipped 7
589683282
extracting with thr_boilerpipe
processed 600
skipped 7
589686438
extracting with thr_boilerpipe
processed 601
skipped 7
589754761
extracting with thr_boilerpipe
processed 602
skipped 7
589755411
extracting with thr_boilerpipe
processed 603
skipped 7
589755612
extracting with thr_boilerpipe
processed 604
skipped 7
589758021
extracting with thr_boilerpipe
processed 605
skipped 7
589768387
extracting with thr_boilerpipe
processed 606
skipped 7
589786104
extracting with thr_boilerpipe
processed 607
skipped 7
589786154
extracting with thr_boilerpipe
processed 608
skipped 7
589786414
extracting with thr_boilerpipe
error on download589786414
File "<ipython-input-73-5462c770f344>", line 17, in <module>
res = comp_extractors( extractor_training_object )
File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
extraction_results = get_extraction_results( eto )
File "<ipython-input-66-716979e1aa41>", line 17, in get_extraction_results
ret['thr_boiler_pipe_ArticleExtractor'] = extract_with_thr_boilerpipe_ArticleExtractor( raw_content)
File "<ipython-input-68-fdb56e8b1b24>", line 21, in <lambda>
extract_with_thr_boilerpipe_ArticleExtractor = lambda raw_content: thrift_bp_extract( raw_content, 'ArticleExtractor')
File "<ipython-input-68-fdb56e8b1b24>", line 4, in thrift_bp_extract
thrift_ret = client.extract_html( raw_content, extractor_type )
File "thriftboilerpipe/ExtractorService.py", line 44, in extract_html
return self.recv_extract_html()
File "thriftboilerpipe/ExtractorService.py", line 62, in recv_extract_html
raise x
TApplicationException: Internal error processing extract_html
Traceback (most recent call last):
(<class 'thrift.Thrift.TApplicationException'>, TApplicationException(None,), <traceback object at 0x7f3c692f25a8>)
processed 608
skipped 8
589862512
extracting with thr_boilerpipe
processed 609
skipped 8
589862834
extracting with thr_boilerpipe
processed 610
skipped 8
589862835
extracting with thr_boilerpipe
processed 611
skipped 8
589867651
extracting with thr_boilerpipe
processed 612
skipped 8
589877044
extracting with thr_boilerpipe
processed 613
skipped 8
589878988
extracting with thr_boilerpipe
processed 614
skipped 8
589902239
extracting with thr_boilerpipe
processed 615
skipped 8
589902589
extracting with thr_boilerpipe
processed 616
skipped 8
589903803
extracting with thr_boilerpipe
processed 617
skipped 8
589912699
extracting with thr_boilerpipe
processed 618
skipped 8
589925438
extracting with thr_boilerpipe
processed 619
skipped 8
589929254
extracting with thr_boilerpipe
processed 620
skipped 8
589929573
extracting with thr_boilerpipe
processed 621
skipped 8
589934787
extracting with thr_boilerpipe
processed 622
skipped 8
589938190
extracting with thr_boilerpipe
processed 623
skipped 8
589945649
extracting with thr_boilerpipe
processed 624
skipped 8
589946203
extracting with thr_boilerpipe
processed 625
skipped 8
589956136
extracting with thr_boilerpipe
processed 626
skipped 8
589985214
extracting with thr_boilerpipe
processed 627
skipped 8
589992578
extracting with thr_boilerpipe
processed 628
skipped 8
589992873
extracting with thr_boilerpipe
processed 629
skipped 8
590003031
extracting with thr_boilerpipe
processed 630
skipped 8
590003045
extracting with thr_boilerpipe
processed 631
skipped 8
590033845
extracting with thr_boilerpipe
processed 632
skipped 8
590033849
extracting with thr_boilerpipe
processed 633
skipped 8
590033852
extracting with thr_boilerpipe
processed 634
skipped 8
590057803
extracting with thr_boilerpipe
processed 635
skipped 8
590237580
extracting with thr_boilerpipe
processed 636
skipped 8
590249522
extracting with thr_boilerpipe
processed 637
skipped 8
590264671
extracting with thr_boilerpipe
processed 638
skipped 8
590293497
extracting with thr_boilerpipe
processed 639
skipped 8
590323886
extracting with thr_boilerpipe
processed 640
skipped 8
590323913
extracting with thr_boilerpipe
processed 641
skipped 8
590324177
extracting with thr_boilerpipe
processed 642
skipped 8
590338530
extracting with thr_boilerpipe
processed 643
skipped 8
590339267
extracting with thr_boilerpipe
processed 644
skipped 8
590340446
extracting with thr_boilerpipe
processed 645
skipped 8
590351087
extracting with thr_boilerpipe
processed 646
skipped 8
590353661
extracting with thr_boilerpipe
processed 647
skipped 8
590356585
extracting with thr_boilerpipe
processed 648
skipped 8
590356597
extracting with thr_boilerpipe
processed 649
skipped 8
590386215
extracting with thr_boilerpipe
processed 650
skipped 8
590386218
extracting with thr_boilerpipe
processed 651
skipped 8
590386225
extracting with thr_boilerpipe
processed 652
skipped 8
590397660
extracting with thr_boilerpipe
processed 653
skipped 8
590398422
extracting with thr_boilerpipe
processed 654
skipped 8
590399931
extracting with thr_boilerpipe
processed 655
skipped 8
590411501
extracting with thr_boilerpipe
processed 656
skipped 8
590424369
extracting with thr_boilerpipe
processed 657
skipped 8
590425749
extracting with thr_boilerpipe
processed 658
skipped 8
590425776
extracting with thr_boilerpipe
processed 659
skipped 8
590425824
extracting with thr_boilerpipe
processed 660
skipped 8
590437581
extracting with thr_boilerpipe
processed 661
skipped 8
590441661
extracting with thr_boilerpipe
processed 662
skipped 8
590441662
extracting with thr_boilerpipe
processed 663
skipped 8
590460432
extracting with thr_boilerpipe
processed 664
skipped 8
590462467
extracting with thr_boilerpipe
processed 665
skipped 8
590512426
extracting with thr_boilerpipe
processed 666
skipped 8
590514891
extracting with thr_boilerpipe
processed 667
skipped 8
590515180
extracting with thr_boilerpipe
processed 668
skipped 8
590529801
extracting with thr_boilerpipe
processed 669
skipped 8
590531075
extracting with thr_boilerpipe
processed 670
skipped 8
590542372
extracting with thr_boilerpipe
processed 671
skipped 8
590542427
extracting with thr_boilerpipe
processed 672
skipped 8
590551407
extracting with thr_boilerpipe
processed 673
skipped 8
590552268
extracting with thr_boilerpipe
processed 674
skipped 8
590552363
extracting with thr_boilerpipe
processed 675
skipped 8
590552784
extracting with thr_boilerpipe
processed 676
skipped 8
590560741
extracting with thr_boilerpipe
processed 677
skipped 8
590576816
extracting with thr_boilerpipe
processed 678
skipped 8
590585094
extracting with thr_boilerpipe
processed 679
skipped 8
590593066
extracting with thr_boilerpipe
processed 680
skipped 8
590596271
extracting with thr_boilerpipe
processed 681
skipped 8
590605895
extracting with thr_boilerpipe
processed 682
skipped 8
590620426
extracting with thr_boilerpipe
processed 683
skipped 8
590623508
extracting with thr_boilerpipe
processed 684
skipped 8
590623511
extracting with thr_boilerpipe
processed 685
skipped 8
590639137
extracting with thr_boilerpipe
processed 686
skipped 8
590646856
extracting with thr_boilerpipe
processed 687
skipped 8
590647873
extracting with thr_boilerpipe
processed 688
skipped 8
590654819
extracting with thr_boilerpipe
processed 689
skipped 8
590661408
extracting with thr_boilerpipe
processed 690
skipped 8
590685837
extracting with thr_boilerpipe
processed 691
skipped 8
590720858
extracting with thr_boilerpipe
processed 692
skipped 8
590806668
extracting with thr_boilerpipe
processed 693
skipped 8
590841021
extracting with thr_boilerpipe
processed 694
skipped 8
590877233
extracting with thr_boilerpipe
processed 695
skipped 8
590917078
extracting with thr_boilerpipe
processed 696
skipped 8
590918616
extracting with thr_boilerpipe
processed 697
skipped 8
590918621
extracting with thr_boilerpipe
processed 698
skipped 8
590929956
extracting with thr_boilerpipe
processed 699
skipped 8
590957300
extracting with thr_boilerpipe
processed 700
skipped 8
590957745
extracting with thr_boilerpipe
processed 701
skipped 8
590961196
extracting with thr_boilerpipe
processed 702
skipped 8
590962451
extracting with thr_boilerpipe
processed 703
skipped 8
590975183
extracting with thr_boilerpipe
processed 704
skipped 8
590975515
extracting with thr_boilerpipe
processed 705
skipped 8
590975517
extracting with thr_boilerpipe
processed 706
skipped 8
590976484
extracting with thr_boilerpipe
processed 707
skipped 8
591998702
extracting with thr_boilerpipe
processed 708
skipped 8
591998982
extracting with thr_boilerpipe
Total_time 0:05:07.021219
Time per download 0:00:00.428202
File "<ipython-input-73-5462c770f344>", line 17, in <module>
res = comp_extractors( extractor_training_object )
File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
extraction_results = get_extraction_results( eto )
File "<ipython-input-66-716979e1aa41>", line 17, in get_extraction_results
ret['thr_boiler_pipe_ArticleExtractor'] = extract_with_thr_boilerpipe_ArticleExtractor( raw_content)
File "<ipython-input-68-fdb56e8b1b24>", line 21, in <lambda>
extract_with_thr_boilerpipe_ArticleExtractor = lambda raw_content: thrift_bp_extract( raw_content, 'ArticleExtractor')
File "<ipython-input-68-fdb56e8b1b24>", line 4, in thrift_bp_extract
thrift_ret = client.extract_html( raw_content, extractor_type )
File "thriftboilerpipe/ExtractorService.py", line 44, in extract_html
return self.recv_extract_html()
File "thriftboilerpipe/ExtractorService.py", line 62, in recv_extract_html
raise x
TApplicationException: Internal error processing extract_html
In [65]:
comps_downloads = cPickle.load( file(
os.path.expanduser( "~/Dropbox/mc/extractor_test/comps_downloads.pickle"), "rb") )
In [74]:
comps_downloads[0]
Out[74]:
{'downloads_id': 391881020,
'gold': {'f1': 1.0, 'precision': 1.0, 'recall': 1.0},
'media_id': 83371,
'py_boiler_pipe_ArticleExtractor': {'f1': 0.898148148148148,
'precision': 0.8220338983050848,
'recall': 0.9897959183673469},
'py_boiler_pipe_ArticleSentencesExtractor': {'f1': 0,
'precision': 0.0,
'recall': 0.0},
'py_boiler_pipe_CanolaExtractor': {'f1': 0.8729016786570742,
'precision': 0.8235294117647058,
'recall': 0.9285714285714286},
'py_boiler_pipe_DefaultExtractor': {'f1': 0.8656036446469247,
'precision': 0.7818930041152263,
'recall': 0.9693877551020408},
'py_boiler_pipe_KeepEverythingExtractor': {'f1': 0.4714459295261239,
'precision': 0.3094098883572568,
'recall': 0.9897959183673469},
'py_boiler_pipe_LargestContentExtractor': {'f1': 0.9440389294403893,
'precision': 0.9023255813953488,
'recall': 0.9897959183673469},
'py_boiler_pipe_NumWordsRulesExtractor': {'f1': 0.900473933649289,
'precision': 0.8407079646017699,
'recall': 0.9693877551020408},
'python_readibilty': {'f1': 0.9411764705882353,
'precision': 0.9435897435897436,
'recall': 0.9387755102040817},
'story_is_spidered': False,
'thr_boiler_pipe_ArticleExtractor': {'f1': 0.938875305623472,
'precision': 0.9014084507042254,
'recall': 0.9795918367346939},
'thr_boiler_pipe_DefaultExtractor': {'f1': 0.7840670859538783,
'precision': 0.6654804270462633,
'recall': 0.9540816326530612}}
In [75]:
df = get_data_frame_from_comparision_objects( comps_downloads )
print_results_by_measurement_type( df )
precision_gold precision_py_boiler_pipe_ArticleExtractor \
count 709 709.000000
mean 1 0.743752
std 0 0.283451
min 1 0.000000
2% 1 0.004002
5% 1 0.023810
10% 1 0.190711
50% 1 0.861842
max 1 0.990968
precision_py_boiler_pipe_ArticleSentencesExtractor \
count 709.000000
mean 0.000437
std 0.011624
min 0.000000
2% 0.000000
5% 0.000000
10% 0.000000
50% 0.000000
max 0.309524
precision_py_boiler_pipe_CanolaExtractor \
count 709.000000
mean 0.559867
std 0.289836
min 0.000000
2% 0.004046
5% 0.053150
10% 0.125632
50% 0.627586
max 0.982085
precision_py_boiler_pipe_DefaultExtractor \
count 709.000000
mean 0.633871
std 0.282888
min 0.000000
2% 0.000000
5% 0.028471
10% 0.165822
50% 0.726000
max 0.993469
precision_py_boiler_pipe_KeepEverythingExtractor \
count 709.000000
mean 0.351158
std 0.246308
min 0.000000
2% 0.012139
5% 0.039472
10% 0.067484
50% 0.305634
max 0.956033
precision_py_boiler_pipe_LargestContentExtractor \
count 709.000000
mean 0.744173
std 0.303411
min 0.000000
2% 0.004587
5% 0.018513
10% 0.047491
50% 0.870968
max 0.994749
precision_py_boiler_pipe_NumWordsRulesExtractor \
count 709.000000
mean 0.654955
std 0.276770
min 0.000000
2% 0.003950
5% 0.069395
10% 0.217952
50% 0.760870
max 0.991265
precision_python_readibilty \
count 709.000000
mean 0.909932
std 0.196690
min 0.009811
2% 0.073263
5% 0.409343
10% 0.780636
50% 0.977199
max 1.000000
precision_thr_boiler_pipe_ArticleExtractor \
count 709.000000
mean 0.628442
std 0.398688
min 0.000000
2% 0.003212
5% 0.008029
10% 0.019322
50% 0.878049
max 1.000000
precision_thr_boiler_pipe_DefaultExtractor
count 709.000000
mean 0.555218
std 0.349223
min 0.000000
2% 0.005259
5% 0.012785
10% 0.024934
50% 0.637229
max 1.000000
recall_gold recall_py_boiler_pipe_ArticleExtractor \
count 709 709.000000
mean 1 0.805785
std 0 0.274766
min 1 0.000000
2% 1 0.022585
5% 1 0.058824
10% 1 0.233867
50% 1 0.920000
max 1 1.000000
recall_py_boiler_pipe_ArticleSentencesExtractor \
count 709.000000
mean 0.000029
std 0.000769
min 0.000000
2% 0.000000
5% 0.000000
10% 0.000000
50% 0.000000
max 0.020472
recall_py_boiler_pipe_CanolaExtractor \
count 709.000000
mean 0.809303
std 0.200048
min 0.000000
2% 0.081745
5% 0.333653
10% 0.577662
50% 0.880184
max 0.997429
recall_py_boiler_pipe_DefaultExtractor \
count 709.000000
mean 0.780991
std 0.249576
min 0.000000
2% 0.000000
5% 0.095378
10% 0.353777
50% 0.874126
max 0.997429
recall_py_boiler_pipe_KeepEverythingExtractor \
count 709.000000
mean 0.914876
std 0.118623
min 0.000000
2% 0.538462
5% 0.782886
10% 0.865614
50% 0.938620
max 1.000000
recall_py_boiler_pipe_LargestContentExtractor \
count 709.000000
mean 0.701460
std 0.317048
min 0.000000
2% 0.011802
5% 0.034231
10% 0.075996
50% 0.857143
max 0.997409
recall_py_boiler_pipe_NumWordsRulesExtractor recall_python_readibilty \
count 709.000000 709.000000
mean 0.833080 0.859992
std 0.196529 0.246466
min 0.000000 0.002874
2% 0.080144 0.038143
5% 0.310962 0.093615
10% 0.641447 0.596880
50% 0.891654 0.954128
max 0.997852 1.000000
recall_thr_boiler_pipe_ArticleExtractor \
count 709.000000
mean 0.706731
std 0.404080
min 0.000000
2% 0.008620
5% 0.019078
10% 0.040008
50% 0.975962
max 1.000000
recall_thr_boiler_pipe_DefaultExtractor
count 709.000000
mean 0.751379
std 0.360973
min 0.000000
2% 0.017128
5% 0.032103
10% 0.053490
50% 0.954274
max 1.000000
f1_gold f1_py_boiler_pipe_ArticleExtractor \
count 709 709.000000
mean 1 0.756813
std 0 0.280207
min 1 0.000000
2% 1 0.007380
5% 1 0.030489
10% 1 0.165069
50% 1 0.882155
max 1 0.994777
f1_py_boiler_pipe_ArticleSentencesExtractor \
count 709.000000
mean 0.000054
std 0.001442
min 0.000000
2% 0.000000
5% 0.000000
10% 0.000000
50% 0.000000
max 0.038405
f1_py_boiler_pipe_CanolaExtractor f1_py_boiler_pipe_DefaultExtractor \
count 709.000000 709.000000
mean 0.623477 0.670189
std 0.268275 0.271653
min 0.000000 0.000000
2% 0.007836 0.000000
5% 0.094016 0.033595
10% 0.203038 0.187118
50% 0.699983 0.777651
max 0.986559 0.986933
f1_py_boiler_pipe_KeepEverythingExtractor \
count 709.000000
mean 0.463295
std 0.257087
min 0.000000
2% 0.023895
5% 0.075663
10% 0.126259
50% 0.458901
max 0.975148
f1_py_boiler_pipe_LargestContentExtractor \
count 709.000000
mean 0.707175
std 0.307054
min 0.000000
2% 0.008247
5% 0.022703
10% 0.055075
50% 0.848148
max 0.987313
f1_py_boiler_pipe_NumWordsRulesExtractor f1_python_readibilty \
count 709.000000 709.000000
mean 0.704248 0.861987
std 0.254059 0.242245
min 0.000000 0.005525
2% 0.007593 0.040732
5% 0.117909 0.127670
10% 0.318135 0.568254
50% 0.804428 0.950690
max 0.989595 1.000000
f1_thr_boiler_pipe_ArticleExtractor \
count 709.000000
mean 0.639403
std 0.402629
min 0.000000
2% 0.005083
5% 0.010944
10% 0.025470
50% 0.901982
max 1.000000
f1_thr_boiler_pipe_DefaultExtractor
count 709.000000
mean 0.590392
std 0.349566
min 0.000000
2% 0.008021
5% 0.016681
10% 0.033278
50% 0.695518
max 0.998779
In [76]:
non_spidered_downloads = remove_spidered_downloads( comps_downloads )
df = get_data_frame_from_comparision_objects( non_spidered_downloads )
print_results_by_measurement_type( df )
precision_gold precision_py_boiler_pipe_ArticleExtractor \
count 601 601.000000
mean 1 0.740157
std 0 0.279127
min 1 0.000000
2% 1 0.003984
5% 1 0.023438
10% 1 0.213873
50% 1 0.859296
max 1 0.988089
precision_py_boiler_pipe_ArticleSentencesExtractor \
count 601.000000
mean 0.000515
std 0.012626
min 0.000000
2% 0.000000
5% 0.000000
10% 0.000000
50% 0.000000
max 0.309524
precision_py_boiler_pipe_CanolaExtractor \
count 601.000000
mean 0.549850
std 0.285852
min 0.000000
2% 0.002198
5% 0.042590
10% 0.125749
50% 0.609137
max 0.979773
precision_py_boiler_pipe_DefaultExtractor \
count 601.000000
mean 0.629166
std 0.277645
min 0.000000
2% 0.000000
5% 0.017544
10% 0.174323
50% 0.713615
max 0.981651
precision_py_boiler_pipe_KeepEverythingExtractor \
count 601.000000
mean 0.329332
std 0.233642
min 0.000000
2% 0.010258
5% 0.036174
10% 0.062707
50% 0.285622
max 0.913591
precision_py_boiler_pipe_LargestContentExtractor \
count 601.000000
mean 0.745410
std 0.296023
min 0.000000
2% 0.003861
5% 0.020619
10% 0.064394
50% 0.862944
max 0.982402
precision_py_boiler_pipe_NumWordsRulesExtractor \
count 601.000000
mean 0.653895
std 0.270699
min 0.000000
2% 0.003175
5% 0.067548
10% 0.236220
50% 0.755814
max 0.982402
precision_python_readibilty \
count 601.000000
mean 0.924880
std 0.165005
min 0.013793
2% 0.241379
5% 0.615385
10% 0.849057
50% 0.977778
max 1.000000
precision_thr_boiler_pipe_ArticleExtractor \
count 601.000000
mean 0.609960
std 0.404626
min 0.000000
2% 0.003058
5% 0.006623
10% 0.018248
50% 0.870079
max 1.000000
precision_thr_boiler_pipe_DefaultExtractor
count 601.000000
mean 0.534960
std 0.350879
min 0.000000
2% 0.005222
5% 0.010448
10% 0.021786
50% 0.608193
max 1.000000
recall_gold recall_py_boiler_pipe_ArticleExtractor \
count 601 601.000000
mean 1 0.812731
std 0 0.265081
min 1 0.000000
2% 1 0.022222
5% 1 0.058824
10% 1 0.310811
50% 1 0.917178
max 1 1.000000
recall_py_boiler_pipe_ArticleSentencesExtractor \
count 601.000000
mean 0.000034
std 0.000835
min 0.000000
2% 0.000000
5% 0.000000
10% 0.000000
50% 0.000000
max 0.020472
recall_py_boiler_pipe_CanolaExtractor \
count 601.000000
mean 0.802074
std 0.203839
min 0.000000
2% 0.052632
5% 0.307692
10% 0.571429
50% 0.871795
max 0.997429
recall_py_boiler_pipe_DefaultExtractor \
count 601.000000
mean 0.773389
std 0.255432
min 0.000000
2% 0.000000
5% 0.080000
10% 0.319372
50% 0.872222
max 0.997429
recall_py_boiler_pipe_KeepEverythingExtractor \
count 601.000000
mean 0.914629
std 0.117758
min 0.000000
2% 0.692308
5% 0.789474
10% 0.865169
50% 0.937050
max 1.000000
recall_py_boiler_pipe_LargestContentExtractor \
count 601.000000
mean 0.717584
std 0.306372
min 0.000000
2% 0.010778
5% 0.034483
10% 0.093750
50% 0.859649
max 0.997409
recall_py_boiler_pipe_NumWordsRulesExtractor recall_python_readibilty \
count 601.000000 601.000000
mean 0.830285 0.867044
std 0.196843 0.227136
min 0.000000 0.012255
2% 0.052632 0.049689
5% 0.392947 0.166667
10% 0.641026 0.657534
50% 0.890957 0.950820
max 0.997852 1.000000
recall_thr_boiler_pipe_ArticleExtractor \
count 601.000000
mean 0.692083
std 0.412559
min 0.000000
2% 0.007156
5% 0.017483
10% 0.034446
50% 0.974194
max 1.000000
recall_thr_boiler_pipe_DefaultExtractor
count 601.000000
mean 0.730151
std 0.376488
min 0.000000
2% 0.016484
5% 0.027933
10% 0.046092
50% 0.954248
max 1.000000
f1_gold f1_py_boiler_pipe_ArticleExtractor \
count 601 601.000000
mean 1 0.759977
std 0 0.271806
min 1 0.000000
2% 1 0.007246
5% 1 0.029412
10% 1 0.256983
50% 1 0.879257
max 1 0.987313
f1_py_boiler_pipe_ArticleSentencesExtractor \
count 601.000000
mean 0.000064
std 0.001567
min 0.000000
2% 0.000000
5% 0.000000
10% 0.000000
50% 0.000000
max 0.038405
f1_py_boiler_pipe_CanolaExtractor f1_py_boiler_pipe_DefaultExtractor \
count 601.000000 601.000000
mean 0.617464 0.668067
std 0.266504 0.269835
min 0.000000 0.000000
2% 0.004315 0.000000
5% 0.080780 0.023256
10% 0.189921 0.187135
50% 0.691293 0.772388
max 0.974256 0.986933
f1_py_boiler_pipe_KeepEverythingExtractor \
count 601.000000
mean 0.443883
std 0.249808
min 0.000000
2% 0.020263
5% 0.066079
10% 0.117464
50% 0.434783
max 0.946723
f1_py_boiler_pipe_LargestContentExtractor \
count 601.000000
mean 0.718765
std 0.297337
min 0.000000
2% 0.007194
5% 0.024818
10% 0.062827
50% 0.851711
max 0.987313
f1_py_boiler_pipe_NumWordsRulesExtractor f1_python_readibilty \
count 601.000000 601.000000
mean 0.707795 0.874278
std 0.247907 0.218192
min 0.000000 0.023585
2% 0.006314 0.072539
5% 0.110977 0.181818
10% 0.359116 0.708571
50% 0.803134 0.948718
max 0.987313 1.000000
f1_thr_boiler_pipe_ArticleExtractor \
count 601.000000
mean 0.624692
std 0.408753
min 0.000000
2% 0.004386
5% 0.010417
10% 0.023077
50% 0.893434
max 1.000000
f1_thr_boiler_pipe_DefaultExtractor
count 601.000000
mean 0.572334
std 0.356012
min 0.000000
2% 0.007509
5% 0.014286
10% 0.028986
50% 0.663594
max 0.998779
In [77]:
print "spidered"
df = get_data_frame_from_comparision_objects( only_spidered_downloads( comps_downloads ) )
print_results_by_measurement_type( df )
spidered
precision_gold precision_py_boiler_pipe_ArticleExtractor \
count 108 108.000000
mean 1 0.763759
std 0 0.307015
min 1 0.002899
2% 1 0.013435
5% 1 0.029351
10% 1 0.076326
50% 1 0.887346
max 1 0.990968
precision_py_boiler_pipe_ArticleSentencesExtractor \
count 108
mean 0
std 0
min 0
2% 0
5% 0
10% 0
50% 0
max 0
precision_py_boiler_pipe_CanolaExtractor \
count 108.000000
mean 0.615616
std 0.306512
min 0.002242
2% 0.052741
5% 0.098303
10% 0.134120
50% 0.730356
max 0.982085
precision_py_boiler_pipe_DefaultExtractor \
count 108.000000
mean 0.660055
std 0.310547
min 0.000000
2% 0.018757
5% 0.061666
10% 0.125806
50% 0.816143
max 0.993469
precision_py_boiler_pipe_KeepEverythingExtractor \
count 108.000000
mean 0.472615
std 0.278852
min 0.002789
2% 0.044233
5% 0.062028
10% 0.090576
50% 0.484823
max 0.956033
precision_py_boiler_pipe_LargestContentExtractor \
count 108.000000
mean 0.737291
std 0.343068
min 0.003731
2% 0.009898
5% 0.012590
10% 0.030961
50% 0.896185
max 0.994749
precision_py_boiler_pipe_NumWordsRulesExtractor \
count 108.000000
mean 0.660853
std 0.309699
min 0.003404
2% 0.054765
5% 0.098685
10% 0.154611
50% 0.799741
max 0.991265
precision_python_readibilty \
count 108.000000
mean 0.826749
std 0.308338
min 0.009811
2% 0.027668
5% 0.069709
10% 0.177386
50% 0.974176
max 1.000000
precision_thr_boiler_pipe_ArticleExtractor \
count 108.000000
mean 0.731285
std 0.347983
min 0.003079
2% 0.008216
5% 0.017428
10% 0.041506
50% 0.913215
max 0.996908
precision_thr_boiler_pipe_DefaultExtractor
count 108.000000
mean 0.667947
std 0.318537
min 0.003623
2% 0.025591
5% 0.069110
10% 0.134852
50% 0.797931
max 0.998761
recall_gold recall_py_boiler_pipe_ArticleExtractor \
count 108 108.000000
mean 1 0.767135
std 0 0.322093
min 1 0.012876
2% 1 0.046481
5% 1 0.060836
10% 1 0.087225
50% 1 0.929098
max 1 1.000000
recall_py_boiler_pipe_ArticleSentencesExtractor \
count 108
mean 0
std 0
min 0
2% 0
5% 0
10% 0
50% 0
max 0
recall_py_boiler_pipe_CanolaExtractor \
count 108.000000
mean 0.849529
std 0.172865
min 0.156566
2% 0.196731
5% 0.439646
10% 0.703911
50% 0.906307
max 0.994856
recall_py_boiler_pipe_DefaultExtractor \
count 108.000000
mean 0.823295
std 0.210135
min 0.000000
2% 0.109326
5% 0.287291
10% 0.609275
50% 0.886602
max 0.994580
recall_py_boiler_pipe_KeepEverythingExtractor \
count 108.000000
mean 0.916251
std 0.123883
min 0.204773
2% 0.509903
5% 0.752874
10% 0.881987
50% 0.943998
max 1.000000
recall_py_boiler_pipe_LargestContentExtractor \
count 108.000000
mean 0.611730
std 0.359432
min 0.005025
2% 0.013555
5% 0.032341
10% 0.051268
50% 0.719308
max 0.986637
recall_py_boiler_pipe_NumWordsRulesExtractor recall_python_readibilty \
count 108.000000 108.000000
mean 0.848634 0.820751
std 0.194951 0.332890
min 0.079292 0.002874
2% 0.130391 0.016990
5% 0.294764 0.037407
10% 0.710042 0.081159
50% 0.900348 0.971610
max 0.987930 1.000000
recall_thr_boiler_pipe_ArticleExtractor \
count 108.000000
mean 0.788241
std 0.343611
min 0.014540
2% 0.032170
5% 0.058052
10% 0.089529
50% 0.979877
max 1.000000
recall_thr_boiler_pipe_DefaultExtractor
count 108.000000
mean 0.869507
std 0.225256
min 0.087533
2% 0.114921
5% 0.271450
10% 0.543603
50% 0.954387
max 1.000000
f1_gold f1_py_boiler_pipe_ArticleExtractor \
count 108 108.000000
mean 1 0.739209
std 0 0.323862
min 1 0.005777
2% 1 0.015187
5% 1 0.040443
10% 1 0.069610
50% 1 0.896467
max 1 0.994777
f1_py_boiler_pipe_ArticleSentencesExtractor \
count 108
mean 0
std 0
min 0
2% 0
5% 0
10% 0
50% 0
max 0
f1_py_boiler_pipe_CanolaExtractor f1_py_boiler_pipe_DefaultExtractor \
count 108.000000 108.000000
mean 0.656935 0.681997
std 0.276804 0.282563
min 0.004464 0.000000
2% 0.099180 0.033732
5% 0.177400 0.100176
10% 0.232113 0.200943
50% 0.783502 0.811930
max 0.986559 0.981556
f1_py_boiler_pipe_KeepEverythingExtractor \
count 108.000000
mean 0.571323
std 0.271120
min 0.005559
2% 0.084523
5% 0.115850
10% 0.165602
50% 0.602696
max 0.975148
f1_py_boiler_pipe_LargestContentExtractor \
count 108.000000
mean 0.642674
std 0.350910
min 0.004283
2% 0.013151
5% 0.020266
10% 0.035722
50% 0.796850
max 0.970168
f1_py_boiler_pipe_NumWordsRulesExtractor f1_python_readibilty \
count 108.000000 108.000000
mean 0.684515 0.793592
std 0.286360 0.340243
min 0.006780 0.005525
2% 0.102864 0.018270
5% 0.140361 0.036748
10% 0.218346 0.080691
50% 0.824475 0.961409
max 0.989595 1.000000
f1_thr_boiler_pipe_ArticleExtractor \
count 108.000000
mean 0.721266
std 0.357463
min 0.006138
2% 0.011561
5% 0.029779
10% 0.062218
50% 0.909680
max 0.997525
f1_thr_boiler_pipe_DefaultExtractor
count 108.000000
mean 0.690878
std 0.292937
min 0.007220
2% 0.048827
5% 0.112937
10% 0.187165
50% 0.833973
max 0.998452
In [78]:
regional = { 2453107 }
print "region / pew knight study / 245107 "
df = get_data_frame_from_comparision_objects( filter_by_media_tags_id( non_spidered_downloads, regional ) )
print_results_by_measurement_type( df )
ap_english_us_top_25 = { 8875027 }
print "ap_english_us_top25 / 8875027 "
df = get_data_frame_from_comparision_objects( filter_by_media_tags_id( non_spidered_downloads, ap_english_us_top_25 ) )
print_results_by_measurement_type( df )
political_blogs = { 125 }
print "political blogs / 125"
df = get_data_frame_from_comparision_objects( filter_by_media_tags_id( non_spidered_downloads, political_blogs ) )
print_results_by_measurement_type( df )
russian = { 7796878 }
print 'russian'
df = get_data_frame_from_comparision_objects( filter_by_media_tags_id( non_spidered_downloads, russian ) )
print_results_by_measurement_type( df )
print 'brazil'
df = get_data_frame_from_comparision_objects( filter_by_media_tags_id( non_spidered_downloads, {8877968, 8877969, 8877973, 8877970 } ) )
print_results_by_measurement_type( df )
arabic = { 8878255 }
print 'arabic'
df = get_data_frame_from_comparision_objects( filter_by_media_tags_id( non_spidered_downloads, arabic ) )
print_results_by_measurement_type( df )
region / pew knight study / 245107
precision_gold precision_py_boiler_pipe_ArticleExtractor \
count 99 99.000000
mean 1 0.730497
std 0 0.254126
min 1 0.021638
2% 1 0.058144
5% 1 0.191812
10% 1 0.339346
50% 1 0.855634
max 1 0.977451
precision_py_boiler_pipe_ArticleSentencesExtractor \
count 99.000000
mean 0.003127
std 0.031108
min 0.000000
2% 0.000000
5% 0.000000
10% 0.000000
50% 0.000000
max 0.309524
precision_py_boiler_pipe_CanolaExtractor \
count 99.000000
mean 0.603325
std 0.220402
min 0.068873
2% 0.178570
5% 0.195998
10% 0.239692
50% 0.653451
max 0.911409
precision_py_boiler_pipe_DefaultExtractor \
count 99.000000
mean 0.669334
std 0.210591
min 0.087109
2% 0.142523
5% 0.287539
10% 0.334865
50% 0.709030
max 0.951528
precision_py_boiler_pipe_KeepEverythingExtractor \
count 99.000000
mean 0.402181
std 0.204563
min 0.058161
2% 0.076351
5% 0.091449
10% 0.112205
50% 0.371308
max 0.773504
precision_py_boiler_pipe_LargestContentExtractor \
count 99.000000
mean 0.786516
std 0.253222
min 0.033708
2% 0.035928
5% 0.052695
10% 0.401654
50% 0.882629
max 0.977451
precision_py_boiler_pipe_NumWordsRulesExtractor \
count 99.000000
mean 0.645020
std 0.216599
min 0.075733
2% 0.189406
5% 0.257510
10% 0.300302
50% 0.690299
max 0.963362
precision_python_readibilty \
count 99.000000
mean 0.926673
std 0.138609
min 0.143822
2% 0.438315
5% 0.614948
10% 0.841220
50% 0.968974
max 1.000000
precision_thr_boiler_pipe_ArticleExtractor \
count 99.000000
mean 0.638565
std 0.325689
min 0.022250
2% 0.026711
5% 0.042044
10% 0.061152
50% 0.753870
max 0.989749
precision_thr_boiler_pipe_DefaultExtractor
count 99.000000
mean 0.610429
std 0.249978
min 0.087406
2% 0.097743
5% 0.187686
10% 0.236001
50% 0.617523
max 0.980851
recall_gold recall_py_boiler_pipe_ArticleExtractor \
count 99 99.000000
mean 1 0.848680
std 0 0.193062
min 1 0.067901
2% 1 0.097876
5% 1 0.409609
10% 1 0.676557
50% 1 0.914703
max 1 0.993341
recall_py_boiler_pipe_ArticleSentencesExtractor \
count 99.000000
mean 0.000207
std 0.002058
min 0.000000
2% 0.000000
5% 0.000000
10% 0.000000
50% 0.000000
max 0.020472
recall_py_boiler_pipe_CanolaExtractor \
count 99.000000
mean 0.844213
std 0.108269
min 0.422374
2% 0.542033
5% 0.593156
10% 0.705541
50% 0.882083
max 0.987791
recall_py_boiler_pipe_DefaultExtractor \
count 99.000000
mean 0.860696
std 0.121499
min 0.253968
2% 0.517735
5% 0.579318
10% 0.710183
50% 0.895105
max 0.986681
recall_py_boiler_pipe_KeepEverythingExtractor \
count 99.000000
mean 0.924781
std 0.036528
min 0.738562
2% 0.849661
5% 0.866116
10% 0.888651
50% 0.933036
max 0.993341
recall_py_boiler_pipe_LargestContentExtractor \
count 99.000000
mean 0.728798
std 0.274766
min 0.041379
2% 0.047393
5% 0.069693
10% 0.226484
50% 0.854911
max 0.985572
recall_py_boiler_pipe_NumWordsRulesExtractor recall_python_readibilty \
count 99.000000 99.000000
mean 0.876498 0.915441
std 0.089291 0.128770
min 0.448276 0.077586
2% 0.600374 0.640930
5% 0.697441 0.685469
10% 0.789726 0.847998
50% 0.904639 0.946387
max 0.986681 1.000000
recall_thr_boiler_pipe_ArticleExtractor \
count 99.000000
mean 0.813821
std 0.341946
min 0.027972
2% 0.056410
5% 0.079944
10% 0.092131
50% 0.993174
max 1.000000
recall_thr_boiler_pipe_DefaultExtractor
count 99.000000
mean 0.942651
std 0.115500
min 0.285714
2% 0.510198
5% 0.778831
10% 0.860101
50% 0.985882
max 1.000000
f1_gold f1_py_boiler_pipe_ArticleExtractor \
count 99 99.000000
mean 1 0.765140
std 0 0.219826
min 1 0.035176
2% 1 0.065534
5% 1 0.311998
10% 1 0.488654
50% 1 0.868217
max 1 0.971837
f1_py_boiler_pipe_ArticleSentencesExtractor \
count 99.000000
mean 0.000388
std 0.003860
min 0.000000
2% 0.000000
5% 0.000000
10% 0.000000
50% 0.000000
max 0.038405
f1_py_boiler_pipe_CanolaExtractor f1_py_boiler_pipe_DefaultExtractor \
count 99.000000 99.000000
mean 0.678045 0.733985
std 0.188601 0.177721
min 0.127827 0.159309
2% 0.293293 0.223785
5% 0.311104 0.428092
10% 0.366320 0.484591
50% 0.717622 0.791304
max 0.933186 0.959617
f1_py_boiler_pipe_KeepEverythingExtractor \
count 99.000000
mean 0.530552
std 0.210729
min 0.109541
2% 0.140531
5% 0.166162
10% 0.200544
50% 0.525896
max 0.851478
f1_py_boiler_pipe_LargestContentExtractor \
count 99.000000
mean 0.746387
std 0.266300
min 0.038462
2% 0.039443
5% 0.059673
10% 0.251082
50% 0.866142
max 0.979592
f1_py_boiler_pipe_NumWordsRulesExtractor f1_python_readibilty \
count 99.000000 99.000000
mean 0.720843 0.908288
std 0.179827 0.140196
min 0.140069 0.136364
2% 0.315054 0.384693
5% 0.396596 0.715988
10% 0.441076 0.794279
50% 0.766467 0.950276
max 0.960774 1.000000
f1_thr_boiler_pipe_ArticleExtractor \
count 99.000000
mean 0.693991
std 0.325722
min 0.025532
2% 0.038306
5% 0.056527
10% 0.070245
50% 0.830508
max 0.992573
f1_thr_boiler_pipe_DefaultExtractor
count 99.000000
mean 0.709473
std 0.212386
min 0.143426
2% 0.174606
5% 0.290379
10% 0.376066
50% 0.748092
max 0.988210
ap_english_us_top25 / 8875027
precision_gold precision_py_boiler_pipe_ArticleExtractor \
count 99 99.000000
mean 1 0.730497
std 0 0.254126
min 1 0.021638
2% 1 0.058144
5% 1 0.191812
10% 1 0.339346
50% 1 0.855634
max 1 0.977451
precision_py_boiler_pipe_ArticleSentencesExtractor \
count 99.000000
mean 0.003127
std 0.031108
min 0.000000
2% 0.000000
5% 0.000000
10% 0.000000
50% 0.000000
max 0.309524
precision_py_boiler_pipe_CanolaExtractor \
count 99.000000
mean 0.603325
std 0.220402
min 0.068873
2% 0.178570
5% 0.195998
10% 0.239692
50% 0.653451
max 0.911409
precision_py_boiler_pipe_DefaultExtractor \
count 99.000000
mean 0.669334
std 0.210591
min 0.087109
2% 0.142523
5% 0.287539
10% 0.334865
50% 0.709030
max 0.951528
precision_py_boiler_pipe_KeepEverythingExtractor \
count 99.000000
mean 0.402181
std 0.204563
min 0.058161
2% 0.076351
5% 0.091449
10% 0.112205
50% 0.371308
max 0.773504
precision_py_boiler_pipe_LargestContentExtractor \
count 99.000000
mean 0.786516
std 0.253222
min 0.033708
2% 0.035928
5% 0.052695
10% 0.401654
50% 0.882629
max 0.977451
precision_py_boiler_pipe_NumWordsRulesExtractor \
count 99.000000
mean 0.645020
std 0.216599
min 0.075733
2% 0.189406
5% 0.257510
10% 0.300302
50% 0.690299
max 0.963362
precision_python_readibilty \
count 99.000000
mean 0.926673
std 0.138609
min 0.143822
2% 0.438315
5% 0.614948
10% 0.841220
50% 0.968974
max 1.000000
precision_thr_boiler_pipe_ArticleExtractor \
count 99.000000
mean 0.638565
std 0.325689
min 0.022250
2% 0.026711
5% 0.042044
10% 0.061152
50% 0.753870
max 0.989749
precision_thr_boiler_pipe_DefaultExtractor
count 99.000000
mean 0.610429
std 0.249978
min 0.087406
2% 0.097743
5% 0.187686
10% 0.236001
50% 0.617523
max 0.980851
recall_gold recall_py_boiler_pipe_ArticleExtractor \
count 99 99.000000
mean 1 0.848680
std 0 0.193062
min 1 0.067901
2% 1 0.097876
5% 1 0.409609
10% 1 0.676557
50% 1 0.914703
max 1 0.993341
recall_py_boiler_pipe_ArticleSentencesExtractor \
count 99.000000
mean 0.000207
std 0.002058
min 0.000000
2% 0.000000
5% 0.000000
10% 0.000000
50% 0.000000
max 0.020472
recall_py_boiler_pipe_CanolaExtractor \
count 99.000000
mean 0.844213
std 0.108269
min 0.422374
2% 0.542033
5% 0.593156
10% 0.705541
50% 0.882083
max 0.987791
recall_py_boiler_pipe_DefaultExtractor \
count 99.000000
mean 0.860696
std 0.121499
min 0.253968
2% 0.517735
5% 0.579318
10% 0.710183
50% 0.895105
max 0.986681
recall_py_boiler_pipe_KeepEverythingExtractor \
count 99.000000
mean 0.924781
std 0.036528
min 0.738562
2% 0.849661
5% 0.866116
10% 0.888651
50% 0.933036
max 0.993341
recall_py_boiler_pipe_LargestContentExtractor \
count 99.000000
mean 0.728798
std 0.274766
min 0.041379
2% 0.047393
5% 0.069693
10% 0.226484
50% 0.854911
max 0.985572
recall_py_boiler_pipe_NumWordsRulesExtractor recall_python_readibilty \
count 99.000000 99.000000
mean 0.876498 0.915441
std 0.089291 0.128770
min 0.448276 0.077586
2% 0.600374 0.640930
5% 0.697441 0.685469
10% 0.789726 0.847998
50% 0.904639 0.946387
max 0.986681 1.000000
recall_thr_boiler_pipe_ArticleExtractor \
count 99.000000
mean 0.813821
std 0.341946
min 0.027972
2% 0.056410
5% 0.079944
10% 0.092131
50% 0.993174
max 1.000000
recall_thr_boiler_pipe_DefaultExtractor
count 99.000000
mean 0.942651
std 0.115500
min 0.285714
2% 0.510198
5% 0.778831
10% 0.860101
50% 0.985882
max 1.000000
f1_gold f1_py_boiler_pipe_ArticleExtractor \
count 99 99.000000
mean 1 0.765140
std 0 0.219826
min 1 0.035176
2% 1 0.065534
5% 1 0.311998
10% 1 0.488654
50% 1 0.868217
max 1 0.971837
f1_py_boiler_pipe_ArticleSentencesExtractor \
count 99.000000
mean 0.000388
std 0.003860
min 0.000000
2% 0.000000
5% 0.000000
10% 0.000000
50% 0.000000
max 0.038405
f1_py_boiler_pipe_CanolaExtractor f1_py_boiler_pipe_DefaultExtractor \
count 99.000000 99.000000
mean 0.678045 0.733985
std 0.188601 0.177721
min 0.127827 0.159309
2% 0.293293 0.223785
5% 0.311104 0.428092
10% 0.366320 0.484591
50% 0.717622 0.791304
max 0.933186 0.959617
f1_py_boiler_pipe_KeepEverythingExtractor \
count 99.000000
mean 0.530552
std 0.210729
min 0.109541
2% 0.140531
5% 0.166162
10% 0.200544
50% 0.525896
max 0.851478
f1_py_boiler_pipe_LargestContentExtractor \
count 99.000000
mean 0.746387
std 0.266300
min 0.038462
2% 0.039443
5% 0.059673
10% 0.251082
50% 0.866142
max 0.979592
f1_py_boiler_pipe_NumWordsRulesExtractor f1_python_readibilty \
count 99.000000 99.000000
mean 0.720843 0.908288
std 0.179827 0.140196
min 0.140069 0.136364
2% 0.315054 0.384693
5% 0.396596 0.715988
10% 0.441076 0.794279
50% 0.766467 0.950276
max 0.960774 1.000000
f1_thr_boiler_pipe_ArticleExtractor \
count 99.000000
mean 0.693991
std 0.325722
min 0.025532
2% 0.038306
5% 0.056527
10% 0.070245
50% 0.830508
max 0.992573
f1_thr_boiler_pipe_DefaultExtractor
count 99.000000
mean 0.709473
std 0.212386
min 0.143426
2% 0.174606
5% 0.290379
10% 0.376066
50% 0.748092
max 0.988210
political blogs / 125
precision_gold precision_py_boiler_pipe_ArticleExtractor \
count 150 150.000000
mean 1 0.703533
std 0 0.360671
min 1 0.000000
2% 1 0.000000
5% 1 0.003984
10% 1 0.011152
50% 1 0.898303
max 1 0.988089
precision_py_boiler_pipe_ArticleSentencesExtractor \
count 150
mean 0
std 0
min 0
2% 0
5% 0
10% 0
50% 0
max 0
precision_py_boiler_pipe_CanolaExtractor \
count 150.000000
mean 0.515234
std 0.341081
min 0.000000
2% 0.000000
5% 0.003016
10% 0.014508
50% 0.555674
max 0.973795
precision_py_boiler_pipe_DefaultExtractor \
count 150.000000
mean 0.522629
std 0.342647
min 0.000000
2% 0.000000
5% 0.000000
10% 0.005803
50% 0.599262
max 0.973963
precision_py_boiler_pipe_KeepEverythingExtractor \
count 150.000000
mean 0.369914
std 0.278387
min 0.002865
2% 0.007767
5% 0.015080
10% 0.023499
50% 0.298086
max 0.913591
precision_py_boiler_pipe_LargestContentExtractor \
count 150.000000
mean 0.677239
std 0.383520
min 0.000000
2% 0.000000
5% 0.004188
10% 0.012684
50% 0.892375
max 0.978162
precision_py_boiler_pipe_NumWordsRulesExtractor \
count 150.000000
mean 0.572627
std 0.333536
min 0.000000
2% 0.000000
5% 0.002471
10% 0.012913
50% 0.681146
max 0.973301
precision_python_readibilty \
count 150.000000
mean 0.880689
std 0.241102
min 0.013793
2% 0.030579
5% 0.186217
10% 0.599048
50% 0.985371
max 1.000000
precision_thr_boiler_pipe_ArticleExtractor \
count 150.000000
mean 0.616935
std 0.421249
min 0.003302
2% 0.005500
5% 0.008824
10% 0.014676
50% 0.906002
max 0.997970
precision_thr_boiler_pipe_DefaultExtractor
count 150.000000
mean 0.484702
std 0.325502
min 0.003189
2% 0.005959
5% 0.012064
10% 0.029434
50% 0.455373
max 0.997809
recall_gold recall_py_boiler_pipe_ArticleExtractor \
count 150 150.000000
mean 1 0.756760
std 0 0.342598
min 1 0.000000
2% 1 0.000000
5% 1 0.023901
10% 1 0.051914
50% 1 0.928193
max 1 0.991501
recall_py_boiler_pipe_ArticleSentencesExtractor \
count 150
mean 0
std 0
min 0
2% 0
5% 0
10% 0
50% 0
max 0
recall_py_boiler_pipe_CanolaExtractor \
count 150.000000
mean 0.775613
std 0.281738
min 0.000000
2% 0.000000
5% 0.072667
10% 0.266144
50% 0.898780
max 0.995704
recall_py_boiler_pipe_DefaultExtractor \
count 150.000000
mean 0.715632
std 0.333288
min 0.000000
2% 0.000000
5% 0.000000
10% 0.077706
50% 0.880511
max 0.991501
recall_py_boiler_pipe_KeepEverythingExtractor \
count 150.000000
mean 0.911380
std 0.109811
min 0.050909
2% 0.689231
5% 0.722857
10% 0.779471
50% 0.946547
max 0.998389
recall_py_boiler_pipe_LargestContentExtractor \
count 150.000000
mean 0.660467
std 0.372133
min 0.000000
2% 0.000000
5% 0.011912
10% 0.034427
50% 0.879248
max 0.991501
recall_py_boiler_pipe_NumWordsRulesExtractor recall_python_readibilty \
count 150.000000 150.000000
mean 0.781152 0.807832
std 0.280683 0.313501
min 0.000000 0.020690
2% 0.000000 0.035595
5% 0.084909 0.069238
10% 0.234492 0.166667
50% 0.905032 0.970968
max 0.991501 1.000000
recall_thr_boiler_pipe_ArticleExtractor \
count 150.000000
mean 0.769863
std 0.359786
min 0.010331
2% 0.023252
5% 0.036122
10% 0.069009
50% 0.976322
max 1.000000
recall_thr_boiler_pipe_DefaultExtractor
count 150.000000
mean 0.815129
std 0.282623
min 0.003704
2% 0.031039
5% 0.131712
10% 0.301923
50% 0.962453
max 1.000000
f1_gold f1_py_boiler_pipe_ArticleExtractor \
count 150 150.000000
mean 1 0.718112
std 0 0.358481
min 1 0.000000
2% 1 0.000000
5% 1 0.006798
10% 1 0.017960
50% 1 0.912683
max 1 0.984093
f1_py_boiler_pipe_ArticleSentencesExtractor \
count 150
mean 0
std 0
min 0
2% 0
5% 0
10% 0
50% 0
max 0
f1_py_boiler_pipe_CanolaExtractor f1_py_boiler_pipe_DefaultExtractor \
count 150.000000 150.000000
mean 0.577217 0.568006
std 0.330633 0.345523
min 0.000000 0.000000
2% 0.000000 0.000000
5% 0.005862 0.000000
10% 0.027886 0.011238
50% 0.674047 0.682651
max 0.969320 0.971197
f1_py_boiler_pipe_KeepEverythingExtractor \
count 150.000000
mean 0.472859
std 0.292861
min 0.005706
2% 0.015378
5% 0.027311
10% 0.045693
50% 0.453552
max 0.944083
f1_py_boiler_pipe_LargestContentExtractor \
count 150.000000
mean 0.659167
std 0.376714
min 0.000000
2% 0.000000
5% 0.007661
10% 0.020191
50% 0.867735
max 0.980839
f1_py_boiler_pipe_NumWordsRulesExtractor f1_python_readibilty \
count 150.000000 150.000000
mean 0.627502 0.807779
std 0.324330 0.304467
min 0.000000 0.026549
2% 0.000000 0.034729
5% 0.004894 0.072607
10% 0.025091 0.165152
50% 0.756476 0.963556
max 0.973301 1.000000
f1_thr_boiler_pipe_ArticleExtractor \
count 150.000000
mean 0.635670
std 0.419068
min 0.006577
2% 0.009602
5% 0.012090
10% 0.020877
50% 0.933609
max 0.998469
f1_thr_boiler_pipe_DefaultExtractor
count 150.000000
mean 0.530499
std 0.330138
min 0.006349
2% 0.010201
5% 0.020156
10% 0.042055
50% 0.580317
max 0.994242
russian
precision_gold precision_py_boiler_pipe_ArticleExtractor \
count 132 132.000000
mean 1 0.718432
std 0 0.252618
min 1 0.012500
2% 1 0.051124
5% 1 0.185462
10% 1 0.340725
50% 1 0.835164
max 1 0.982402
precision_py_boiler_pipe_ArticleSentencesExtractor \
count 132
mean 0
std 0
min 0
2% 0
5% 0
10% 0
50% 0
max 0
precision_py_boiler_pipe_CanolaExtractor \
count 132.000000
mean 0.488699
std 0.250472
min 0.054201
2% 0.078920
5% 0.114551
10% 0.175987
50% 0.441274
max 0.925352
precision_py_boiler_pipe_DefaultExtractor \
count 132.000000
mean 0.610990
std 0.258515
min 0.022857
2% 0.112440
5% 0.165727
10% 0.247548
50% 0.665667
max 0.981651
precision_py_boiler_pipe_KeepEverythingExtractor \
count 132.000000
mean 0.188878
std 0.142571
min 0.010258
2% 0.033324
5% 0.039598
10% 0.048948
50% 0.148603
max 0.835991
precision_py_boiler_pipe_LargestContentExtractor \
count 132.000000
mean 0.702744
std 0.288666
min 0.012500
2% 0.044430
5% 0.084635
10% 0.158502
50% 0.835874
max 0.982402
precision_py_boiler_pipe_NumWordsRulesExtractor \
count 132.000000
mean 0.614402
std 0.262220
min 0.067548
2% 0.117832
5% 0.162064
10% 0.238983
50% 0.711282
max 0.982402
precision_python_readibilty \
count 132.000000
mean 0.946815
std 0.139890
min 0.018519
2% 0.503566
5% 0.842260
10% 0.904014
50% 0.980456
max 1.000000
precision_thr_boiler_pipe_ArticleExtractor \
count 132.000000
mean 0.234777
std 0.370754
min 0.000000
2% 0.000000
5% 0.000000
10% 0.003215
50% 0.043435
max 1.000000
precision_thr_boiler_pipe_DefaultExtractor
count 132.000000
mean 0.220128
std 0.347734
min 0.001456
2% 0.003306
5% 0.004450
10% 0.006159
50% 0.030528
max 1.000000
recall_gold recall_py_boiler_pipe_ArticleExtractor \
count 132 132.000000
mean 1 0.818267
std 0 0.220713
min 1 0.032882
2% 1 0.092117
5% 1 0.201786
10% 1 0.581737
50% 1 0.898746
max 1 1.000000
recall_py_boiler_pipe_ArticleSentencesExtractor \
count 132
mean 0
std 0
min 0
2% 0
5% 0
10% 0
50% 0
max 0
recall_py_boiler_pipe_CanolaExtractor \
count 132.000000
mean 0.804646
std 0.146584
min 0.297297
2% 0.412065
5% 0.503986
10% 0.611248
50% 0.845416
max 0.993498
recall_py_boiler_pipe_DefaultExtractor \
count 132.000000
mean 0.789282
std 0.184720
min 0.037736
2% 0.222724
5% 0.390545
10% 0.594575
50% 0.840412
max 0.996778
recall_py_boiler_pipe_KeepEverythingExtractor \
count 132.000000
mean 0.913321
std 0.078472
min 0.381356
2% 0.785777
5% 0.833346
10% 0.851894
50% 0.921185
max 1.000000
recall_py_boiler_pipe_LargestContentExtractor \
count 132.000000
mean 0.717450
std 0.302408
min 0.010778
2% 0.035458
5% 0.085721
10% 0.127878
50% 0.875702
max 0.997409
recall_py_boiler_pipe_NumWordsRulesExtractor recall_python_readibilty \
count 132.000000 132.000000
mean 0.851578 0.879423
std 0.120377 0.194934
min 0.310811 0.020725
2% 0.476644 0.057854
5% 0.608980 0.655007
10% 0.714950 0.754755
50% 0.866012 0.937048
max 0.997852 1.000000
recall_thr_boiler_pipe_ArticleExtractor \
count 132.000000
mean 0.237385
std 0.376244
min 0.000000
2% 0.000000
5% 0.000000
10% 0.007983
50% 0.051471
max 1.000000
recall_thr_boiler_pipe_DefaultExtractor
count 132.000000
mean 0.301652
std 0.406205
min 0.005319
2% 0.009765
5% 0.014811
10% 0.017790
50% 0.058824
max 1.000000
f1_gold f1_py_boiler_pipe_ArticleExtractor \
count 132 132.000000
mean 1 0.749676
std 0 0.227920
min 1 0.018114
2% 1 0.063775
5% 1 0.187938
10% 1 0.478066
50% 1 0.818793
max 1 0.987313
f1_py_boiler_pipe_ArticleSentencesExtractor \
count 132
mean 0
std 0
min 0
2% 0
5% 0
10% 0
50% 0
max 0
f1_py_boiler_pipe_CanolaExtractor f1_py_boiler_pipe_DefaultExtractor \
count 132.000000 132.000000
mean 0.573987 0.661798
std 0.222509 0.225256
min 0.092437 0.036866
2% 0.137648 0.101141
5% 0.183537 0.253035
10% 0.280725 0.340049
50% 0.570221 0.715686
max 0.920245 0.986933
f1_py_boiler_pipe_KeepEverythingExtractor \
count 132.000000
mean 0.293188
std 0.173252
min 0.020263
2% 0.062051
5% 0.075902
10% 0.088069
50% 0.254443
max 0.878564
f1_py_boiler_pipe_LargestContentExtractor \
count 132.000000
mean 0.692104
std 0.285086
min 0.012821
2% 0.033738
5% 0.075729
10% 0.135805
50% 0.802698
max 0.987313
f1_py_boiler_pipe_NumWordsRulesExtractor f1_python_readibilty \
count 132.000000 132.000000
mean 0.684560 0.899931
std 0.222306 0.184317
min 0.110977 0.026781
2% 0.194573 0.098648
5% 0.268575 0.778182
10% 0.365804 0.843508
50% 0.768441 0.944767
max 0.987313 0.999427
f1_thr_boiler_pipe_ArticleExtractor \
count 132.000000
mean 0.231709
std 0.371334
min 0.000000
2% 0.000000
5% 0.000000
10% 0.005065
50% 0.047168
max 1.000000
f1_thr_boiler_pipe_DefaultExtractor
count 132.000000
mean 0.241120
std 0.357583
min 0.002375
2% 0.005276
5% 0.006912
10% 0.009496
50% 0.039170
max 0.997347
brazil
precision_gold precision_py_boiler_pipe_ArticleExtractor \
count 5 5.000000
mean 1 0.801957
std 0 0.153987
min 1 0.538341
2% 1 0.561037
5% 1 0.595080
10% 1 0.651818
50% 1 0.828571
max 1 0.929329
precision_py_boiler_pipe_ArticleSentencesExtractor \
count 5
mean 0
std 0
min 0
2% 0
5% 0
10% 0
50% 0
max 0
precision_py_boiler_pipe_CanolaExtractor \
count 5.000000
mean 0.664670
std 0.273746
min 0.180902
2% 0.224373
5% 0.289580
10% 0.398258
50% 0.767857
max 0.826772
precision_py_boiler_pipe_DefaultExtractor \
count 5.000000
mean 0.719327
std 0.319628
min 0.157421
2% 0.207379
5% 0.282316
10% 0.407210
50% 0.840909
max 0.946043
precision_py_boiler_pipe_KeepEverythingExtractor \
count 5.000000
mean 0.284243
std 0.107933
min 0.118172
2% 0.128293
5% 0.143474
10% 0.168776
50% 0.309410
max 0.390855
precision_py_boiler_pipe_LargestContentExtractor \
count 5.000000
mean 0.888106
std 0.039403
min 0.820312
2% 0.826008
5% 0.834552
10% 0.848791
50% 0.902326
max 0.921622
precision_py_boiler_pipe_NumWordsRulesExtractor \
count 5.000000
mean 0.734233
std 0.323688
min 0.161388
2% 0.215733
5% 0.297252
10% 0.433116
50% 0.840909
max 0.956124
precision_python_readibilty \
count 5.000000
mean 0.962387
std 0.030849
min 0.933014
2% 0.933860
5% 0.935129
10% 0.937245
50% 0.944000
max 1.000000
precision_thr_boiler_pipe_ArticleExtractor \
count 5.000000
mean 0.742769
std 0.388031
min 0.050514
2% 0.116920
5% 0.216530
10% 0.382547
50% 0.901408
max 0.957516
precision_thr_boiler_pipe_DefaultExtractor
count 5.000000
mean 0.640169
std 0.355717
min 0.037340
2% 0.087591
5% 0.162968
10% 0.288596
50% 0.690391
max 0.920415
recall_gold recall_py_boiler_pipe_ArticleExtractor \
count 5 5.000000
mean 1 0.979646
std 0 0.009266
min 1 0.969231
2% 1 0.969331
5% 1 0.969481
10% 1 0.969730
50% 1 0.983051
max 1 0.989796
recall_py_boiler_pipe_ArticleSentencesExtractor \
count 5
mean 0
std 0
min 0
2% 0
5% 0
10% 0
50% 0
max 0
recall_py_boiler_pipe_CanolaExtractor \
count 5.000000
mean 0.914842
std 0.039027
min 0.882051
2% 0.882674
5% 0.883607
10% 0.885163
50% 0.896679
max 0.977077
recall_py_boiler_pipe_DefaultExtractor \
count 5.000000
mean 0.949445
std 0.028846
min 0.902579
2% 0.905627
5% 0.910199
10% 0.917818
50% 0.964103
max 0.970480
recall_py_boiler_pipe_KeepEverythingExtractor \
count 5.000000
mean 0.978402
std 0.010026
min 0.964103
2% 0.964940
5% 0.966197
10% 0.968292
50% 0.977860
max 0.989796
recall_py_boiler_pipe_LargestContentExtractor \
count 5.000000
mean 0.870353
std 0.196538
min 0.525830
2% 0.554950
5% 0.598630
10% 0.671430
50% 0.969231
max 0.989796
recall_py_boiler_pipe_NumWordsRulesExtractor recall_python_readibilty \
count 5.000000 5.000000
mean 0.947674 0.979889
std 0.020087 0.025059
min 0.919771 0.938776
2% 0.921443 0.941902
5% 0.923952 0.946592
10% 0.928134 0.954409
50% 0.943590 0.982808
max 0.969388 1.000000
recall_thr_boiler_pipe_ArticleExtractor \
count 5.000000
mean 0.736822
std 0.372712
min 0.169054
2% 0.198777
5% 0.243362
10% 0.317669
50% 0.979592
max 1.000000
recall_thr_boiler_pipe_DefaultExtractor
count 5.000000
mean 0.822777
std 0.357876
min 0.183381
2% 0.245037
5% 0.337521
10% 0.491661
50% 0.981550
max 1.000000
f1_gold f1_py_boiler_pipe_ArticleExtractor \
count 5 5.000000
mean 1 0.874387
std 0 0.101817
min 1 0.696356
2% 1 0.712500
5% 1 0.736715
10% 1 0.777073
50% 1 0.899225
max 1 0.949458
f1_py_boiler_pipe_ArticleSentencesExtractor \
count 5
mean 0
std 0
min 0
2% 0
5% 0
10% 0
50% 0
max 0
f1_py_boiler_pipe_CanolaExtractor f1_py_boiler_pipe_DefaultExtractor \
count 5.000000 5.000000
mean 0.731530 0.778927
std 0.239962 0.287640
min 0.305282 0.268085
2% 0.344965 0.315887
5% 0.404489 0.387589
10% 0.503697 0.507093
50% 0.821002 0.888000
max 0.872902 0.958106
f1_py_boiler_pipe_KeepEverythingExtractor \
count 5.000000
mean 0.430870
std 0.138002
min 0.211043
2% 0.225452
5% 0.247066
10% 0.283088
50% 0.471446
max 0.558483
f1_py_boiler_pipe_LargestContentExtractor \
count 5.000000
mean 0.868019
std 0.119717
min 0.665111
2% 0.680195
5% 0.702820
10% 0.740530
50% 0.928747
max 0.948540
f1_py_boiler_pipe_NumWordsRulesExtractor f1_python_readibilty \
count 5.000000 5.000000
mean 0.785997 0.970715
std 0.287227 0.019337
min 0.274594 0.941176
2% 0.323666 0.943110
5% 0.397275 0.946010
10% 0.519956 0.950844
50% 0.900474 0.971193
max 0.960514 0.988806
f1_thr_boiler_pipe_ArticleExtractor \
count 5.000000
mean 0.720446
std 0.375738
min 0.077785
2% 0.126845
5% 0.200436
10% 0.323086
50% 0.936508
max 0.958025
f1_thr_boiler_pipe_DefaultExtractor
count 5.000000
mean 0.710296
std 0.369795
min 0.062046
2% 0.119807
5% 0.206450
10% 0.350854
50% 0.815126
max 0.950000
arabic
precision_gold precision_py_boiler_pipe_ArticleExtractor \
count 117 117.000000
mean 1 0.741376
std 0 0.252984
min 1 0.010726
2% 1 0.022730
5% 1 0.060870
10% 1 0.330385
50% 1 0.828947
max 1 0.980379
precision_py_boiler_pipe_ArticleSentencesExtractor \
count 117
mean 0
std 0
min 0
2% 0
5% 0
10% 0
50% 0
max 0
precision_py_boiler_pipe_CanolaExtractor \
count 117.000000
mean 0.588046
std 0.253096
min 0.001637
2% 0.027450
5% 0.159398
10% 0.235953
50% 0.638584
max 0.976285
precision_py_boiler_pipe_DefaultExtractor \
count 117.000000
mean 0.657585
std 0.245890
min 0.005348
2% 0.058647
5% 0.155817
10% 0.283340
50% 0.736264
max 0.978873
precision_py_boiler_pipe_KeepEverythingExtractor \
count 117.000000
mean 0.336254
std 0.219627
min 0.001162
2% 0.027610
5% 0.079325
10% 0.100802
50% 0.305410
max 0.910245
precision_py_boiler_pipe_LargestContentExtractor \
count 117.000000
mean 0.755409
std 0.255409
min 0.000000
2% 0.014231
5% 0.023593
10% 0.490368
50% 0.837349
max 0.980232
precision_py_boiler_pipe_NumWordsRulesExtractor \
count 117.000000
mean 0.711242
std 0.224088
min 0.013514
2% 0.058882
5% 0.280358
10% 0.358444
50% 0.785714
max 0.978873
precision_python_readibilty \
count 117.000000
mean 0.929890
std 0.134994
min 0.241379
2% 0.414138
5% 0.666667
10% 0.857168
50% 0.980952
max 1.000000
precision_thr_boiler_pipe_ArticleExtractor \
count 117.000000
mean 0.756990
std 0.297713
min 0.011182
2% 0.027757
5% 0.043883
10% 0.130529
50% 0.881720
max 0.997958
precision_thr_boiler_pipe_DefaultExtractor
count 117.000000
mean 0.674588
std 0.298915
min 0.000000
2% 0.033720
5% 0.120281
10% 0.236147
50% 0.802956
max 1.000000
recall_gold recall_py_boiler_pipe_ArticleExtractor \
count 117 117.000000
mean 1 0.839014
std 0 0.255579
min 1 0.012346
2% 1 0.024636
5% 1 0.037730
10% 1 0.632387
50% 1 0.936508
max 1 1.000000
recall_py_boiler_pipe_ArticleSentencesExtractor \
count 117
mean 0
std 0
min 0
2% 0
5% 0
10% 0
50% 0
max 0
recall_py_boiler_pipe_CanolaExtractor \
count 117.000000
mean 0.766722
std 0.208122
min 0.012346
2% 0.139507
5% 0.195693
10% 0.552758
50% 0.825397
max 0.997429
recall_py_boiler_pipe_DefaultExtractor \
count 117.000000
mean 0.701666
std 0.293111
min 0.012346
2% 0.026452
5% 0.045974
10% 0.126923
50% 0.819788
max 0.997429
recall_py_boiler_pipe_KeepEverythingExtractor \
count 117.000000
mean 0.922749
std 0.133383
min 0.007463
2% 0.590000
5% 0.830303
10% 0.874678
50% 0.952000
max 1.000000
recall_py_boiler_pipe_LargestContentExtractor \
count 117.000000
mean 0.768446
std 0.282090
min 0.000000
2% 0.020848
5% 0.032891
10% 0.138356
50% 0.868613
max 0.994859
recall_py_boiler_pipe_NumWordsRulesExtractor recall_python_readibilty \
count 117.000000 117.000000
mean 0.801901 0.830034
std 0.193662 0.255519
min 0.012346 0.012255
2% 0.112381 0.041673
5% 0.398237 0.093980
10% 0.607539 0.509681
50% 0.865079 0.921986
max 0.997429 1.000000
recall_thr_boiler_pipe_ArticleExtractor \
count 117.000000
mean 0.842179
std 0.314171
min 0.023932
2% 0.040051
5% 0.062071
10% 0.151961
50% 1.000000
max 1.000000
recall_thr_boiler_pipe_DefaultExtractor
count 117.000000
mean 0.759565
std 0.347585
min 0.000000
2% 0.032252
5% 0.060219
10% 0.117895
50% 0.966667
max 1.000000
f1_gold f1_py_boiler_pipe_ArticleExtractor \
count 117 117.000000
mean 1 0.774978
std 0 0.255278
min 1 0.014469
2% 1 0.028784
5% 1 0.036998
10% 1 0.405667
50% 1 0.874074
max 1 0.981043
f1_py_boiler_pipe_ArticleSentencesExtractor \
count 117
mean 0
std 0
min 0
2% 0
5% 0
10% 0
50% 0
max 0
f1_py_boiler_pipe_CanolaExtractor f1_py_boiler_pipe_DefaultExtractor \
count 117.000000 117.000000
mean 0.645211 0.655535
std 0.235679 0.267798
min 0.003236 0.010309
2% 0.024550 0.025866
5% 0.194915 0.068729
10% 0.296855 0.171742
50% 0.707692 0.765854
max 0.973399 0.971214
f1_py_boiler_pipe_KeepEverythingExtractor \
count 117.000000
mean 0.459616
std 0.228261
min 0.002321
2% 0.049776
5% 0.146173
10% 0.182303
50% 0.453368
max 0.946723
f1_py_boiler_pipe_LargestContentExtractor \
count 117.000000
mean 0.754560
std 0.268122
min 0.000000
2% 0.016508
5% 0.031962
10% 0.192451
50% 0.850000
max 0.980392
f1_py_boiler_pipe_NumWordsRulesExtractor f1_python_readibilty \
count 117.000000 117.000000
mean 0.741682 0.844823
std 0.209849 0.235643
min 0.014599 0.023585
2% 0.056917 0.079781
5% 0.366651 0.171765
10% 0.473788 0.558496
50% 0.820755 0.933902
max 0.971641 1.000000
f1_thr_boiler_pipe_ArticleExtractor \
count 117.000000
mean 0.776713
std 0.305971
min 0.015242
2% 0.032739
5% 0.051157
10% 0.145678
50% 0.927273
max 0.997361
f1_thr_boiler_pipe_DefaultExtractor
count 117.000000
mean 0.656418
std 0.314089
min 0.000000
2% 0.037984
5% 0.055188
10% 0.154126
50% 0.735484
max 0.998779
In [67]:
boiler_pipe_extractor_training_objects = cPickle.load( open( "boiler_pipe_google_news_extractor_training_objects.pickle", "rb") )
#eto = extractor_training_objects[ 0 ]
#eto.keys()
#print eto['expected_text']
#get_extraction_results( eto )
#comp_extractors ( eto )
comps_downloads_boiler_pipe = []
processed = 0
skipped = 0
start_time = datetime.datetime.now()
e=None
for extractor_training_object in boiler_pipe_extractor_training_objects[:]:
try:
res = comp_extractors( extractor_training_object )
#print res
comps_downloads_boiler_pipe.append( res )
processed += 1
except Exception, e:
print "error on download{}".format( extractor_training_object[ 'downloads_id'] )
e = sys.exc_info()
import traceback
traceback.print_exc()
print e
#raise e
skipped += 1
print 'processed', processed, 'skipped', skipped
#extraction_results.append( er )
end_time = datetime.datetime.now()
print "Total_time", end_time - start_time
print "Time per download", (end_time - start_time)/ (processed + skipped )
res.keys()
processed 1 skipped 0
processed 2 skipped 0
processed 3 skipped 0
processed 4 skipped 0
processed 5 skipped 0
processed 6 skipped 0
processed 7 skipped 0
error on download045cb317-60ad-454d-add8-3baa40789258.html
(<type 'exceptions.IOError'>, IOError('cannot identify image file',), <traceback object at 0x7f4dcdc70248>)
processed 7 skipped 1
processed 8 skipped 1
processed 9 skipped 1
processed 10 skipped 1
error on download065445c6-e5e0-4006-ba4b-31711c4a6a4b.html
Traceback (most recent call last):
File "<ipython-input-67-6114dbc3b5a2>", line 16, in <module>
res = comp_extractors( extractor_training_object )
File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
extraction_results = get_extraction_results( eto )
File "<ipython-input-21-ff5d87d959e4>", line 25, in get_extraction_results
ret['py_goose'] = { 'extracted_html': extract_with_python_goose( raw_content ) }
File "<ipython-input-9-da7d07f83e6d>", line 6, in extract_with_python_goose
r = g.extract( raw_html=raw_content )
File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 56, in extract
return self.crawl(cc)
File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 63, in crawl
article = crawler.crawl(crawl_candiate)
File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 125, in crawl
self.get_image()
File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 147, in get_image
self.article.top_image = self.image_extractor.get_best_image(doc, top_node)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 88, in get_best_image
image = self.check_large_images(topNode, 0, 0)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 122, in check_large_images
good_images = self.get_image_candidates(node)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 283, in get_image_candidates
good_images = self.get_images_bytesize_match(filtered_images)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 299, in get_images_bytesize_match
local_image = self.get_local_image(src)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 344, in get_local_image
self.link_hash, src, self.config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 52, in store_image
image = self.read_localfile(link_hash, src, config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 81, in read_localfile
image_details = self.get_image_dimensions(identify, local_image_name)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 36, in get_image_dimensions
image = Image.open(path)
File "/usr/lib/python2.7/dist-packages/PIL/Image.py", line 2028, in open
raise IOError("cannot identify image file")
IOError: cannot identify image file
Traceback (most recent call last):
(<type 'exceptions.IOError'>, IOError('cannot identify image file',), <traceback object at 0x7f4dcd4117a0>)
processed 10 skipped 2
processed 11 skipped 2
processed 12 skipped 2
processed 13 skipped 2
processed 14 skipped 2
processed 15 skipped 2
processed 16 skipped 2
processed 17 skipped 2
processed 18 skipped 2
processed 19 skipped 2
processed 20 skipped 2
error on download0acd5213-35e1-4039-adb5-6c7611911b9e.html
File "<ipython-input-67-6114dbc3b5a2>", line 16, in <module>
res = comp_extractors( extractor_training_object )
File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
extraction_results = get_extraction_results( eto )
File "<ipython-input-21-ff5d87d959e4>", line 25, in get_extraction_results
ret['py_goose'] = { 'extracted_html': extract_with_python_goose( raw_content ) }
File "<ipython-input-9-da7d07f83e6d>", line 6, in extract_with_python_goose
r = g.extract( raw_html=raw_content )
File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 56, in extract
return self.crawl(cc)
File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 63, in crawl
article = crawler.crawl(crawl_candiate)
File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 125, in crawl
self.get_image()
File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 147, in get_image
self.article.top_image = self.image_extractor.get_best_image(doc, top_node)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 88, in get_best_image
image = self.check_large_images(topNode, 0, 0)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
depth_obj.parent_depth, depth_obj.sibling_depth)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
depth_obj.parent_depth, depth_obj.sibling_depth)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
depth_obj.parent_depth, depth_obj.sibling_depth)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 122, in check_large_images
good_images = self.get_image_candidates(node)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 283, in get_image_candidates
good_images = self.get_images_bytesize_match(filtered_images)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 299, in get_images_bytesize_match
local_image = self.get_local_image(src)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 344, in get_local_image
self.link_hash, src, self.config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 52, in store_image
image = self.read_localfile(link_hash, src, config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 81, in read_localfile
image_details = self.get_image_dimensions(identify, local_image_name)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 36, in get_image_dimensions
image = Image.open(path)
File "/usr/lib/python2.7/dist-packages/PIL/Image.py", line 2028, in open
raise IOError("cannot identify image file")
IOError: cannot identify image file
Traceback (most recent call last):
(<type 'exceptions.IOError'>, IOError('cannot identify image file',), <traceback object at 0x7f4de71a79e0>)
processed 20 skipped 3
processed 21 skipped 3
processed 22 skipped 3
processed 23 skipped 3
processed 24 skipped 3
processed 25 skipped 3
processed 26 skipped 3
processed 27 skipped 3
processed 28 skipped 3
error on download0fb4f846-1043-4578-8863-a4bf82dccb74.html
File "<ipython-input-67-6114dbc3b5a2>", line 16, in <module>
res = comp_extractors( extractor_training_object )
File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
extraction_results = get_extraction_results( eto )
File "<ipython-input-21-ff5d87d959e4>", line 25, in get_extraction_results
ret['py_goose'] = { 'extracted_html': extract_with_python_goose( raw_content ) }
File "<ipython-input-9-da7d07f83e6d>", line 6, in extract_with_python_goose
r = g.extract( raw_html=raw_content )
File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 56, in extract
return self.crawl(cc)
File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 63, in crawl
article = crawler.crawl(crawl_candiate)
File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 125, in crawl
self.get_image()
File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 147, in get_image
self.article.top_image = self.image_extractor.get_best_image(doc, top_node)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 88, in get_best_image
image = self.check_large_images(topNode, 0, 0)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 122, in check_large_images
good_images = self.get_image_candidates(node)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 283, in get_image_candidates
good_images = self.get_images_bytesize_match(filtered_images)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 299, in get_images_bytesize_match
local_image = self.get_local_image(src)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 344, in get_local_image
self.link_hash, src, self.config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 59, in store_image
image = self.write_localfile(data, link_hash, src, config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 101, in write_localfile
return self.read_localfile(link_hash, src, config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 81, in read_localfile
image_details = self.get_image_dimensions(identify, local_image_name)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 36, in get_image_dimensions
image = Image.open(path)
File "/usr/lib/python2.7/dist-packages/PIL/Image.py", line 2028, in open
raise IOError("cannot identify image file")
IOError: cannot identify image file
Traceback (most recent call last):
(<type 'exceptions.IOError'>, IOError('cannot identify image file',), <traceback object at 0x7f4e0f04e3b0>)
processed 28 skipped 4
processed 29 skipped 4
processed 30 skipped 4
processed 31 skipped 4
processed 32 skipped 4
processed 33 skipped 4
processed 34 skipped 4
processed 35 skipped 4
processed 36 skipped 4
processed 37 skipped 4
processed 38 skipped 4
processed 39 skipped 4
processed 40 skipped 4
processed 41 skipped 4
processed 42 skipped 4
processed 43 skipped 4
processed 44 skipped 4
processed 45 skipped 4
processed 46 skipped 4
processed 47 skipped 4
processed 48 skipped 4
processed 49 skipped 4
processed 50 skipped 4
processed 51 skipped 4
processed 52 skipped 4
processed 53 skipped 4
processed 54 skipped 4
processed 55 skipped 4
processed 56 skipped 4
processed 57 skipped 4
processed 58 skipped 4
processed 59 skipped 4
processed 60 skipped 4
processed 61 skipped 4
processed 62 skipped 4
processed 63 skipped 4
processed 64 skipped 4
processed 65 skipped 4
processed 66 skipped 4
processed 67 skipped 4
processed 68 skipped 4
processed 69 skipped 4
processed 70 skipped 4
processed 71 skipped 4
processed 72 skipped 4
processed 73 skipped 4
processed 74 skipped 4
processed 75 skipped 4
processed 76 skipped 4
processed 77 skipped 4
processed 78 skipped 4
processed 79 skipped 4
processed 80 skipped 4
processed 81 skipped 4
processed 82 skipped 4
processed 83 skipped 4
processed 84 skipped 4
processed 85 skipped 4
processed 86 skipped 4
processed 87 skipped 4
processed 88 skipped 4
processed 89 skipped 4
processed 90 skipped 4
processed 91 skipped 4
processed 92 skipped 4
processed 93 skipped 4
processed 94 skipped 4
error on download2d25dd6c-5093-4be6-b801-d671416c2e61.html
File "<ipython-input-67-6114dbc3b5a2>", line 16, in <module>
res = comp_extractors( extractor_training_object )
File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
extraction_results = get_extraction_results( eto )
File "<ipython-input-21-ff5d87d959e4>", line 25, in get_extraction_results
ret['py_goose'] = { 'extracted_html': extract_with_python_goose( raw_content ) }
File "<ipython-input-9-da7d07f83e6d>", line 6, in extract_with_python_goose
r = g.extract( raw_html=raw_content )
File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 56, in extract
return self.crawl(cc)
File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 63, in crawl
article = crawler.crawl(crawl_candiate)
File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 125, in crawl
self.get_image()
File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 147, in get_image
self.article.top_image = self.image_extractor.get_best_image(doc, top_node)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 88, in get_best_image
image = self.check_large_images(topNode, 0, 0)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 122, in check_large_images
good_images = self.get_image_candidates(node)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 283, in get_image_candidates
good_images = self.get_images_bytesize_match(filtered_images)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 299, in get_images_bytesize_match
local_image = self.get_local_image(src)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 344, in get_local_image
self.link_hash, src, self.config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 59, in store_image
image = self.write_localfile(data, link_hash, src, config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 101, in write_localfile
return self.read_localfile(link_hash, src, config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 81, in read_localfile
image_details = self.get_image_dimensions(identify, local_image_name)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 36, in get_image_dimensions
image = Image.open(path)
File "/usr/lib/python2.7/dist-packages/PIL/Image.py", line 2028, in open
raise IOError("cannot identify image file")
IOError: cannot identify image file
Traceback (most recent call last):
(<type 'exceptions.IOError'>, IOError('cannot identify image file',), <traceback object at 0x7f4dcdba0098>)
processed 94 skipped 5
processed 95 skipped 5
processed 96 skipped 5
processed 97 skipped 5
error on download2fc045e8-a8a6-4ae4-9269-a8892fe69085.html
File "<ipython-input-67-6114dbc3b5a2>", line 16, in <module>
res = comp_extractors( extractor_training_object )
File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
extraction_results = get_extraction_results( eto )
File "<ipython-input-21-ff5d87d959e4>", line 25, in get_extraction_results
ret['py_goose'] = { 'extracted_html': extract_with_python_goose( raw_content ) }
File "<ipython-input-9-da7d07f83e6d>", line 6, in extract_with_python_goose
r = g.extract( raw_html=raw_content )
File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 56, in extract
return self.crawl(cc)
File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 63, in crawl
article = crawler.crawl(crawl_candiate)
File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 125, in crawl
self.get_image()
File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 147, in get_image
self.article.top_image = self.image_extractor.get_best_image(doc, top_node)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 88, in get_best_image
image = self.check_large_images(topNode, 0, 0)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
depth_obj.parent_depth, depth_obj.sibling_depth)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
depth_obj.parent_depth, depth_obj.sibling_depth)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 122, in check_large_images
good_images = self.get_image_candidates(node)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 283, in get_image_candidates
good_images = self.get_images_bytesize_match(filtered_images)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 299, in get_images_bytesize_match
local_image = self.get_local_image(src)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 344, in get_local_image
self.link_hash, src, self.config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 59, in store_image
image = self.write_localfile(data, link_hash, src, config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 101, in write_localfile
return self.read_localfile(link_hash, src, config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 81, in read_localfile
image_details = self.get_image_dimensions(identify, local_image_name)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 36, in get_image_dimensions
image = Image.open(path)
File "/usr/lib/python2.7/dist-packages/PIL/Image.py", line 2028, in open
raise IOError("cannot identify image file")
IOError: cannot identify image file
Traceback (most recent call last):
(<type 'exceptions.ValueError'>, ValueError(u'Unicode strings with encoding declaration are not supported. Please use bytes input or XML fragments without declaration.',), <traceback object at 0x7f4dcdc7a248>)
processed 97 skipped 6
error on download2fd3440e-8cf1-422d-8b6b-0ec1082cbebb.html
File "<ipython-input-67-6114dbc3b5a2>", line 16, in <module>
res = comp_extractors( extractor_training_object )
File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
extraction_results = get_extraction_results( eto )
File "<ipython-input-21-ff5d87d959e4>", line 25, in get_extraction_results
ret['py_goose'] = { 'extracted_html': extract_with_python_goose( raw_content ) }
File "<ipython-input-9-da7d07f83e6d>", line 6, in extract_with_python_goose
r = g.extract( raw_html=raw_content )
File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 56, in extract
return self.crawl(cc)
File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 63, in crawl
article = crawler.crawl(crawl_candiate)
File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 90, in crawl
doc = self.get_document(raw_html)
File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 176, in get_document
doc = self.parser.fromstring(raw_html)
File "/usr/local/lib/python2.7/dist-packages/goose/parsers.py", line 54, in fromstring
self.doc = lxml.html.fromstring(html)
File "/usr/local/lib/python2.7/dist-packages/lxml/html/__init__.py", line 672, in fromstring
doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
File "/usr/local/lib/python2.7/dist-packages/lxml/html/__init__.py", line 568, in document_fromstring
value = etree.fromstring(html, parser, **kw)
File "lxml.etree.pyx", line 2997, in lxml.etree.fromstring (src/lxml/lxml.etree.c:63276)
File "parser.pxi", line 1607, in lxml.etree._parseMemoryDocument (src/lxml/lxml.etree.c:93592)
ValueError: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fragments without declaration.
Traceback (most recent call last):
(<type 'exceptions.IOError'>, IOError('cannot identify image file',), <traceback object at 0x7f4dccd12098>)
processed 97 skipped 7
processed 98 skipped 7
processed 99 skipped 7
processed 100 skipped 7
processed 101 skipped 7
processed 102 skipped 7
processed 103 skipped 7
processed 104 skipped 7
processed 105 skipped 7
processed 106 skipped 7
error on download3164baec-188a-4116-9aed-a18041854535.html
File "<ipython-input-67-6114dbc3b5a2>", line 16, in <module>
res = comp_extractors( extractor_training_object )
File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
extraction_results = get_extraction_results( eto )
File "<ipython-input-21-ff5d87d959e4>", line 25, in get_extraction_results
ret['py_goose'] = { 'extracted_html': extract_with_python_goose( raw_content ) }
File "<ipython-input-9-da7d07f83e6d>", line 6, in extract_with_python_goose
r = g.extract( raw_html=raw_content )
File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 56, in extract
return self.crawl(cc)
File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 63, in crawl
article = crawler.crawl(crawl_candiate)
File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 125, in crawl
self.get_image()
File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 147, in get_image
self.article.top_image = self.image_extractor.get_best_image(doc, top_node)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 88, in get_best_image
image = self.check_large_images(topNode, 0, 0)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
depth_obj.parent_depth, depth_obj.sibling_depth)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 122, in check_large_images
good_images = self.get_image_candidates(node)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 283, in get_image_candidates
good_images = self.get_images_bytesize_match(filtered_images)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 299, in get_images_bytesize_match
local_image = self.get_local_image(src)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 344, in get_local_image
self.link_hash, src, self.config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 59, in store_image
image = self.write_localfile(data, link_hash, src, config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 101, in write_localfile
return self.read_localfile(link_hash, src, config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 81, in read_localfile
image_details = self.get_image_dimensions(identify, local_image_name)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 36, in get_image_dimensions
image = Image.open(path)
File "/usr/lib/python2.7/dist-packages/PIL/Image.py", line 2028, in open
raise IOError("cannot identify image file")
IOError: cannot identify image file
Traceback (most recent call last):
(<type 'exceptions.IOError'>, IOError("Couldn't open file /usr/local/lib/python2.7/dist-packages/goose/resources/text/stopwords-fa.txt",), <traceback object at 0x7f4dcd474560>)
processed 106 skipped 8
processed 107 skipped 8
processed 108 skipped 8
processed 109 skipped 8
processed 110 skipped 8
processed 111 skipped 8
processed 112 skipped 8
processed 113 skipped 8
processed 114 skipped 8
processed 115 skipped 8
processed 116 skipped 8
processed 117 skipped 8
processed 118 skipped 8
processed 119 skipped 8
processed 120 skipped 8
processed 121 skipped 8
processed 122 skipped 8
processed 123 skipped 8
processed 124 skipped 8
processed 125 skipped 8
processed 126 skipped 8
processed 127 skipped 8
processed 128 skipped 8
processed 129 skipped 8
processed 130 skipped 8
processed 131 skipped 8
processed 132 skipped 8
processed 133 skipped 8
processed 134 skipped 8
processed 135 skipped 8
processed 136 skipped 8
processed 137 skipped 8
processed 138 skipped 8
error on download41f7673a-553d-4893-9a51-ef7bb0ce6293.html
File "<ipython-input-67-6114dbc3b5a2>", line 16, in <module>
res = comp_extractors( extractor_training_object )
File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
extraction_results = get_extraction_results( eto )
File "<ipython-input-21-ff5d87d959e4>", line 25, in get_extraction_results
ret['py_goose'] = { 'extracted_html': extract_with_python_goose( raw_content ) }
File "<ipython-input-9-da7d07f83e6d>", line 6, in extract_with_python_goose
r = g.extract( raw_html=raw_content )
File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 56, in extract
return self.crawl(cc)
File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 63, in crawl
article = crawler.crawl(crawl_candiate)
File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 131, in crawl
self.article.cleaned_text = self.formatter.get_formatted_text()
File "/usr/local/lib/python2.7/dist-packages/goose/outputformatters.py", line 66, in get_formatted_text
self.remove_fewwords_paragraphs()
File "/usr/local/lib/python2.7/dist-packages/goose/outputformatters.py", line 123, in remove_fewwords_paragraphs
stop_words = self.stopwords_class(language=self.get_language()).get_stopword_count(text)
File "/usr/local/lib/python2.7/dist-packages/goose/text.py", line 98, in __init__
self._cached_stop_words[language] = set(FileHelper.loadResourceFile(path).splitlines())
File "/usr/local/lib/python2.7/dist-packages/goose/utils/__init__.py", line 79, in loadResourceFile
raise IOError("Couldn't open file %s" % path)
IOError: Couldn't open file /usr/local/lib/python2.7/dist-packages/goose/resources/text/stopwords-fa.txt
Traceback (most recent call last):
(<type 'exceptions.IOError'>, IOError('cannot identify image file',), <traceback object at 0x7f4dccce5908>)
processed 138 skipped 9
error on download42abb137-8a90-47db-90e2-3f8013cb9ae8.html
File "<ipython-input-67-6114dbc3b5a2>", line 16, in <module>
res = comp_extractors( extractor_training_object )
File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
extraction_results = get_extraction_results( eto )
File "<ipython-input-21-ff5d87d959e4>", line 25, in get_extraction_results
ret['py_goose'] = { 'extracted_html': extract_with_python_goose( raw_content ) }
File "<ipython-input-9-da7d07f83e6d>", line 6, in extract_with_python_goose
r = g.extract( raw_html=raw_content )
File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 56, in extract
return self.crawl(cc)
File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 63, in crawl
article = crawler.crawl(crawl_candiate)
File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 125, in crawl
self.get_image()
File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 147, in get_image
self.article.top_image = self.image_extractor.get_best_image(doc, top_node)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 88, in get_best_image
image = self.check_large_images(topNode, 0, 0)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
depth_obj.parent_depth, depth_obj.sibling_depth)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 122, in check_large_images
good_images = self.get_image_candidates(node)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 283, in get_image_candidates
good_images = self.get_images_bytesize_match(filtered_images)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 299, in get_images_bytesize_match
local_image = self.get_local_image(src)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 344, in get_local_image
self.link_hash, src, self.config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 59, in store_image
image = self.write_localfile(data, link_hash, src, config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 101, in write_localfile
return self.read_localfile(link_hash, src, config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 81, in read_localfile
image_details = self.get_image_dimensions(identify, local_image_name)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 36, in get_image_dimensions
image = Image.open(path)
File "/usr/lib/python2.7/dist-packages/PIL/Image.py", line 2028, in open
raise IOError("cannot identify image file")
IOError: cannot identify image file
Traceback (most recent call last):
(<type 'exceptions.IOError'>, IOError('cannot identify image file',), <traceback object at 0x7f4da2191b48>)
processed 138 skipped 10
processed 139 skipped 10
error on download43fb5dbe-6f8c-45f8-bd59-7b258df028fb.html
File "<ipython-input-67-6114dbc3b5a2>", line 16, in <module>
res = comp_extractors( extractor_training_object )
File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
extraction_results = get_extraction_results( eto )
File "<ipython-input-21-ff5d87d959e4>", line 25, in get_extraction_results
ret['py_goose'] = { 'extracted_html': extract_with_python_goose( raw_content ) }
File "<ipython-input-9-da7d07f83e6d>", line 6, in extract_with_python_goose
r = g.extract( raw_html=raw_content )
File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 56, in extract
return self.crawl(cc)
File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 63, in crawl
article = crawler.crawl(crawl_candiate)
File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 125, in crawl
self.get_image()
File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 147, in get_image
self.article.top_image = self.image_extractor.get_best_image(doc, top_node)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 88, in get_best_image
image = self.check_large_images(topNode, 0, 0)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
depth_obj.parent_depth, depth_obj.sibling_depth)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
depth_obj.parent_depth, depth_obj.sibling_depth)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
depth_obj.parent_depth, depth_obj.sibling_depth)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 122, in check_large_images
good_images = self.get_image_candidates(node)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 283, in get_image_candidates
good_images = self.get_images_bytesize_match(filtered_images)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 299, in get_images_bytesize_match
local_image = self.get_local_image(src)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 344, in get_local_image
self.link_hash, src, self.config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 59, in store_image
image = self.write_localfile(data, link_hash, src, config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 101, in write_localfile
return self.read_localfile(link_hash, src, config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 81, in read_localfile
image_details = self.get_image_dimensions(identify, local_image_name)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 36, in get_image_dimensions
image = Image.open(path)
File "/usr/lib/python2.7/dist-packages/PIL/Image.py", line 2028, in open
raise IOError("cannot identify image file")
IOError: cannot identify image file
Traceback (most recent call last):
(<type 'exceptions.ValueError'>, ValueError(u'Unicode strings with encoding declaration are not supported. Please use bytes input or XML fragments without declaration.',), <traceback object at 0x7f4de618fc68>)
processed 139 skipped 11
processed 140 skipped 11
processed 141 skipped 11
processed 142 skipped 11
processed 143 skipped 11
processed 144 skipped 11
processed 145 skipped 11
processed 146 skipped 11
processed 147 skipped 11
processed 148 skipped 11
processed 149 skipped 11
processed 150 skipped 11
processed 151 skipped 11
processed 152 skipped 11
processed 153 skipped 11
processed 154 skipped 11
processed 155 skipped 11
processed 156 skipped 11
processed 157 skipped 11
processed 158 skipped 11
processed 159 skipped 11
processed 160 skipped 11
processed 161 skipped 11
processed 162 skipped 11
processed 163 skipped 11
processed 164 skipped 11
processed 165 skipped 11
processed 166 skipped 11
processed 167 skipped 11
processed 168 skipped 11
processed 169 skipped 11
processed 170 skipped 11
processed 171 skipped 11
processed 172 skipped 11
processed 173 skipped 11
error on download50f75f84-f64b-4d08-87dc-b351742a4e4b.html
File "<ipython-input-67-6114dbc3b5a2>", line 16, in <module>
res = comp_extractors( extractor_training_object )
File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
extraction_results = get_extraction_results( eto )
File "<ipython-input-21-ff5d87d959e4>", line 25, in get_extraction_results
ret['py_goose'] = { 'extracted_html': extract_with_python_goose( raw_content ) }
File "<ipython-input-9-da7d07f83e6d>", line 6, in extract_with_python_goose
r = g.extract( raw_html=raw_content )
File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 56, in extract
return self.crawl(cc)
File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 63, in crawl
article = crawler.crawl(crawl_candiate)
File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 90, in crawl
doc = self.get_document(raw_html)
File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 176, in get_document
doc = self.parser.fromstring(raw_html)
File "/usr/local/lib/python2.7/dist-packages/goose/parsers.py", line 54, in fromstring
self.doc = lxml.html.fromstring(html)
File "/usr/local/lib/python2.7/dist-packages/lxml/html/__init__.py", line 672, in fromstring
doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
File "/usr/local/lib/python2.7/dist-packages/lxml/html/__init__.py", line 568, in document_fromstring
value = etree.fromstring(html, parser, **kw)
File "lxml.etree.pyx", line 2997, in lxml.etree.fromstring (src/lxml/lxml.etree.c:63276)
File "parser.pxi", line 1607, in lxml.etree._parseMemoryDocument (src/lxml/lxml.etree.c:93592)
ValueError: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fragments without declaration.
Traceback (most recent call last):
(<type 'exceptions.IOError'>, IOError('cannot identify image file',), <traceback object at 0x7f4da21b62d8>)
processed 173 skipped 12
processed 174 skipped 12
processed 175 skipped 12
processed 176 skipped 12
processed 177 skipped 12
processed 178 skipped 12
processed 179 skipped 12
processed 180 skipped 12
processed 181 skipped 12
processed 182 skipped 12
processed 183 skipped 12
processed 184 skipped 12
processed 185 skipped 12
processed 186 skipped 12
processed 187 skipped 12
processed 188 skipped 12
processed 189 skipped 12
processed 190 skipped 12
processed 191 skipped 12
processed 192 skipped 12
processed 193 skipped 12
processed 194 skipped 12
processed 195 skipped 12
processed 196 skipped 12
processed 197 skipped 12
processed 198 skipped 12
processed 199 skipped 12
processed 200 skipped 12
processed 201 skipped 12
processed 202 skipped 12
processed 203 skipped 12
processed 204 skipped 12
processed 205 skipped 12
processed 206 skipped 12
processed 207 skipped 12
processed 208 skipped 12
processed 209 skipped 12
processed 210 skipped 12
processed 211 skipped 12
processed 212 skipped 12
processed 213 skipped 12
processed 214 skipped 12
processed 215 skipped 12
processed 216 skipped 12
processed 217 skipped 12
processed 218 skipped 12
processed 219 skipped 12
processed 220 skipped 12
processed 221 skipped 12
processed 222 skipped 12
processed 223 skipped 12
processed 224 skipped 12
processed 225 skipped 12
processed 226 skipped 12
error on download693c707d-eeab-4486-97d0-2f3d286d1a0e.html
File "<ipython-input-67-6114dbc3b5a2>", line 16, in <module>
res = comp_extractors( extractor_training_object )
File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
extraction_results = get_extraction_results( eto )
File "<ipython-input-21-ff5d87d959e4>", line 25, in get_extraction_results
ret['py_goose'] = { 'extracted_html': extract_with_python_goose( raw_content ) }
File "<ipython-input-9-da7d07f83e6d>", line 6, in extract_with_python_goose
r = g.extract( raw_html=raw_content )
File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 56, in extract
return self.crawl(cc)
File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 63, in crawl
article = crawler.crawl(crawl_candiate)
File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 125, in crawl
self.get_image()
File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 147, in get_image
self.article.top_image = self.image_extractor.get_best_image(doc, top_node)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 88, in get_best_image
image = self.check_large_images(topNode, 0, 0)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
depth_obj.parent_depth, depth_obj.sibling_depth)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 122, in check_large_images
good_images = self.get_image_candidates(node)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 283, in get_image_candidates
good_images = self.get_images_bytesize_match(filtered_images)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 299, in get_images_bytesize_match
local_image = self.get_local_image(src)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 344, in get_local_image
self.link_hash, src, self.config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 59, in store_image
image = self.write_localfile(data, link_hash, src, config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 101, in write_localfile
return self.read_localfile(link_hash, src, config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 81, in read_localfile
image_details = self.get_image_dimensions(identify, local_image_name)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 36, in get_image_dimensions
image = Image.open(path)
File "/usr/lib/python2.7/dist-packages/PIL/Image.py", line 2028, in open
raise IOError("cannot identify image file")
IOError: cannot identify image file
Traceback (most recent call last):
(<type 'exceptions.ValueError'>, ValueError(u'Unicode strings with encoding declaration are not supported. Please use bytes input or XML fragments without declaration.',), <traceback object at 0x7f4da21aca28>)
processed 226 skipped 13
processed 227 skipped 13
processed 228 skipped 13
processed 229 skipped 13
error on download69df3069-b6f2-4624-8f94-c9aa23298e3b.html
File "<ipython-input-67-6114dbc3b5a2>", line 16, in <module>
res = comp_extractors( extractor_training_object )
File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
extraction_results = get_extraction_results( eto )
File "<ipython-input-21-ff5d87d959e4>", line 25, in get_extraction_results
ret['py_goose'] = { 'extracted_html': extract_with_python_goose( raw_content ) }
File "<ipython-input-9-da7d07f83e6d>", line 6, in extract_with_python_goose
r = g.extract( raw_html=raw_content )
File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 56, in extract
return self.crawl(cc)
File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 63, in crawl
article = crawler.crawl(crawl_candiate)
File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 90, in crawl
doc = self.get_document(raw_html)
File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 176, in get_document
doc = self.parser.fromstring(raw_html)
File "/usr/local/lib/python2.7/dist-packages/goose/parsers.py", line 54, in fromstring
self.doc = lxml.html.fromstring(html)
File "/usr/local/lib/python2.7/dist-packages/lxml/html/__init__.py", line 672, in fromstring
doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
File "/usr/local/lib/python2.7/dist-packages/lxml/html/__init__.py", line 568, in document_fromstring
value = etree.fromstring(html, parser, **kw)
File "lxml.etree.pyx", line 2997, in lxml.etree.fromstring (src/lxml/lxml.etree.c:63276)
File "parser.pxi", line 1607, in lxml.etree._parseMemoryDocument (src/lxml/lxml.etree.c:93592)
ValueError: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fragments without declaration.
Traceback (most recent call last):
(<type 'exceptions.IOError'>, IOError('cannot identify image file',), <traceback object at 0x7f4df95eea28>)
processed 229 skipped 14
processed 230 skipped 14
error on download6bb2477c-bb70-4027-8851-d15a51cd9a49.html
File "<ipython-input-67-6114dbc3b5a2>", line 16, in <module>
res = comp_extractors( extractor_training_object )
File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
extraction_results = get_extraction_results( eto )
File "<ipython-input-21-ff5d87d959e4>", line 25, in get_extraction_results
ret['py_goose'] = { 'extracted_html': extract_with_python_goose( raw_content ) }
File "<ipython-input-9-da7d07f83e6d>", line 6, in extract_with_python_goose
r = g.extract( raw_html=raw_content )
File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 56, in extract
return self.crawl(cc)
File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 63, in crawl
article = crawler.crawl(crawl_candiate)
File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 125, in crawl
self.get_image()
File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 147, in get_image
self.article.top_image = self.image_extractor.get_best_image(doc, top_node)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 88, in get_best_image
image = self.check_large_images(topNode, 0, 0)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
depth_obj.parent_depth, depth_obj.sibling_depth)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
depth_obj.parent_depth, depth_obj.sibling_depth)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 122, in check_large_images
good_images = self.get_image_candidates(node)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 283, in get_image_candidates
good_images = self.get_images_bytesize_match(filtered_images)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 299, in get_images_bytesize_match
local_image = self.get_local_image(src)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 344, in get_local_image
self.link_hash, src, self.config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 59, in store_image
image = self.write_localfile(data, link_hash, src, config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 101, in write_localfile
return self.read_localfile(link_hash, src, config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 81, in read_localfile
image_details = self.get_image_dimensions(identify, local_image_name)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 36, in get_image_dimensions
image = Image.open(path)
File "/usr/lib/python2.7/dist-packages/PIL/Image.py", line 2028, in open
raise IOError("cannot identify image file")
IOError: cannot identify image file
Traceback (most recent call last):
(<type 'exceptions.IOError'>, IOError('cannot identify image file',), <traceback object at 0x7f4de72344d0>)
processed 230 skipped 15
error on download6bfd738f-ffe9-4303-948e-c8d75a2944e1.html
File "<ipython-input-67-6114dbc3b5a2>", line 16, in <module>
res = comp_extractors( extractor_training_object )
File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
extraction_results = get_extraction_results( eto )
File "<ipython-input-21-ff5d87d959e4>", line 25, in get_extraction_results
ret['py_goose'] = { 'extracted_html': extract_with_python_goose( raw_content ) }
File "<ipython-input-9-da7d07f83e6d>", line 6, in extract_with_python_goose
r = g.extract( raw_html=raw_content )
File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 56, in extract
return self.crawl(cc)
File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 63, in crawl
article = crawler.crawl(crawl_candiate)
File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 125, in crawl
self.get_image()
File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 147, in get_image
self.article.top_image = self.image_extractor.get_best_image(doc, top_node)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 88, in get_best_image
image = self.check_large_images(topNode, 0, 0)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 122, in check_large_images
good_images = self.get_image_candidates(node)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 283, in get_image_candidates
good_images = self.get_images_bytesize_match(filtered_images)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 299, in get_images_bytesize_match
local_image = self.get_local_image(src)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 344, in get_local_image
self.link_hash, src, self.config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 59, in store_image
image = self.write_localfile(data, link_hash, src, config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 101, in write_localfile
return self.read_localfile(link_hash, src, config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 81, in read_localfile
image_details = self.get_image_dimensions(identify, local_image_name)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 36, in get_image_dimensions
image = Image.open(path)
File "/usr/lib/python2.7/dist-packages/PIL/Image.py", line 2028, in open
raise IOError("cannot identify image file")
IOError: cannot identify image file
Traceback (most recent call last):
(<type 'exceptions.IOError'>, IOError('cannot identify image file',), <traceback object at 0x7f4de618f098>)
processed 230 skipped 16
processed 231 skipped 16
processed 232 skipped 16
processed 233 skipped 16
processed 234 skipped 16
processed 235 skipped 16
processed 236 skipped 16
error on download6e371db7-72dc-4e89-9bc5-2c259cf456e1.html
File "<ipython-input-67-6114dbc3b5a2>", line 16, in <module>
res = comp_extractors( extractor_training_object )
File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
extraction_results = get_extraction_results( eto )
File "<ipython-input-21-ff5d87d959e4>", line 25, in get_extraction_results
ret['py_goose'] = { 'extracted_html': extract_with_python_goose( raw_content ) }
File "<ipython-input-9-da7d07f83e6d>", line 6, in extract_with_python_goose
r = g.extract( raw_html=raw_content )
File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 56, in extract
return self.crawl(cc)
File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 63, in crawl
article = crawler.crawl(crawl_candiate)
File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 125, in crawl
self.get_image()
File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 147, in get_image
self.article.top_image = self.image_extractor.get_best_image(doc, top_node)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 88, in get_best_image
image = self.check_large_images(topNode, 0, 0)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
depth_obj.parent_depth, depth_obj.sibling_depth)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 122, in check_large_images
good_images = self.get_image_candidates(node)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 283, in get_image_candidates
good_images = self.get_images_bytesize_match(filtered_images)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 299, in get_images_bytesize_match
local_image = self.get_local_image(src)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 344, in get_local_image
self.link_hash, src, self.config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 59, in store_image
image = self.write_localfile(data, link_hash, src, config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 101, in write_localfile
return self.read_localfile(link_hash, src, config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 81, in read_localfile
image_details = self.get_image_dimensions(identify, local_image_name)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 36, in get_image_dimensions
image = Image.open(path)
File "/usr/lib/python2.7/dist-packages/PIL/Image.py", line 2028, in open
raise IOError("cannot identify image file")
IOError: cannot identify image file
Traceback (most recent call last):
(<type 'exceptions.IOError'>, IOError('cannot identify image file',), <traceback object at 0x7f4dccd0a950>)
processed 236 skipped 17
processed 237 skipped 17
processed 238 skipped 17
processed 239 skipped 17
processed 240 skipped 17
processed 241 skipped 17
processed 242 skipped 17
processed 243 skipped 17
processed 244 skipped 17
processed 245 skipped 17
processed 246 skipped 17
processed 247 skipped 17
processed 248 skipped 17
processed 249 skipped 17
processed 250 skipped 17
processed 251 skipped 17
processed 252 skipped 17
processed 253 skipped 17
processed 254 skipped 17
processed 255 skipped 17
processed 256 skipped 17
processed 257 skipped 17
error on download756ff125-8afa-43d2-a2bb-f9e742109202.html
File "<ipython-input-67-6114dbc3b5a2>", line 16, in <module>
res = comp_extractors( extractor_training_object )
File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
extraction_results = get_extraction_results( eto )
File "<ipython-input-21-ff5d87d959e4>", line 25, in get_extraction_results
ret['py_goose'] = { 'extracted_html': extract_with_python_goose( raw_content ) }
File "<ipython-input-9-da7d07f83e6d>", line 6, in extract_with_python_goose
r = g.extract( raw_html=raw_content )
File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 56, in extract
return self.crawl(cc)
File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 63, in crawl
article = crawler.crawl(crawl_candiate)
File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 125, in crawl
self.get_image()
File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 147, in get_image
self.article.top_image = self.image_extractor.get_best_image(doc, top_node)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 88, in get_best_image
image = self.check_large_images(topNode, 0, 0)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
depth_obj.parent_depth, depth_obj.sibling_depth)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
depth_obj.parent_depth, depth_obj.sibling_depth)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
depth_obj.parent_depth, depth_obj.sibling_depth)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
depth_obj.parent_depth, depth_obj.sibling_depth)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 122, in check_large_images
good_images = self.get_image_candidates(node)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 283, in get_image_candidates
good_images = self.get_images_bytesize_match(filtered_images)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 299, in get_images_bytesize_match
local_image = self.get_local_image(src)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 344, in get_local_image
self.link_hash, src, self.config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 59, in store_image
image = self.write_localfile(data, link_hash, src, config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 101, in write_localfile
return self.read_localfile(link_hash, src, config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 81, in read_localfile
image_details = self.get_image_dimensions(identify, local_image_name)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 36, in get_image_dimensions
image = Image.open(path)
File "/usr/lib/python2.7/dist-packages/PIL/Image.py", line 2028, in open
raise IOError("cannot identify image file")
IOError: cannot identify image file
Traceback (most recent call last):
(<type 'exceptions.IOError'>, IOError('cannot identify image file',), <traceback object at 0x7f4da15cc2d8>)
processed 257 skipped 18
processed 258 skipped 18
processed 259 skipped 18
processed 260 skipped 18
processed 261 skipped 18
processed 262 skipped 18
processed 263 skipped 18
processed 264 skipped 18
processed 265 skipped 18
error on download79f28b11-26b6-4a20-8e9b-de1064b8bc01.html
File "<ipython-input-67-6114dbc3b5a2>", line 16, in <module>
res = comp_extractors( extractor_training_object )
File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
extraction_results = get_extraction_results( eto )
File "<ipython-input-21-ff5d87d959e4>", line 25, in get_extraction_results
ret['py_goose'] = { 'extracted_html': extract_with_python_goose( raw_content ) }
File "<ipython-input-9-da7d07f83e6d>", line 6, in extract_with_python_goose
r = g.extract( raw_html=raw_content )
File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 56, in extract
return self.crawl(cc)
File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 63, in crawl
article = crawler.crawl(crawl_candiate)
File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 125, in crawl
self.get_image()
File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 147, in get_image
self.article.top_image = self.image_extractor.get_best_image(doc, top_node)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 88, in get_best_image
image = self.check_large_images(topNode, 0, 0)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
depth_obj.parent_depth, depth_obj.sibling_depth)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
depth_obj.parent_depth, depth_obj.sibling_depth)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
depth_obj.parent_depth, depth_obj.sibling_depth)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 122, in check_large_images
good_images = self.get_image_candidates(node)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 283, in get_image_candidates
good_images = self.get_images_bytesize_match(filtered_images)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 299, in get_images_bytesize_match
local_image = self.get_local_image(src)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 344, in get_local_image
self.link_hash, src, self.config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 59, in store_image
image = self.write_localfile(data, link_hash, src, config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 101, in write_localfile
return self.read_localfile(link_hash, src, config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 81, in read_localfile
image_details = self.get_image_dimensions(identify, local_image_name)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 36, in get_image_dimensions
image = Image.open(path)
File "/usr/lib/python2.7/dist-packages/PIL/Image.py", line 2028, in open
raise IOError("cannot identify image file")
IOError: cannot identify image file
Traceback (most recent call last):
(<type 'exceptions.IOError'>, IOError('cannot identify image file',), <traceback object at 0x7f4da21ac638>)
processed 265 skipped 19
processed 266 skipped 19
processed 267 skipped 19
processed 268 skipped 19
processed 269 skipped 19
processed 270 skipped 19
processed 271 skipped 19
processed 272 skipped 19
processed 273 skipped 19
processed 274 skipped 19
processed 275 skipped 19
processed 276 skipped 19
processed 277 skipped 19
processed 278 skipped 19
processed 279 skipped 19
processed 280 skipped 19
processed 281 skipped 19
processed 282 skipped 19
processed 283 skipped 19
processed 284 skipped 19
processed 285 skipped 19
processed 286 skipped 19
processed 287 skipped 19
processed 288 skipped 19
processed 289 skipped 19
processed 290 skipped 19
processed 291 skipped 19
processed 292 skipped 19
processed 293 skipped 19
error on download89fc8a0d-3841-4ffc-8366-18fa10cbae64.html
File "<ipython-input-67-6114dbc3b5a2>", line 16, in <module>
res = comp_extractors( extractor_training_object )
File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
extraction_results = get_extraction_results( eto )
File "<ipython-input-21-ff5d87d959e4>", line 25, in get_extraction_results
ret['py_goose'] = { 'extracted_html': extract_with_python_goose( raw_content ) }
File "<ipython-input-9-da7d07f83e6d>", line 6, in extract_with_python_goose
r = g.extract( raw_html=raw_content )
File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 56, in extract
return self.crawl(cc)
File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 63, in crawl
article = crawler.crawl(crawl_candiate)
File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 125, in crawl
self.get_image()
File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 147, in get_image
self.article.top_image = self.image_extractor.get_best_image(doc, top_node)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 88, in get_best_image
image = self.check_large_images(topNode, 0, 0)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
depth_obj.parent_depth, depth_obj.sibling_depth)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
depth_obj.parent_depth, depth_obj.sibling_depth)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
depth_obj.parent_depth, depth_obj.sibling_depth)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
depth_obj.parent_depth, depth_obj.sibling_depth)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
depth_obj.parent_depth, depth_obj.sibling_depth)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 122, in check_large_images
good_images = self.get_image_candidates(node)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 283, in get_image_candidates
good_images = self.get_images_bytesize_match(filtered_images)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 299, in get_images_bytesize_match
local_image = self.get_local_image(src)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 344, in get_local_image
self.link_hash, src, self.config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 59, in store_image
image = self.write_localfile(data, link_hash, src, config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 101, in write_localfile
return self.read_localfile(link_hash, src, config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 81, in read_localfile
image_details = self.get_image_dimensions(identify, local_image_name)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 36, in get_image_dimensions
image = Image.open(path)
File "/usr/lib/python2.7/dist-packages/PIL/Image.py", line 2028, in open
raise IOError("cannot identify image file")
IOError: cannot identify image file
Traceback (most recent call last):
(<type 'exceptions.IOError'>, IOError('cannot identify image file',), <traceback object at 0x7f4dcdb816c8>)
processed 293 skipped 20
processed 294 skipped 20
processed 295 skipped 20
processed 296 skipped 20
processed 297 skipped 20
processed 298 skipped 20
processed 299 skipped 20
processed 300 skipped 20
processed 301 skipped 20
processed 302 skipped 20
processed 303 skipped 20
processed 304 skipped 20
processed 305 skipped 20
processed 306 skipped 20
processed 307 skipped 20
processed 308 skipped 20
processed 309 skipped 20
processed 310 skipped 20
processed 311 skipped 20
processed 312 skipped 20
processed 313 skipped 20
error on download931bd5b0-b1c7-4026-b937-a2404e3d8891.html
File "<ipython-input-67-6114dbc3b5a2>", line 16, in <module>
res = comp_extractors( extractor_training_object )
File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
extraction_results = get_extraction_results( eto )
File "<ipython-input-21-ff5d87d959e4>", line 25, in get_extraction_results
ret['py_goose'] = { 'extracted_html': extract_with_python_goose( raw_content ) }
File "<ipython-input-9-da7d07f83e6d>", line 6, in extract_with_python_goose
r = g.extract( raw_html=raw_content )
File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 56, in extract
return self.crawl(cc)
File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 63, in crawl
article = crawler.crawl(crawl_candiate)
File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 125, in crawl
self.get_image()
File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 147, in get_image
self.article.top_image = self.image_extractor.get_best_image(doc, top_node)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 88, in get_best_image
image = self.check_large_images(topNode, 0, 0)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
depth_obj.parent_depth, depth_obj.sibling_depth)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 122, in check_large_images
good_images = self.get_image_candidates(node)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 283, in get_image_candidates
good_images = self.get_images_bytesize_match(filtered_images)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 299, in get_images_bytesize_match
local_image = self.get_local_image(src)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 344, in get_local_image
self.link_hash, src, self.config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 59, in store_image
image = self.write_localfile(data, link_hash, src, config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 101, in write_localfile
return self.read_localfile(link_hash, src, config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 81, in read_localfile
image_details = self.get_image_dimensions(identify, local_image_name)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 36, in get_image_dimensions
image = Image.open(path)
File "/usr/lib/python2.7/dist-packages/PIL/Image.py", line 2028, in open
raise IOError("cannot identify image file")
IOError: cannot identify image file
Traceback (most recent call last):
(<type 'exceptions.IOError'>, IOError('cannot identify image file',), <traceback object at 0x7f4dccd12ea8>)
processed 313 skipped 21
processed 314 skipped 21
processed 315 skipped 21
processed 316 skipped 21
processed 317 skipped 21
processed 318 skipped 21
processed 319 skipped 21
processed 320 skipped 21
processed 321 skipped 21
processed 322 skipped 21
processed 323 skipped 21
processed 324 skipped 21
processed 325 skipped 21
processed 326 skipped 21
processed 327 skipped 21
processed 328 skipped 21
processed 329 skipped 21
processed 330 skipped 21
processed 331 skipped 21
processed 332 skipped 21
processed 333 skipped 21
processed 334 skipped 21
processed 335 skipped 21
processed 336 skipped 21
processed 337 skipped 21
processed 338 skipped 21
processed 339 skipped 21
processed 340 skipped 21
processed 341 skipped 21
processed 342 skipped 21
processed 343 skipped 21
processed 344 skipped 21
processed 345 skipped 21
processed 346 skipped 21
processed 347 skipped 21
processed 348 skipped 21
processed 349 skipped 21
processed 350 skipped 21
processed 351 skipped 21
processed 352 skipped 21
processed 353 skipped 21
processed 354 skipped 21
processed 355 skipped 21
processed 356 skipped 21
processed 357 skipped 21
processed 358 skipped 21
processed 359 skipped 21
processed 360 skipped 21
processed 361 skipped 21
processed 362 skipped 21
processed 363 skipped 21
processed 364 skipped 21
processed 365 skipped 21
processed 366 skipped 21
processed 367 skipped 21
processed 368 skipped 21
processed 369 skipped 21
processed 370 skipped 21
processed 371 skipped 21
error on downloada8bbed64-c630-4e89-bfc3-795a7aebe780.html
File "<ipython-input-67-6114dbc3b5a2>", line 16, in <module>
res = comp_extractors( extractor_training_object )
File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
extraction_results = get_extraction_results( eto )
File "<ipython-input-21-ff5d87d959e4>", line 25, in get_extraction_results
ret['py_goose'] = { 'extracted_html': extract_with_python_goose( raw_content ) }
File "<ipython-input-9-da7d07f83e6d>", line 6, in extract_with_python_goose
r = g.extract( raw_html=raw_content )
File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 56, in extract
return self.crawl(cc)
File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 63, in crawl
article = crawler.crawl(crawl_candiate)
File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 125, in crawl
self.get_image()
File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 147, in get_image
self.article.top_image = self.image_extractor.get_best_image(doc, top_node)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 88, in get_best_image
image = self.check_large_images(topNode, 0, 0)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 122, in check_large_images
good_images = self.get_image_candidates(node)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 283, in get_image_candidates
good_images = self.get_images_bytesize_match(filtered_images)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 299, in get_images_bytesize_match
local_image = self.get_local_image(src)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 344, in get_local_image
self.link_hash, src, self.config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 59, in store_image
image = self.write_localfile(data, link_hash, src, config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 101, in write_localfile
return self.read_localfile(link_hash, src, config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 81, in read_localfile
image_details = self.get_image_dimensions(identify, local_image_name)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 36, in get_image_dimensions
image = Image.open(path)
File "/usr/lib/python2.7/dist-packages/PIL/Image.py", line 2028, in open
raise IOError("cannot identify image file")
IOError: cannot identify image file
Traceback (most recent call last):
(<type 'exceptions.IOError'>, IOError('cannot identify image file',), <traceback object at 0x7f4da15f0248>)
processed 371 skipped 22
processed 372 skipped 22
processed 373 skipped 22
processed 374 skipped 22
processed 375 skipped 22
processed 376 skipped 22
processed 377 skipped 22
processed 378 skipped 22
processed 379 skipped 22
processed 380 skipped 22
processed 381 skipped 22
processed 382 skipped 22
processed 383 skipped 22
processed 384 skipped 22
processed 385 skipped 22
processed 386 skipped 22
processed 387 skipped 22
processed 388 skipped 22
processed 389 skipped 22
processed 390 skipped 22
processed 391 skipped 22
error on downloadb09fe710-8e21-48bb-abf7-1cc842b668ea.html
File "<ipython-input-67-6114dbc3b5a2>", line 16, in <module>
res = comp_extractors( extractor_training_object )
File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
extraction_results = get_extraction_results( eto )
File "<ipython-input-21-ff5d87d959e4>", line 25, in get_extraction_results
ret['py_goose'] = { 'extracted_html': extract_with_python_goose( raw_content ) }
File "<ipython-input-9-da7d07f83e6d>", line 6, in extract_with_python_goose
r = g.extract( raw_html=raw_content )
File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 56, in extract
return self.crawl(cc)
File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 63, in crawl
article = crawler.crawl(crawl_candiate)
File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 125, in crawl
self.get_image()
File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 147, in get_image
self.article.top_image = self.image_extractor.get_best_image(doc, top_node)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 88, in get_best_image
image = self.check_large_images(topNode, 0, 0)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
depth_obj.parent_depth, depth_obj.sibling_depth)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
depth_obj.parent_depth, depth_obj.sibling_depth)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
depth_obj.parent_depth, depth_obj.sibling_depth)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
depth_obj.parent_depth, depth_obj.sibling_depth)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 122, in check_large_images
good_images = self.get_image_candidates(node)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 283, in get_image_candidates
good_images = self.get_images_bytesize_match(filtered_images)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 299, in get_images_bytesize_match
local_image = self.get_local_image(src)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 344, in get_local_image
self.link_hash, src, self.config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 59, in store_image
image = self.write_localfile(data, link_hash, src, config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 101, in write_localfile
return self.read_localfile(link_hash, src, config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 81, in read_localfile
image_details = self.get_image_dimensions(identify, local_image_name)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 36, in get_image_dimensions
image = Image.open(path)
File "/usr/lib/python2.7/dist-packages/PIL/Image.py", line 2028, in open
raise IOError("cannot identify image file")
IOError: cannot identify image file
Traceback (most recent call last):
(<type 'exceptions.IOError'>, IOError('cannot identify image file',), <traceback object at 0x7f4de604b7a0>)
processed 391 skipped 23
processed 392 skipped 23
processed 393 skipped 23
processed 394 skipped 23
processed 395 skipped 23
processed 396 skipped 23
processed 397 skipped 23
processed 398 skipped 23
processed 399 skipped 23
processed 400 skipped 23
processed 401 skipped 23
processed 402 skipped 23
processed 403 skipped 23
processed 404 skipped 23
processed 405 skipped 23
processed 406 skipped 23
processed 407 skipped 23
processed 408 skipped 23
processed 409 skipped 23
processed 410 skipped 23
processed 411 skipped 23
processed 412 skipped 23
processed 413 skipped 23
processed 414 skipped 23
processed 415 skipped 23
processed 416 skipped 23
processed 417 skipped 23
processed 418 skipped 23
error on downloadba66beed-a665-4cce-8ad3-61f7e05f0c47.html
File "<ipython-input-67-6114dbc3b5a2>", line 16, in <module>
res = comp_extractors( extractor_training_object )
File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
extraction_results = get_extraction_results( eto )
File "<ipython-input-21-ff5d87d959e4>", line 25, in get_extraction_results
ret['py_goose'] = { 'extracted_html': extract_with_python_goose( raw_content ) }
File "<ipython-input-9-da7d07f83e6d>", line 6, in extract_with_python_goose
r = g.extract( raw_html=raw_content )
File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 56, in extract
return self.crawl(cc)
File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 63, in crawl
article = crawler.crawl(crawl_candiate)
File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 125, in crawl
self.get_image()
File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 147, in get_image
self.article.top_image = self.image_extractor.get_best_image(doc, top_node)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 88, in get_best_image
image = self.check_large_images(topNode, 0, 0)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 122, in check_large_images
good_images = self.get_image_candidates(node)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 283, in get_image_candidates
good_images = self.get_images_bytesize_match(filtered_images)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 299, in get_images_bytesize_match
local_image = self.get_local_image(src)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 344, in get_local_image
self.link_hash, src, self.config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 52, in store_image
image = self.read_localfile(link_hash, src, config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 81, in read_localfile
image_details = self.get_image_dimensions(identify, local_image_name)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 36, in get_image_dimensions
image = Image.open(path)
File "/usr/lib/python2.7/dist-packages/PIL/Image.py", line 2028, in open
raise IOError("cannot identify image file")
IOError: cannot identify image file
Traceback (most recent call last):
(<type 'exceptions.IOError'>, IOError('cannot identify image file',), <traceback object at 0x7f4dcd647d88>)
processed 418 skipped 24
error on downloadbaeee9ce-b4a7-4fdc-ba0c-f5400a0a76ef.html
File "<ipython-input-67-6114dbc3b5a2>", line 16, in <module>
res = comp_extractors( extractor_training_object )
File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
extraction_results = get_extraction_results( eto )
File "<ipython-input-21-ff5d87d959e4>", line 25, in get_extraction_results
ret['py_goose'] = { 'extracted_html': extract_with_python_goose( raw_content ) }
File "<ipython-input-9-da7d07f83e6d>", line 6, in extract_with_python_goose
r = g.extract( raw_html=raw_content )
File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 56, in extract
return self.crawl(cc)
File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 63, in crawl
article = crawler.crawl(crawl_candiate)
File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 125, in crawl
self.get_image()
File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 147, in get_image
self.article.top_image = self.image_extractor.get_best_image(doc, top_node)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 88, in get_best_image
image = self.check_large_images(topNode, 0, 0)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
depth_obj.parent_depth, depth_obj.sibling_depth)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
depth_obj.parent_depth, depth_obj.sibling_depth)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
depth_obj.parent_depth, depth_obj.sibling_depth)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 122, in check_large_images
good_images = self.get_image_candidates(node)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 283, in get_image_candidates
good_images = self.get_images_bytesize_match(filtered_images)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 299, in get_images_bytesize_match
local_image = self.get_local_image(src)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 344, in get_local_image
self.link_hash, src, self.config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 59, in store_image
image = self.write_localfile(data, link_hash, src, config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 101, in write_localfile
return self.read_localfile(link_hash, src, config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 81, in read_localfile
image_details = self.get_image_dimensions(identify, local_image_name)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 36, in get_image_dimensions
image = Image.open(path)
File "/usr/lib/python2.7/dist-packages/PIL/Image.py", line 2028, in open
raise IOError("cannot identify image file")
IOError: cannot identify image file
Traceback (most recent call last):
(<type 'exceptions.IOError'>, IOError('cannot identify image file',), <traceback object at 0x7f4de71a7518>)
processed 418 skipped 25
processed 419 skipped 25
processed 420 skipped 25
processed 421 skipped 25
processed 422 skipped 25
processed 423 skipped 25
processed 424 skipped 25
processed 425 skipped 25
processed 426 skipped 25
processed 427 skipped 25
processed 428 skipped 25
processed 429 skipped 25
processed 430 skipped 25
processed 431 skipped 25
processed 432 skipped 25
processed 433 skipped 25
processed 434 skipped 25
processed 435 skipped 25
processed 436 skipped 25
processed 437 skipped 25
processed 438 skipped 25
processed 439 skipped 25
processed 440 skipped 25
processed 441 skipped 25
processed 442 skipped 25
processed 443 skipped 25
processed 444 skipped 25
processed 445 skipped 25
processed 446 skipped 25
processed 447 skipped 25
processed 448 skipped 25
processed 449 skipped 25
processed 450 skipped 25
error on downloadc6604e4c-239d-43df-9023-906bc4136622.html
File "<ipython-input-67-6114dbc3b5a2>", line 16, in <module>
res = comp_extractors( extractor_training_object )
File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
extraction_results = get_extraction_results( eto )
File "<ipython-input-21-ff5d87d959e4>", line 25, in get_extraction_results
ret['py_goose'] = { 'extracted_html': extract_with_python_goose( raw_content ) }
File "<ipython-input-9-da7d07f83e6d>", line 6, in extract_with_python_goose
r = g.extract( raw_html=raw_content )
File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 56, in extract
return self.crawl(cc)
File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 63, in crawl
article = crawler.crawl(crawl_candiate)
File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 125, in crawl
self.get_image()
File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 147, in get_image
self.article.top_image = self.image_extractor.get_best_image(doc, top_node)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 88, in get_best_image
image = self.check_large_images(topNode, 0, 0)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
depth_obj.parent_depth, depth_obj.sibling_depth)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
depth_obj.parent_depth, depth_obj.sibling_depth)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
depth_obj.parent_depth, depth_obj.sibling_depth)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
depth_obj.parent_depth, depth_obj.sibling_depth)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 122, in check_large_images
good_images = self.get_image_candidates(node)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 283, in get_image_candidates
good_images = self.get_images_bytesize_match(filtered_images)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 299, in get_images_bytesize_match
local_image = self.get_local_image(src)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 344, in get_local_image
self.link_hash, src, self.config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 59, in store_image
image = self.write_localfile(data, link_hash, src, config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 101, in write_localfile
return self.read_localfile(link_hash, src, config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 81, in read_localfile
image_details = self.get_image_dimensions(identify, local_image_name)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 36, in get_image_dimensions
image = Image.open(path)
File "/usr/lib/python2.7/dist-packages/PIL/Image.py", line 2028, in open
raise IOError("cannot identify image file")
IOError: cannot identify image file
Traceback (most recent call last):
(<type 'exceptions.IOError'>, IOError('cannot identify image file',), <traceback object at 0x7f4dccceae18>)
processed 450 skipped 26
processed 451 skipped 26
processed 452 skipped 26
processed 453 skipped 26
processed 454 skipped 26
processed 455 skipped 26
error on downloadc8a6ace8-c4e8-4d05-8e85-5e58284151bb.html
File "<ipython-input-67-6114dbc3b5a2>", line 16, in <module>
res = comp_extractors( extractor_training_object )
File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
extraction_results = get_extraction_results( eto )
File "<ipython-input-21-ff5d87d959e4>", line 25, in get_extraction_results
ret['py_goose'] = { 'extracted_html': extract_with_python_goose( raw_content ) }
File "<ipython-input-9-da7d07f83e6d>", line 6, in extract_with_python_goose
r = g.extract( raw_html=raw_content )
File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 56, in extract
return self.crawl(cc)
File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 63, in crawl
article = crawler.crawl(crawl_candiate)
File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 125, in crawl
self.get_image()
File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 147, in get_image
self.article.top_image = self.image_extractor.get_best_image(doc, top_node)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 88, in get_best_image
image = self.check_large_images(topNode, 0, 0)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
depth_obj.parent_depth, depth_obj.sibling_depth)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
depth_obj.parent_depth, depth_obj.sibling_depth)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 122, in check_large_images
good_images = self.get_image_candidates(node)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 283, in get_image_candidates
good_images = self.get_images_bytesize_match(filtered_images)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 299, in get_images_bytesize_match
local_image = self.get_local_image(src)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 344, in get_local_image
self.link_hash, src, self.config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 52, in store_image
image = self.read_localfile(link_hash, src, config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 81, in read_localfile
image_details = self.get_image_dimensions(identify, local_image_name)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 36, in get_image_dimensions
image = Image.open(path)
File "/usr/lib/python2.7/dist-packages/PIL/Image.py", line 2028, in open
raise IOError("cannot identify image file")
IOError: cannot identify image file
Traceback (most recent call last):
(<type 'exceptions.IOError'>, IOError('cannot identify image file',), <traceback object at 0x7f4dcd661c68>)
processed 455 skipped 27
processed 456 skipped 27
processed 457 skipped 27
processed 458 skipped 27
processed 459 skipped 27
processed 460 skipped 27
processed 461 skipped 27
processed 462 skipped 27
error on downloadcbeeb77e-294d-4ac9-9139-ce587ec19626.html
File "<ipython-input-67-6114dbc3b5a2>", line 16, in <module>
res = comp_extractors( extractor_training_object )
File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
extraction_results = get_extraction_results( eto )
File "<ipython-input-21-ff5d87d959e4>", line 25, in get_extraction_results
ret['py_goose'] = { 'extracted_html': extract_with_python_goose( raw_content ) }
File "<ipython-input-9-da7d07f83e6d>", line 6, in extract_with_python_goose
r = g.extract( raw_html=raw_content )
File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 56, in extract
return self.crawl(cc)
File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 63, in crawl
article = crawler.crawl(crawl_candiate)
File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 125, in crawl
self.get_image()
File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 147, in get_image
self.article.top_image = self.image_extractor.get_best_image(doc, top_node)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 88, in get_best_image
image = self.check_large_images(topNode, 0, 0)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
depth_obj.parent_depth, depth_obj.sibling_depth)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
depth_obj.parent_depth, depth_obj.sibling_depth)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
depth_obj.parent_depth, depth_obj.sibling_depth)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
depth_obj.parent_depth, depth_obj.sibling_depth)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 122, in check_large_images
good_images = self.get_image_candidates(node)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 283, in get_image_candidates
good_images = self.get_images_bytesize_match(filtered_images)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 299, in get_images_bytesize_match
local_image = self.get_local_image(src)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 344, in get_local_image
self.link_hash, src, self.config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 59, in store_image
image = self.write_localfile(data, link_hash, src, config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 101, in write_localfile
return self.read_localfile(link_hash, src, config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 81, in read_localfile
image_details = self.get_image_dimensions(identify, local_image_name)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 36, in get_image_dimensions
image = Image.open(path)
File "/usr/lib/python2.7/dist-packages/PIL/Image.py", line 2028, in open
raise IOError("cannot identify image file")
IOError: cannot identify image file
Traceback (most recent call last):
(<type 'exceptions.IOError'>, IOError('cannot identify image file',), <traceback object at 0x7f4df95ee0e0>)
processed 462 skipped 28
processed 463 skipped 28
processed 464 skipped 28
processed 465 skipped 28
processed 466 skipped 28
processed 467 skipped 28
processed 468 skipped 28
error on downloadceaaf8b9-4739-4250-9d9d-f868e68872fc.html
File "<ipython-input-67-6114dbc3b5a2>", line 16, in <module>
res = comp_extractors( extractor_training_object )
File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
extraction_results = get_extraction_results( eto )
File "<ipython-input-21-ff5d87d959e4>", line 25, in get_extraction_results
ret['py_goose'] = { 'extracted_html': extract_with_python_goose( raw_content ) }
File "<ipython-input-9-da7d07f83e6d>", line 6, in extract_with_python_goose
r = g.extract( raw_html=raw_content )
File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 56, in extract
return self.crawl(cc)
File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 63, in crawl
article = crawler.crawl(crawl_candiate)
File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 125, in crawl
self.get_image()
File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 147, in get_image
self.article.top_image = self.image_extractor.get_best_image(doc, top_node)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 88, in get_best_image
image = self.check_large_images(topNode, 0, 0)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 122, in check_large_images
good_images = self.get_image_candidates(node)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 283, in get_image_candidates
good_images = self.get_images_bytesize_match(filtered_images)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 299, in get_images_bytesize_match
local_image = self.get_local_image(src)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 344, in get_local_image
self.link_hash, src, self.config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 59, in store_image
image = self.write_localfile(data, link_hash, src, config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 101, in write_localfile
return self.read_localfile(link_hash, src, config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 81, in read_localfile
image_details = self.get_image_dimensions(identify, local_image_name)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 36, in get_image_dimensions
image = Image.open(path)
File "/usr/lib/python2.7/dist-packages/PIL/Image.py", line 2028, in open
raise IOError("cannot identify image file")
IOError: cannot identify image file
Traceback (most recent call last):
(<type 'exceptions.ValueError'>, ValueError(u'Unicode strings with encoding declaration are not supported. Please use bytes input or XML fragments without declaration.',), <traceback object at 0x7f4da2a1db48>)
processed 468 skipped 29
processed 469 skipped 29
error on downloadd00505ef-6e10-4616-88f0-d1165292c417.html
File "<ipython-input-67-6114dbc3b5a2>", line 16, in <module>
res = comp_extractors( extractor_training_object )
File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
extraction_results = get_extraction_results( eto )
File "<ipython-input-21-ff5d87d959e4>", line 25, in get_extraction_results
ret['py_goose'] = { 'extracted_html': extract_with_python_goose( raw_content ) }
File "<ipython-input-9-da7d07f83e6d>", line 6, in extract_with_python_goose
r = g.extract( raw_html=raw_content )
File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 56, in extract
return self.crawl(cc)
File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 63, in crawl
article = crawler.crawl(crawl_candiate)
File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 90, in crawl
doc = self.get_document(raw_html)
File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 176, in get_document
doc = self.parser.fromstring(raw_html)
File "/usr/local/lib/python2.7/dist-packages/goose/parsers.py", line 54, in fromstring
self.doc = lxml.html.fromstring(html)
File "/usr/local/lib/python2.7/dist-packages/lxml/html/__init__.py", line 672, in fromstring
doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
File "/usr/local/lib/python2.7/dist-packages/lxml/html/__init__.py", line 568, in document_fromstring
value = etree.fromstring(html, parser, **kw)
File "lxml.etree.pyx", line 2997, in lxml.etree.fromstring (src/lxml/lxml.etree.c:63276)
File "parser.pxi", line 1607, in lxml.etree._parseMemoryDocument (src/lxml/lxml.etree.c:93592)
ValueError: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fragments without declaration.
Traceback (most recent call last):
(<type 'exceptions.IOError'>, IOError("Couldn't open file /usr/local/lib/python2.7/dist-packages/goose/resources/text/stopwords-vi.txt",), <traceback object at 0x7f4de6585d88>)
processed 469 skipped 30
processed 470 skipped 30
processed 471 skipped 30
processed 472 skipped 30
processed 473 skipped 30
processed 474 skipped 30
processed 475 skipped 30
processed 476 skipped 30
processed 477 skipped 30
processed 478 skipped 30
error on downloadd35ceaa6-3fb5-4d60-bc6c-82f1a23bb59c.html
File "<ipython-input-67-6114dbc3b5a2>", line 16, in <module>
res = comp_extractors( extractor_training_object )
File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
extraction_results = get_extraction_results( eto )
File "<ipython-input-21-ff5d87d959e4>", line 25, in get_extraction_results
ret['py_goose'] = { 'extracted_html': extract_with_python_goose( raw_content ) }
File "<ipython-input-9-da7d07f83e6d>", line 6, in extract_with_python_goose
r = g.extract( raw_html=raw_content )
File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 56, in extract
return self.crawl(cc)
File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 63, in crawl
article = crawler.crawl(crawl_candiate)
File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 131, in crawl
self.article.cleaned_text = self.formatter.get_formatted_text()
File "/usr/local/lib/python2.7/dist-packages/goose/outputformatters.py", line 66, in get_formatted_text
self.remove_fewwords_paragraphs()
File "/usr/local/lib/python2.7/dist-packages/goose/outputformatters.py", line 123, in remove_fewwords_paragraphs
stop_words = self.stopwords_class(language=self.get_language()).get_stopword_count(text)
File "/usr/local/lib/python2.7/dist-packages/goose/text.py", line 98, in __init__
self._cached_stop_words[language] = set(FileHelper.loadResourceFile(path).splitlines())
File "/usr/local/lib/python2.7/dist-packages/goose/utils/__init__.py", line 79, in loadResourceFile
raise IOError("Couldn't open file %s" % path)
IOError: Couldn't open file /usr/local/lib/python2.7/dist-packages/goose/resources/text/stopwords-vi.txt
Traceback (most recent call last):
(<type 'exceptions.IOError'>, IOError('cannot identify image file',), <traceback object at 0x7f4dccce5f38>)
processed 478 skipped 31
processed 479 skipped 31
processed 480 skipped 31
processed 481 skipped 31
processed 482 skipped 31
processed 483 skipped 31
processed 484 skipped 31
processed 485 skipped 31
processed 486 skipped 31
processed 487 skipped 31
processed 488 skipped 31
processed 489 skipped 31
error on downloaddc9814d5-b1eb-4095-a183-05c3be64c537.html
File "<ipython-input-67-6114dbc3b5a2>", line 16, in <module>
res = comp_extractors( extractor_training_object )
File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
extraction_results = get_extraction_results( eto )
File "<ipython-input-21-ff5d87d959e4>", line 25, in get_extraction_results
ret['py_goose'] = { 'extracted_html': extract_with_python_goose( raw_content ) }
File "<ipython-input-9-da7d07f83e6d>", line 6, in extract_with_python_goose
r = g.extract( raw_html=raw_content )
File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 56, in extract
return self.crawl(cc)
File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 63, in crawl
article = crawler.crawl(crawl_candiate)
File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 125, in crawl
self.get_image()
File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 147, in get_image
self.article.top_image = self.image_extractor.get_best_image(doc, top_node)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 88, in get_best_image
image = self.check_large_images(topNode, 0, 0)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
depth_obj.parent_depth, depth_obj.sibling_depth)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 122, in check_large_images
good_images = self.get_image_candidates(node)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 283, in get_image_candidates
good_images = self.get_images_bytesize_match(filtered_images)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 299, in get_images_bytesize_match
local_image = self.get_local_image(src)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 344, in get_local_image
self.link_hash, src, self.config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 59, in store_image
image = self.write_localfile(data, link_hash, src, config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 101, in write_localfile
return self.read_localfile(link_hash, src, config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 81, in read_localfile
image_details = self.get_image_dimensions(identify, local_image_name)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 36, in get_image_dimensions
image = Image.open(path)
File "/usr/lib/python2.7/dist-packages/PIL/Image.py", line 2028, in open
raise IOError("cannot identify image file")
IOError: cannot identify image file
Traceback (most recent call last):
(<type 'exceptions.ValueError'>, ValueError(u'Unicode strings with encoding declaration are not supported. Please use bytes input or XML fragments without declaration.',), <traceback object at 0x7f4dcd474dd0>)
processed 489 skipped 32
processed 490 skipped 32
processed 491 skipped 32
processed 492 skipped 32
processed 493 skipped 32
processed 494 skipped 32
processed 495 skipped 32
processed 496 skipped 32
processed 497 skipped 32
processed 498 skipped 32
processed 499 skipped 32
processed 500 skipped 32
processed 501 skipped 32
processed 502 skipped 32
processed 503 skipped 32
processed 504 skipped 32
processed 505 skipped 32
processed 506 skipped 32
processed 507 skipped 32
processed 508 skipped 32
processed 509 skipped 32
processed 510 skipped 32
processed 511 skipped 32
processed 512 skipped 32
processed 513 skipped 32
processed 514 skipped 32
processed 515 skipped 32
processed 516 skipped 32
processed 517 skipped 32
processed 518 skipped 32
processed 519 skipped 32
processed 520 skipped 32
processed 521 skipped 32
processed 522 skipped 32
processed 523 skipped 32
processed 524 skipped 32
processed 525 skipped 32
processed 526 skipped 32
processed 527 skipped 32
processed 528 skipped 32
processed 529 skipped 32
error on downloadeba86c40-98c9-4af1-bf0a-c26ce4db3536.html
File "<ipython-input-67-6114dbc3b5a2>", line 16, in <module>
res = comp_extractors( extractor_training_object )
File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
extraction_results = get_extraction_results( eto )
File "<ipython-input-21-ff5d87d959e4>", line 25, in get_extraction_results
ret['py_goose'] = { 'extracted_html': extract_with_python_goose( raw_content ) }
File "<ipython-input-9-da7d07f83e6d>", line 6, in extract_with_python_goose
r = g.extract( raw_html=raw_content )
File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 56, in extract
return self.crawl(cc)
File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 63, in crawl
article = crawler.crawl(crawl_candiate)
File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 90, in crawl
doc = self.get_document(raw_html)
File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 176, in get_document
doc = self.parser.fromstring(raw_html)
File "/usr/local/lib/python2.7/dist-packages/goose/parsers.py", line 54, in fromstring
self.doc = lxml.html.fromstring(html)
File "/usr/local/lib/python2.7/dist-packages/lxml/html/__init__.py", line 672, in fromstring
doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
File "/usr/local/lib/python2.7/dist-packages/lxml/html/__init__.py", line 568, in document_fromstring
value = etree.fromstring(html, parser, **kw)
File "lxml.etree.pyx", line 2997, in lxml.etree.fromstring (src/lxml/lxml.etree.c:63276)
File "parser.pxi", line 1607, in lxml.etree._parseMemoryDocument (src/lxml/lxml.etree.c:93592)
ValueError: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fragments without declaration.
Traceback (most recent call last):
(<type 'exceptions.IOError'>, IOError('cannot identify image file',), <traceback object at 0x7f4dcd65ce60>)
processed 529 skipped 33
processed 530 skipped 33
processed 531 skipped 33
processed 532 skipped 33
processed 533 skipped 33
processed 534 skipped 33
processed 535 skipped 33
processed 536 skipped 33
processed 537 skipped 33
processed 538 skipped 33
processed 539 skipped 33
processed 540 skipped 33
processed 541 skipped 33
processed 542 skipped 33
processed 543 skipped 33
processed 544 skipped 33
processed 545 skipped 33
processed 546 skipped 33
processed 547 skipped 33
processed 548 skipped 33
processed 549 skipped 33
processed 550 skipped 33
processed 551 skipped 33
processed 552 skipped 33
processed 553 skipped 33
processed 554 skipped 33
processed 555 skipped 33
processed 556 skipped 33
processed 557 skipped 33
processed 558 skipped 33
processed 559 skipped 33
processed 560 skipped 33
processed 561 skipped 33
processed 562 skipped 33
processed 563 skipped 33
processed 564 skipped 33
processed 565 skipped 33
processed 566 skipped 33
processed 567 skipped 33
processed 568 skipped 33
processed 569 skipped 33
processed 570 skipped 33
processed 571 skipped 33
processed 572 skipped 33
processed 573 skipped 33
error on downloadfbdd64ab-0d00-4a1f-99ec-e21855eae2af.html
File "<ipython-input-67-6114dbc3b5a2>", line 16, in <module>
res = comp_extractors( extractor_training_object )
File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
extraction_results = get_extraction_results( eto )
File "<ipython-input-21-ff5d87d959e4>", line 25, in get_extraction_results
ret['py_goose'] = { 'extracted_html': extract_with_python_goose( raw_content ) }
File "<ipython-input-9-da7d07f83e6d>", line 6, in extract_with_python_goose
r = g.extract( raw_html=raw_content )
File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 56, in extract
return self.crawl(cc)
File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 63, in crawl
article = crawler.crawl(crawl_candiate)
File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 125, in crawl
self.get_image()
File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 147, in get_image
self.article.top_image = self.image_extractor.get_best_image(doc, top_node)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 88, in get_best_image
image = self.check_large_images(topNode, 0, 0)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 122, in check_large_images
good_images = self.get_image_candidates(node)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 283, in get_image_candidates
good_images = self.get_images_bytesize_match(filtered_images)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 299, in get_images_bytesize_match
local_image = self.get_local_image(src)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 344, in get_local_image
self.link_hash, src, self.config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 59, in store_image
image = self.write_localfile(data, link_hash, src, config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 101, in write_localfile
return self.read_localfile(link_hash, src, config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 81, in read_localfile
image_details = self.get_image_dimensions(identify, local_image_name)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 36, in get_image_dimensions
image = Image.open(path)
File "/usr/lib/python2.7/dist-packages/PIL/Image.py", line 2028, in open
raise IOError("cannot identify image file")
IOError: cannot identify image file
Traceback (most recent call last):
(<type 'exceptions.IOError'>, IOError('cannot identify image file',), <traceback object at 0x7f4dac691128>)
processed 573 skipped 34
processed 574 skipped 34
processed 575 skipped 34
processed 576 skipped 34
processed 577 skipped 34
processed 578 skipped 34
processed 579 skipped 34
processed 580 skipped 34
processed 581 skipped 34
processed 582 skipped 34
processed 583 skipped 34
processed 584 skipped 34
processed 585 skipped 34
processed 586 skipped 34
processed 587 skipped 34
Total_time 1:25:34.219244
Time per download 0:00:08.267663
File "<ipython-input-67-6114dbc3b5a2>", line 16, in <module>
res = comp_extractors( extractor_training_object )
File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
extraction_results = get_extraction_results( eto )
File "<ipython-input-21-ff5d87d959e4>", line 25, in get_extraction_results
ret['py_goose'] = { 'extracted_html': extract_with_python_goose( raw_content ) }
File "<ipython-input-9-da7d07f83e6d>", line 6, in extract_with_python_goose
r = g.extract( raw_html=raw_content )
File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 56, in extract
return self.crawl(cc)
File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 63, in crawl
article = crawler.crawl(crawl_candiate)
File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 125, in crawl
self.get_image()
File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 147, in get_image
self.article.top_image = self.image_extractor.get_best_image(doc, top_node)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 88, in get_best_image
image = self.check_large_images(topNode, 0, 0)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
depth_obj.parent_depth, depth_obj.sibling_depth)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 122, in check_large_images
good_images = self.get_image_candidates(node)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 283, in get_image_candidates
good_images = self.get_images_bytesize_match(filtered_images)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 299, in get_images_bytesize_match
local_image = self.get_local_image(src)
File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 344, in get_local_image
self.link_hash, src, self.config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 59, in store_image
image = self.write_localfile(data, link_hash, src, config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 101, in write_localfile
return self.read_localfile(link_hash, src, config)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 81, in read_localfile
image_details = self.get_image_dimensions(identify, local_image_name)
File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 36, in get_image_dimensions
image = Image.open(path)
File "/usr/lib/python2.7/dist-packages/PIL/Image.py", line 2028, in open
raise IOError("cannot identify image file")
IOError: cannot identify image file
Out[67]:
['justext',
'heur',
'media_id',
'gold',
'py_boiler_pipe_CanolaExtractor',
'downloads_id',
'py_boiler_pipe_DefaultExtractor',
'py_boiler_pipe_KeepEverythingExtractor',
'story_is_spidered',
'py_boiler_pipe_ArticleExtractor',
'crf',
'python_readibilty',
'py_boiler_pipe_NumWordsRulesExtractor',
'py_goose',
'py_boiler_pipe_ArticleSentencesExtractor',
'py_boiler_pipe_LargestContentExtractor',
'boiler_pipe']
In [69]:
df = get_data_frame_from_comparision_objects( comps_downloads_boiler_pipe )
print_results_by_measurement_type( df )
precision_boiler_pipe precision_crf precision_gold precision_heur \
count 587.000000 587.000000 587 587.000000
mean 0.947119 0.731588 1 0.791331
std 0.103217 0.260810 0 0.236207
min 0.000000 0.000000 1 0.000000
2% 0.682820 0.000000 1 0.000000
5% 0.827871 0.100000 1 0.194313
10% 0.892255 0.325697 1 0.499278
50% 0.974522 0.822270 1 0.871486
max 1.000000 1.000000 1 1.000000
precision_justext precision_py_boiler_pipe_ArticleExtractor \
count 587.000000 587.000000
mean 0.832769 0.881723
std 0.320992 0.105574
min 0.000000 0.000000
2% 0.000000 0.660794
5% 0.000000 0.755209
10% 0.020766 0.804754
50% 0.990854 0.905172
max 1.000000 0.990328
precision_py_boiler_pipe_ArticleSentencesExtractor \
count 587
mean 0
std 0
min 0
2% 0
5% 0
10% 0
50% 0
max 0
precision_py_boiler_pipe_CanolaExtractor \
count 587.000000
mean 0.773246
std 0.187061
min 0.000000
2% 0.174271
5% 0.390601
10% 0.507277
50% 0.839002
max 0.987124
precision_py_boiler_pipe_DefaultExtractor \
count 587.000000
mean 0.817369
std 0.167634
min 0.000000
2% 0.178499
5% 0.472519
10% 0.640371
50% 0.871854
max 0.987685
precision_py_boiler_pipe_KeepEverythingExtractor \
count 587.000000
mean 0.562141
std 0.201950
min 0.000000
2% 0.146591
5% 0.212001
10% 0.288187
50% 0.579137
max 0.973542
precision_py_boiler_pipe_LargestContentExtractor \
count 587.000000
mean 0.874354
std 0.145622
min 0.000000
2% 0.056011
5% 0.754319
10% 0.804667
50% 0.907063
max 0.991265
precision_py_boiler_pipe_NumWordsRulesExtractor precision_py_goose \
count 587.000000 587.000000
mean 0.821370 0.918389
std 0.161591 0.185575
min 0.000000 0.000000
2% 0.264646 0.218827
5% 0.471592 0.489141
10% 0.632772 0.698180
50% 0.871508 0.990566
max 0.987117 1.000000
precision_python_readibilty
count 587.000000
mean 0.918135
std 0.166788
min 0.000000
2% 0.174974
5% 0.671403
10% 0.827754
50% 0.967532
max 1.000000
recall_boiler_pipe recall_crf recall_gold recall_heur \
count 587.000000 587.000000 587 587.000000
mean 0.951461 0.864146 1 0.876037
std 0.110789 0.264172 0 0.202423
min 0.000000 0.000000 1 0.000000
2% 0.583534 0.000000 1 0.000000
5% 0.796251 0.012519 1 0.529092
10% 0.899326 0.533212 1 0.796895
50% 0.983333 0.956835 1 0.928230
max 1.000000 1.000000 1 1.000000
recall_justext recall_py_boiler_pipe_ArticleExtractor \
count 587.000000 587.000000
mean 0.797343 0.914903
std 0.306030 0.112087
min 0.000000 0.000000
2% 0.000000 0.574515
5% 0.000000 0.752301
10% 0.006784 0.851506
50% 0.932203 0.932927
max 1.000000 1.000000
recall_py_boiler_pipe_ArticleSentencesExtractor \
count 587
mean 0
std 0
min 0
2% 0
5% 0
10% 0
50% 0
max 0
recall_py_boiler_pipe_CanolaExtractor \
count 587.000000
mean 0.882045
std 0.106958
min 0.000000
2% 0.611500
5% 0.737168
10% 0.792529
50% 0.903988
max 0.999087
recall_py_boiler_pipe_DefaultExtractor \
count 587.000000
mean 0.885074
std 0.127238
min 0.000000
2% 0.532425
5% 0.699313
10% 0.781537
50% 0.911950
max 1.000000
recall_py_boiler_pipe_KeepEverythingExtractor \
count 587.000000
mean 0.934558
std 0.089161
min 0.000000
2% 0.831792
5% 0.869596
10% 0.896111
50% 0.940937
max 1.000000
recall_py_boiler_pipe_LargestContentExtractor \
count 587.000000
mean 0.858449
std 0.178183
min 0.000000
2% 0.067892
5% 0.494472
10% 0.657884
50% 0.916667
max 1.000000
recall_py_boiler_pipe_NumWordsRulesExtractor recall_py_goose \
count 587.000000 587.000000
mean 0.907176 0.881010
std 0.101053 0.197481
min 0.000000 0.000000
2% 0.645752 0.031562
5% 0.792025 0.512564
10% 0.847497 0.769361
50% 0.920863 0.938462
max 1.000000 1.000000
recall_python_readibilty
count 587.000000
mean 0.901399
std 0.194292
min 0.000000
2% 0.030122
5% 0.658983
10% 0.860628
50% 0.947047
max 1.000000
f1_boiler_pipe f1_crf f1_gold f1_heur f1_justext \
count 587.000000 587.000000 587 587.000000 587.000000
mean 0.945981 0.773358 1 0.818156 0.801968
std 0.103467 0.259278 0 0.216310 0.307921
min 0.000000 0.000000 1 0.000000 0.000000
2% 0.696706 0.000000 1 0.000000 0.000000
5% 0.808139 0.018326 1 0.294773 0.000000
10% 0.881987 0.380801 1 0.617992 0.010730
50% 0.974648 0.875940 1 0.890909 0.938389
max 1.000000 1.000000 1 0.999224 1.000000
f1_py_boiler_pipe_ArticleExtractor \
count 587.000000
mean 0.895349
std 0.103379
min 0.000000
2% 0.684377
5% 0.777131
10% 0.824159
50% 0.915521
max 0.992034
f1_py_boiler_pipe_ArticleSentencesExtractor \
count 587
mean 0
std 0
min 0
2% 0
5% 0
10% 0
50% 0
max 0
f1_py_boiler_pipe_CanolaExtractor f1_py_boiler_pipe_DefaultExtractor \
count 587.000000 587.000000
mean 0.811977 0.840813
std 0.154431 0.148167
min 0.000000 0.000000
2% 0.291535 0.297843
5% 0.519562 0.599921
10% 0.626885 0.696688
50% 0.863076 0.886466
max 0.987124 0.985744
f1_py_boiler_pipe_KeepEverythingExtractor \
count 587.000000
mean 0.681243
std 0.178612
min 0.000000
2% 0.254176
5% 0.341140
10% 0.437733
50% 0.715528
max 0.982999
f1_py_boiler_pipe_LargestContentExtractor \
count 587.000000
mean 0.861334
std 0.156952
min 0.000000
2% 0.056688
5% 0.622468
10% 0.740102
50% 0.904841
max 0.992034
f1_py_boiler_pipe_NumWordsRulesExtractor f1_py_goose \
count 587.000000 587.000000
mean 0.853158 0.884124
std 0.136401 0.204271
min 0.000000 0.000000
2% 0.405971 0.048493
5% 0.616763 0.506277
10% 0.716640 0.681772
50% 0.894185 0.953125
max 0.986814 1.000000
f1_python_readibilty
count 587.000000
mean 0.894271
std 0.200252
min 0.000000
2% 0.038057
5% 0.407503
10% 0.817409
50% 0.949510
max 1.000000
Content source: AchyuthIIIT/mediacloud
Similar notebooks: