Extractor evaluation

This notebook evaluates both Media Cloud's internal extractors and third party FLOSS extractor libraries across a corpus of hand annotated articles.

Readers may wish to skip to the results section at the end.

Set up / Methods


In [1]:
import cPickle
import os.path

api_key = cPickle.load( file( os.path.expanduser( '~/mediacloud_api_key.pickle' ), 'r' ) )

In [2]:
import cPickle
import os.path

cPickle.dump( api_key, file( os.path.expanduser( '~/mediacloud_api_key.pickle' ), 'wb' ) )

In [3]:
import sys
sys.path.append('../../foreign_modules/python/')

In [4]:
loc_key = 'f66a50230d54afaf18822808aed649f1d6ca72b08fb06d5efb6247afe9fbae52'

In [5]:
import mediacloud, requests, csv, sys, os, json, cPickle

def get_download( downloads_id ):
    download = requests.get('https://api.mediacloud.org/api/v2/downloads/single/'+str(downloads_id)+'?key='+api_key)
    return download.json()[0]

def extract_story( preprocessed_lines, title, description, extractor_method ):
    extract_params = {'key':loc_key, 'preprocessed_lines':preprocessed_lines, 
                           'story_title':title, 'story_description':description, 'extractor_method': extractor_method}
    
    extract_result = requests.put('http://0:3000/api/v2/extractlines/extract',data=json.dumps(extract_params), 
                                headers = {'Content-type': 'application/json'})
    
    extract_result.raise_for_status()
    return extract_result.json()

def get_story_lines( raw_content ):
    story_lines_params = {'key':loc_key, 'body_html':raw_content }
    headers = {'Content-type': 'application/json'}
    story_lines = requests.put('http://0:3000/api/v2/extractlines/story_lines',data=json.dumps(story_lines_params), 
                               params={ 'key': loc_key },headers=headers)
    
    story_lines.raise_for_status()
    
    return story_lines

In [6]:
import subprocess
import tempfile
import codecs
import time
from lxml import html

#download = get_download( downloads_id )
#raw_content = download[u'raw_content']

def extract_with_boilerpipe( raw_content ):
    with tempfile.NamedTemporaryFile( suffix='.html', delete=False ) as t:
        #print t.name
    
        UTF8Writer = codecs.getwriter('utf8')
        t.file = UTF8Writer(t.file)
        t.file.write( raw_content )
    
        t.close()
        #time.sleep( 2 )
        #print "original article tmp file ", t.name
        
        #input_file = '/tmp/416655019.htm'
        input_file = t.name
        
        output_tmp = tempfile.NamedTemporaryFile( suffix='.html', delete=False )
        
        output_file = output_tmp.name
        #output_file = '/tmp/highlighted.html'
        #print output_file
        
        subprocess.check_output(['java', '-jar',
                               '/home/dlarochelle/dev_scratch/boilerpipe_test/out/artifacts/boilerpipe_test_jar/boilerpipe_test.jar',
                               input_file, output_file ] )
        f = open( output_file, 'rb' )
        
        annotated_file_str = f.read()
        
        #t.unlink( t.name )
        output_tmp.close()
        #output_tmp.unlink( output_tmp.name )

    tree = html.fromstring( annotated_file_str )   
    spans = tree.xpath('//span[@class="x-boilerpipe-mark1"]')
    boiler_pipe_lines = [ etree.tostring(s) for s in spans ]

    ret = { 'extracted_html': "\n\n".join(boiler_pipe_lines) }
    return ret

In [7]:
#f = open( '/tmp/tmp01CV6F.html' )
#annotated_file_str = f.read()
#tree = html.fromstring( annotated_file_str )   
#spans = tree.xpath('//span[@class="x-boilerpipe-mark1"]')
#span = spans[0]
#etree.tostring( span )

In [8]:
import readability

def extract_with_python_readability( raw_content ):
    doc = readability.Document( raw_content )
    
    return doc.short_title() + "\n\n" + doc.summary()

In [9]:
import goose

def extract_with_python_goose( raw_content ):
    g = goose.Goose()
    
    r = g.extract( raw_html=raw_content )
    return r.title + "\n\n" +  r.cleaned_text

In [10]:
import justext

def extract_with_justext( raw_content ):
    ret = []
    
    paragraphs = justext.justext( raw_content, justext.get_stoplist('English') )
    
    #p = paragraphs[0]
    for p in paragraphs:
        if not p.is_boilerplate:
            ret.append(p.text)
            
    return "\n\n".join(ret)

#extract_with_justext( raw_content )
#raw_html

#justext.get_stoplists()

In [11]:
import operator

def get_extractor_training_text( downloads_id, preprocessed_lines ):
    extractor_training_lines_result = requests.get(
                                                   'https://api.mediacloud.org/api/v2/extractlines/extractor_training_lines/' + str(downloads_id),
                                               headers = {'Content-type': 'application/json'}
                                               , params= {'key': api_key}
                                               )

    extractor_training_lines_result.raise_for_status()

    extractor_training_lines_result = extractor_training_lines_result.json()
    
    line_numbers = [ x['line_number'] for x in extractor_training_lines_result ]
    
    line_numbers = sorted(line_numbers)
    
    line_numbers.sort()
    
    #print line_numbers
    
    return operator.itemgetter( * line_numbers )( preprocessed_lines  )

import operator

def get_extracted_text( extractor_results ):
    included_line_numbers = extractor_results['included_line_numbers']
    #print included_line_numbers
    
    dl = extractor_results['download_lines']
   
    if len( included_line_numbers ) == 0:
        return []
    else:    
        return operator.itemgetter( * extractor_results['included_line_numbers']   )(dl)

In [12]:
#import Levenshtein

def lines_to_comparable_text( lines ):
    text = u"\n\n".join([ clean_for_comparison(line) for line in lines ])
    
    if text == '':
        text = u''
        
    return text

def html_to_comparable_text( html_text ):
    text = clean_for_comparison( html_text )
    
    if text == '' or text == None:
        text = u''
        
    return text
    
    
#def compare_accuracy( lines, lines_expected ):
#    return Levenshtein.distance( lines_to_comparable_text( lines ) , lines_to_comparable_text( lines_expected ) )

In [13]:
def get_anncestors( element ):
    anncestors = [ element ];
    anncestor = element.getparent()
    
    while anncestor != None :
        #print 'loop'
        anncestors.append( anncestor )
        anncestor = anncestor.getparent()
        
    return anncestors

In [14]:
def text_from_lxml_object( obj):
    if type(obj) is etree._ElementStringResult:
        return u'' + obj
    if type(obj) ==  etree._ElementUnicodeResult:
        return u'' + obj 
    else:
        try:
            return etree.tostring( obj , method='text', encoding="UTF-8") 
        except:
            print type(obj)
            print obj
            
            raise ''

In [15]:
from lxml import etree

downloads_id =  582817308 
download = get_download( downloads_id )
raw_content = download[ 'raw_content' ]
with open( '/tmp/' + str(downloads_id) , 'wb' ) as f:
    f.write( raw_content )

In [16]:
from nltk import tokenize

def remove_duplicate_sentences( article_html, story ):
    #sentences_from_html
    extract_params = {'key':loc_key, 'story_html': article_html }
    
    extract_result = requests.put('http://0:3000/api/v2/extractlines/sentences_from_html',data=json.dumps(extract_params), 
                                headers = {'Content-type': 'application/json'})
    
    #print "article_html:\n", article_html
    
    extract_result.raise_for_status()
    sentences = extract_result.json()
    
    #print "sentences", sentences
    
    #comp_text = lines_to_comparable_text( text_lines )
    #sentences = [ sent.strip()  for sent in tokenize.sent_tokenize( comp_text ) ]
    #print sentences
    non_duplicate_sentences = [sentence for sentence in sentences if not sentence_is_duplicate( sentence, story ) ] 
    return u"\n".join( non_duplicate_sentences )

In [17]:
def html_strip( str ):
    if str.isspace() or str == '':
        return u' '
    
    if str == '<':
        return u' '        
    
    try:
        return html.fromstring(str).text_content()    
    except:
        print "Unexpected error on string '" + str + "'" , sys.exc_info()[0]
        #raise
        return u''       

def clean_for_comparison( str ):
    if len(str) > 0:
        ret = html_strip( str )
    else:
        return str
    
    return ret

In [18]:
def extract_with_mc_extractor( eto, method ):
    story = eto['story']
    preprocessed_lines = eto['preprocessed_lines']
    title = story[u'title']
    description = story[u'description']
    
    extract_result = extract_story( preprocessed_lines, title, description, method)
    #html_lines = get_extracted_text( extract_result )
    
    ret = {}
    
    ret['extracted_html'] = extract_result[ 'extracted_html' ]
    
    return ret

def extract_with_heur( eto ):
    return extract_with_mc_extractor( eto, 'HeuristicExtractor' )

def extract_with_crf( eto ):
    return extract_with_mc_extractor( eto, 'CrfExtractor' )

In [19]:
import difflib
from IPython.display import HTML

from collections import Counter

def ratcliff_obershelp_compare( actual_text, expected_text ):
    
    words_expected = expected_text.split()
    words_crf      = actual_text.split()
    
    differ = difflib.Differ( )
    
    #print words_crf[:10]
    #print words_expected[:10]
    list( differ.compare( words_crf , words_expected ) )
    counts = Counter([ d[0] for d in differ.compare( words_expected, words_crf   ) ])
    
    tp = counts[' ']
    fp = counts['+']
    fn = counts['-']
    
    if float(tp+fp) == 0:
        precision = 0.0
    else:
        precision = tp/float(tp+fp)
        
    if float( tp + fn ) == 0:
        recall = 0
    else:
        recall    = tp/float( tp + fn )
    
    if ( precision + recall ) > 0:
        f1 = 2*(precision*recall)/( precision + recall )
    else:
        f1 = 0
    
    ret = { 'precision': precision,
        'recall': recall,
        'f1': f1
    }
    
    return ret

def compare_with_expeto migcted( extractor_name, actual_text, actual_html, expected_text, story ):
    #actual_text = lines_to_comparable_text( actual_lines )
    #expected_text = lines_to_comparable_text( expected_lines )
    ret = {}
    ret[ extractor_name ] = ratcliff_obershelp_compare( actual_text, expected_text )
    
    if compare_deduplicated:
        dedup_text = remove_duplicate_sentences( actual_html, story )
        
        ret[ extractor_name + "_dedup" ] = ratcliff_obershelp_compare( dedup_text, expected_text )
    
    return ret

In [20]:
ratcliff_obershelp_compare( actual_text='foo', expected_text='bar foo baz BAST')


Out[20]:
{'f1': 0.4, 'precision': 1.0, 'recall': 0.25}

In [66]:
def get_extraction_results( eto ):

    raw_content = eto[ 'raw_content' ]
    
    assert raw_content != None
    assert len(raw_content ) > 0
        
    ret = {}
    
    
    #ret['heur'] = extract_with_heur( eto )
    #ret['crf'] = extract_with_crf( eto )
    #ret['boiler_pipe'] =   extract_with_boilerpipe( raw_content)

    print 'extracting with thr_boilerpipe'
    
    ret['thr_boiler_pipe_ArticleExtractor'] =  extract_with_thr_boilerpipe_ArticleExtractor( raw_content)
    #ret['thr_boiler_pipe_ArticleSentencesExtractor'] =  extract_with_thr_boilerpipe_ArticleSentencesExtractor( raw_content)
    ret['thr_boiler_pipe_DefaultExtractor'] =  extract_with_thr_boilerpipe_DefaultExtractor( raw_content )
    
    ret['py_boiler_pipe_ArticleExtractor'] =  extract_with_py_boilerpipe_ArticleExtractor( raw_content)
    ret['py_boiler_pipe_ArticleSentencesExtractor'] =  extract_with_py_boilerpipe_ArticleSentencesExtractor( raw_content)
    ret['py_boiler_pipe_CanolaExtractor'] =  extract_with_py_boilerpipe_CanolaExtractor( raw_content)  
    ret['py_boiler_pipe_DefaultExtractor'] =  extract_with_py_boilerpipe_DefaultExtractor( raw_content )
    ret['py_boiler_pipe_KeepEverythingExtractor'] =  extract_with_py_boilerpipe_KeepEverythingExtractor( raw_content)        
    #ret['py_boiler_pipe_KeepEverythingWithMinKWordsExtractor'] =  extract_with_py_boilerpipe_KeepEverythingWithMinKWordsExtractor( raw_content)            
    ret['py_boiler_pipe_LargestContentExtractor'] =  extract_with_py_boilerpipe_LargestContentExtractor( raw_content)            
    ret['py_boiler_pipe_NumWordsRulesExtractor'] =  extract_with_py_boilerpipe_NumWordsRulesExtractor( raw_content)            
    
    ret['python_readibilty'] = { 'extracted_html': extract_with_python_readability( raw_content ) }
    #ret['py_goose']  = { 'extracted_html': extract_with_python_goose( raw_content ) }
    #ret['justext'] =  { 'extracted_html': extract_with_justext( raw_content ) }
    
    for method, result in ret.iteritems():
        if 'extracted_text' not in result:
            result['extracted_text'] = html_to_comparable_text( result['extracted_html' ] )
            
    return ret

In [22]:
def compare_extractors_for_download( downloads_id ):
    
    eto = create_extractor_training_object( downloads_id )
    
    return comp_extractors( eto )    
    
def comp_extractors( eto ):    
    downloads_id = eto['downloads_id']
    media_id     = eto['media_id' ]
    story = eto['story']
    raw_content = eto['raw_content']
    preprocessed_lines = eto['preprocessed_lines']
    expected_text = eto['expected_text']
        
    title = story[u'title']
    description = story[u'description']
    url = story[u'url']
    
    extraction_results = get_extraction_results( eto )
    
    comp_results = {}
        
    comp_results['downloads_id'] = downloads_id
    
    comp_results['media_id']  = media_id
    
    comp_results['story_is_spidered'] = eto['story_is_spidered']
    
    for name, value in extraction_results.iteritems():
        #print name, value
        comp_results.update (compare_with_expected( name, value['extracted_text'], value['extracted_html'], expected_text, story ) )
    
    comp_results.update( compare_with_expected( 'gold', expected_text, expected_text, expected_text, story ) )
    
    return comp_results

In [23]:
import sys

sys.path.append('../')

import mc_config

def get_db_info():
    config_file = mc_config.read_config()
    
    db_infos = config_file['database']
    db_info = next (db_info for db_info in db_infos if db_info['port'] == '6000' )
    return db_info

import psycopg2
#import solr_reimport
import psycopg2.extras

#db_info = get_db_info()

#conn = psycopg2.connect( database=db_info['db'], user=db_info['user'], 
#                        password=db_info['pass'], host=db_info['host'], port=db_info['port'] )

conn = None

story_sentence_counts_cache = {}

def get_sentence_counts( sentence, story ):

    stories_id = story['stories_id']
    
    if not stories_id in story_sentence_counts_cache:
        story_sentence_counts_cache[ stories_id ] = {}
        
    if sentence in story_sentence_counts_cache[ stories_id ]:
        return story_sentence_counts_cache[stories_id ][sentence]

    global conn 
    
    if conn == None:        
        db_info = get_db_info()
        
        conn = psycopg2.connect( database=db_info['db'], user=db_info['user'], 
                                password=db_info['pass'], host=db_info['host'], port=db_info['port'] )

        
    cursor = conn.cursor(cursor_factory=psycopg2.extras.DictCursor)
    query = '''               
                   SELECT MIN( story_sentence_counts_id) AS story_sentence_counts_id, sentence_count, first_stories_id,
                   sentence_md5
            FROM story_sentence_counts
            WHERE sentence_md5  = md5(%(sentence)s)
              AND media_id = %(media_id)s
              AND publish_week =  DATE_TRUNC( 'week', %(publish_date)s::date )
            GROUP BY story_sentence_counts_id
    '''
    
    #print sentence
    #md5_sum = md5.new( sentence ).hexdigest()
    
    params = { 'sentence': sentence,
                            'media_id': story['media_id'], 
                            'publish_date': story['publish_date']
                            } 
    
    #print params
    
    #print eto[ 'story'] ['stories_id' ]
    cursor.execute( query, params )
    
    fetched = cursor.fetchall()
    
    if len( fetched ) == 0:
        story_sentence_counts_cache[ stories_id ][sentence] = None
    else:
        story_sentence_counts_cache[ stories_id ][sentence] = dict(fetched[0])
        
    return story_sentence_counts_cache[stories_id ][sentence]
    
def sentence_is_duplicate( sentence, story ):
    sentence_counts = get_sentence_counts( sentence, story )
    
    if sentence_counts != None:
        if sentence_counts['sentence_count'] > 1:
            #print "duplicate sentence", sentence
            return True
        elif sentence_counts['first_stories_id'] == story['stories_id']:
            return True
            #print "duplicate sentence (diff first_stories_id) ", sentence
            
    else:
        return False
        pass
        #print "sentence not found ", sentence

In [24]:
import pandas as pd

def get_data_frame_from_comparision_objects( comparison_objects ):
    
    assert len( comparison_objects ) > 0 
    new_comps = []
    for comp in comparison_objects:
        
        new_comp = {}
        new_comp = { 'downloads_id': comp['downloads_id'] }
        
        extractor_types = [ k for k in comp.keys() if k not in { 'downloads_id', 'media_id', 'story_is_spidered' }  ]
        
        for extractor_type in extractor_types:
            new_comp.update([ ( k + '_' + extractor_type , v) for k,v in comp[ extractor_type ].iteritems() ])
            
        new_comps.append( new_comp )
        
    df = pd.DataFrame( new_comps )
    df.set_index('downloads_id', inplace=True )
    return df

In [25]:
import sys
sys.path = ['/home/dlarochelle/git_dev/mediacloud/python_scripts/notebook/thriftboilerpipe'] + sys.path
from thrift.transport import TTransport
from thrift.transport import TSocket
from thrift.transport import TSSLSocket
from thrift.transport import THttpClient
from thrift.protocol import TBinaryProtocol

from thriftboilerpipe import ExtractorService
from thriftboilerpipe.ttypes import *

In [53]:
host = 'localhost'
port = 9090
uri = ''

socket = TSocket.TSocket(host, port)
transport = TTransport.TBufferedTransport(socket)

protocol = TBinaryProtocol.TBinaryProtocol(transport)
client = ExtractorService.Client(protocol)
transport.open()

In [68]:
def thrift_bp_extract( raw_content, extractor_type ) :
    #print 'start thrift_bp_extract'
    
    thrift_ret = client.extract_html( raw_content, extractor_type )
    
    #print thrift_ret[1][15230:]
    #print type(thrift_ret[1])
    #print repr(thrift_ret[1])
    #unicode( thrift_ret[1], 'utf-8' )
    extracted_text = u"\n\n".join( [ u'' + s for s in thrift_ret ] )
    
    #print 'returning from thrift_bp_extract'
    
    ret = { 'extracted_text': extracted_text,
           'extracted_html': ''
           }
    
    return ret

extract_with_thr_boilerpipe_DefaultExtractor = lambda raw_content : thrift_bp_extract( raw_content, 'DefaultExtractor' )
extract_with_thr_boilerpipe_ArticleExtractor = lambda raw_content: thrift_bp_extract( raw_content, 'ArticleExtractor')
extract_with_thr_boilerpipe_ArticleSentencesExtractor = lambda raw_content: thrift_bp_extract( raw_content, 'ArticleSentencesExtractor')

In [28]:
import boilerpipe.extract

def extract_with_py_boilerpipe( raw_content, extractor ):
    e = boilerpipe.extract.Extractor( extractor=extractor, html=raw_content )
    html = e.getHTML()

    ret = { 'extracted_html': html }
    return ret

extract_with_py_boilerpipe_DefaultExtractor = lambda raw_content: extract_with_py_boilerpipe( raw_content,
                                                                                              'DefaultExtractor')
    
extract_with_py_boilerpipe_ArticleExtractor = lambda raw_content: extract_with_py_boilerpipe( raw_content,
                                                                                              'ArticleExtractor')
extract_with_py_boilerpipe_ArticleSentencesExtractor = lambda raw_content: extract_with_py_boilerpipe( raw_content,
                                                                                              'ArticleSentencesExtractor')

extract_with_py_boilerpipe_KeepEverythingExtractor = lambda raw_content: extract_with_py_boilerpipe( raw_content,
                                                                                              'KeepEverythingExtractor')

extract_with_py_boilerpipe_KeepEverythingWithMinKWordsExtractor = lambda raw_content: extract_with_py_boilerpipe( raw_content,
                                                                                              'KeepEverythingWithMinKWordsExtractor')

extract_with_py_boilerpipe_LargestContentExtractor = lambda raw_content: extract_with_py_boilerpipe( raw_content,
                                                                                              'LargestContentExtractor')

extract_with_py_boilerpipe_NumWordsRulesExtractor = lambda raw_content: extract_with_py_boilerpipe( raw_content,
                                                                                              'NumWordsRulesExtractor')

extract_with_py_boilerpipe_CanolaExtractor = lambda raw_content: extract_with_py_boilerpipe( raw_content,
                                                                                              'CanolaExtractor')

In [29]:
def print_results_by_measurement_type( df ):
    df.describe(percentiles=[.5] )
    result_types = [ 'precision', 'recall', 'f1' ]
    for result_type in result_types:
        res_columns = [ col for col in df.columns if col.startswith( result_type ) ]
        #df.ix[:,['f1_boiler_pipe',	'f1_crf',	'f1_heur', 'f1_python_readibilty']].describe()
        print df.ix[:,res_columns].describe( percentiles=[0.02, 0.05,.1,0.5])

In [30]:
def filter_by_media_tags_id( comps_downloads, media_tags_ids ):
    media_ids_matching = set()
    #print media_id_media_map
    for media_id, media in media_id_media_map.iteritems():
        if not media[ 'media_source_tags_ids'].isdisjoint( media_tags_ids ):
            media_ids_matching.add( media_id )
            
    return  [cd for cd in comps_downloads if cd['media_id'] in media_ids_matching ]

In [31]:
def remove_spidered_downloads( comps_downloads ):           
    return  [cd for cd in comps_downloads if not cd['story_is_spidered'] ]

def only_spidered_downloads( comps_downloads ):           
    return  [cd for cd in comps_downloads if cd['story_is_spidered'] ]

In [32]:
def story_is_spidered( story ):
    for taginfo in story['story_tags']:
        if taginfo['tag'] == 'spidered' and taginfo['tag_set'] == 'spidered':
            return True
    
    return False

Flags


In [71]:
regenerate_media_id_media_map         = False
regenerate_comps_downloads            = True
compare_deduplicated                  = False

Constants

Data Analysis

Load Data


In [34]:
extractor_training_objects = cPickle.load( file( 
                                                os.path.expanduser( '~/Dropbox/mc/extractor_test/extractor_training_objects.pickle' ), "rb" ) )

#cPickle.load( open( "extractor_traning_objects.pickle", "rb") )

Look up Media Tags


In [35]:
len( extractor_training_objects )
c = Counter( sorted([ eto['media_id'] for eto in extractor_training_objects ]) )
c.most_common()
sorted( c.keys() )


Out[35]:
[1,
 2,
 4,
 6,
 7,
 8,
 14,
 15,
 23,
 28,
 39,
 109,
 110,
 113,
 115,
 117,
 118,
 121,
 125,
 129,
 131,
 139,
 147,
 160,
 268,
 285,
 294,
 307,
 336,
 564,
 669,
 687,
 697,
 711,
 712,
 713,
 731,
 751,
 752,
 771,
 788,
 789,
 801,
 805,
 809,
 853,
 870,
 889,
 955,
 1004,
 1027,
 1040,
 1056,
 1062,
 1063,
 1082,
 1089,
 1092,
 1094,
 1095,
 1104,
 1127,
 1147,
 1149,
 1200,
 1259,
 1270,
 1311,
 1347,
 1349,
 1352,
 1359,
 1376,
 1398,
 1420,
 1452,
 1490,
 1536,
 1549,
 1568,
 1585,
 1589,
 1596,
 1607,
 1626,
 1630,
 1641,
 1648,
 1650,
 1651,
 1654,
 1655,
 1658,
 1667,
 1670,
 1674,
 1675,
 1680,
 1684,
 1685,
 1694,
 1724,
 1725,
 1726,
 1728,
 1729,
 1730,
 1731,
 1733,
 1740,
 1742,
 1747,
 1750,
 1751,
 1752,
 1757,
 4415,
 4418,
 4419,
 5527,
 6162,
 6335,
 18203,
 18204,
 18210,
 18213,
 18346,
 18350,
 18364,
 18370,
 18761,
 18886,
 19027,
 19081,
 19127,
 19327,
 19347,
 19445,
 19690,
 19854,
 20763,
 21629,
 21936,
 21990,
 23209,
 23464,
 23881,
 24697,
 24767,
 25170,
 25369,
 25527,
 26309,
 26705,
 26924,
 27692,
 29363,
 32120,
 35065,
 35400,
 35625,
 39008,
 39648,
 39677,
 40405,
 40584,
 40789,
 41502,
 41771,
 48279,
 48768,
 53459,
 57137,
 58992,
 61322,
 63247,
 64110,
 65731,
 66924,
 66949,
 66951,
 66957,
 68627,
 72816,
 73384,
 77657,
 79784,
 83352,
 83353,
 83371,
 83860,
 84057,
 84249,
 84256,
 84654,
 84991,
 87821,
 88695,
 88819,
 95973,
 96863,
 102611,
 102618,
 102622,
 102629,
 102634,
 102635,
 102637,
 102639,
 102647,
 102652,
 102657,
 102661,
 102668,
 102670,
 102671,
 102673,
 102680,
 102689,
 102691,
 102693,
 102694,
 102708,
 102739,
 102741,
 102744,
 102745,
 102747,
 102759,
 102766,
 102769,
 102770,
 102781,
 104881,
 104903,
 104942,
 104950,
 105130,
 105403,
 107088,
 107637,
 107692,
 107764,
 111691,
 111851,
 112357,
 112386,
 112982,
 113157,
 120850,
 128098,
 133122,
 137922,
 142103,
 143836,
 144943,
 145564,
 146357,
 146402,
 147230,
 177339,
 177382]

In [36]:
import itertools
from collections import Counter

mc = mediacloud.api.MediaCloud(api_key)

if regenerate_media_id_media_map:
    media_id_media_map = {}
    
    media_ids = sorted(list(set([ eto['media_id'] for eto in extractor_training_objects ])))
    
    for media_id in list(media_ids)[:]:
        media = mc.media( media_id )
        media[ 'media_source_tags_ids' ] = set( [ media_source_tag['tags_id'] 
                                                 for media_source_tag in media['media_source_tags'] ] )
        media_id_media_map[ media_id ] = media
    
    print len( media_ids )
    print 'pickling'
    
    cPickle.dump( media_id_media_map, 
                 file( os.path.expanduser( '~/Dropbox/mc/extractor_test/media_id_media_map.pickle'), "wb") )

In [37]:
media_id_media_map = cPickle.load(  
                                  file( os.path.expanduser( 
                                                           '~/Dropbox/mc/extractor_test/media_id_media_map.pickle'), 
                                                           "rb") )
                                  
media_tag_counts = Counter(list ( itertools.chain.from_iterable( media_source['media_source_tags_ids'] for media_source in media_id_media_map.values() )) ) 
tags_id_to_media_tags_map = {}
for media_tag in media_id_media_map.values():
    source_tags = media_tag[ 'media_source_tags' ]
    for source_tag in source_tags:
        tags_id_to_media_tags_map[ source_tag[ 'tags_id' ] ] = source_tag

In [38]:
media_id_media_map.keys()


Out[38]:
[1536,
 1,
 2,
 1027,
 4,
 113157,
 6,
 7,
 8,
 72816,
 1549,
 14,
 15,
 1040,
 6162,
 68627,
 23,
 28,
 41502,
 1056,
 1062,
 39,
 27692,
 144943,
 1585,
 564,
 1589,
 1082,
 1596,
 1089,
 1654,
 1092,
 1094,
 1607,
 35400,
 268,
 111691,
 1104,
 26705,
 25170,
 19027,
 84057,
 1626,
 63247,
 1630,
 96863,
 39008,
 128098,
 955,
 1095,
 1127,
 1641,
 4419,
 109,
 110,
 58992,
 113,
 1650,
 115,
 117,
 118,
 1655,
 121,
 1658,
 1147,
 1149,
 48768,
 129,
 133122,
 131,
 1670,
 40584,
 19081,
 1674,
 1675,
 1730,
 102639,
 1680,
 147,
 1684,
 1685,
 48279,
 145564,
 669,
 1694,
 26309,
 160,
 1648,
 18203,
 73384,
 23209,
 105130,
 107692,
 84654,
 687,
 1200,
 29363,
 19127,
 697,
 177339,
 1724,
 1725,
 1726,
 6335,
 1728,
 1729,
 137922,
 1731,
 1733,
 711,
 712,
 713,
 88695,
 1740,
 1742,
 40789,
 1747,
 1750,
 1751,
 1752,
 102618,
 731,
 1757,
 102622,
 805,
 39648,
 120850,
 95973,
 65731,
 19690,
 1259,
 102637,
 21629,
 752,
 53459,
 107764,
 1270,
 809,
 35065,
 102652,
 39677,
 102657,
 112386,
 771,
 102661,
 102668,
 87821,
 102670,
 102671,
 102673,
 1667,
 788,
 789,
 142103,
 102680,
 25369,
 20763,
 18204,
 285,
 147230,
 1311,
 84256,
 102689,
 18210,
 102691,
 18213,
 294,
 35625,
 41771,
 26924,
 102635,
 57137,
 307,
 102708,
 24697,
 84249,
 4415,
 4418,
 139,
 1349,
 102611,
 1352,
 18761,
 1359,
 336,
 102739,
 102741,
 112982,
 88819,
 102744,
 102745,
 102747,
 112357,
 1376,
 102629,
 177382,
 870,
 102759,
 66924,
 24767,
 102766,
 102769,
 102770,
 19347,
 102694,
 1398,
 32120,
 889,
 102781,
 23881,
 19327,
 111851,
 66949,
 66951,
 61322,
 1420,
 66957,
 19854,
 125,
 1347,
 83860,
 104942,
 5527,
 83352,
 83353,
 751,
 1063,
 23464,
 18346,
 83371,
 1452,
 18350,
 21936,
 104881,
 1651,
 146357,
 25527,
 105403,
 18364,
 18370,
 102693,
 18886,
 104903,
 102647,
 1490,
 40405,
 143836,
 107637,
 107088,
 146402,
 21990,
 1004,
 77657,
 64110,
 79784,
 19445,
 104950,
 1568,
 102634,
 84991,
 801,
 853]

Run extractors


In [39]:
[ m['media_source_tags_ids'] for m in media_id_media_map.values() ]


Out[39]:
[{2453107, 2454253, 2491715},
 {109, 6071565, 6729599, 8875027, 8878390, 8878416},
 {6, 7, 18, 6071565, 6729599, 8875027, 8878390, 8878416},
 {125, 8878332},
 {5, 6, 6071565, 6729599, 8875027, 8878416},
 {8875452},
 {6, 14, 8875027, 8875460, 8878390, 8878416},
 {6, 16, 6071565, 6729599, 8875027, 8875676, 8878416},
 {17, 6071565, 6729599, 8875027, 8875676, 8878416},
 {8875452, 8878416},
 {2453107, 2454099, 2454253},
 {21, 8875027, 8875460, 8878416},
 {22, 8875027, 8878416},
 {125, 8878415},
 {8875028, 8875033, 8875107, 8878420},
 {8875452},
 {32, 2453107, 2496423, 6260468},
 {11, 2453107, 2496423, 6260349, 8878416},
 {8876474, 8876475, 8876476, 8876479, 8876504, 8876548, 8876585},
 {125, 8875031, 8875108, 8875110, 8875113, 8875456, 8878415},
 {125},
 {43, 2453107, 2496423, 6260349, 8875031},
 {8875458, 8878418},
 {8875452},
 {2453107, 2495238, 2495253},
 {118, 2453107, 2496423, 2497397, 8875031, 8875108, 8875110, 8875113, 8878416},
 {2453107, 2495238, 2495239},
 {125, 796, 8878292},
 {2453107, 2495238, 2495255},
 {129,
  8875027,
  8876474,
  8876475,
  8876476,
  8876484,
  8876499,
  8876509,
  8876510,
  8878416},
 {2453107, 2496423, 2497397},
 {8875027, 8875028, 8878416},
 {6124858, 8875027, 8878416},
 {796, 2453107, 2495238, 2495240},
 {8875452},
 {117, 8878292, 8878415, 8878451},
 {8875452},
 {8875027, 8878416},
 {8875452},
 {8875452, 8878416},
 {8875452, 8876987, 8876988, 8878419},
 {8877968, 8877969, 8877973, 8877990},
 {142317, 2453107, 2496423, 2497397},
 {8876987, 8876988, 8878416},
 {2453107, 2496423, 2496424, 5648819},
 {8875452},
 {8876474,
  8876475,
  8876484,
  8876492,
  8876499,
  8876500,
  8876562,
  8876987,
  8876988,
  8878413},
 {8875452},
 {125, 8878416},
 {8875027, 8875031, 8878416},
 {142001, 8878293, 8878332, 8878416},
 {2453107, 2496423, 2496424, 7055831, 8878416},
 {8875027, 8878416},
 {7, 117, 118, 125, 6071565, 6729401, 6729599, 8875028, 8878292, 8878415},
 {7, 117, 118, 125, 6124858},
 {8876987, 8876988},
 {7,
  117,
  118,
  125,
  6071565,
  6729549,
  6729599,
  8875028,
  8875031,
  8875108,
  8875109,
  8875114,
  8875456,
  8878293,
  8878416},
 {2453107, 2496423, 2497397},
 {7,
  117,
  118,
  125,
  6124858,
  8875028,
  8875031,
  8875108,
  8875109,
  8875114,
  8875456,
  8875458,
  8878063,
  8878293,
  8878423},
 {7,
  117,
  118,
  125,
  796,
  142001,
  6071565,
  6729401,
  6729599,
  8875028,
  8875031,
  8875108,
  8875111,
  8875115,
  8875459,
  8878062,
  8878292,
  8878416},
 {7, 117, 118, 125, 6071565, 6729549, 6729599, 8875028, 8878415},
 {2453107, 2496423, 2497397},
 {6, 117, 118, 125, 796, 8878293},
 {2453107, 2496423, 2497397},
 {796,
  142001,
  8875028,
  8875031,
  8875108,
  8875109,
  8875114,
  8875456,
  8878063,
  8878293,
  8878332,
  8878413},
 {8875027,
  8876474,
  8876475,
  8876476,
  8876479,
  8876484,
  8876509,
  8876510,
  8878293,
  8878416},
 {8875452, 8878416},
 {7, 117, 125},
 {8875452},
 {117, 118, 125, 8875456, 8878293},
 {2453107, 2496423, 2497397},
 {8876474,
  8876475,
  8876476,
  8876479,
  8876502,
  8876508,
  8876576,
  8878255,
  8878259,
  8878263},
 {8875452, 8878423},
 {2453107, 2496423, 2497397},
 {2453107, 2496423, 2497397},
 {7796878},
 {8878255, 8878259, 8878263},
 {2453107, 2496423, 2497397},
 {7, 117, 118, 125, 8878415},
 {2453107, 2496423, 2497397},
 {2453107, 2496423, 2497397},
 {8875452, 8876987, 8877044, 8878443},
 {8875452},
 {125, 8878416},
 {50, 2453107, 2496423, 2497397, 7055831, 8878416},
 {8875452},
 {7, 117, 125, 796, 8875458, 8878293},
 {2453107, 2496423, 2497397},
 {8875227, 8875361, 8878255, 8878259, 8878269, 8878416},
 {8875452, 8878293, 8878416},
 {8875452, 8876987, 8876988, 8878416, 8878442},
 {8875452, 8878415},
 {8875452, 8878416},
 {8875452},
 {125, 6071565, 6729401, 6729599},
 {2453107, 2496423, 2497397, 8878416},
 {8875452, 8878416},
 {8875452, 8878413},
 {125},
 {8875452},
 {7796878, 8876474, 8876475, 8876476, 8876479, 8876491, 8876504, 8876505},
 {7796878,
  8875024,
  8876474,
  8876475,
  8876476,
  8876479,
  8876491,
  8876504,
  8876505},
 {7, 7796878, 8875024},
 {8875031, 8875108, 8875111, 8878062, 8878292, 8878415},
 {7796878, 8875024, 8876987, 8876995},
 {7796878, 8875024, 8876987, 8876995},
 {8875452},
 {7796878},
 {7796878, 8875035},
 {125, 8875031, 8875108, 8875111, 8875115, 8878062, 8878292, 8878416},
 {125, 8875031, 8875108, 8875110, 8875113, 8878332},
 {125, 8878293},
 {8875452},
 {7796878, 8875026},
 {7796878, 8875026},
 {8876474,
  8876475,
  8876476,
  8876479,
  8876508,
  8876569,
  8876570,
  8877914,
  8878255,
  8878259,
  8878273},
 {8875027,
  8876474,
  8876475,
  8876476,
  8876478,
  8876479,
  8876484,
  8876487,
  8878292,
  8878416},
 {8875027,
  8876474,
  8876475,
  8876476,
  8876478,
  8876479,
  8876484,
  8876487,
  8878292,
  8878416},
 {8875027,
  8875031,
  8876474,
  8876475,
  8876476,
  8876478,
  8876479,
  8876484,
  8876487,
  8878293,
  8878416},
 {8875027, 8878416},
 {8878255, 8878259, 8878263},
 {125, 796, 8878293},
 {796, 8875456, 8878293, 8878390, 8878416},
 {8878255, 8878259, 8878263},
 {125},
 {8876474,
  8876475,
  8876476,
  8876479,
  8876502,
  8876508,
  8876576,
  8877914,
  8878255,
  8878261},
 {8875452},
 {8875452, 8878416},
 {8876987, 8877000, 8878332, 8878416},
 {8875452, 8878292, 8878416},
 {2453107, 2496423, 6260328},
 {8878255, 8878259, 8878273},
 {8875452, 8876987, 8876988, 8878416},
 {125},
 {8875452},
 {8875452},
 {2453107, 2496423, 6260328, 8878416},
 {125, 8875456, 8875458, 8878293},
 {8875452},
 {8878255, 8878259, 8878263},
 {8876474,
  8876475,
  8876476,
  8876479,
  8876502,
  8876508,
  8876576,
  8878255,
  8878259,
  8878273},
 {8878255, 8878259, 8878263},
 {8875452},
 {125},
 {8878255, 8878259, 8878263},
 {8878255, 8878259, 8878270},
 {8875452},
 {8878255, 8878259, 8878263},
 {8878255, 8878259, 8878263},
 {8878255, 8878259, 8878263},
 {2453107, 2496423, 2497397},
 {125, 8878293},
 {125},
 {8875452},
 {8878255, 8878259, 8878263},
 {8875452, 8875456, 8878293, 8878415},
 {8875452, 8876987, 8876988},
 {8875227,
  8875361,
  8876474,
  8876476,
  8876478,
  8876479,
  8876487,
  8876508,
  8876544,
  8878255,
  8878259,
  8878269},
 {117, 125},
 {8875452},
 {2453107, 2496423, 2496424},
 {8877968, 8877989, 8877997},
 {8878255, 8878259, 8878270},
 {8875228, 8877914, 8878255, 8878259, 8878263},
 {8878255, 8878259, 8878270},
 {8875228, 8878255, 8878257},
 {118, 125, 8878416},
 {8875452},
 {8876474,
  8876475,
  8876476,
  8876479,
  8876502,
  8876508,
  8876576,
  8878255,
  8878259,
  8878270},
 {8875452, 8878416},
 {8878255, 8878259, 8878263},
 {8876987, 8877008},
 {118, 125, 8875028, 8875458, 8878293, 8878415},
 {8878255, 8878259, 8878263},
 {8875452, 8876987, 8877043},
 {8877968, 8877989, 8878011},
 {8875027, 8878420},
 {796, 8875027, 8878416},
 {7, 117, 118, 125, 8875456, 8878293},
 {2453107, 2496423, 6260349},
 {8878255, 8878257},
 {2453107, 2496423, 6260349},
 {8875452, 8876987, 8876988},
 {2453107, 2496423, 6260349},
 {118, 125, 8878292, 8878415},
 {8878255, 8878259, 8878270},
 {8878255, 8878259, 8878261, 8878273},
 {8875452},
 {8875452},
 {8878255, 8878259, 8878263},
 {8878255, 8878259, 8878269, 8878416},
 {8878255, 8878259, 8878266},
 {8875452},
 {2453107, 2496423, 6260349},
 {8878255, 8878259, 8878263},
 {8875452},
 {125},
 {8878255, 8878259, 8878273},
 {8877913, 8878255, 8878261},
 {8875452},
 {8878255, 8878259, 8878273},
 {8878255, 8878259, 8878263},
 {8878255, 8878259, 8878263},
 {8875452, 8876987, 8877046},
 {8878255, 8878259, 8878263},
 {2453107, 2496423, 6260468},
 {8875452},
 {125},
 {8878255, 8878259, 8878270},
 {142364, 8875361, 8878255, 8878259, 8878269},
 {8875452},
 {8875452},
 {8877914, 8878255, 8878259, 8878273},
 {8877914, 8878255, 8878259, 8878273},
 {8876987, 8877006, 8878255, 8878259},
 {2453107, 2496423, 6260468, 8878416},
 {8877915, 8878255, 8878259, 8878263, 8878429},
 {8875452, 8875456, 8878416},
 {7,
  117,
  125,
  796,
  8875031,
  8875108,
  8875111,
  8875115,
  8878062,
  8878293,
  8878416},
 {2453107, 2496423, 6260349},
 {8877968, 8877969},
 {8875452},
 {8875028, 8875033, 8875107, 8878420},
 {8877968, 8877969, 8877970, 8878416},
 {8877968, 8877969, 8877970, 8878416},
 {125},
 {125, 8875031, 8875108, 8875110, 8875113, 8878416},
 {8875452, 8875456, 8876987, 8877000},
 {8875452, 8876987, 8876988, 8878423},
 {8877968, 8877969, 8877980},
 {2453107, 2454099, 2454253},
 {8875452, 8878416},
 {8875452, 8878429},
 {8875452},
 {2453107, 2496423, 2497397},
 {8875452},
 {8875452, 8878416},
 {8875452},
 {8875452, 8875456, 8878416},
 {8875452, 8876987, 8876988, 8878423},
 {8878255, 8878259, 8878273},
 {8875452, 8876987, 8876988, 8878416},
 {8875452},
 {8878255, 8878259, 8878263},
 {2453107, 2454253, 2484526},
 {8876474,
  8876475,
  8876476,
  8876479,
  8876491,
  8876492,
  8876508,
  8878255,
  8878259,
  8878269},
 {8875452},
 {8875452, 8878413},
 {8875452},
 {8875452},
 {8875452,
  8876474,
  8876475,
  8876476,
  8876484,
  8876499,
  8876509,
  8876510,
  8876987,
  8876988,
  8878416},
 {125, 8875028, 8878293, 8878332, 8878415},
 {8875452, 8878293},
 {8876987, 8877006, 8878255, 8878260, 8878416},
 {8875452, 8878443},
 {8875452},
 {8875452},
 {2453107, 2495238, 2495240},
 {8878255, 8878259, 8878263, 8878273},
 {8875452},
 {125, 8875456, 8875458, 8878293, 8878416},
 {125, 8878415}]

In [64]:
extract_with_thr_boilerpipe_ArticleExtractor( extractor_training_objects[0]['raw_content'] )


start thrift_bp_extract
returning from thrift_bp_extract
Out[64]:
{'extracted_html': '',
 'extracted_text': u'Metr\xf4 tem confus\xe3o por causa de encontro em shopping; veja v\xeddeo - Bizarro\n\nfonte: Reprodu\xe7\xe3o/Facebook Rol\xea Shopping Itaquera\nHouve superlota\xe7\xe3o e a confus\xe3o foi generalizada\nEra para ser um s\xe1bado muvucado como todos aqueles que antecedem o Natal em mais um shopping de S\xe3o Paulo. Mas foi muito mais do que isso. Milhares de jovens compareceram a um encontro promovido por uma p\xe1gina \xa0no Facebook para este s\xe1bado (7), no segundo andar do Shopping Metr\xf4 Itaquera, \xe0s 17h. Mas o que aconteceu foi uma bagun\xe7a total, com a chegada da pol\xedcia e o fechamento do shopping duas horas antes que o previsto.\nNingu\xe9m sabe ao certo o que aconteceu, alguns dizem que foi arrast\xe3o, mas os pr\xf3prios lojistas negam. De acordo com um rapaz que registrou a movimenta\xe7\xe3o na sa\xedda do metr\xf4, algumas pessoas isoladamente tiveram itens furtados. Infelizmente, o celular do garoto era meio ruim, mas deu para ver a confus\xe3o. Veja aqui o v\xeddeo que ele fez.\nComo muita gente compareceu, houve a confus\xe3o. O certo \xe9 que j\xe1 est\xe1 marcado o terceiro encontro, que \xe9 mais uma badala\xe7\xe3o entre a galera, ou melhor, como diz a pr\xf3pria p\xe1gina , \xe9 s\xf3 um Rol\xeazinho Parte 3. Tudo come\xe7ou na semana passado, no domingo (1), com o Vuuk no Shopping Itaquera.\nVEJA+\n'}

In [72]:
comp_extractors( extractor_training_objects[ 0] )


extracting with thr_boilerpipe
Out[72]:
{'downloads_id': 391881020,
 'gold': {'f1': 1.0, 'precision': 1.0, 'recall': 1.0},
 'media_id': 83371,
 'py_boiler_pipe_ArticleExtractor': {'f1': 0.898148148148148,
  'precision': 0.8220338983050848,
  'recall': 0.9897959183673469},
 'py_boiler_pipe_ArticleSentencesExtractor': {'f1': 0,
  'precision': 0.0,
  'recall': 0.0},
 'py_boiler_pipe_CanolaExtractor': {'f1': 0.8729016786570742,
  'precision': 0.8235294117647058,
  'recall': 0.9285714285714286},
 'py_boiler_pipe_DefaultExtractor': {'f1': 0.8656036446469247,
  'precision': 0.7818930041152263,
  'recall': 0.9693877551020408},
 'py_boiler_pipe_KeepEverythingExtractor': {'f1': 0.4714459295261239,
  'precision': 0.3094098883572568,
  'recall': 0.9897959183673469},
 'py_boiler_pipe_LargestContentExtractor': {'f1': 0.9440389294403893,
  'precision': 0.9023255813953488,
  'recall': 0.9897959183673469},
 'py_boiler_pipe_NumWordsRulesExtractor': {'f1': 0.900473933649289,
  'precision': 0.8407079646017699,
  'recall': 0.9693877551020408},
 'python_readibilty': {'f1': 0.9411764705882353,
  'precision': 0.9435897435897436,
  'recall': 0.9387755102040817},
 'story_is_spidered': False,
 'thr_boiler_pipe_ArticleExtractor': {'f1': 0.938875305623472,
  'precision': 0.9014084507042254,
  'recall': 0.9795918367346939},
 'thr_boiler_pipe_DefaultExtractor': {'f1': 0.7840670859538783,
  'precision': 0.6654804270462633,
  'recall': 0.9540816326530612}}

In [73]:
import datetime

if regenerate_comps_downloads:
    
    comps_downloads = []
    processed = 0
    skipped = 0
    
    start_time = datetime.datetime.now()
    
    e=None
    for extractor_training_object in extractor_training_objects[:]:
        print 'processed ', processed
        print 'skipped ', skipped
        print extractor_training_object[ 'downloads_id']
        try:
            res = comp_extractors( extractor_training_object )
            #print res
            comps_downloads.append( res )
            processed += 1
        except Exception, e:
            print "error on download{}".format( extractor_training_object[ 'downloads_id'] )
            e = sys.exc_info()
            
            import traceback
            
            traceback.print_exc()
            print e
            #raise e
            skipped += 1

    end_time = datetime.datetime.now()
    
    print "Total_time", end_time - start_time
    
    print "Time per download", (end_time - start_time)/ (processed + skipped )
    
    cPickle.dump( comps_downloads, file( 
            os.path.expanduser( "~/Dropbox/mc/extractor_test/comps_downloads.pickle"), "wb"))
    
    
    e
#extractor_training_objects


processed  0
skipped  0
391881020
extracting with thr_boilerpipe
processed  1
skipped  0
401370599
extracting with thr_boilerpipe
processed  2
skipped  0
412896439
extracting with thr_boilerpipe
processed  3
skipped  0
412952145
extracting with thr_boilerpipe
processed  4
skipped  0
412977048
extracting with thr_boilerpipe
processed  5
skipped  0
406397565
extracting with thr_boilerpipe
processed  6
skipped  0
406946308
extracting with thr_boilerpipe
processed  7
skipped  0
407440556
extracting with thr_boilerpipe
processed  8
skipped  0
408175125
extracting with thr_boilerpipe
processed  9
skipped  0
408604940
extracting with thr_boilerpipe
processed  10
skipped  0
408964569
extracting with thr_boilerpipe
processed  11
skipped  0
409307571
extracting with thr_boilerpipe
processed  12
skipped  0
409362014
extracting with thr_boilerpipe
processed  13
skipped  0
410019932
extracting with thr_boilerpipe
processed  14
skipped  0
410769033
extracting with thr_boilerpipe
processed  15
skipped  0
410893054
extracting with thr_boilerpipe
processed  16
skipped  0
411494070
extracting with thr_boilerpipe
processed  17
skipped  0
412822633
extracting with thr_boilerpipe
processed  18
skipped  0
412832327
extracting with thr_boilerpipe
processed  19
skipped  0
413070223
extracting with thr_boilerpipe
processed  20
skipped  0
413080625
extracting with thr_boilerpipe
processed  21
skipped  0
413724188
extracting with thr_boilerpipe
processed  22
skipped  0
413990063
extracting with thr_boilerpipe
processed  23
skipped  0
414001946
extracting with thr_boilerpipe
processed  24
skipped  0
414705852
extracting with thr_boilerpipe
processed  25
skipped  0
414754369
extracting with thr_boilerpipe
processed  26
skipped  0
414974366
extracting with thr_boilerpipe
processed  27
skipped  0
415381471
extracting with thr_boilerpipe
processed  28
skipped  0
415752815
extracting with thr_boilerpipe
processed  29
skipped  0
415977284
extracting with thr_boilerpipe
processed  30
skipped  0
416560140
extracting with thr_boilerpipe
processed  31
skipped  0
416773947
extracting with thr_boilerpipe
processed  32
skipped  0
416931217
extracting with thr_boilerpipe
processed  33
skipped  0
417026931
extracting with thr_boilerpipe
processed  34
skipped  0
417913575
extracting with thr_boilerpipe
processed  35
skipped  0
418921816
extracting with thr_boilerpipe
processed  36
skipped  0
419059149
extracting with thr_boilerpipe
processed  37
skipped  0
419061380
extracting with thr_boilerpipe
processed  38
skipped  0
419312194
extracting with thr_boilerpipe
processed  39
skipped  0
419463576
extracting with thr_boilerpipe
processed  40
skipped  0
419897309
extracting with thr_boilerpipe
processed  41
skipped  0
420605489
extracting with thr_boilerpipe
processed  42
skipped  0
420695976
extracting with thr_boilerpipe
processed  43
skipped  0
420972612
extracting with thr_boilerpipe
processed  44
skipped  0
421512071
extracting with thr_boilerpipe
processed  45
skipped  0
421586812
extracting with thr_boilerpipe
processed  46
skipped  0
421950008
extracting with thr_boilerpipe
processed  47
skipped  0
422329395
extracting with thr_boilerpipe
processed  48
skipped  0
423644184
extracting with thr_boilerpipe
processed  49
skipped  0
423780415
extracting with thr_boilerpipe
processed  50
skipped  0
423811752
extracting with thr_boilerpipe
processed  51
skipped  0
423826345
extracting with thr_boilerpipe
processed  52
skipped  0
426274790
extracting with thr_boilerpipe
processed  53
skipped  0
426745030
extracting with thr_boilerpipe
processed  54
skipped  0
426958717
extracting with thr_boilerpipe
processed  55
skipped  0
427023406
extracting with thr_boilerpipe
processed  56
skipped  0
427039192
extracting with thr_boilerpipe
processed  57
skipped  0
427131787
extracting with thr_boilerpipe
processed  58
skipped  0
427645929
extracting with thr_boilerpipe
processed  59
skipped  0
427851499
extracting with thr_boilerpipe
processed  60
skipped  0
428523804
extracting with thr_boilerpipe
processed  61
skipped  0
429112619
extracting with thr_boilerpipe
processed  62
skipped  0
429500447
extracting with thr_boilerpipe
processed  63
skipped  0
429714766
extracting with thr_boilerpipe
processed  64
skipped  0
429793967
extracting with thr_boilerpipe
processed  65
skipped  0
430099220
extracting with thr_boilerpipe
processed  66
skipped  0
430660615
extracting with thr_boilerpipe
processed  67
skipped  0
431012388
extracting with thr_boilerpipe
processed  68
skipped  0
431311136
extracting with thr_boilerpipe
processed  69
skipped  0
431387007
extracting with thr_boilerpipe
processed  70
skipped  0
431606529
extracting with thr_boilerpipe
processed  71
skipped  0
431839510
extracting with thr_boilerpipe
processed  72
skipped  0
431905299
extracting with thr_boilerpipe
processed  73
skipped  0
432896524
extracting with thr_boilerpipe
processed  74
skipped  0
432929604
extracting with thr_boilerpipe
processed  75
skipped  0
433044130
extracting with thr_boilerpipe
processed  76
skipped  0
434192180
extracting with thr_boilerpipe
processed  77
skipped  0
434250046
extracting with thr_boilerpipe
processed  78
skipped  0
434842660
extracting with thr_boilerpipe
processed  79
skipped  0
435142187
extracting with thr_boilerpipe
processed  80
skipped  0
435417572
extracting with thr_boilerpipe
processed  81
skipped  0
435417726
extracting with thr_boilerpipe
processed  82
skipped  0
435795096
extracting with thr_boilerpipe
processed  83
skipped  0
436540640
extracting with thr_boilerpipe
processed  84
skipped  0
436554531
extracting with thr_boilerpipe
processed  85
skipped  0
437087573
extracting with thr_boilerpipe
processed  86
skipped  0
437239472
extracting with thr_boilerpipe
processed  87
skipped  0
437281074
extracting with thr_boilerpipe
processed  88
skipped  0
437315186
extracting with thr_boilerpipe
processed  89
skipped  0
437523403
extracting with thr_boilerpipe
processed  90
skipped  0
437526788
extracting with thr_boilerpipe
processed  91
skipped  0
438596131
extracting with thr_boilerpipe
processed  92
skipped  0
438672710
extracting with thr_boilerpipe
processed  93
skipped  0
439340014
extracting with thr_boilerpipe
error on download439340014
(<class 'thrift.Thrift.TApplicationException'>, TApplicationException(None,), <traceback object at 0x7f3c8a8d51b8>)
processed  93
skipped  1
439491986
extracting with thr_boilerpipe
processed  94
skipped  1
439619864
extracting with thr_boilerpipe
processed  95
skipped  1
439631363
extracting with thr_boilerpipe
processed  96
skipped  1
440193709
extracting with thr_boilerpipe
processed  97
skipped  1
440219216
extracting with thr_boilerpipe
processed  98
skipped  1
440780462
extracting with thr_boilerpipe
processed  99
skipped  1
441039343
extracting with thr_boilerpipe
processed  100
skipped  1
441313630
extracting with thr_boilerpipe
processed  101
skipped  1
441455864
extracting with thr_boilerpipe
processed  102
skipped  1
441846112
extracting with thr_boilerpipe
processed  103
skipped  1
442808951
extracting with thr_boilerpipe
processed  104
skipped  1
442862362
extracting with thr_boilerpipe
processed  105
skipped  1
443616755
extracting with thr_boilerpipe
processed  106
skipped  1
444570682
extracting with thr_boilerpipe
processed  107
skipped  1
444894095
extracting with thr_boilerpipe
processed  108
skipped  1
445086933
extracting with thr_boilerpipe
processed  109
skipped  1
445428584
extracting with thr_boilerpipe
processed  110
skipped  1
445477281
extracting with thr_boilerpipe
processed  111
skipped  1
446094222
extracting with thr_boilerpipe
processed  112
skipped  1
446613221
extracting with thr_boilerpipe
processed  113
skipped  1
447062817
extracting with thr_boilerpipe
processed  114
skipped  1
447776250
extracting with thr_boilerpipe
processed  115
skipped  1
448661284
extracting with thr_boilerpipe
processed  116
skipped  1
448661534
extracting with thr_boilerpipe
processed  117
skipped  1
448892156
extracting with thr_boilerpipe
processed  118
skipped  1
448905892
extracting with thr_boilerpipe
processed  119
skipped  1
449094788
extracting with thr_boilerpipe
processed  120
skipped  1
449256732
extracting with thr_boilerpipe
processed  121
skipped  1
450223351
extracting with thr_boilerpipe
processed  122
skipped  1
450797637
extracting with thr_boilerpipe
processed  123
skipped  1
452232437
extracting with thr_boilerpipe
processed  124
skipped  1
453272806
extracting with thr_boilerpipe
processed  125
skipped  1
454046798
extracting with thr_boilerpipe
processed  126
skipped  1
454492602
extracting with thr_boilerpipe
processed  127
skipped  1
455066382
extracting with thr_boilerpipe
processed  128
skipped  1
455593030
extracting with thr_boilerpipe
processed  129
skipped  1
456789442
extracting with thr_boilerpipe
processed  130
skipped  1
457113351
extracting with thr_boilerpipe
processed  131
skipped  1
457749321
extracting with thr_boilerpipe
processed  132
skipped  1
457918791
extracting with thr_boilerpipe
processed  133
skipped  1
457945604
extracting with thr_boilerpipe
processed  134
skipped  1
458053172
extracting with thr_boilerpipe
processed  135
skipped  1
458710845
extracting with thr_boilerpipe
processed  136
skipped  1
459297373
extracting with thr_boilerpipe
processed  137
skipped  1
459314297
extracting with thr_boilerpipe
processed  138
skipped  1
459678712
extracting with thr_boilerpipe
processed  139
skipped  1
459731765
extracting with thr_boilerpipe
processed  140
skipped  1
461252885
extracting with thr_boilerpipe
processed  141
skipped  1
462404061
extracting with thr_boilerpipe
processed  142
skipped  1
462983422
extracting with thr_boilerpipe
processed  143
skipped  1
463954045
extracting with thr_boilerpipe
processed  144
skipped  1
464270345
extracting with thr_boilerpipe
processed  145
skipped  1
465068895
extracting with thr_boilerpipe
processed  146
skipped  1
465090983
extracting with thr_boilerpipe
processed  147
skipped  1
466804114
extracting with thr_boilerpipe
processed  148
skipped  1
467150362
extracting with thr_boilerpipe
processed  149
skipped  1
468568475
extracting with thr_boilerpipe
processed  150
skipped  1
468788039
extracting with thr_boilerpipe
processed  151
skipped  1
468996262
extracting with thr_boilerpipe
processed  152
skipped  1
469058922
extracting with thr_boilerpipe
processed  153
skipped  1
470488107
extracting with thr_boilerpipe
processed  154
skipped  1
473138193
extracting with thr_boilerpipe
processed  155
skipped  1
474060840
extracting with thr_boilerpipe
processed  156
skipped  1
474649098
extracting with thr_boilerpipe
processed  157
skipped  1
474891663
extracting with thr_boilerpipe
processed  158
skipped  1
475232740
extracting with thr_boilerpipe
processed  159
skipped  1
475489411
extracting with thr_boilerpipe
processed  160
skipped  1
475550192
extracting with thr_boilerpipe
processed  161
skipped  1
475842962
extracting with thr_boilerpipe
processed  162
skipped  1
476050916
extracting with thr_boilerpipe
processed  163
skipped  1
476079482
extracting with thr_boilerpipe
processed  164
skipped  1
476189164
extracting with thr_boilerpipe
processed  165
skipped  1
476365487
extracting with thr_boilerpipe
processed  166
skipped  1
476850484
extracting with thr_boilerpipe
processed  167
skipped  1
476936059
extracting with thr_boilerpipe
processed  168
skipped  1
476962103
extracting with thr_boilerpipe
processed  169
skipped  1
477126381
extracting with thr_boilerpipe
processed  170
skipped  1
477171081
extracting with thr_boilerpipe
processed  171
skipped  1
477673129
extracting with thr_boilerpipe
processed  172
skipped  1
477798748
extracting with thr_boilerpipe
processed  173
skipped  1
477850182
extracting with thr_boilerpipe
processed  174
skipped  1
478307074
extracting with thr_boilerpipe
processed  175
skipped  1
478793359
extracting with thr_boilerpipe
processed  176
skipped  1
479262110
extracting with thr_boilerpipe
processed  177
skipped  1
479410656
extracting with thr_boilerpipe
processed  178
skipped  1
480072496
extracting with thr_boilerpipe
processed  179
skipped  1
480080971
extracting with thr_boilerpipe
processed  180
skipped  1
480850060
extracting with thr_boilerpipe
processed  181
skipped  1
480965210
extracting with thr_boilerpipe
processed  182
skipped  1
481064362
extracting with thr_boilerpipe
processed  183
skipped  1
481747325
extracting with thr_boilerpipe
processed  184
skipped  1
481956983
extracting with thr_boilerpipe
processed  185
skipped  1
481995599
extracting with thr_boilerpipe
processed  186
skipped  1
482339280
extracting with thr_boilerpipe
processed  187
skipped  1
482455316
extracting with thr_boilerpipe
processed  188
skipped  1
482905957
extracting with thr_boilerpipe
processed  189
skipped  1
483167965
extracting with thr_boilerpipe
processed  190
skipped  1
483933348
extracting with thr_boilerpipe
processed  191
skipped  1
484173730
extracting with thr_boilerpipe
processed  192
skipped  1
484469651
extracting with thr_boilerpipe
processed  193
skipped  1
485257678
extracting with thr_boilerpipe
processed  194
skipped  1
485289153
extracting with thr_boilerpipe
processed  195
skipped  1
485312050
extracting with thr_boilerpipe
processed  196
skipped  1
485772601
extracting with thr_boilerpipe
processed  197
skipped  1
485874387
extracting with thr_boilerpipe
processed  198
skipped  1
486956929
extracting with thr_boilerpipe
processed  199
skipped  1
486963918
extracting with thr_boilerpipe
processed  200
skipped  1
486975782
extracting with thr_boilerpipe
processed  201
skipped  1
487099193
extracting with thr_boilerpipe
processed  202
skipped  1
487930345
extracting with thr_boilerpipe
processed  203
skipped  1
487938684
extracting with thr_boilerpipe
processed  204
skipped  1
488325235
extracting with thr_boilerpipe
processed  205
skipped  1
489427373
extracting with thr_boilerpipe
processed  206
skipped  1
489785301
extracting with thr_boilerpipe
processed  207
skipped  1
490000422
extracting with thr_boilerpipe
processed  208
skipped  1
491390357
extracting with thr_boilerpipe
processed  209
skipped  1
491394627
extracting with thr_boilerpipe
processed  210
skipped  1
491580307
extracting with thr_boilerpipe
processed  211
skipped  1
491868589
extracting with thr_boilerpipe
processed 
Traceback (most recent call last):
  File "<ipython-input-73-5462c770f344>", line 17, in <module>
    res = comp_extractors( extractor_training_object )
  File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
    extraction_results = get_extraction_results( eto )
  File "<ipython-input-66-716979e1aa41>", line 17, in get_extraction_results
    ret['thr_boiler_pipe_ArticleExtractor'] =  extract_with_thr_boilerpipe_ArticleExtractor( raw_content)
  File "<ipython-input-68-fdb56e8b1b24>", line 21, in <lambda>
    extract_with_thr_boilerpipe_ArticleExtractor = lambda raw_content: thrift_bp_extract( raw_content, 'ArticleExtractor')
  File "<ipython-input-68-fdb56e8b1b24>", line 4, in thrift_bp_extract
    thrift_ret = client.extract_html( raw_content, extractor_type )
  File "thriftboilerpipe/ExtractorService.py", line 44, in extract_html
    return self.recv_extract_html()
  File "thriftboilerpipe/ExtractorService.py", line 62, in recv_extract_html
    raise x
TApplicationException: Internal error processing extract_html
Traceback (most recent call last):
 212
skipped  1
492035281
extracting with thr_boilerpipe
error on download492035281
(<class 'thrift.Thrift.TApplicationException'>, TApplicationException(None,), <traceback object at 0x7f3c8a8dd248>)
processed  212
skipped  2
492212969
extracting with thr_boilerpipe
processed  213
skipped  2
492220330
extracting with thr_boilerpipe
processed  214
skipped  2
492825637
extracting with thr_boilerpipe
processed  215
skipped  2
492998487
extracting with thr_boilerpipe
processed  216
skipped  2
493395197
extracting with thr_boilerpipe
processed  217
skipped  2
493603632
extracting with thr_boilerpipe
processed  218
skipped  2
494910637
extracting with thr_boilerpipe
processed  219
skipped  2
495132412
extracting with thr_boilerpipe
processed  220
skipped  2
495402183
extracting with thr_boilerpipe
processed  221
skipped  2
495442862
extracting with thr_boilerpipe
processed  222
skipped  2
495702037
extracting with thr_boilerpipe
processed  223
skipped  2
495856120
extracting with thr_boilerpipe
processed  224
skipped  2
496830079
extracting with thr_boilerpipe
processed  225
skipped  2
496960779
extracting with thr_boilerpipe
processed  226
skipped  2
497510792
extracting with thr_boilerpipe
processed  227
skipped  2
497750204
extracting with thr_boilerpipe
processed  228
skipped  2
498241896
extracting with thr_boilerpipe
processed  229
skipped  2
498496981
extracting with thr_boilerpipe
processed  230
skipped  2
498497707
extracting with thr_boilerpipe
processed  231
skipped  2
499288413
extracting with thr_boilerpipe
processed  232
skipped  2
499367463
extracting with thr_boilerpipe
processed  233
skipped  2
499577868
extracting with thr_boilerpipe
processed  234
skipped  2
499738311
extracting with thr_boilerpipe
processed  235
skipped  2
499979607
extracting with thr_boilerpipe
processed  236
skipped  2
500618275
extracting with thr_boilerpipe
processed  237
skipped  2
501409722
extracting with thr_boilerpipe
processed  238
skipped  2
501420351
extracting with thr_boilerpipe
processed  239
skipped  2
501420884
extracting with thr_boilerpipe
processed  240
skipped  2
501421014
extracting with thr_boilerpipe
processed  241
skipped  2
501431149
extracting with thr_boilerpipe
processed  242
skipped  2
501442658
extracting with thr_boilerpipe
processed  243
skipped  2
501453175
extracting with thr_boilerpipe
processed  244
skipped  2
501471473
extracting with thr_boilerpipe
processed  245
skipped  2
502622175
extracting with thr_boilerpipe
processed  246
skipped  2
502653177
extracting with thr_boilerpipe
processed  247
skipped  2
502830407
extracting with thr_boilerpipe
processed  248
skipped  2
503101426
extracting with thr_boilerpipe
processed  249
skipped  2
503171821
extracting with thr_boilerpipe
processed  250
skipped  2
503183518
extracting with thr_boilerpipe
processed  251
skipped  2
503244112
extracting with thr_boilerpipe
processed  252
skipped  2
503478903
extracting with thr_boilerpipe
processed  253
skipped  2
504316681
extracting with thr_boilerpipe
processed  254
skipped  2
504570818
extracting with thr_boilerpipe
processed  255
skipped  2
504702541
extracting with thr_boilerpipe
processed  256
skipped  2
505354116
extracting with thr_boilerpipe
processed  257
skipped  2
505354986
extracting with thr_boilerpipe
processed  258
skipped  2
505362351
extracting with thr_boilerpipe
processed  259
skipped  2
505513151
extracting with thr_boilerpipe
processed  260
skipped  2
505554845
extracting with thr_boilerpipe
processed  261
skipped  2
505630012
extracting with thr_boilerpipe
processed  262
skipped  2
505666313
extracting with thr_boilerpipe
processed  263
skipped  2
505709237
extracting with thr_boilerpipe
processed  264
skipped  2
505875260
extracting with thr_boilerpipe
processed  265
skipped  2
505891713
extracting with thr_boilerpipe
processed  266
skipped  2
505960251
extracting with thr_boilerpipe
processed  267
skipped  2
506245926
extracting with thr_boilerpipe
processed  268
skipped  2
506248868
extracting with thr_boilerpipe
processed  269
skipped  2
506288913
extracting with thr_boilerpipe
processed  270
skipped  2
506374300
extracting with thr_boilerpipe
processed  271
skipped  2
506905377
extracting with thr_boilerpipe
processed  272
skipped  2
508032017
extracting with thr_boilerpipe
processed  273
skipped  2
508653746
extracting with thr_boilerpipe
processed  274
skipped  2
509162462
extracting with thr_boilerpipe
processed  275
skipped  2
509252393
extracting with thr_boilerpipe
processed  276
skipped  2
509863893
extracting with thr_boilerpipe
processed  277
skipped  2
510413590
extracting with thr_boilerpipe
processed  278
skipped  2
510662547
extracting with thr_boilerpipe
processed  279
skipped  2
510881596
extracting with thr_boilerpipe
processed  280
skipped  2
510954456
extracting with thr_boilerpipe
processed  281
skipped  2
510955330
extracting with thr_boilerpipe
processed  282
skipped  2
511823889
extracting with thr_boilerpipe
processed  283
skipped  2
512505124
extracting with thr_boilerpipe
processed  284
skipped  2
512677517
extracting with thr_boilerpipe
processed  285
skipped  2
512765019
extracting with thr_boilerpipe
processed  286
skipped  2
514105704
extracting with thr_boilerpipe
processed  287
skipped  2
514153344
extracting with thr_boilerpipe
processed  288
skipped  2
514364011
extracting with thr_boilerpipe
processed  289
skipped  2
514378750
extracting with thr_boilerpipe
processed  290
skipped  2
514572763
extracting with thr_boilerpipe
processed  291
skipped  2
514931592
extracting with thr_boilerpipe
processed  292
skipped  2
515594059
extracting with thr_boilerpipe
processed  293
skipped  2
515980414
extracting with thr_boilerpipe
processed  294
skipped  2
516147422
extracting with thr_boilerpipe
processed  295
skipped  2
516155213
extracting with thr_boilerpipe
processed  296
skipped  2
516527178
extracting with thr_boilerpipe
processed  297
skipped  2
517088904
extracting with thr_boilerpipe
processed  298
skipped  2
517204863
extracting with thr_boilerpipe
processed  299
skipped  2
517393170
extracting with thr_boilerpipe
processed  300
skipped  2
518149302
extracting with thr_boilerpipe
processed  301
skipped  2
518752724
extracting with thr_boilerpipe
processed  302
skipped  2
518815044
extracting with thr_boilerpipe
processed  303
skipped  2
518917973
extracting with thr_boilerpipe
processed  304
skipped  2
519698497
extracting with thr_boilerpipe
processed  305
skipped  2
519788971
extracting with thr_boilerpipe
processed  306
skipped  2
519948923
extracting with thr_boilerpipe
processed  307
skipped  2
520302517
extracting with thr_boilerpipe
processed  308
skipped  2
520916358
extracting with thr_boilerpipe
processed  309
skipped  2
521501901
extracting with thr_boilerpipe
processed  310
skipped  2
521596942
extracting with thr_boilerpipe
processed  311
skipped  2
524336614
extracting with thr_boilerpipe
processed  312
skipped  2
524583342
extracting with thr_boilerpipe
processed  313
skipped  2
524747370
extracting with thr_boilerpipe
processed  314
skipped  2
525018299
extracting with thr_boilerpipe
processed  315
skipped  2
525274342
extracting with thr_boilerpipe
processed  316
skipped  2
525762267
extracting with thr_boilerpipe
processed  317
skipped  2
526210669
extracting with thr_boilerpipe
processed  318
skipped  2
526399492
extracting with thr_boilerpipe
processed  319
skipped  2
526437284
extracting with thr_boilerpipe
processed  320
skipped  2
526513004
extracting with thr_boilerpipe
processed  321
skipped  2
526972862
extracting with thr_boilerpipe
processed  322
skipped  2
527258084
extracting with thr_boilerpipe
processed  323
skipped  2
528174162
extracting with thr_boilerpipe
processed  324
skipped  2
528717265
extracting with thr_boilerpipe
processed  325
skipped  2
529134306
extracting with thr_boilerpipe
processed  326
skipped  2
529174292
extracting with thr_boilerpipe
processed  327
skipped  2
529318707
extracting with thr_boilerpipe
processed  328
skipped  2
529350751
extracting with thr_boilerpipe
processed  329
skipped  2
529500550
extracting with thr_boilerpipe
processed  330
skipped  2
529638751
extracting with thr_boilerpipe
processed  331
skipped  2
529645874
extracting with thr_boilerpipe
processed  332
skipped  2
529645941
extracting with thr_boilerpipe
processed  333
skipped  2
529698279
extracting with thr_boilerpipe
processed  334
skipped  2
529792413
extracting with thr_boilerpipe
processed  335
skipped  2
529835888
extracting with thr_boilerpipe
processed  336
skipped  2
529849320
extracting with thr_boilerpipe
processed  337
skipped  2
529860654
extracting with thr_boilerpipe
processed  338
skipped  2
529872483
extracting with thr_boilerpipe
processed  339
skipped  2
529878837
extracting with thr_boilerpipe
processed  340
skipped  2
529887477
extracting with thr_boilerpipe
processed  341
skipped  2
529897403
extracting with thr_boilerpipe
processed  342
skipped  2
529897845
extracting with thr_boilerpipe
processed  343
skipped  2
529985569
extracting with thr_boilerpipe
processed  344
skipped  2
530078770
extracting with thr_boilerpipe
processed  345
skipped  2
530528969
extracting with thr_boilerpipe
processed  346
skipped  2
530758329
extracting with thr_boilerpipe
processed  347
skipped  2
530760478
extracting with thr_boilerpipe
processed  348
skipped  2
531116330
extracting with thr_boilerpipe
processed  349
skipped  2
531271665
extracting with thr_boilerpipe
processed  350
skipped  2
531541254
extracting with thr_boilerpipe
processed  351
skipped  2
531805217
extracting with thr_boilerpipe
processed  352
skipped  2
531997009
extracting with thr_boilerpipe
processed  353
skipped  2
532554918
extracting with thr_boilerpipe
processed  354
skipped  2
532672804
extracting with thr_boilerpipe
processed  355
skipped  2
533194544
extracting with thr_boilerpipe
processed  356
skipped  2
533730844
extracting with thr_boilerpipe
processed  357
skipped  2
533889958
extracting with thr_boilerpipe
processed  358
skipped  2
534093073
extracting with thr_boilerpipe
processed  359
skipped  2
534505078
extracting with thr_boilerpipe
processed  360
skipped  2
534718899
extracting with thr_boilerpipe
processed  361
skipped  2
534742315
extracting with thr_boilerpipe
processed  362
skipped  2
534981910
extracting with thr_boilerpipe
processed  363
skipped  2
535201962
extracting with thr_boilerpipe
processed  364
skipped  2
535469332
extracting with thr_boilerpipe
processed  365
skipped  2
535880958
extracting with thr_boilerpipe
processed  366
skipped  2
536622410
extracting with thr_boilerpipe
processed  367
skipped  2
536622427
extracting with thr_boilerpipe
processed  368
skipped  2
536670142
extracting with thr_boilerpipe
processed  369
skipped  2
536689603
extracting with thr_boilerpipe
processed  370
skipped  2
536697475
extracting with thr_boilerpipe
processed  371
skipped  2
536720884
extracting with thr_boilerpipe
processed  372
skipped  2
536817371
extracting with thr_boilerpipe
processed  373
skipped  2
536828101
extracting with thr_boilerpipe
processed  374
skipped  2
536833731
extracting with thr_boilerpipe
processed  375
skipped  2
536870204
extracting with thr_boilerpipe
processed  376
skipped  2
536977072
extracting with thr_boilerpipe
processed  377
skipped  2
537231678
extracting with thr_boilerpipe
processed  378
skipped  2
537256396
extracting with thr_boilerpipe
processed  379
skipped  2
537501183
extracting with thr_boilerpipe
processed  380
skipped  2
537704893
extracting with thr_boilerpipe
processed  381
skipped  2
538721777
extracting with thr_boilerpipe
processed  382
skipped  2
539056055
extracting with thr_boilerpipe
processed  383
skipped  2
539126425
extracting with thr_boilerpipe
processed  384
skipped  2
539382819
extracting with thr_boilerpipe
processed  385
skipped  2
539387198
extracting with thr_boilerpipe
processed  386
skipped  2
539389371
extracting with thr_boilerpipe
processed  387
skipped  2
539392922
extracting with thr_boilerpipe
processed  388
skipped  2
539411169
extracting with thr_boilerpipe
processed  389
skipped  2
539415012
extracting with thr_boilerpipe
processed  390
skipped  2
539423034
extracting with thr_boilerpipe
processed  391
skipped  2
539444342
extracting with thr_boilerpipe
processed  392
skipped  2
539444757
extracting with thr_boilerpipe
processed  393
skipped  2
539445540
extracting with thr_boilerpipe
processed  394
skipped  2
539453644
extracting with thr_boilerpipe
processed  395
skipped  2
539482866
extracting with thr_boilerpipe
processed  396
skipped  2
539483121
extracting with thr_boilerpipe
processed  397
skipped  2
539535265
extracting with thr_boilerpipe
processed  398
skipped  2
539535898
extracting with thr_boilerpipe
processed  399
skipped  2
539639827
extracting with thr_boilerpipe
processed  400
skipped  2
539693458
extracting with thr_boilerpipe
processed  401
skipped  2
539699310
extracting with thr_boilerpipe
processed  402
skipped  2
539784484
extracting with thr_boilerpipe
processed  403
skipped  2
539855645
extracting with thr_boilerpipe
processed  404
skipped  2
539862886
extracting with thr_boilerpipe
processed  405
skipped  2
540049468
extracting with thr_boilerpipe
processed  406
skipped  2
540072482
extracting with thr_boilerpipe
processed  407
skipped  2
540124060
extracting with thr_boilerpipe
processed  408
skipped  2
540219129
extracting with thr_boilerpipe
processed  409
skipped  2
540246359
extracting with thr_boilerpipe
processed  410
skipped  2
540356681
extracting with thr_boilerpipe
processed  411
skipped  2
540448548
extracting with thr_boilerpipe
processed  412
skipped  2
540628443
extracting with thr_boilerpipe
processed  413
skipped  2
540729123
extracting with thr_boilerpipe
processed  414
skipped  2
540938254
extracting with thr_boilerpipe
processed  415
skipped  2
541044672
extracting with thr_boilerpipe
processed  416
skipped  2
541380062
extracting with thr_boilerpipe
processed  417
skipped  2
541787974
extracting with thr_boilerpipe
processed  418
skipped  2
541944011
extracting with thr_boilerpipe
processed  419
skipped  2
542151181
extracting with thr_boilerpipe
processed  420
skipped  2
542292829
extracting with thr_boilerpipe
processed  421
skipped  2
542601235
extracting with thr_boilerpipe
processed  422
skipped  2
542607593
extracting with thr_boilerpipe
processed  423
skipped  2
542722436
extracting with thr_boilerpipe
processed  424
skipped  2
543002984
extracting with thr_boilerpipe
processed  425
skipped  2
544804153
extracting with thr_boilerpipe
processed  426
skipped  2
545408829
extracting with thr_boilerpipe
processed  427
skipped  2
545490257
extracting with thr_boilerpipe
processed  428
skipped  2
546226391
extracting with thr_boilerpipe
processed  429
skipped  2
546447285
extracting with thr_boilerpipe
processed  430
skipped  2
546452241
extracting with thr_boilerpipe
processed  431
skipped  2
546475377
extracting with thr_boilerpipe
processed  432
skipped  2
546744400
extracting with thr_boilerpipe
processed  433
skipped  2
546865980
extracting with thr_boilerpipe
processed  434
skipped  2
547134278
extracting with thr_boilerpipe
processed  435
skipped  2
547947151
extracting with thr_boilerpipe
processed  436
skipped  2
550077777
extracting with thr_boilerpipe
processed  437
skipped  2
550220223
extracting with thr_boilerpipe
processed  438
skipped  2
550246134
extracting with thr_boilerpipe
processed  439
skipped  2
550918776
extracting with thr_boilerpipe
processed  440
skipped  2
551714821
extracting with thr_boilerpipe
processed  441
skipped  2
551991048
extracting with thr_boilerpipe
processed  442
skipped  2
552179030
extracting with thr_boilerpipe
processed  443
skipped  2
552285278
extracting with thr_boilerpipe
processed  444
skipped  2
552343286
extracting with thr_boilerpipe
processed  445
skipped  2
552450557
extracting with thr_boilerpipe
processed  446
skipped  2
553165338
extracting with thr_boilerpipe
processed  447
skipped  2
554883505
extracting with thr_boilerpipe
processed  448
skipped  2
555372352
extracting with thr_boilerpipe
processed  449
skipped  2
555689825
extracting with thr_boilerpipe
processed  450
skipped  2
555843132
extracting with thr_boilerpipe
processed  451
skipped  2
556645479
extracting with thr_boilerpipe
processed  452
skipped  2
556934454
extracting with thr_boilerpipe
processed  453
skipped  2
557152619
extracting with thr_boilerpipe
processed  454
skipped  2
557237971
extracting with thr_boilerpipe
processed  455
skipped  2
557521276
extracting with thr_boilerpipe
processed  456
skipped  2
558095303
extracting with thr_boilerpipe
processed  457
skipped  2
558197649
extracting with thr_boilerpipe
processed  458
skipped  2
558655687
extracting with thr_boilerpipe
processed  459
skipped  2
558851890
extracting with thr_boilerpipe
processed  460
skipped  2
559736417
extracting with thr_boilerpipe
processed  461
skipped  2
559785151
extracting with thr_boilerpipe
processed  462
skipped  2
560048673
extracting with thr_boilerpipe
processed  463
skipped  2
560090309
extracting with thr_boilerpipe
processed  464
skipped  2
560127916
extracting with thr_boilerpipe
processed  465
skipped  2
560262829
extracting with thr_boilerpipe
processed  466
skipped  2
560310961
extracting with thr_boilerpipe
processed  467
skipped  2
560339085
extracting with thr_boilerpipe
processed  468
skipped  2
560351631
extracting with thr_boilerpipe
processed  469
skipped  2
560378287
extracting with thr_boilerpipe
processed  470
skipped  2
560417790
extracting with thr_boilerpipe
processed  471
skipped  2
560535896
extracting with thr_boilerpipe
processed  472
skipped  2
560707952
extracting with thr_boilerpipe
processed  473
skipped  2
560751009
extracting with thr_boilerpipe
processed  474
skipped  2
560768548
extracting with thr_boilerpipe
processed  475
skipped  2
560842330
extracting with thr_boilerpipe
processed  476
skipped  2
561122957
extracting with thr_boilerpipe
processed  477
skipped  2
561174738
extracting with thr_boilerpipe
processed  478
skipped  2
561368626
extracting with thr_boilerpipe
processed  479
skipped  2
561800981
extracting with thr_boilerpipe
processed  480
skipped  2
562073055
extracting with thr_boilerpipe
processed  481
skipped  2
562399059
extracting with thr_boilerpipe
processed  482
skipped  2
562399486
extracting with thr_boilerpipe
processed  483
skipped  2
562736854
extracting with thr_boilerpipe
processed  484
skipped  2
562742684
extracting with thr_boilerpipe
processed  485
skipped  2
562984785
extracting with thr_boilerpipe
processed  486
skipped  2
563073521
extracting with thr_boilerpipe
processed  487
skipped  2
563250031
extracting with thr_boilerpipe
processed  488
skipped  2
563556588
extracting with thr_boilerpipe
processed  489
skipped  2
563582892
extracting with thr_boilerpipe
processed  490
skipped  2
563851373
extracting with thr_boilerpipe
processed  491
skipped  2
564075589
extracting with thr_boilerpipe
processed  492
skipped  2
564196161
extracting with thr_boilerpipe
processed  493
skipped  2
564418488
extracting with thr_boilerpipe
processed  494
skipped  2
564457008
extracting with thr_boilerpipe
processed  495
skipped  2
565254169
extracting with thr_boilerpipe
processed  496
skipped  2
565270689
extracting with thr_boilerpipe
processed  497
skipped  2
565612572
extracting with thr_boilerpipe
processed  498
skipped  2
565620132
extracting with thr_boilerpipe
processed  499
skipped  2
565774715
extracting with thr_boilerpipe
error on download565774715
  File "<ipython-input-73-5462c770f344>", line 17, in <module>
    res = comp_extractors( extractor_training_object )
  File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
    extraction_results = get_extraction_results( eto )
  File "<ipython-input-66-716979e1aa41>", line 17, in get_extraction_results
    ret['thr_boiler_pipe_ArticleExtractor'] =  extract_with_thr_boilerpipe_ArticleExtractor( raw_content)
  File "<ipython-input-68-fdb56e8b1b24>", line 21, in <lambda>
    extract_with_thr_boilerpipe_ArticleExtractor = lambda raw_content: thrift_bp_extract( raw_content, 'ArticleExtractor')
  File "<ipython-input-68-fdb56e8b1b24>", line 4, in thrift_bp_extract
    thrift_ret = client.extract_html( raw_content, extractor_type )
  File "thriftboilerpipe/ExtractorService.py", line 44, in extract_html
    return self.recv_extract_html()
  File "thriftboilerpipe/ExtractorService.py", line 62, in recv_extract_html
    raise x
TApplicationException: Internal error processing extract_html
Traceback (most recent call last):
(<class 'thrift.Thrift.TApplicationException'>, TApplicationException(None,), <traceback object at 0x7f3c69406098>)
processed  499
skipped  3
565831221
extracting with thr_boilerpipe
processed  500
skipped  3
566175982
extracting with thr_boilerpipe
processed  501
skipped  3
566194155
extracting with thr_boilerpipe
processed  502
skipped  3
566220098
extracting with thr_boilerpipe
processed  503
skipped  3
566546448
extracting with thr_boilerpipe
processed  504
skipped  3
566592645
extracting with thr_boilerpipe
processed  505
skipped  3
566726127
extracting with thr_boilerpipe
processed  506
skipped  3
567132910
extracting with thr_boilerpipe
processed  507
skipped  3
567149912
extracting with thr_boilerpipe
processed  508
skipped  3
567150914
extracting with thr_boilerpipe
processed  509
skipped  3
567201539
extracting with thr_boilerpipe
processed  510
skipped  3
567387189
extracting with thr_boilerpipe
processed  511
skipped  3
567440968
extracting with thr_boilerpipe
processed  512
skipped  3
567594230
extracting with thr_boilerpipe
processed  513
skipped  3
567706084
extracting with thr_boilerpipe
processed  514
skipped  3
567808993
extracting with thr_boilerpipe
processed  515
skipped  3
568058112
extracting with thr_boilerpipe
processed  516
skipped  3
568220153
extracting with thr_boilerpipe
processed  517
skipped  3
568278639
extracting with thr_boilerpipe
processed  518
skipped  3
568314417
extracting with thr_boilerpipe
processed  519
skipped  3
568405515
extracting with thr_boilerpipe
processed  520
skipped  3
568873296
extracting with thr_boilerpipe
processed  521
skipped  3
569385275
extracting with thr_boilerpipe
processed  522
skipped  3
569400896
extracting with thr_boilerpipe
processed  523
skipped  3
569440622
extracting with thr_boilerpipe
error on download569440622
  File "<ipython-input-73-5462c770f344>", line 17, in <module>
    res = comp_extractors( extractor_training_object )
  File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
    extraction_results = get_extraction_results( eto )
  File "<ipython-input-66-716979e1aa41>", line 17, in get_extraction_results
    ret['thr_boiler_pipe_ArticleExtractor'] =  extract_with_thr_boilerpipe_ArticleExtractor( raw_content)
  File "<ipython-input-68-fdb56e8b1b24>", line 21, in <lambda>
    extract_with_thr_boilerpipe_ArticleExtractor = lambda raw_content: thrift_bp_extract( raw_content, 'ArticleExtractor')
  File "<ipython-input-68-fdb56e8b1b24>", line 4, in thrift_bp_extract
    thrift_ret = client.extract_html( raw_content, extractor_type )
  File "thriftboilerpipe/ExtractorService.py", line 44, in extract_html
    return self.recv_extract_html()
  File "thriftboilerpipe/ExtractorService.py", line 62, in recv_extract_html
    raise x
TApplicationException: Internal error processing extract_html
Traceback (most recent call last):
(<class 'thrift.Thrift.TApplicationException'>, TApplicationException(None,), <traceback object at 0x7f3c8a8ca518>)
processed  523
skipped  4
569448540
extracting with thr_boilerpipe
processed  524
skipped  4
569458361
extracting with thr_boilerpipe
processed  525
skipped  4
569473458
extracting with thr_boilerpipe
processed  526
skipped  4
570126417
extracting with thr_boilerpipe
processed  527
skipped  4
570281609
extracting with thr_boilerpipe
processed  528
skipped  4
570420066
extracting with thr_boilerpipe
processed  529
skipped  4
570820516
extracting with thr_boilerpipe
processed  530
skipped  4
571250692
extracting with thr_boilerpipe
processed  531
skipped  4
572427751
extracting with thr_boilerpipe
processed  532
skipped  4
572497331
extracting with thr_boilerpipe
processed  533
skipped  4
572595598
extracting with thr_boilerpipe
processed  534
skipped  4
576800952
extracting with thr_boilerpipe
processed  535
skipped  4
576826346
extracting with thr_boilerpipe
processed  536
skipped  4
576906221
extracting with thr_boilerpipe
processed  537
skipped  4
577070880
extracting with thr_boilerpipe
processed  538
skipped  4
577076453
extracting with thr_boilerpipe
processed  539
skipped  4
577226126
extracting with thr_boilerpipe
error on download577226126
  File "<ipython-input-73-5462c770f344>", line 17, in <module>
    res = comp_extractors( extractor_training_object )
  File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
    extraction_results = get_extraction_results( eto )
  File "<ipython-input-66-716979e1aa41>", line 17, in get_extraction_results
    ret['thr_boiler_pipe_ArticleExtractor'] =  extract_with_thr_boilerpipe_ArticleExtractor( raw_content)
  File "<ipython-input-68-fdb56e8b1b24>", line 21, in <lambda>
    extract_with_thr_boilerpipe_ArticleExtractor = lambda raw_content: thrift_bp_extract( raw_content, 'ArticleExtractor')
  File "<ipython-input-68-fdb56e8b1b24>", line 4, in thrift_bp_extract
    thrift_ret = client.extract_html( raw_content, extractor_type )
  File "thriftboilerpipe/ExtractorService.py", line 44, in extract_html
    return self.recv_extract_html()
  File "thriftboilerpipe/ExtractorService.py", line 62, in recv_extract_html
    raise x
TApplicationException: Internal error processing extract_html
Traceback (most recent call last):
(<class 'thrift.Thrift.TApplicationException'>, TApplicationException(None,), <traceback object at 0x7f3c8a8d51b8>)
processed  539
skipped  5
578103835
extracting with thr_boilerpipe
processed  540
skipped  5
578124694
extracting with thr_boilerpipe
processed  541
skipped  5
578153622
extracting with thr_boilerpipe
processed  542
skipped  5
578156115
extracting with thr_boilerpipe
processed  543
skipped  5
578156372
extracting with thr_boilerpipe
processed  544
skipped  5
578156412
extracting with thr_boilerpipe
processed  545
skipped  5
578167861
extracting with thr_boilerpipe
processed  546
skipped  5
578210309
extracting with thr_boilerpipe
error on download578210309
  File "<ipython-input-73-5462c770f344>", line 17, in <module>
    res = comp_extractors( extractor_training_object )
  File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
    extraction_results = get_extraction_results( eto )
  File "<ipython-input-66-716979e1aa41>", line 17, in get_extraction_results
    ret['thr_boiler_pipe_ArticleExtractor'] =  extract_with_thr_boilerpipe_ArticleExtractor( raw_content)
  File "<ipython-input-68-fdb56e8b1b24>", line 21, in <lambda>
    extract_with_thr_boilerpipe_ArticleExtractor = lambda raw_content: thrift_bp_extract( raw_content, 'ArticleExtractor')
  File "<ipython-input-68-fdb56e8b1b24>", line 4, in thrift_bp_extract
    thrift_ret = client.extract_html( raw_content, extractor_type )
  File "thriftboilerpipe/ExtractorService.py", line 44, in extract_html
    return self.recv_extract_html()
  File "thriftboilerpipe/ExtractorService.py", line 62, in recv_extract_html
    raise x
TApplicationException: Internal error processing extract_html
Traceback (most recent call last):
(<class 'thrift.Thrift.TApplicationException'>, TApplicationException(None,), <traceback object at 0x7f3c8a8ce680>)
processed  546
skipped  6
578210949
extracting with thr_boilerpipe
processed  547
skipped  6
578294304
extracting with thr_boilerpipe
processed  548
skipped  6
578364597
extracting with thr_boilerpipe
processed  549
skipped  6
578371687
extracting with thr_boilerpipe
processed  550
skipped  6
578520886
extracting with thr_boilerpipe
processed  551
skipped  6
578636827
extracting with thr_boilerpipe
processed  552
skipped  6
578653839
extracting with thr_boilerpipe
processed  553
skipped  6
578713987
extracting with thr_boilerpipe
processed 
  File "<ipython-input-73-5462c770f344>", line 17, in <module>
    res = comp_extractors( extractor_training_object )
  File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
    extraction_results = get_extraction_results( eto )
  File "<ipython-input-66-716979e1aa41>", line 17, in get_extraction_results
    ret['thr_boiler_pipe_ArticleExtractor'] =  extract_with_thr_boilerpipe_ArticleExtractor( raw_content)
  File "<ipython-input-68-fdb56e8b1b24>", line 21, in <lambda>
    extract_with_thr_boilerpipe_ArticleExtractor = lambda raw_content: thrift_bp_extract( raw_content, 'ArticleExtractor')
  File "<ipython-input-68-fdb56e8b1b24>", line 4, in thrift_bp_extract
    thrift_ret = client.extract_html( raw_content, extractor_type )
  File "thriftboilerpipe/ExtractorService.py", line 44, in extract_html
    return self.recv_extract_html()
  File "thriftboilerpipe/ExtractorService.py", line 62, in recv_extract_html
    raise x
TApplicationException: Internal error processing extract_html
Traceback (most recent call last):
 554
skipped  6
578974914
extracting with thr_boilerpipe
error on download578974914
(<class 'thrift.Thrift.TApplicationException'>, TApplicationException(None,), <traceback object at 0x7f3c8a8d6098>)
processed  554
skipped  7
579044624
extracting with thr_boilerpipe
processed  555
skipped  7
580296869
extracting with thr_boilerpipe
processed  556
skipped  7
581963231
extracting with thr_boilerpipe
processed  557
skipped  7
589072496
extracting with thr_boilerpipe
processed  558
skipped  7
589074541
extracting with thr_boilerpipe
processed  559
skipped  7
589074546
extracting with thr_boilerpipe
processed  560
skipped  7
589088428
extracting with thr_boilerpipe
processed  561
skipped  7
589089635
extracting with thr_boilerpipe
processed  562
skipped  7
589090828
extracting with thr_boilerpipe
processed  563
skipped  7
589116902
extracting with thr_boilerpipe
processed  564
skipped  7
589116915
extracting with thr_boilerpipe
processed  565
skipped  7
589129653
extracting with thr_boilerpipe
processed  566
skipped  7
589153423
extracting with thr_boilerpipe
processed  567
skipped  7
589156380
extracting with thr_boilerpipe
processed  568
skipped  7
589208129
extracting with thr_boilerpipe
processed  569
skipped  7
589213448
extracting with thr_boilerpipe
processed  570
skipped  7
589213994
extracting with thr_boilerpipe
processed  571
skipped  7
589224922
extracting with thr_boilerpipe
processed  572
skipped  7
589225296
extracting with thr_boilerpipe
processed  573
skipped  7
589239170
extracting with thr_boilerpipe
processed  574
skipped  7
589240076
extracting with thr_boilerpipe
processed  575
skipped  7
589261134
extracting with thr_boilerpipe
processed  576
skipped  7
589261136
extracting with thr_boilerpipe
processed  577
skipped  7
589273978
extracting with thr_boilerpipe
processed  578
skipped  7
589285851
extracting with thr_boilerpipe
processed  579
skipped  7
589285856
extracting with thr_boilerpipe
processed  580
skipped  7
589298598
extracting with thr_boilerpipe
processed  581
skipped  7
589305788
extracting with thr_boilerpipe
processed  582
skipped  7
589310534
extracting with thr_boilerpipe
processed  583
skipped  7
589316558
extracting with thr_boilerpipe
processed  584
skipped  7
589324992
extracting with thr_boilerpipe
processed  585
skipped  7
589326873
extracting with thr_boilerpipe
processed  586
skipped  7
589335687
extracting with thr_boilerpipe
processed  587
skipped  7
589355839
extracting with thr_boilerpipe
processed  588
skipped  7
589368807
extracting with thr_boilerpipe
processed  589
skipped  7
589371772
extracting with thr_boilerpipe
processed  590
skipped  7
589377853
extracting with thr_boilerpipe
processed  591
skipped  7
589384584
extracting with thr_boilerpipe
processed  592
skipped  7
589440172
extracting with thr_boilerpipe
processed  593
skipped  7
589502987
extracting with thr_boilerpipe
processed  594
skipped  7
589513642
extracting with thr_boilerpipe
processed  595
skipped  7
589568015
extracting with thr_boilerpipe
processed  596
skipped  7
589625611
extracting with thr_boilerpipe
processed  597
skipped  7
589655068
extracting with thr_boilerpipe
processed  598
skipped  7
589674386
extracting with thr_boilerpipe
processed  599
skipped  7
589683282
extracting with thr_boilerpipe
processed  600
skipped  7
589686438
extracting with thr_boilerpipe
processed  601
skipped  7
589754761
extracting with thr_boilerpipe
processed  602
skipped  7
589755411
extracting with thr_boilerpipe
processed  603
skipped  7
589755612
extracting with thr_boilerpipe
processed  604
skipped  7
589758021
extracting with thr_boilerpipe
processed  605
skipped  7
589768387
extracting with thr_boilerpipe
processed  606
skipped  7
589786104
extracting with thr_boilerpipe
processed  607
skipped  7
589786154
extracting with thr_boilerpipe
processed  608
skipped  7
589786414
extracting with thr_boilerpipe
error on download589786414
  File "<ipython-input-73-5462c770f344>", line 17, in <module>
    res = comp_extractors( extractor_training_object )
  File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
    extraction_results = get_extraction_results( eto )
  File "<ipython-input-66-716979e1aa41>", line 17, in get_extraction_results
    ret['thr_boiler_pipe_ArticleExtractor'] =  extract_with_thr_boilerpipe_ArticleExtractor( raw_content)
  File "<ipython-input-68-fdb56e8b1b24>", line 21, in <lambda>
    extract_with_thr_boilerpipe_ArticleExtractor = lambda raw_content: thrift_bp_extract( raw_content, 'ArticleExtractor')
  File "<ipython-input-68-fdb56e8b1b24>", line 4, in thrift_bp_extract
    thrift_ret = client.extract_html( raw_content, extractor_type )
  File "thriftboilerpipe/ExtractorService.py", line 44, in extract_html
    return self.recv_extract_html()
  File "thriftboilerpipe/ExtractorService.py", line 62, in recv_extract_html
    raise x
TApplicationException: Internal error processing extract_html
Traceback (most recent call last):
(<class 'thrift.Thrift.TApplicationException'>, TApplicationException(None,), <traceback object at 0x7f3c692f25a8>)
processed  608
skipped  8
589862512
extracting with thr_boilerpipe
processed  609
skipped  8
589862834
extracting with thr_boilerpipe
processed  610
skipped  8
589862835
extracting with thr_boilerpipe
processed  611
skipped  8
589867651
extracting with thr_boilerpipe
processed  612
skipped  8
589877044
extracting with thr_boilerpipe
processed  613
skipped  8
589878988
extracting with thr_boilerpipe
processed  614
skipped  8
589902239
extracting with thr_boilerpipe
processed  615
skipped  8
589902589
extracting with thr_boilerpipe
processed  616
skipped  8
589903803
extracting with thr_boilerpipe
processed  617
skipped  8
589912699
extracting with thr_boilerpipe
processed  618
skipped  8
589925438
extracting with thr_boilerpipe
processed  619
skipped  8
589929254
extracting with thr_boilerpipe
processed  620
skipped  8
589929573
extracting with thr_boilerpipe
processed  621
skipped  8
589934787
extracting with thr_boilerpipe
processed  622
skipped  8
589938190
extracting with thr_boilerpipe
processed  623
skipped  8
589945649
extracting with thr_boilerpipe
processed  624
skipped  8
589946203
extracting with thr_boilerpipe
processed  625
skipped  8
589956136
extracting with thr_boilerpipe
processed  626
skipped  8
589985214
extracting with thr_boilerpipe
processed  627
skipped  8
589992578
extracting with thr_boilerpipe
processed  628
skipped  8
589992873
extracting with thr_boilerpipe
processed  629
skipped  8
590003031
extracting with thr_boilerpipe
processed  630
skipped  8
590003045
extracting with thr_boilerpipe
processed  631
skipped  8
590033845
extracting with thr_boilerpipe
processed  632
skipped  8
590033849
extracting with thr_boilerpipe
processed  633
skipped  8
590033852
extracting with thr_boilerpipe
processed  634
skipped  8
590057803
extracting with thr_boilerpipe
processed  635
skipped  8
590237580
extracting with thr_boilerpipe
processed  636
skipped  8
590249522
extracting with thr_boilerpipe
processed  637
skipped  8
590264671
extracting with thr_boilerpipe
processed  638
skipped  8
590293497
extracting with thr_boilerpipe
processed  639
skipped  8
590323886
extracting with thr_boilerpipe
processed  640
skipped  8
590323913
extracting with thr_boilerpipe
processed  641
skipped  8
590324177
extracting with thr_boilerpipe
processed  642
skipped  8
590338530
extracting with thr_boilerpipe
processed  643
skipped  8
590339267
extracting with thr_boilerpipe
processed  644
skipped  8
590340446
extracting with thr_boilerpipe
processed  645
skipped  8
590351087
extracting with thr_boilerpipe
processed  646
skipped  8
590353661
extracting with thr_boilerpipe
processed  647
skipped  8
590356585
extracting with thr_boilerpipe
processed  648
skipped  8
590356597
extracting with thr_boilerpipe
processed  649
skipped  8
590386215
extracting with thr_boilerpipe
processed  650
skipped  8
590386218
extracting with thr_boilerpipe
processed  651
skipped  8
590386225
extracting with thr_boilerpipe
processed  652
skipped  8
590397660
extracting with thr_boilerpipe
processed  653
skipped  8
590398422
extracting with thr_boilerpipe
processed  654
skipped  8
590399931
extracting with thr_boilerpipe
processed  655
skipped  8
590411501
extracting with thr_boilerpipe
processed  656
skipped  8
590424369
extracting with thr_boilerpipe
processed  657
skipped  8
590425749
extracting with thr_boilerpipe
processed  658
skipped  8
590425776
extracting with thr_boilerpipe
processed  659
skipped  8
590425824
extracting with thr_boilerpipe
processed  660
skipped  8
590437581
extracting with thr_boilerpipe
processed  661
skipped  8
590441661
extracting with thr_boilerpipe
processed  662
skipped  8
590441662
extracting with thr_boilerpipe
processed  663
skipped  8
590460432
extracting with thr_boilerpipe
processed  664
skipped  8
590462467
extracting with thr_boilerpipe
processed  665
skipped  8
590512426
extracting with thr_boilerpipe
processed  666
skipped  8
590514891
extracting with thr_boilerpipe
processed  667
skipped  8
590515180
extracting with thr_boilerpipe
processed  668
skipped  8
590529801
extracting with thr_boilerpipe
processed  669
skipped  8
590531075
extracting with thr_boilerpipe
processed  670
skipped  8
590542372
extracting with thr_boilerpipe
processed  671
skipped  8
590542427
extracting with thr_boilerpipe
processed  672
skipped  8
590551407
extracting with thr_boilerpipe
processed  673
skipped  8
590552268
extracting with thr_boilerpipe
processed  674
skipped  8
590552363
extracting with thr_boilerpipe
processed  675
skipped  8
590552784
extracting with thr_boilerpipe
processed  676
skipped  8
590560741
extracting with thr_boilerpipe
processed  677
skipped  8
590576816
extracting with thr_boilerpipe
processed  678
skipped  8
590585094
extracting with thr_boilerpipe
processed  679
skipped  8
590593066
extracting with thr_boilerpipe
processed  680
skipped  8
590596271
extracting with thr_boilerpipe
processed  681
skipped  8
590605895
extracting with thr_boilerpipe
processed  682
skipped  8
590620426
extracting with thr_boilerpipe
processed  683
skipped  8
590623508
extracting with thr_boilerpipe
processed  684
skipped  8
590623511
extracting with thr_boilerpipe
processed  685
skipped  8
590639137
extracting with thr_boilerpipe
processed  686
skipped  8
590646856
extracting with thr_boilerpipe
processed  687
skipped  8
590647873
extracting with thr_boilerpipe
processed  688
skipped  8
590654819
extracting with thr_boilerpipe
processed  689
skipped  8
590661408
extracting with thr_boilerpipe
processed  690
skipped  8
590685837
extracting with thr_boilerpipe
processed  691
skipped  8
590720858
extracting with thr_boilerpipe
processed  692
skipped  8
590806668
extracting with thr_boilerpipe
processed  693
skipped  8
590841021
extracting with thr_boilerpipe
processed  694
skipped  8
590877233
extracting with thr_boilerpipe
processed  695
skipped  8
590917078
extracting with thr_boilerpipe
processed  696
skipped  8
590918616
extracting with thr_boilerpipe
processed  697
skipped  8
590918621
extracting with thr_boilerpipe
processed  698
skipped  8
590929956
extracting with thr_boilerpipe
processed  699
skipped  8
590957300
extracting with thr_boilerpipe
processed  700
skipped  8
590957745
extracting with thr_boilerpipe
processed  701
skipped  8
590961196
extracting with thr_boilerpipe
processed  702
skipped  8
590962451
extracting with thr_boilerpipe
processed  703
skipped  8
590975183
extracting with thr_boilerpipe
processed  704
skipped  8
590975515
extracting with thr_boilerpipe
processed  705
skipped  8
590975517
extracting with thr_boilerpipe
processed  706
skipped  8
590976484
extracting with thr_boilerpipe
processed  707
skipped  8
591998702
extracting with thr_boilerpipe
processed  708
skipped  8
591998982
extracting with thr_boilerpipe
Total_time 0:05:07.021219
Time per download 0:00:00.428202
  File "<ipython-input-73-5462c770f344>", line 17, in <module>
    res = comp_extractors( extractor_training_object )
  File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
    extraction_results = get_extraction_results( eto )
  File "<ipython-input-66-716979e1aa41>", line 17, in get_extraction_results
    ret['thr_boiler_pipe_ArticleExtractor'] =  extract_with_thr_boilerpipe_ArticleExtractor( raw_content)
  File "<ipython-input-68-fdb56e8b1b24>", line 21, in <lambda>
    extract_with_thr_boilerpipe_ArticleExtractor = lambda raw_content: thrift_bp_extract( raw_content, 'ArticleExtractor')
  File "<ipython-input-68-fdb56e8b1b24>", line 4, in thrift_bp_extract
    thrift_ret = client.extract_html( raw_content, extractor_type )
  File "thriftboilerpipe/ExtractorService.py", line 44, in extract_html
    return self.recv_extract_html()
  File "thriftboilerpipe/ExtractorService.py", line 62, in recv_extract_html
    raise x
TApplicationException: Internal error processing extract_html

In [65]:
comps_downloads = cPickle.load(  file( 
            os.path.expanduser( "~/Dropbox/mc/extractor_test/comps_downloads.pickle"), "rb") )

Results

Results Overall


In [74]:
comps_downloads[0]


Out[74]:
{'downloads_id': 391881020,
 'gold': {'f1': 1.0, 'precision': 1.0, 'recall': 1.0},
 'media_id': 83371,
 'py_boiler_pipe_ArticleExtractor': {'f1': 0.898148148148148,
  'precision': 0.8220338983050848,
  'recall': 0.9897959183673469},
 'py_boiler_pipe_ArticleSentencesExtractor': {'f1': 0,
  'precision': 0.0,
  'recall': 0.0},
 'py_boiler_pipe_CanolaExtractor': {'f1': 0.8729016786570742,
  'precision': 0.8235294117647058,
  'recall': 0.9285714285714286},
 'py_boiler_pipe_DefaultExtractor': {'f1': 0.8656036446469247,
  'precision': 0.7818930041152263,
  'recall': 0.9693877551020408},
 'py_boiler_pipe_KeepEverythingExtractor': {'f1': 0.4714459295261239,
  'precision': 0.3094098883572568,
  'recall': 0.9897959183673469},
 'py_boiler_pipe_LargestContentExtractor': {'f1': 0.9440389294403893,
  'precision': 0.9023255813953488,
  'recall': 0.9897959183673469},
 'py_boiler_pipe_NumWordsRulesExtractor': {'f1': 0.900473933649289,
  'precision': 0.8407079646017699,
  'recall': 0.9693877551020408},
 'python_readibilty': {'f1': 0.9411764705882353,
  'precision': 0.9435897435897436,
  'recall': 0.9387755102040817},
 'story_is_spidered': False,
 'thr_boiler_pipe_ArticleExtractor': {'f1': 0.938875305623472,
  'precision': 0.9014084507042254,
  'recall': 0.9795918367346939},
 'thr_boiler_pipe_DefaultExtractor': {'f1': 0.7840670859538783,
  'precision': 0.6654804270462633,
  'recall': 0.9540816326530612}}

In [75]:
df = get_data_frame_from_comparision_objects( comps_downloads )
print_results_by_measurement_type( df )


       precision_gold  precision_py_boiler_pipe_ArticleExtractor  \
count             709                                 709.000000   
mean                1                                   0.743752   
std                 0                                   0.283451   
min                 1                                   0.000000   
2%                  1                                   0.004002   
5%                  1                                   0.023810   
10%                 1                                   0.190711   
50%                 1                                   0.861842   
max                 1                                   0.990968   

       precision_py_boiler_pipe_ArticleSentencesExtractor  \
count                                         709.000000    
mean                                            0.000437    
std                                             0.011624    
min                                             0.000000    
2%                                              0.000000    
5%                                              0.000000    
10%                                             0.000000    
50%                                             0.000000    
max                                             0.309524    

       precision_py_boiler_pipe_CanolaExtractor  \
count                                709.000000   
mean                                   0.559867   
std                                    0.289836   
min                                    0.000000   
2%                                     0.004046   
5%                                     0.053150   
10%                                    0.125632   
50%                                    0.627586   
max                                    0.982085   

       precision_py_boiler_pipe_DefaultExtractor  \
count                                 709.000000   
mean                                    0.633871   
std                                     0.282888   
min                                     0.000000   
2%                                      0.000000   
5%                                      0.028471   
10%                                     0.165822   
50%                                     0.726000   
max                                     0.993469   

       precision_py_boiler_pipe_KeepEverythingExtractor  \
count                                        709.000000   
mean                                           0.351158   
std                                            0.246308   
min                                            0.000000   
2%                                             0.012139   
5%                                             0.039472   
10%                                            0.067484   
50%                                            0.305634   
max                                            0.956033   

       precision_py_boiler_pipe_LargestContentExtractor  \
count                                        709.000000   
mean                                           0.744173   
std                                            0.303411   
min                                            0.000000   
2%                                             0.004587   
5%                                             0.018513   
10%                                            0.047491   
50%                                            0.870968   
max                                            0.994749   

       precision_py_boiler_pipe_NumWordsRulesExtractor  \
count                                       709.000000   
mean                                          0.654955   
std                                           0.276770   
min                                           0.000000   
2%                                            0.003950   
5%                                            0.069395   
10%                                           0.217952   
50%                                           0.760870   
max                                           0.991265   

       precision_python_readibilty  \
count                   709.000000   
mean                      0.909932   
std                       0.196690   
min                       0.009811   
2%                        0.073263   
5%                        0.409343   
10%                       0.780636   
50%                       0.977199   
max                       1.000000   

       precision_thr_boiler_pipe_ArticleExtractor  \
count                                  709.000000   
mean                                     0.628442   
std                                      0.398688   
min                                      0.000000   
2%                                       0.003212   
5%                                       0.008029   
10%                                      0.019322   
50%                                      0.878049   
max                                      1.000000   

       precision_thr_boiler_pipe_DefaultExtractor  
count                                  709.000000  
mean                                     0.555218  
std                                      0.349223  
min                                      0.000000  
2%                                       0.005259  
5%                                       0.012785  
10%                                      0.024934  
50%                                      0.637229  
max                                      1.000000  
       recall_gold  recall_py_boiler_pipe_ArticleExtractor  \
count          709                              709.000000   
mean             1                                0.805785   
std              0                                0.274766   
min              1                                0.000000   
2%               1                                0.022585   
5%               1                                0.058824   
10%              1                                0.233867   
50%              1                                0.920000   
max              1                                1.000000   

       recall_py_boiler_pipe_ArticleSentencesExtractor  \
count                                       709.000000   
mean                                          0.000029   
std                                           0.000769   
min                                           0.000000   
2%                                            0.000000   
5%                                            0.000000   
10%                                           0.000000   
50%                                           0.000000   
max                                           0.020472   

       recall_py_boiler_pipe_CanolaExtractor  \
count                             709.000000   
mean                                0.809303   
std                                 0.200048   
min                                 0.000000   
2%                                  0.081745   
5%                                  0.333653   
10%                                 0.577662   
50%                                 0.880184   
max                                 0.997429   

       recall_py_boiler_pipe_DefaultExtractor  \
count                              709.000000   
mean                                 0.780991   
std                                  0.249576   
min                                  0.000000   
2%                                   0.000000   
5%                                   0.095378   
10%                                  0.353777   
50%                                  0.874126   
max                                  0.997429   

       recall_py_boiler_pipe_KeepEverythingExtractor  \
count                                     709.000000   
mean                                        0.914876   
std                                         0.118623   
min                                         0.000000   
2%                                          0.538462   
5%                                          0.782886   
10%                                         0.865614   
50%                                         0.938620   
max                                         1.000000   

       recall_py_boiler_pipe_LargestContentExtractor  \
count                                     709.000000   
mean                                        0.701460   
std                                         0.317048   
min                                         0.000000   
2%                                          0.011802   
5%                                          0.034231   
10%                                         0.075996   
50%                                         0.857143   
max                                         0.997409   

       recall_py_boiler_pipe_NumWordsRulesExtractor  recall_python_readibilty  \
count                                    709.000000                709.000000   
mean                                       0.833080                  0.859992   
std                                        0.196529                  0.246466   
min                                        0.000000                  0.002874   
2%                                         0.080144                  0.038143   
5%                                         0.310962                  0.093615   
10%                                        0.641447                  0.596880   
50%                                        0.891654                  0.954128   
max                                        0.997852                  1.000000   

       recall_thr_boiler_pipe_ArticleExtractor  \
count                               709.000000   
mean                                  0.706731   
std                                   0.404080   
min                                   0.000000   
2%                                    0.008620   
5%                                    0.019078   
10%                                   0.040008   
50%                                   0.975962   
max                                   1.000000   

       recall_thr_boiler_pipe_DefaultExtractor  
count                               709.000000  
mean                                  0.751379  
std                                   0.360973  
min                                   0.000000  
2%                                    0.017128  
5%                                    0.032103  
10%                                   0.053490  
50%                                   0.954274  
max                                   1.000000  
       f1_gold  f1_py_boiler_pipe_ArticleExtractor  \
count      709                          709.000000   
mean         1                            0.756813   
std          0                            0.280207   
min          1                            0.000000   
2%           1                            0.007380   
5%           1                            0.030489   
10%          1                            0.165069   
50%          1                            0.882155   
max          1                            0.994777   

       f1_py_boiler_pipe_ArticleSentencesExtractor  \
count                                   709.000000   
mean                                      0.000054   
std                                       0.001442   
min                                       0.000000   
2%                                        0.000000   
5%                                        0.000000   
10%                                       0.000000   
50%                                       0.000000   
max                                       0.038405   

       f1_py_boiler_pipe_CanolaExtractor  f1_py_boiler_pipe_DefaultExtractor  \
count                         709.000000                          709.000000   
mean                            0.623477                            0.670189   
std                             0.268275                            0.271653   
min                             0.000000                            0.000000   
2%                              0.007836                            0.000000   
5%                              0.094016                            0.033595   
10%                             0.203038                            0.187118   
50%                             0.699983                            0.777651   
max                             0.986559                            0.986933   

       f1_py_boiler_pipe_KeepEverythingExtractor  \
count                                 709.000000   
mean                                    0.463295   
std                                     0.257087   
min                                     0.000000   
2%                                      0.023895   
5%                                      0.075663   
10%                                     0.126259   
50%                                     0.458901   
max                                     0.975148   

       f1_py_boiler_pipe_LargestContentExtractor  \
count                                 709.000000   
mean                                    0.707175   
std                                     0.307054   
min                                     0.000000   
2%                                      0.008247   
5%                                      0.022703   
10%                                     0.055075   
50%                                     0.848148   
max                                     0.987313   

       f1_py_boiler_pipe_NumWordsRulesExtractor  f1_python_readibilty  \
count                                709.000000            709.000000   
mean                                   0.704248              0.861987   
std                                    0.254059              0.242245   
min                                    0.000000              0.005525   
2%                                     0.007593              0.040732   
5%                                     0.117909              0.127670   
10%                                    0.318135              0.568254   
50%                                    0.804428              0.950690   
max                                    0.989595              1.000000   

       f1_thr_boiler_pipe_ArticleExtractor  \
count                           709.000000   
mean                              0.639403   
std                               0.402629   
min                               0.000000   
2%                                0.005083   
5%                                0.010944   
10%                               0.025470   
50%                               0.901982   
max                               1.000000   

       f1_thr_boiler_pipe_DefaultExtractor  
count                           709.000000  
mean                              0.590392  
std                               0.349566  
min                               0.000000  
2%                                0.008021  
5%                                0.016681  
10%                               0.033278  
50%                               0.695518  
max                               0.998779  

In [76]:
non_spidered_downloads = remove_spidered_downloads( comps_downloads )

df = get_data_frame_from_comparision_objects( non_spidered_downloads )
print_results_by_measurement_type( df )


       precision_gold  precision_py_boiler_pipe_ArticleExtractor  \
count             601                                 601.000000   
mean                1                                   0.740157   
std                 0                                   0.279127   
min                 1                                   0.000000   
2%                  1                                   0.003984   
5%                  1                                   0.023438   
10%                 1                                   0.213873   
50%                 1                                   0.859296   
max                 1                                   0.988089   

       precision_py_boiler_pipe_ArticleSentencesExtractor  \
count                                         601.000000    
mean                                            0.000515    
std                                             0.012626    
min                                             0.000000    
2%                                              0.000000    
5%                                              0.000000    
10%                                             0.000000    
50%                                             0.000000    
max                                             0.309524    

       precision_py_boiler_pipe_CanolaExtractor  \
count                                601.000000   
mean                                   0.549850   
std                                    0.285852   
min                                    0.000000   
2%                                     0.002198   
5%                                     0.042590   
10%                                    0.125749   
50%                                    0.609137   
max                                    0.979773   

       precision_py_boiler_pipe_DefaultExtractor  \
count                                 601.000000   
mean                                    0.629166   
std                                     0.277645   
min                                     0.000000   
2%                                      0.000000   
5%                                      0.017544   
10%                                     0.174323   
50%                                     0.713615   
max                                     0.981651   

       precision_py_boiler_pipe_KeepEverythingExtractor  \
count                                        601.000000   
mean                                           0.329332   
std                                            0.233642   
min                                            0.000000   
2%                                             0.010258   
5%                                             0.036174   
10%                                            0.062707   
50%                                            0.285622   
max                                            0.913591   

       precision_py_boiler_pipe_LargestContentExtractor  \
count                                        601.000000   
mean                                           0.745410   
std                                            0.296023   
min                                            0.000000   
2%                                             0.003861   
5%                                             0.020619   
10%                                            0.064394   
50%                                            0.862944   
max                                            0.982402   

       precision_py_boiler_pipe_NumWordsRulesExtractor  \
count                                       601.000000   
mean                                          0.653895   
std                                           0.270699   
min                                           0.000000   
2%                                            0.003175   
5%                                            0.067548   
10%                                           0.236220   
50%                                           0.755814   
max                                           0.982402   

       precision_python_readibilty  \
count                   601.000000   
mean                      0.924880   
std                       0.165005   
min                       0.013793   
2%                        0.241379   
5%                        0.615385   
10%                       0.849057   
50%                       0.977778   
max                       1.000000   

       precision_thr_boiler_pipe_ArticleExtractor  \
count                                  601.000000   
mean                                     0.609960   
std                                      0.404626   
min                                      0.000000   
2%                                       0.003058   
5%                                       0.006623   
10%                                      0.018248   
50%                                      0.870079   
max                                      1.000000   

       precision_thr_boiler_pipe_DefaultExtractor  
count                                  601.000000  
mean                                     0.534960  
std                                      0.350879  
min                                      0.000000  
2%                                       0.005222  
5%                                       0.010448  
10%                                      0.021786  
50%                                      0.608193  
max                                      1.000000  
       recall_gold  recall_py_boiler_pipe_ArticleExtractor  \
count          601                              601.000000   
mean             1                                0.812731   
std              0                                0.265081   
min              1                                0.000000   
2%               1                                0.022222   
5%               1                                0.058824   
10%              1                                0.310811   
50%              1                                0.917178   
max              1                                1.000000   

       recall_py_boiler_pipe_ArticleSentencesExtractor  \
count                                       601.000000   
mean                                          0.000034   
std                                           0.000835   
min                                           0.000000   
2%                                            0.000000   
5%                                            0.000000   
10%                                           0.000000   
50%                                           0.000000   
max                                           0.020472   

       recall_py_boiler_pipe_CanolaExtractor  \
count                             601.000000   
mean                                0.802074   
std                                 0.203839   
min                                 0.000000   
2%                                  0.052632   
5%                                  0.307692   
10%                                 0.571429   
50%                                 0.871795   
max                                 0.997429   

       recall_py_boiler_pipe_DefaultExtractor  \
count                              601.000000   
mean                                 0.773389   
std                                  0.255432   
min                                  0.000000   
2%                                   0.000000   
5%                                   0.080000   
10%                                  0.319372   
50%                                  0.872222   
max                                  0.997429   

       recall_py_boiler_pipe_KeepEverythingExtractor  \
count                                     601.000000   
mean                                        0.914629   
std                                         0.117758   
min                                         0.000000   
2%                                          0.692308   
5%                                          0.789474   
10%                                         0.865169   
50%                                         0.937050   
max                                         1.000000   

       recall_py_boiler_pipe_LargestContentExtractor  \
count                                     601.000000   
mean                                        0.717584   
std                                         0.306372   
min                                         0.000000   
2%                                          0.010778   
5%                                          0.034483   
10%                                         0.093750   
50%                                         0.859649   
max                                         0.997409   

       recall_py_boiler_pipe_NumWordsRulesExtractor  recall_python_readibilty  \
count                                    601.000000                601.000000   
mean                                       0.830285                  0.867044   
std                                        0.196843                  0.227136   
min                                        0.000000                  0.012255   
2%                                         0.052632                  0.049689   
5%                                         0.392947                  0.166667   
10%                                        0.641026                  0.657534   
50%                                        0.890957                  0.950820   
max                                        0.997852                  1.000000   

       recall_thr_boiler_pipe_ArticleExtractor  \
count                               601.000000   
mean                                  0.692083   
std                                   0.412559   
min                                   0.000000   
2%                                    0.007156   
5%                                    0.017483   
10%                                   0.034446   
50%                                   0.974194   
max                                   1.000000   

       recall_thr_boiler_pipe_DefaultExtractor  
count                               601.000000  
mean                                  0.730151  
std                                   0.376488  
min                                   0.000000  
2%                                    0.016484  
5%                                    0.027933  
10%                                   0.046092  
50%                                   0.954248  
max                                   1.000000  
       f1_gold  f1_py_boiler_pipe_ArticleExtractor  \
count      601                          601.000000   
mean         1                            0.759977   
std          0                            0.271806   
min          1                            0.000000   
2%           1                            0.007246   
5%           1                            0.029412   
10%          1                            0.256983   
50%          1                            0.879257   
max          1                            0.987313   

       f1_py_boiler_pipe_ArticleSentencesExtractor  \
count                                   601.000000   
mean                                      0.000064   
std                                       0.001567   
min                                       0.000000   
2%                                        0.000000   
5%                                        0.000000   
10%                                       0.000000   
50%                                       0.000000   
max                                       0.038405   

       f1_py_boiler_pipe_CanolaExtractor  f1_py_boiler_pipe_DefaultExtractor  \
count                         601.000000                          601.000000   
mean                            0.617464                            0.668067   
std                             0.266504                            0.269835   
min                             0.000000                            0.000000   
2%                              0.004315                            0.000000   
5%                              0.080780                            0.023256   
10%                             0.189921                            0.187135   
50%                             0.691293                            0.772388   
max                             0.974256                            0.986933   

       f1_py_boiler_pipe_KeepEverythingExtractor  \
count                                 601.000000   
mean                                    0.443883   
std                                     0.249808   
min                                     0.000000   
2%                                      0.020263   
5%                                      0.066079   
10%                                     0.117464   
50%                                     0.434783   
max                                     0.946723   

       f1_py_boiler_pipe_LargestContentExtractor  \
count                                 601.000000   
mean                                    0.718765   
std                                     0.297337   
min                                     0.000000   
2%                                      0.007194   
5%                                      0.024818   
10%                                     0.062827   
50%                                     0.851711   
max                                     0.987313   

       f1_py_boiler_pipe_NumWordsRulesExtractor  f1_python_readibilty  \
count                                601.000000            601.000000   
mean                                   0.707795              0.874278   
std                                    0.247907              0.218192   
min                                    0.000000              0.023585   
2%                                     0.006314              0.072539   
5%                                     0.110977              0.181818   
10%                                    0.359116              0.708571   
50%                                    0.803134              0.948718   
max                                    0.987313              1.000000   

       f1_thr_boiler_pipe_ArticleExtractor  \
count                           601.000000   
mean                              0.624692   
std                               0.408753   
min                               0.000000   
2%                                0.004386   
5%                                0.010417   
10%                               0.023077   
50%                               0.893434   
max                               1.000000   

       f1_thr_boiler_pipe_DefaultExtractor  
count                           601.000000  
mean                              0.572334  
std                               0.356012  
min                               0.000000  
2%                                0.007509  
5%                                0.014286  
10%                               0.028986  
50%                               0.663594  
max                               0.998779  

In [77]:
print "spidered"
df = get_data_frame_from_comparision_objects( only_spidered_downloads( comps_downloads ) )
print_results_by_measurement_type( df )


spidered
       precision_gold  precision_py_boiler_pipe_ArticleExtractor  \
count             108                                 108.000000   
mean                1                                   0.763759   
std                 0                                   0.307015   
min                 1                                   0.002899   
2%                  1                                   0.013435   
5%                  1                                   0.029351   
10%                 1                                   0.076326   
50%                 1                                   0.887346   
max                 1                                   0.990968   

       precision_py_boiler_pipe_ArticleSentencesExtractor  \
count                                                108    
mean                                                   0    
std                                                    0    
min                                                    0    
2%                                                     0    
5%                                                     0    
10%                                                    0    
50%                                                    0    
max                                                    0    

       precision_py_boiler_pipe_CanolaExtractor  \
count                                108.000000   
mean                                   0.615616   
std                                    0.306512   
min                                    0.002242   
2%                                     0.052741   
5%                                     0.098303   
10%                                    0.134120   
50%                                    0.730356   
max                                    0.982085   

       precision_py_boiler_pipe_DefaultExtractor  \
count                                 108.000000   
mean                                    0.660055   
std                                     0.310547   
min                                     0.000000   
2%                                      0.018757   
5%                                      0.061666   
10%                                     0.125806   
50%                                     0.816143   
max                                     0.993469   

       precision_py_boiler_pipe_KeepEverythingExtractor  \
count                                        108.000000   
mean                                           0.472615   
std                                            0.278852   
min                                            0.002789   
2%                                             0.044233   
5%                                             0.062028   
10%                                            0.090576   
50%                                            0.484823   
max                                            0.956033   

       precision_py_boiler_pipe_LargestContentExtractor  \
count                                        108.000000   
mean                                           0.737291   
std                                            0.343068   
min                                            0.003731   
2%                                             0.009898   
5%                                             0.012590   
10%                                            0.030961   
50%                                            0.896185   
max                                            0.994749   

       precision_py_boiler_pipe_NumWordsRulesExtractor  \
count                                       108.000000   
mean                                          0.660853   
std                                           0.309699   
min                                           0.003404   
2%                                            0.054765   
5%                                            0.098685   
10%                                           0.154611   
50%                                           0.799741   
max                                           0.991265   

       precision_python_readibilty  \
count                   108.000000   
mean                      0.826749   
std                       0.308338   
min                       0.009811   
2%                        0.027668   
5%                        0.069709   
10%                       0.177386   
50%                       0.974176   
max                       1.000000   

       precision_thr_boiler_pipe_ArticleExtractor  \
count                                  108.000000   
mean                                     0.731285   
std                                      0.347983   
min                                      0.003079   
2%                                       0.008216   
5%                                       0.017428   
10%                                      0.041506   
50%                                      0.913215   
max                                      0.996908   

       precision_thr_boiler_pipe_DefaultExtractor  
count                                  108.000000  
mean                                     0.667947  
std                                      0.318537  
min                                      0.003623  
2%                                       0.025591  
5%                                       0.069110  
10%                                      0.134852  
50%                                      0.797931  
max                                      0.998761  
       recall_gold  recall_py_boiler_pipe_ArticleExtractor  \
count          108                              108.000000   
mean             1                                0.767135   
std              0                                0.322093   
min              1                                0.012876   
2%               1                                0.046481   
5%               1                                0.060836   
10%              1                                0.087225   
50%              1                                0.929098   
max              1                                1.000000   

       recall_py_boiler_pipe_ArticleSentencesExtractor  \
count                                              108   
mean                                                 0   
std                                                  0   
min                                                  0   
2%                                                   0   
5%                                                   0   
10%                                                  0   
50%                                                  0   
max                                                  0   

       recall_py_boiler_pipe_CanolaExtractor  \
count                             108.000000   
mean                                0.849529   
std                                 0.172865   
min                                 0.156566   
2%                                  0.196731   
5%                                  0.439646   
10%                                 0.703911   
50%                                 0.906307   
max                                 0.994856   

       recall_py_boiler_pipe_DefaultExtractor  \
count                              108.000000   
mean                                 0.823295   
std                                  0.210135   
min                                  0.000000   
2%                                   0.109326   
5%                                   0.287291   
10%                                  0.609275   
50%                                  0.886602   
max                                  0.994580   

       recall_py_boiler_pipe_KeepEverythingExtractor  \
count                                     108.000000   
mean                                        0.916251   
std                                         0.123883   
min                                         0.204773   
2%                                          0.509903   
5%                                          0.752874   
10%                                         0.881987   
50%                                         0.943998   
max                                         1.000000   

       recall_py_boiler_pipe_LargestContentExtractor  \
count                                     108.000000   
mean                                        0.611730   
std                                         0.359432   
min                                         0.005025   
2%                                          0.013555   
5%                                          0.032341   
10%                                         0.051268   
50%                                         0.719308   
max                                         0.986637   

       recall_py_boiler_pipe_NumWordsRulesExtractor  recall_python_readibilty  \
count                                    108.000000                108.000000   
mean                                       0.848634                  0.820751   
std                                        0.194951                  0.332890   
min                                        0.079292                  0.002874   
2%                                         0.130391                  0.016990   
5%                                         0.294764                  0.037407   
10%                                        0.710042                  0.081159   
50%                                        0.900348                  0.971610   
max                                        0.987930                  1.000000   

       recall_thr_boiler_pipe_ArticleExtractor  \
count                               108.000000   
mean                                  0.788241   
std                                   0.343611   
min                                   0.014540   
2%                                    0.032170   
5%                                    0.058052   
10%                                   0.089529   
50%                                   0.979877   
max                                   1.000000   

       recall_thr_boiler_pipe_DefaultExtractor  
count                               108.000000  
mean                                  0.869507  
std                                   0.225256  
min                                   0.087533  
2%                                    0.114921  
5%                                    0.271450  
10%                                   0.543603  
50%                                   0.954387  
max                                   1.000000  
       f1_gold  f1_py_boiler_pipe_ArticleExtractor  \
count      108                          108.000000   
mean         1                            0.739209   
std          0                            0.323862   
min          1                            0.005777   
2%           1                            0.015187   
5%           1                            0.040443   
10%          1                            0.069610   
50%          1                            0.896467   
max          1                            0.994777   

       f1_py_boiler_pipe_ArticleSentencesExtractor  \
count                                          108   
mean                                             0   
std                                              0   
min                                              0   
2%                                               0   
5%                                               0   
10%                                              0   
50%                                              0   
max                                              0   

       f1_py_boiler_pipe_CanolaExtractor  f1_py_boiler_pipe_DefaultExtractor  \
count                         108.000000                          108.000000   
mean                            0.656935                            0.681997   
std                             0.276804                            0.282563   
min                             0.004464                            0.000000   
2%                              0.099180                            0.033732   
5%                              0.177400                            0.100176   
10%                             0.232113                            0.200943   
50%                             0.783502                            0.811930   
max                             0.986559                            0.981556   

       f1_py_boiler_pipe_KeepEverythingExtractor  \
count                                 108.000000   
mean                                    0.571323   
std                                     0.271120   
min                                     0.005559   
2%                                      0.084523   
5%                                      0.115850   
10%                                     0.165602   
50%                                     0.602696   
max                                     0.975148   

       f1_py_boiler_pipe_LargestContentExtractor  \
count                                 108.000000   
mean                                    0.642674   
std                                     0.350910   
min                                     0.004283   
2%                                      0.013151   
5%                                      0.020266   
10%                                     0.035722   
50%                                     0.796850   
max                                     0.970168   

       f1_py_boiler_pipe_NumWordsRulesExtractor  f1_python_readibilty  \
count                                108.000000            108.000000   
mean                                   0.684515              0.793592   
std                                    0.286360              0.340243   
min                                    0.006780              0.005525   
2%                                     0.102864              0.018270   
5%                                     0.140361              0.036748   
10%                                    0.218346              0.080691   
50%                                    0.824475              0.961409   
max                                    0.989595              1.000000   

       f1_thr_boiler_pipe_ArticleExtractor  \
count                           108.000000   
mean                              0.721266   
std                               0.357463   
min                               0.006138   
2%                                0.011561   
5%                                0.029779   
10%                               0.062218   
50%                               0.909680   
max                               0.997525   

       f1_thr_boiler_pipe_DefaultExtractor  
count                           108.000000  
mean                              0.690878  
std                               0.292937  
min                               0.007220  
2%                                0.048827  
5%                                0.112937  
10%                               0.187165  
50%                               0.833973  
max                               0.998452  

Results by Subset


In [78]:
regional = { 2453107 }



print "region / pew knight study / 245107 "
df = get_data_frame_from_comparision_objects( filter_by_media_tags_id( non_spidered_downloads, regional ) )
print_results_by_measurement_type( df )

ap_english_us_top_25 = { 8875027 }
print "ap_english_us_top25 / 8875027 "
df = get_data_frame_from_comparision_objects( filter_by_media_tags_id( non_spidered_downloads, ap_english_us_top_25 ) )
print_results_by_measurement_type( df )

political_blogs = { 125 }
print "political blogs / 125"
df = get_data_frame_from_comparision_objects( filter_by_media_tags_id( non_spidered_downloads, political_blogs ) )
print_results_by_measurement_type( df )


russian = { 7796878 }
print 'russian'
df = get_data_frame_from_comparision_objects( filter_by_media_tags_id( non_spidered_downloads, russian ) )
print_results_by_measurement_type( df )

print 'brazil'
df = get_data_frame_from_comparision_objects( filter_by_media_tags_id( non_spidered_downloads, {8877968,  8877969, 8877973, 8877970 } ) )
print_results_by_measurement_type( df )

arabic = { 8878255 }
print 'arabic'
df = get_data_frame_from_comparision_objects( filter_by_media_tags_id( non_spidered_downloads, arabic ) )
print_results_by_measurement_type( df )


region / pew knight study / 245107 
       precision_gold  precision_py_boiler_pipe_ArticleExtractor  \
count              99                                  99.000000   
mean                1                                   0.730497   
std                 0                                   0.254126   
min                 1                                   0.021638   
2%                  1                                   0.058144   
5%                  1                                   0.191812   
10%                 1                                   0.339346   
50%                 1                                   0.855634   
max                 1                                   0.977451   

       precision_py_boiler_pipe_ArticleSentencesExtractor  \
count                                          99.000000    
mean                                            0.003127    
std                                             0.031108    
min                                             0.000000    
2%                                              0.000000    
5%                                              0.000000    
10%                                             0.000000    
50%                                             0.000000    
max                                             0.309524    

       precision_py_boiler_pipe_CanolaExtractor  \
count                                 99.000000   
mean                                   0.603325   
std                                    0.220402   
min                                    0.068873   
2%                                     0.178570   
5%                                     0.195998   
10%                                    0.239692   
50%                                    0.653451   
max                                    0.911409   

       precision_py_boiler_pipe_DefaultExtractor  \
count                                  99.000000   
mean                                    0.669334   
std                                     0.210591   
min                                     0.087109   
2%                                      0.142523   
5%                                      0.287539   
10%                                     0.334865   
50%                                     0.709030   
max                                     0.951528   

       precision_py_boiler_pipe_KeepEverythingExtractor  \
count                                         99.000000   
mean                                           0.402181   
std                                            0.204563   
min                                            0.058161   
2%                                             0.076351   
5%                                             0.091449   
10%                                            0.112205   
50%                                            0.371308   
max                                            0.773504   

       precision_py_boiler_pipe_LargestContentExtractor  \
count                                         99.000000   
mean                                           0.786516   
std                                            0.253222   
min                                            0.033708   
2%                                             0.035928   
5%                                             0.052695   
10%                                            0.401654   
50%                                            0.882629   
max                                            0.977451   

       precision_py_boiler_pipe_NumWordsRulesExtractor  \
count                                        99.000000   
mean                                          0.645020   
std                                           0.216599   
min                                           0.075733   
2%                                            0.189406   
5%                                            0.257510   
10%                                           0.300302   
50%                                           0.690299   
max                                           0.963362   

       precision_python_readibilty  \
count                    99.000000   
mean                      0.926673   
std                       0.138609   
min                       0.143822   
2%                        0.438315   
5%                        0.614948   
10%                       0.841220   
50%                       0.968974   
max                       1.000000   

       precision_thr_boiler_pipe_ArticleExtractor  \
count                                   99.000000   
mean                                     0.638565   
std                                      0.325689   
min                                      0.022250   
2%                                       0.026711   
5%                                       0.042044   
10%                                      0.061152   
50%                                      0.753870   
max                                      0.989749   

       precision_thr_boiler_pipe_DefaultExtractor  
count                                   99.000000  
mean                                     0.610429  
std                                      0.249978  
min                                      0.087406  
2%                                       0.097743  
5%                                       0.187686  
10%                                      0.236001  
50%                                      0.617523  
max                                      0.980851  
       recall_gold  recall_py_boiler_pipe_ArticleExtractor  \
count           99                               99.000000   
mean             1                                0.848680   
std              0                                0.193062   
min              1                                0.067901   
2%               1                                0.097876   
5%               1                                0.409609   
10%              1                                0.676557   
50%              1                                0.914703   
max              1                                0.993341   

       recall_py_boiler_pipe_ArticleSentencesExtractor  \
count                                        99.000000   
mean                                          0.000207   
std                                           0.002058   
min                                           0.000000   
2%                                            0.000000   
5%                                            0.000000   
10%                                           0.000000   
50%                                           0.000000   
max                                           0.020472   

       recall_py_boiler_pipe_CanolaExtractor  \
count                              99.000000   
mean                                0.844213   
std                                 0.108269   
min                                 0.422374   
2%                                  0.542033   
5%                                  0.593156   
10%                                 0.705541   
50%                                 0.882083   
max                                 0.987791   

       recall_py_boiler_pipe_DefaultExtractor  \
count                               99.000000   
mean                                 0.860696   
std                                  0.121499   
min                                  0.253968   
2%                                   0.517735   
5%                                   0.579318   
10%                                  0.710183   
50%                                  0.895105   
max                                  0.986681   

       recall_py_boiler_pipe_KeepEverythingExtractor  \
count                                      99.000000   
mean                                        0.924781   
std                                         0.036528   
min                                         0.738562   
2%                                          0.849661   
5%                                          0.866116   
10%                                         0.888651   
50%                                         0.933036   
max                                         0.993341   

       recall_py_boiler_pipe_LargestContentExtractor  \
count                                      99.000000   
mean                                        0.728798   
std                                         0.274766   
min                                         0.041379   
2%                                          0.047393   
5%                                          0.069693   
10%                                         0.226484   
50%                                         0.854911   
max                                         0.985572   

       recall_py_boiler_pipe_NumWordsRulesExtractor  recall_python_readibilty  \
count                                     99.000000                 99.000000   
mean                                       0.876498                  0.915441   
std                                        0.089291                  0.128770   
min                                        0.448276                  0.077586   
2%                                         0.600374                  0.640930   
5%                                         0.697441                  0.685469   
10%                                        0.789726                  0.847998   
50%                                        0.904639                  0.946387   
max                                        0.986681                  1.000000   

       recall_thr_boiler_pipe_ArticleExtractor  \
count                                99.000000   
mean                                  0.813821   
std                                   0.341946   
min                                   0.027972   
2%                                    0.056410   
5%                                    0.079944   
10%                                   0.092131   
50%                                   0.993174   
max                                   1.000000   

       recall_thr_boiler_pipe_DefaultExtractor  
count                                99.000000  
mean                                  0.942651  
std                                   0.115500  
min                                   0.285714  
2%                                    0.510198  
5%                                    0.778831  
10%                                   0.860101  
50%                                   0.985882  
max                                   1.000000  
       f1_gold  f1_py_boiler_pipe_ArticleExtractor  \
count       99                           99.000000   
mean         1                            0.765140   
std          0                            0.219826   
min          1                            0.035176   
2%           1                            0.065534   
5%           1                            0.311998   
10%          1                            0.488654   
50%          1                            0.868217   
max          1                            0.971837   

       f1_py_boiler_pipe_ArticleSentencesExtractor  \
count                                    99.000000   
mean                                      0.000388   
std                                       0.003860   
min                                       0.000000   
2%                                        0.000000   
5%                                        0.000000   
10%                                       0.000000   
50%                                       0.000000   
max                                       0.038405   

       f1_py_boiler_pipe_CanolaExtractor  f1_py_boiler_pipe_DefaultExtractor  \
count                          99.000000                           99.000000   
mean                            0.678045                            0.733985   
std                             0.188601                            0.177721   
min                             0.127827                            0.159309   
2%                              0.293293                            0.223785   
5%                              0.311104                            0.428092   
10%                             0.366320                            0.484591   
50%                             0.717622                            0.791304   
max                             0.933186                            0.959617   

       f1_py_boiler_pipe_KeepEverythingExtractor  \
count                                  99.000000   
mean                                    0.530552   
std                                     0.210729   
min                                     0.109541   
2%                                      0.140531   
5%                                      0.166162   
10%                                     0.200544   
50%                                     0.525896   
max                                     0.851478   

       f1_py_boiler_pipe_LargestContentExtractor  \
count                                  99.000000   
mean                                    0.746387   
std                                     0.266300   
min                                     0.038462   
2%                                      0.039443   
5%                                      0.059673   
10%                                     0.251082   
50%                                     0.866142   
max                                     0.979592   

       f1_py_boiler_pipe_NumWordsRulesExtractor  f1_python_readibilty  \
count                                 99.000000             99.000000   
mean                                   0.720843              0.908288   
std                                    0.179827              0.140196   
min                                    0.140069              0.136364   
2%                                     0.315054              0.384693   
5%                                     0.396596              0.715988   
10%                                    0.441076              0.794279   
50%                                    0.766467              0.950276   
max                                    0.960774              1.000000   

       f1_thr_boiler_pipe_ArticleExtractor  \
count                            99.000000   
mean                              0.693991   
std                               0.325722   
min                               0.025532   
2%                                0.038306   
5%                                0.056527   
10%                               0.070245   
50%                               0.830508   
max                               0.992573   

       f1_thr_boiler_pipe_DefaultExtractor  
count                            99.000000  
mean                              0.709473  
std                               0.212386  
min                               0.143426  
2%                                0.174606  
5%                                0.290379  
10%                               0.376066  
50%                               0.748092  
max                               0.988210  
ap_english_us_top25 / 8875027 
       precision_gold  precision_py_boiler_pipe_ArticleExtractor  \
count              99                                  99.000000   
mean                1                                   0.730497   
std                 0                                   0.254126   
min                 1                                   0.021638   
2%                  1                                   0.058144   
5%                  1                                   0.191812   
10%                 1                                   0.339346   
50%                 1                                   0.855634   
max                 1                                   0.977451   

       precision_py_boiler_pipe_ArticleSentencesExtractor  \
count                                          99.000000    
mean                                            0.003127    
std                                             0.031108    
min                                             0.000000    
2%                                              0.000000    
5%                                              0.000000    
10%                                             0.000000    
50%                                             0.000000    
max                                             0.309524    

       precision_py_boiler_pipe_CanolaExtractor  \
count                                 99.000000   
mean                                   0.603325   
std                                    0.220402   
min                                    0.068873   
2%                                     0.178570   
5%                                     0.195998   
10%                                    0.239692   
50%                                    0.653451   
max                                    0.911409   

       precision_py_boiler_pipe_DefaultExtractor  \
count                                  99.000000   
mean                                    0.669334   
std                                     0.210591   
min                                     0.087109   
2%                                      0.142523   
5%                                      0.287539   
10%                                     0.334865   
50%                                     0.709030   
max                                     0.951528   

       precision_py_boiler_pipe_KeepEverythingExtractor  \
count                                         99.000000   
mean                                           0.402181   
std                                            0.204563   
min                                            0.058161   
2%                                             0.076351   
5%                                             0.091449   
10%                                            0.112205   
50%                                            0.371308   
max                                            0.773504   

       precision_py_boiler_pipe_LargestContentExtractor  \
count                                         99.000000   
mean                                           0.786516   
std                                            0.253222   
min                                            0.033708   
2%                                             0.035928   
5%                                             0.052695   
10%                                            0.401654   
50%                                            0.882629   
max                                            0.977451   

       precision_py_boiler_pipe_NumWordsRulesExtractor  \
count                                        99.000000   
mean                                          0.645020   
std                                           0.216599   
min                                           0.075733   
2%                                            0.189406   
5%                                            0.257510   
10%                                           0.300302   
50%                                           0.690299   
max                                           0.963362   

       precision_python_readibilty  \
count                    99.000000   
mean                      0.926673   
std                       0.138609   
min                       0.143822   
2%                        0.438315   
5%                        0.614948   
10%                       0.841220   
50%                       0.968974   
max                       1.000000   

       precision_thr_boiler_pipe_ArticleExtractor  \
count                                   99.000000   
mean                                     0.638565   
std                                      0.325689   
min                                      0.022250   
2%                                       0.026711   
5%                                       0.042044   
10%                                      0.061152   
50%                                      0.753870   
max                                      0.989749   

       precision_thr_boiler_pipe_DefaultExtractor  
count                                   99.000000  
mean                                     0.610429  
std                                      0.249978  
min                                      0.087406  
2%                                       0.097743  
5%                                       0.187686  
10%                                      0.236001  
50%                                      0.617523  
max                                      0.980851  
       recall_gold  recall_py_boiler_pipe_ArticleExtractor  \
count           99                               99.000000   
mean             1                                0.848680   
std              0                                0.193062   
min              1                                0.067901   
2%               1                                0.097876   
5%               1                                0.409609   
10%              1                                0.676557   
50%              1                                0.914703   
max              1                                0.993341   

       recall_py_boiler_pipe_ArticleSentencesExtractor  \
count                                        99.000000   
mean                                          0.000207   
std                                           0.002058   
min                                           0.000000   
2%                                            0.000000   
5%                                            0.000000   
10%                                           0.000000   
50%                                           0.000000   
max                                           0.020472   

       recall_py_boiler_pipe_CanolaExtractor  \
count                              99.000000   
mean                                0.844213   
std                                 0.108269   
min                                 0.422374   
2%                                  0.542033   
5%                                  0.593156   
10%                                 0.705541   
50%                                 0.882083   
max                                 0.987791   

       recall_py_boiler_pipe_DefaultExtractor  \
count                               99.000000   
mean                                 0.860696   
std                                  0.121499   
min                                  0.253968   
2%                                   0.517735   
5%                                   0.579318   
10%                                  0.710183   
50%                                  0.895105   
max                                  0.986681   

       recall_py_boiler_pipe_KeepEverythingExtractor  \
count                                      99.000000   
mean                                        0.924781   
std                                         0.036528   
min                                         0.738562   
2%                                          0.849661   
5%                                          0.866116   
10%                                         0.888651   
50%                                         0.933036   
max                                         0.993341   

       recall_py_boiler_pipe_LargestContentExtractor  \
count                                      99.000000   
mean                                        0.728798   
std                                         0.274766   
min                                         0.041379   
2%                                          0.047393   
5%                                          0.069693   
10%                                         0.226484   
50%                                         0.854911   
max                                         0.985572   

       recall_py_boiler_pipe_NumWordsRulesExtractor  recall_python_readibilty  \
count                                     99.000000                 99.000000   
mean                                       0.876498                  0.915441   
std                                        0.089291                  0.128770   
min                                        0.448276                  0.077586   
2%                                         0.600374                  0.640930   
5%                                         0.697441                  0.685469   
10%                                        0.789726                  0.847998   
50%                                        0.904639                  0.946387   
max                                        0.986681                  1.000000   

       recall_thr_boiler_pipe_ArticleExtractor  \
count                                99.000000   
mean                                  0.813821   
std                                   0.341946   
min                                   0.027972   
2%                                    0.056410   
5%                                    0.079944   
10%                                   0.092131   
50%                                   0.993174   
max                                   1.000000   

       recall_thr_boiler_pipe_DefaultExtractor  
count                                99.000000  
mean                                  0.942651  
std                                   0.115500  
min                                   0.285714  
2%                                    0.510198  
5%                                    0.778831  
10%                                   0.860101  
50%                                   0.985882  
max                                   1.000000  
       f1_gold  f1_py_boiler_pipe_ArticleExtractor  \
count       99                           99.000000   
mean         1                            0.765140   
std          0                            0.219826   
min          1                            0.035176   
2%           1                            0.065534   
5%           1                            0.311998   
10%          1                            0.488654   
50%          1                            0.868217   
max          1                            0.971837   

       f1_py_boiler_pipe_ArticleSentencesExtractor  \
count                                    99.000000   
mean                                      0.000388   
std                                       0.003860   
min                                       0.000000   
2%                                        0.000000   
5%                                        0.000000   
10%                                       0.000000   
50%                                       0.000000   
max                                       0.038405   

       f1_py_boiler_pipe_CanolaExtractor  f1_py_boiler_pipe_DefaultExtractor  \
count                          99.000000                           99.000000   
mean                            0.678045                            0.733985   
std                             0.188601                            0.177721   
min                             0.127827                            0.159309   
2%                              0.293293                            0.223785   
5%                              0.311104                            0.428092   
10%                             0.366320                            0.484591   
50%                             0.717622                            0.791304   
max                             0.933186                            0.959617   

       f1_py_boiler_pipe_KeepEverythingExtractor  \
count                                  99.000000   
mean                                    0.530552   
std                                     0.210729   
min                                     0.109541   
2%                                      0.140531   
5%                                      0.166162   
10%                                     0.200544   
50%                                     0.525896   
max                                     0.851478   

       f1_py_boiler_pipe_LargestContentExtractor  \
count                                  99.000000   
mean                                    0.746387   
std                                     0.266300   
min                                     0.038462   
2%                                      0.039443   
5%                                      0.059673   
10%                                     0.251082   
50%                                     0.866142   
max                                     0.979592   

       f1_py_boiler_pipe_NumWordsRulesExtractor  f1_python_readibilty  \
count                                 99.000000             99.000000   
mean                                   0.720843              0.908288   
std                                    0.179827              0.140196   
min                                    0.140069              0.136364   
2%                                     0.315054              0.384693   
5%                                     0.396596              0.715988   
10%                                    0.441076              0.794279   
50%                                    0.766467              0.950276   
max                                    0.960774              1.000000   

       f1_thr_boiler_pipe_ArticleExtractor  \
count                            99.000000   
mean                              0.693991   
std                               0.325722   
min                               0.025532   
2%                                0.038306   
5%                                0.056527   
10%                               0.070245   
50%                               0.830508   
max                               0.992573   

       f1_thr_boiler_pipe_DefaultExtractor  
count                            99.000000  
mean                              0.709473  
std                               0.212386  
min                               0.143426  
2%                                0.174606  
5%                                0.290379  
10%                               0.376066  
50%                               0.748092  
max                               0.988210  
political blogs / 125
       precision_gold  precision_py_boiler_pipe_ArticleExtractor  \
count             150                                 150.000000   
mean                1                                   0.703533   
std                 0                                   0.360671   
min                 1                                   0.000000   
2%                  1                                   0.000000   
5%                  1                                   0.003984   
10%                 1                                   0.011152   
50%                 1                                   0.898303   
max                 1                                   0.988089   

       precision_py_boiler_pipe_ArticleSentencesExtractor  \
count                                                150    
mean                                                   0    
std                                                    0    
min                                                    0    
2%                                                     0    
5%                                                     0    
10%                                                    0    
50%                                                    0    
max                                                    0    

       precision_py_boiler_pipe_CanolaExtractor  \
count                                150.000000   
mean                                   0.515234   
std                                    0.341081   
min                                    0.000000   
2%                                     0.000000   
5%                                     0.003016   
10%                                    0.014508   
50%                                    0.555674   
max                                    0.973795   

       precision_py_boiler_pipe_DefaultExtractor  \
count                                 150.000000   
mean                                    0.522629   
std                                     0.342647   
min                                     0.000000   
2%                                      0.000000   
5%                                      0.000000   
10%                                     0.005803   
50%                                     0.599262   
max                                     0.973963   

       precision_py_boiler_pipe_KeepEverythingExtractor  \
count                                        150.000000   
mean                                           0.369914   
std                                            0.278387   
min                                            0.002865   
2%                                             0.007767   
5%                                             0.015080   
10%                                            0.023499   
50%                                            0.298086   
max                                            0.913591   

       precision_py_boiler_pipe_LargestContentExtractor  \
count                                        150.000000   
mean                                           0.677239   
std                                            0.383520   
min                                            0.000000   
2%                                             0.000000   
5%                                             0.004188   
10%                                            0.012684   
50%                                            0.892375   
max                                            0.978162   

       precision_py_boiler_pipe_NumWordsRulesExtractor  \
count                                       150.000000   
mean                                          0.572627   
std                                           0.333536   
min                                           0.000000   
2%                                            0.000000   
5%                                            0.002471   
10%                                           0.012913   
50%                                           0.681146   
max                                           0.973301   

       precision_python_readibilty  \
count                   150.000000   
mean                      0.880689   
std                       0.241102   
min                       0.013793   
2%                        0.030579   
5%                        0.186217   
10%                       0.599048   
50%                       0.985371   
max                       1.000000   

       precision_thr_boiler_pipe_ArticleExtractor  \
count                                  150.000000   
mean                                     0.616935   
std                                      0.421249   
min                                      0.003302   
2%                                       0.005500   
5%                                       0.008824   
10%                                      0.014676   
50%                                      0.906002   
max                                      0.997970   

       precision_thr_boiler_pipe_DefaultExtractor  
count                                  150.000000  
mean                                     0.484702  
std                                      0.325502  
min                                      0.003189  
2%                                       0.005959  
5%                                       0.012064  
10%                                      0.029434  
50%                                      0.455373  
max                                      0.997809  
       recall_gold  recall_py_boiler_pipe_ArticleExtractor  \
count          150                              150.000000   
mean             1                                0.756760   
std              0                                0.342598   
min              1                                0.000000   
2%               1                                0.000000   
5%               1                                0.023901   
10%              1                                0.051914   
50%              1                                0.928193   
max              1                                0.991501   

       recall_py_boiler_pipe_ArticleSentencesExtractor  \
count                                              150   
mean                                                 0   
std                                                  0   
min                                                  0   
2%                                                   0   
5%                                                   0   
10%                                                  0   
50%                                                  0   
max                                                  0   

       recall_py_boiler_pipe_CanolaExtractor  \
count                             150.000000   
mean                                0.775613   
std                                 0.281738   
min                                 0.000000   
2%                                  0.000000   
5%                                  0.072667   
10%                                 0.266144   
50%                                 0.898780   
max                                 0.995704   

       recall_py_boiler_pipe_DefaultExtractor  \
count                              150.000000   
mean                                 0.715632   
std                                  0.333288   
min                                  0.000000   
2%                                   0.000000   
5%                                   0.000000   
10%                                  0.077706   
50%                                  0.880511   
max                                  0.991501   

       recall_py_boiler_pipe_KeepEverythingExtractor  \
count                                     150.000000   
mean                                        0.911380   
std                                         0.109811   
min                                         0.050909   
2%                                          0.689231   
5%                                          0.722857   
10%                                         0.779471   
50%                                         0.946547   
max                                         0.998389   

       recall_py_boiler_pipe_LargestContentExtractor  \
count                                     150.000000   
mean                                        0.660467   
std                                         0.372133   
min                                         0.000000   
2%                                          0.000000   
5%                                          0.011912   
10%                                         0.034427   
50%                                         0.879248   
max                                         0.991501   

       recall_py_boiler_pipe_NumWordsRulesExtractor  recall_python_readibilty  \
count                                    150.000000                150.000000   
mean                                       0.781152                  0.807832   
std                                        0.280683                  0.313501   
min                                        0.000000                  0.020690   
2%                                         0.000000                  0.035595   
5%                                         0.084909                  0.069238   
10%                                        0.234492                  0.166667   
50%                                        0.905032                  0.970968   
max                                        0.991501                  1.000000   

       recall_thr_boiler_pipe_ArticleExtractor  \
count                               150.000000   
mean                                  0.769863   
std                                   0.359786   
min                                   0.010331   
2%                                    0.023252   
5%                                    0.036122   
10%                                   0.069009   
50%                                   0.976322   
max                                   1.000000   

       recall_thr_boiler_pipe_DefaultExtractor  
count                               150.000000  
mean                                  0.815129  
std                                   0.282623  
min                                   0.003704  
2%                                    0.031039  
5%                                    0.131712  
10%                                   0.301923  
50%                                   0.962453  
max                                   1.000000  
       f1_gold  f1_py_boiler_pipe_ArticleExtractor  \
count      150                          150.000000   
mean         1                            0.718112   
std          0                            0.358481   
min          1                            0.000000   
2%           1                            0.000000   
5%           1                            0.006798   
10%          1                            0.017960   
50%          1                            0.912683   
max          1                            0.984093   

       f1_py_boiler_pipe_ArticleSentencesExtractor  \
count                                          150   
mean                                             0   
std                                              0   
min                                              0   
2%                                               0   
5%                                               0   
10%                                              0   
50%                                              0   
max                                              0   

       f1_py_boiler_pipe_CanolaExtractor  f1_py_boiler_pipe_DefaultExtractor  \
count                         150.000000                          150.000000   
mean                            0.577217                            0.568006   
std                             0.330633                            0.345523   
min                             0.000000                            0.000000   
2%                              0.000000                            0.000000   
5%                              0.005862                            0.000000   
10%                             0.027886                            0.011238   
50%                             0.674047                            0.682651   
max                             0.969320                            0.971197   

       f1_py_boiler_pipe_KeepEverythingExtractor  \
count                                 150.000000   
mean                                    0.472859   
std                                     0.292861   
min                                     0.005706   
2%                                      0.015378   
5%                                      0.027311   
10%                                     0.045693   
50%                                     0.453552   
max                                     0.944083   

       f1_py_boiler_pipe_LargestContentExtractor  \
count                                 150.000000   
mean                                    0.659167   
std                                     0.376714   
min                                     0.000000   
2%                                      0.000000   
5%                                      0.007661   
10%                                     0.020191   
50%                                     0.867735   
max                                     0.980839   

       f1_py_boiler_pipe_NumWordsRulesExtractor  f1_python_readibilty  \
count                                150.000000            150.000000   
mean                                   0.627502              0.807779   
std                                    0.324330              0.304467   
min                                    0.000000              0.026549   
2%                                     0.000000              0.034729   
5%                                     0.004894              0.072607   
10%                                    0.025091              0.165152   
50%                                    0.756476              0.963556   
max                                    0.973301              1.000000   

       f1_thr_boiler_pipe_ArticleExtractor  \
count                           150.000000   
mean                              0.635670   
std                               0.419068   
min                               0.006577   
2%                                0.009602   
5%                                0.012090   
10%                               0.020877   
50%                               0.933609   
max                               0.998469   

       f1_thr_boiler_pipe_DefaultExtractor  
count                           150.000000  
mean                              0.530499  
std                               0.330138  
min                               0.006349  
2%                                0.010201  
5%                                0.020156  
10%                               0.042055  
50%                               0.580317  
max                               0.994242  
russian
       precision_gold  precision_py_boiler_pipe_ArticleExtractor  \
count             132                                 132.000000   
mean                1                                   0.718432   
std                 0                                   0.252618   
min                 1                                   0.012500   
2%                  1                                   0.051124   
5%                  1                                   0.185462   
10%                 1                                   0.340725   
50%                 1                                   0.835164   
max                 1                                   0.982402   

       precision_py_boiler_pipe_ArticleSentencesExtractor  \
count                                                132    
mean                                                   0    
std                                                    0    
min                                                    0    
2%                                                     0    
5%                                                     0    
10%                                                    0    
50%                                                    0    
max                                                    0    

       precision_py_boiler_pipe_CanolaExtractor  \
count                                132.000000   
mean                                   0.488699   
std                                    0.250472   
min                                    0.054201   
2%                                     0.078920   
5%                                     0.114551   
10%                                    0.175987   
50%                                    0.441274   
max                                    0.925352   

       precision_py_boiler_pipe_DefaultExtractor  \
count                                 132.000000   
mean                                    0.610990   
std                                     0.258515   
min                                     0.022857   
2%                                      0.112440   
5%                                      0.165727   
10%                                     0.247548   
50%                                     0.665667   
max                                     0.981651   

       precision_py_boiler_pipe_KeepEverythingExtractor  \
count                                        132.000000   
mean                                           0.188878   
std                                            0.142571   
min                                            0.010258   
2%                                             0.033324   
5%                                             0.039598   
10%                                            0.048948   
50%                                            0.148603   
max                                            0.835991   

       precision_py_boiler_pipe_LargestContentExtractor  \
count                                        132.000000   
mean                                           0.702744   
std                                            0.288666   
min                                            0.012500   
2%                                             0.044430   
5%                                             0.084635   
10%                                            0.158502   
50%                                            0.835874   
max                                            0.982402   

       precision_py_boiler_pipe_NumWordsRulesExtractor  \
count                                       132.000000   
mean                                          0.614402   
std                                           0.262220   
min                                           0.067548   
2%                                            0.117832   
5%                                            0.162064   
10%                                           0.238983   
50%                                           0.711282   
max                                           0.982402   

       precision_python_readibilty  \
count                   132.000000   
mean                      0.946815   
std                       0.139890   
min                       0.018519   
2%                        0.503566   
5%                        0.842260   
10%                       0.904014   
50%                       0.980456   
max                       1.000000   

       precision_thr_boiler_pipe_ArticleExtractor  \
count                                  132.000000   
mean                                     0.234777   
std                                      0.370754   
min                                      0.000000   
2%                                       0.000000   
5%                                       0.000000   
10%                                      0.003215   
50%                                      0.043435   
max                                      1.000000   

       precision_thr_boiler_pipe_DefaultExtractor  
count                                  132.000000  
mean                                     0.220128  
std                                      0.347734  
min                                      0.001456  
2%                                       0.003306  
5%                                       0.004450  
10%                                      0.006159  
50%                                      0.030528  
max                                      1.000000  
       recall_gold  recall_py_boiler_pipe_ArticleExtractor  \
count          132                              132.000000   
mean             1                                0.818267   
std              0                                0.220713   
min              1                                0.032882   
2%               1                                0.092117   
5%               1                                0.201786   
10%              1                                0.581737   
50%              1                                0.898746   
max              1                                1.000000   

       recall_py_boiler_pipe_ArticleSentencesExtractor  \
count                                              132   
mean                                                 0   
std                                                  0   
min                                                  0   
2%                                                   0   
5%                                                   0   
10%                                                  0   
50%                                                  0   
max                                                  0   

       recall_py_boiler_pipe_CanolaExtractor  \
count                             132.000000   
mean                                0.804646   
std                                 0.146584   
min                                 0.297297   
2%                                  0.412065   
5%                                  0.503986   
10%                                 0.611248   
50%                                 0.845416   
max                                 0.993498   

       recall_py_boiler_pipe_DefaultExtractor  \
count                              132.000000   
mean                                 0.789282   
std                                  0.184720   
min                                  0.037736   
2%                                   0.222724   
5%                                   0.390545   
10%                                  0.594575   
50%                                  0.840412   
max                                  0.996778   

       recall_py_boiler_pipe_KeepEverythingExtractor  \
count                                     132.000000   
mean                                        0.913321   
std                                         0.078472   
min                                         0.381356   
2%                                          0.785777   
5%                                          0.833346   
10%                                         0.851894   
50%                                         0.921185   
max                                         1.000000   

       recall_py_boiler_pipe_LargestContentExtractor  \
count                                     132.000000   
mean                                        0.717450   
std                                         0.302408   
min                                         0.010778   
2%                                          0.035458   
5%                                          0.085721   
10%                                         0.127878   
50%                                         0.875702   
max                                         0.997409   

       recall_py_boiler_pipe_NumWordsRulesExtractor  recall_python_readibilty  \
count                                    132.000000                132.000000   
mean                                       0.851578                  0.879423   
std                                        0.120377                  0.194934   
min                                        0.310811                  0.020725   
2%                                         0.476644                  0.057854   
5%                                         0.608980                  0.655007   
10%                                        0.714950                  0.754755   
50%                                        0.866012                  0.937048   
max                                        0.997852                  1.000000   

       recall_thr_boiler_pipe_ArticleExtractor  \
count                               132.000000   
mean                                  0.237385   
std                                   0.376244   
min                                   0.000000   
2%                                    0.000000   
5%                                    0.000000   
10%                                   0.007983   
50%                                   0.051471   
max                                   1.000000   

       recall_thr_boiler_pipe_DefaultExtractor  
count                               132.000000  
mean                                  0.301652  
std                                   0.406205  
min                                   0.005319  
2%                                    0.009765  
5%                                    0.014811  
10%                                   0.017790  
50%                                   0.058824  
max                                   1.000000  
       f1_gold  f1_py_boiler_pipe_ArticleExtractor  \
count      132                          132.000000   
mean         1                            0.749676   
std          0                            0.227920   
min          1                            0.018114   
2%           1                            0.063775   
5%           1                            0.187938   
10%          1                            0.478066   
50%          1                            0.818793   
max          1                            0.987313   

       f1_py_boiler_pipe_ArticleSentencesExtractor  \
count                                          132   
mean                                             0   
std                                              0   
min                                              0   
2%                                               0   
5%                                               0   
10%                                              0   
50%                                              0   
max                                              0   

       f1_py_boiler_pipe_CanolaExtractor  f1_py_boiler_pipe_DefaultExtractor  \
count                         132.000000                          132.000000   
mean                            0.573987                            0.661798   
std                             0.222509                            0.225256   
min                             0.092437                            0.036866   
2%                              0.137648                            0.101141   
5%                              0.183537                            0.253035   
10%                             0.280725                            0.340049   
50%                             0.570221                            0.715686   
max                             0.920245                            0.986933   

       f1_py_boiler_pipe_KeepEverythingExtractor  \
count                                 132.000000   
mean                                    0.293188   
std                                     0.173252   
min                                     0.020263   
2%                                      0.062051   
5%                                      0.075902   
10%                                     0.088069   
50%                                     0.254443   
max                                     0.878564   

       f1_py_boiler_pipe_LargestContentExtractor  \
count                                 132.000000   
mean                                    0.692104   
std                                     0.285086   
min                                     0.012821   
2%                                      0.033738   
5%                                      0.075729   
10%                                     0.135805   
50%                                     0.802698   
max                                     0.987313   

       f1_py_boiler_pipe_NumWordsRulesExtractor  f1_python_readibilty  \
count                                132.000000            132.000000   
mean                                   0.684560              0.899931   
std                                    0.222306              0.184317   
min                                    0.110977              0.026781   
2%                                     0.194573              0.098648   
5%                                     0.268575              0.778182   
10%                                    0.365804              0.843508   
50%                                    0.768441              0.944767   
max                                    0.987313              0.999427   

       f1_thr_boiler_pipe_ArticleExtractor  \
count                           132.000000   
mean                              0.231709   
std                               0.371334   
min                               0.000000   
2%                                0.000000   
5%                                0.000000   
10%                               0.005065   
50%                               0.047168   
max                               1.000000   

       f1_thr_boiler_pipe_DefaultExtractor  
count                           132.000000  
mean                              0.241120  
std                               0.357583  
min                               0.002375  
2%                                0.005276  
5%                                0.006912  
10%                               0.009496  
50%                               0.039170  
max                               0.997347  
brazil
       precision_gold  precision_py_boiler_pipe_ArticleExtractor  \
count               5                                   5.000000   
mean                1                                   0.801957   
std                 0                                   0.153987   
min                 1                                   0.538341   
2%                  1                                   0.561037   
5%                  1                                   0.595080   
10%                 1                                   0.651818   
50%                 1                                   0.828571   
max                 1                                   0.929329   

       precision_py_boiler_pipe_ArticleSentencesExtractor  \
count                                                  5    
mean                                                   0    
std                                                    0    
min                                                    0    
2%                                                     0    
5%                                                     0    
10%                                                    0    
50%                                                    0    
max                                                    0    

       precision_py_boiler_pipe_CanolaExtractor  \
count                                  5.000000   
mean                                   0.664670   
std                                    0.273746   
min                                    0.180902   
2%                                     0.224373   
5%                                     0.289580   
10%                                    0.398258   
50%                                    0.767857   
max                                    0.826772   

       precision_py_boiler_pipe_DefaultExtractor  \
count                                   5.000000   
mean                                    0.719327   
std                                     0.319628   
min                                     0.157421   
2%                                      0.207379   
5%                                      0.282316   
10%                                     0.407210   
50%                                     0.840909   
max                                     0.946043   

       precision_py_boiler_pipe_KeepEverythingExtractor  \
count                                          5.000000   
mean                                           0.284243   
std                                            0.107933   
min                                            0.118172   
2%                                             0.128293   
5%                                             0.143474   
10%                                            0.168776   
50%                                            0.309410   
max                                            0.390855   

       precision_py_boiler_pipe_LargestContentExtractor  \
count                                          5.000000   
mean                                           0.888106   
std                                            0.039403   
min                                            0.820312   
2%                                             0.826008   
5%                                             0.834552   
10%                                            0.848791   
50%                                            0.902326   
max                                            0.921622   

       precision_py_boiler_pipe_NumWordsRulesExtractor  \
count                                         5.000000   
mean                                          0.734233   
std                                           0.323688   
min                                           0.161388   
2%                                            0.215733   
5%                                            0.297252   
10%                                           0.433116   
50%                                           0.840909   
max                                           0.956124   

       precision_python_readibilty  \
count                     5.000000   
mean                      0.962387   
std                       0.030849   
min                       0.933014   
2%                        0.933860   
5%                        0.935129   
10%                       0.937245   
50%                       0.944000   
max                       1.000000   

       precision_thr_boiler_pipe_ArticleExtractor  \
count                                    5.000000   
mean                                     0.742769   
std                                      0.388031   
min                                      0.050514   
2%                                       0.116920   
5%                                       0.216530   
10%                                      0.382547   
50%                                      0.901408   
max                                      0.957516   

       precision_thr_boiler_pipe_DefaultExtractor  
count                                    5.000000  
mean                                     0.640169  
std                                      0.355717  
min                                      0.037340  
2%                                       0.087591  
5%                                       0.162968  
10%                                      0.288596  
50%                                      0.690391  
max                                      0.920415  
       recall_gold  recall_py_boiler_pipe_ArticleExtractor  \
count            5                                5.000000   
mean             1                                0.979646   
std              0                                0.009266   
min              1                                0.969231   
2%               1                                0.969331   
5%               1                                0.969481   
10%              1                                0.969730   
50%              1                                0.983051   
max              1                                0.989796   

       recall_py_boiler_pipe_ArticleSentencesExtractor  \
count                                                5   
mean                                                 0   
std                                                  0   
min                                                  0   
2%                                                   0   
5%                                                   0   
10%                                                  0   
50%                                                  0   
max                                                  0   

       recall_py_boiler_pipe_CanolaExtractor  \
count                               5.000000   
mean                                0.914842   
std                                 0.039027   
min                                 0.882051   
2%                                  0.882674   
5%                                  0.883607   
10%                                 0.885163   
50%                                 0.896679   
max                                 0.977077   

       recall_py_boiler_pipe_DefaultExtractor  \
count                                5.000000   
mean                                 0.949445   
std                                  0.028846   
min                                  0.902579   
2%                                   0.905627   
5%                                   0.910199   
10%                                  0.917818   
50%                                  0.964103   
max                                  0.970480   

       recall_py_boiler_pipe_KeepEverythingExtractor  \
count                                       5.000000   
mean                                        0.978402   
std                                         0.010026   
min                                         0.964103   
2%                                          0.964940   
5%                                          0.966197   
10%                                         0.968292   
50%                                         0.977860   
max                                         0.989796   

       recall_py_boiler_pipe_LargestContentExtractor  \
count                                       5.000000   
mean                                        0.870353   
std                                         0.196538   
min                                         0.525830   
2%                                          0.554950   
5%                                          0.598630   
10%                                         0.671430   
50%                                         0.969231   
max                                         0.989796   

       recall_py_boiler_pipe_NumWordsRulesExtractor  recall_python_readibilty  \
count                                      5.000000                  5.000000   
mean                                       0.947674                  0.979889   
std                                        0.020087                  0.025059   
min                                        0.919771                  0.938776   
2%                                         0.921443                  0.941902   
5%                                         0.923952                  0.946592   
10%                                        0.928134                  0.954409   
50%                                        0.943590                  0.982808   
max                                        0.969388                  1.000000   

       recall_thr_boiler_pipe_ArticleExtractor  \
count                                 5.000000   
mean                                  0.736822   
std                                   0.372712   
min                                   0.169054   
2%                                    0.198777   
5%                                    0.243362   
10%                                   0.317669   
50%                                   0.979592   
max                                   1.000000   

       recall_thr_boiler_pipe_DefaultExtractor  
count                                 5.000000  
mean                                  0.822777  
std                                   0.357876  
min                                   0.183381  
2%                                    0.245037  
5%                                    0.337521  
10%                                   0.491661  
50%                                   0.981550  
max                                   1.000000  
       f1_gold  f1_py_boiler_pipe_ArticleExtractor  \
count        5                            5.000000   
mean         1                            0.874387   
std          0                            0.101817   
min          1                            0.696356   
2%           1                            0.712500   
5%           1                            0.736715   
10%          1                            0.777073   
50%          1                            0.899225   
max          1                            0.949458   

       f1_py_boiler_pipe_ArticleSentencesExtractor  \
count                                            5   
mean                                             0   
std                                              0   
min                                              0   
2%                                               0   
5%                                               0   
10%                                              0   
50%                                              0   
max                                              0   

       f1_py_boiler_pipe_CanolaExtractor  f1_py_boiler_pipe_DefaultExtractor  \
count                           5.000000                            5.000000   
mean                            0.731530                            0.778927   
std                             0.239962                            0.287640   
min                             0.305282                            0.268085   
2%                              0.344965                            0.315887   
5%                              0.404489                            0.387589   
10%                             0.503697                            0.507093   
50%                             0.821002                            0.888000   
max                             0.872902                            0.958106   

       f1_py_boiler_pipe_KeepEverythingExtractor  \
count                                   5.000000   
mean                                    0.430870   
std                                     0.138002   
min                                     0.211043   
2%                                      0.225452   
5%                                      0.247066   
10%                                     0.283088   
50%                                     0.471446   
max                                     0.558483   

       f1_py_boiler_pipe_LargestContentExtractor  \
count                                   5.000000   
mean                                    0.868019   
std                                     0.119717   
min                                     0.665111   
2%                                      0.680195   
5%                                      0.702820   
10%                                     0.740530   
50%                                     0.928747   
max                                     0.948540   

       f1_py_boiler_pipe_NumWordsRulesExtractor  f1_python_readibilty  \
count                                  5.000000              5.000000   
mean                                   0.785997              0.970715   
std                                    0.287227              0.019337   
min                                    0.274594              0.941176   
2%                                     0.323666              0.943110   
5%                                     0.397275              0.946010   
10%                                    0.519956              0.950844   
50%                                    0.900474              0.971193   
max                                    0.960514              0.988806   

       f1_thr_boiler_pipe_ArticleExtractor  \
count                             5.000000   
mean                              0.720446   
std                               0.375738   
min                               0.077785   
2%                                0.126845   
5%                                0.200436   
10%                               0.323086   
50%                               0.936508   
max                               0.958025   

       f1_thr_boiler_pipe_DefaultExtractor  
count                             5.000000  
mean                              0.710296  
std                               0.369795  
min                               0.062046  
2%                                0.119807  
5%                                0.206450  
10%                               0.350854  
50%                               0.815126  
max                               0.950000  
arabic
       precision_gold  precision_py_boiler_pipe_ArticleExtractor  \
count             117                                 117.000000   
mean                1                                   0.741376   
std                 0                                   0.252984   
min                 1                                   0.010726   
2%                  1                                   0.022730   
5%                  1                                   0.060870   
10%                 1                                   0.330385   
50%                 1                                   0.828947   
max                 1                                   0.980379   

       precision_py_boiler_pipe_ArticleSentencesExtractor  \
count                                                117    
mean                                                   0    
std                                                    0    
min                                                    0    
2%                                                     0    
5%                                                     0    
10%                                                    0    
50%                                                    0    
max                                                    0    

       precision_py_boiler_pipe_CanolaExtractor  \
count                                117.000000   
mean                                   0.588046   
std                                    0.253096   
min                                    0.001637   
2%                                     0.027450   
5%                                     0.159398   
10%                                    0.235953   
50%                                    0.638584   
max                                    0.976285   

       precision_py_boiler_pipe_DefaultExtractor  \
count                                 117.000000   
mean                                    0.657585   
std                                     0.245890   
min                                     0.005348   
2%                                      0.058647   
5%                                      0.155817   
10%                                     0.283340   
50%                                     0.736264   
max                                     0.978873   

       precision_py_boiler_pipe_KeepEverythingExtractor  \
count                                        117.000000   
mean                                           0.336254   
std                                            0.219627   
min                                            0.001162   
2%                                             0.027610   
5%                                             0.079325   
10%                                            0.100802   
50%                                            0.305410   
max                                            0.910245   

       precision_py_boiler_pipe_LargestContentExtractor  \
count                                        117.000000   
mean                                           0.755409   
std                                            0.255409   
min                                            0.000000   
2%                                             0.014231   
5%                                             0.023593   
10%                                            0.490368   
50%                                            0.837349   
max                                            0.980232   

       precision_py_boiler_pipe_NumWordsRulesExtractor  \
count                                       117.000000   
mean                                          0.711242   
std                                           0.224088   
min                                           0.013514   
2%                                            0.058882   
5%                                            0.280358   
10%                                           0.358444   
50%                                           0.785714   
max                                           0.978873   

       precision_python_readibilty  \
count                   117.000000   
mean                      0.929890   
std                       0.134994   
min                       0.241379   
2%                        0.414138   
5%                        0.666667   
10%                       0.857168   
50%                       0.980952   
max                       1.000000   

       precision_thr_boiler_pipe_ArticleExtractor  \
count                                  117.000000   
mean                                     0.756990   
std                                      0.297713   
min                                      0.011182   
2%                                       0.027757   
5%                                       0.043883   
10%                                      0.130529   
50%                                      0.881720   
max                                      0.997958   

       precision_thr_boiler_pipe_DefaultExtractor  
count                                  117.000000  
mean                                     0.674588  
std                                      0.298915  
min                                      0.000000  
2%                                       0.033720  
5%                                       0.120281  
10%                                      0.236147  
50%                                      0.802956  
max                                      1.000000  
       recall_gold  recall_py_boiler_pipe_ArticleExtractor  \
count          117                              117.000000   
mean             1                                0.839014   
std              0                                0.255579   
min              1                                0.012346   
2%               1                                0.024636   
5%               1                                0.037730   
10%              1                                0.632387   
50%              1                                0.936508   
max              1                                1.000000   

       recall_py_boiler_pipe_ArticleSentencesExtractor  \
count                                              117   
mean                                                 0   
std                                                  0   
min                                                  0   
2%                                                   0   
5%                                                   0   
10%                                                  0   
50%                                                  0   
max                                                  0   

       recall_py_boiler_pipe_CanolaExtractor  \
count                             117.000000   
mean                                0.766722   
std                                 0.208122   
min                                 0.012346   
2%                                  0.139507   
5%                                  0.195693   
10%                                 0.552758   
50%                                 0.825397   
max                                 0.997429   

       recall_py_boiler_pipe_DefaultExtractor  \
count                              117.000000   
mean                                 0.701666   
std                                  0.293111   
min                                  0.012346   
2%                                   0.026452   
5%                                   0.045974   
10%                                  0.126923   
50%                                  0.819788   
max                                  0.997429   

       recall_py_boiler_pipe_KeepEverythingExtractor  \
count                                     117.000000   
mean                                        0.922749   
std                                         0.133383   
min                                         0.007463   
2%                                          0.590000   
5%                                          0.830303   
10%                                         0.874678   
50%                                         0.952000   
max                                         1.000000   

       recall_py_boiler_pipe_LargestContentExtractor  \
count                                     117.000000   
mean                                        0.768446   
std                                         0.282090   
min                                         0.000000   
2%                                          0.020848   
5%                                          0.032891   
10%                                         0.138356   
50%                                         0.868613   
max                                         0.994859   

       recall_py_boiler_pipe_NumWordsRulesExtractor  recall_python_readibilty  \
count                                    117.000000                117.000000   
mean                                       0.801901                  0.830034   
std                                        0.193662                  0.255519   
min                                        0.012346                  0.012255   
2%                                         0.112381                  0.041673   
5%                                         0.398237                  0.093980   
10%                                        0.607539                  0.509681   
50%                                        0.865079                  0.921986   
max                                        0.997429                  1.000000   

       recall_thr_boiler_pipe_ArticleExtractor  \
count                               117.000000   
mean                                  0.842179   
std                                   0.314171   
min                                   0.023932   
2%                                    0.040051   
5%                                    0.062071   
10%                                   0.151961   
50%                                   1.000000   
max                                   1.000000   

       recall_thr_boiler_pipe_DefaultExtractor  
count                               117.000000  
mean                                  0.759565  
std                                   0.347585  
min                                   0.000000  
2%                                    0.032252  
5%                                    0.060219  
10%                                   0.117895  
50%                                   0.966667  
max                                   1.000000  
       f1_gold  f1_py_boiler_pipe_ArticleExtractor  \
count      117                          117.000000   
mean         1                            0.774978   
std          0                            0.255278   
min          1                            0.014469   
2%           1                            0.028784   
5%           1                            0.036998   
10%          1                            0.405667   
50%          1                            0.874074   
max          1                            0.981043   

       f1_py_boiler_pipe_ArticleSentencesExtractor  \
count                                          117   
mean                                             0   
std                                              0   
min                                              0   
2%                                               0   
5%                                               0   
10%                                              0   
50%                                              0   
max                                              0   

       f1_py_boiler_pipe_CanolaExtractor  f1_py_boiler_pipe_DefaultExtractor  \
count                         117.000000                          117.000000   
mean                            0.645211                            0.655535   
std                             0.235679                            0.267798   
min                             0.003236                            0.010309   
2%                              0.024550                            0.025866   
5%                              0.194915                            0.068729   
10%                             0.296855                            0.171742   
50%                             0.707692                            0.765854   
max                             0.973399                            0.971214   

       f1_py_boiler_pipe_KeepEverythingExtractor  \
count                                 117.000000   
mean                                    0.459616   
std                                     0.228261   
min                                     0.002321   
2%                                      0.049776   
5%                                      0.146173   
10%                                     0.182303   
50%                                     0.453368   
max                                     0.946723   

       f1_py_boiler_pipe_LargestContentExtractor  \
count                                 117.000000   
mean                                    0.754560   
std                                     0.268122   
min                                     0.000000   
2%                                      0.016508   
5%                                      0.031962   
10%                                     0.192451   
50%                                     0.850000   
max                                     0.980392   

       f1_py_boiler_pipe_NumWordsRulesExtractor  f1_python_readibilty  \
count                                117.000000            117.000000   
mean                                   0.741682              0.844823   
std                                    0.209849              0.235643   
min                                    0.014599              0.023585   
2%                                     0.056917              0.079781   
5%                                     0.366651              0.171765   
10%                                    0.473788              0.558496   
50%                                    0.820755              0.933902   
max                                    0.971641              1.000000   

       f1_thr_boiler_pipe_ArticleExtractor  \
count                           117.000000   
mean                              0.776713   
std                               0.305971   
min                               0.015242   
2%                                0.032739   
5%                                0.051157   
10%                               0.145678   
50%                               0.927273   
max                               0.997361   

       f1_thr_boiler_pipe_DefaultExtractor  
count                           117.000000  
mean                              0.656418  
std                               0.314089  
min                               0.000000  
2%                                0.037984  
5%                                0.055188  
10%                               0.154126  
50%                               0.735484  
max                               0.998779  

In [67]:
boiler_pipe_extractor_training_objects = cPickle.load( open( "boiler_pipe_google_news_extractor_training_objects.pickle", "rb") )
#eto = extractor_training_objects[ 0 ]
#eto.keys()
#print eto['expected_text']
#get_extraction_results( eto )
#comp_extractors ( eto )

comps_downloads_boiler_pipe = []
processed = 0
skipped = 0
start_time = datetime.datetime.now()
e=None

for extractor_training_object in  boiler_pipe_extractor_training_objects[:]:
    try:
        res = comp_extractors( extractor_training_object )
        #print res
        comps_downloads_boiler_pipe.append( res )
        processed += 1
    except Exception, e:
        print "error on download{}".format( extractor_training_object[ 'downloads_id'] )
        e = sys.exc_info()
        
        import traceback
        
        traceback.print_exc()
        print e
        #raise e
        skipped += 1

    print 'processed', processed, 'skipped', skipped

    #extraction_results.append( er )

end_time = datetime.datetime.now()


print "Total_time", end_time - start_time

print "Time per download", (end_time - start_time)/ (processed + skipped )
    
res.keys()


processed 1 skipped 0
processed 2 skipped 0
processed 3 skipped 0
processed 4 skipped 0
processed 5 skipped 0
processed 6 skipped 0
processed 7 skipped 0
error on download045cb317-60ad-454d-add8-3baa40789258.html
(<type 'exceptions.IOError'>, IOError('cannot identify image file',), <traceback object at 0x7f4dcdc70248>)
processed 7 skipped 1
processed 8 skipped 1
processed 9 skipped 1
processed 10 skipped 1
error on download065445c6-e5e0-4006-ba4b-31711c4a6a4b.html
Traceback (most recent call last):
  File "<ipython-input-67-6114dbc3b5a2>", line 16, in <module>
    res = comp_extractors( extractor_training_object )
  File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
    extraction_results = get_extraction_results( eto )
  File "<ipython-input-21-ff5d87d959e4>", line 25, in get_extraction_results
    ret['py_goose']  = { 'extracted_html': extract_with_python_goose( raw_content ) }
  File "<ipython-input-9-da7d07f83e6d>", line 6, in extract_with_python_goose
    r = g.extract( raw_html=raw_content )
  File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 56, in extract
    return self.crawl(cc)
  File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 63, in crawl
    article = crawler.crawl(crawl_candiate)
  File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 125, in crawl
    self.get_image()
  File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 147, in get_image
    self.article.top_image = self.image_extractor.get_best_image(doc, top_node)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 88, in get_best_image
    image = self.check_large_images(topNode, 0, 0)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 122, in check_large_images
    good_images = self.get_image_candidates(node)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 283, in get_image_candidates
    good_images = self.get_images_bytesize_match(filtered_images)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 299, in get_images_bytesize_match
    local_image = self.get_local_image(src)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 344, in get_local_image
    self.link_hash, src, self.config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 52, in store_image
    image = self.read_localfile(link_hash, src, config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 81, in read_localfile
    image_details = self.get_image_dimensions(identify, local_image_name)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 36, in get_image_dimensions
    image = Image.open(path)
  File "/usr/lib/python2.7/dist-packages/PIL/Image.py", line 2028, in open
    raise IOError("cannot identify image file")
IOError: cannot identify image file
Traceback (most recent call last):
(<type 'exceptions.IOError'>, IOError('cannot identify image file',), <traceback object at 0x7f4dcd4117a0>)
processed 10 skipped 2
processed 11 skipped 2
processed 12 skipped 2
processed 13 skipped 2
processed 14 skipped 2
processed 15 skipped 2
processed 16 skipped 2
processed 17 skipped 2
processed 18 skipped 2
processed 19 skipped 2
processed 20 skipped 2
error on download0acd5213-35e1-4039-adb5-6c7611911b9e.html
  File "<ipython-input-67-6114dbc3b5a2>", line 16, in <module>
    res = comp_extractors( extractor_training_object )
  File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
    extraction_results = get_extraction_results( eto )
  File "<ipython-input-21-ff5d87d959e4>", line 25, in get_extraction_results
    ret['py_goose']  = { 'extracted_html': extract_with_python_goose( raw_content ) }
  File "<ipython-input-9-da7d07f83e6d>", line 6, in extract_with_python_goose
    r = g.extract( raw_html=raw_content )
  File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 56, in extract
    return self.crawl(cc)
  File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 63, in crawl
    article = crawler.crawl(crawl_candiate)
  File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 125, in crawl
    self.get_image()
  File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 147, in get_image
    self.article.top_image = self.image_extractor.get_best_image(doc, top_node)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 88, in get_best_image
    image = self.check_large_images(topNode, 0, 0)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
    depth_obj.parent_depth, depth_obj.sibling_depth)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
    depth_obj.parent_depth, depth_obj.sibling_depth)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
    depth_obj.parent_depth, depth_obj.sibling_depth)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 122, in check_large_images
    good_images = self.get_image_candidates(node)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 283, in get_image_candidates
    good_images = self.get_images_bytesize_match(filtered_images)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 299, in get_images_bytesize_match
    local_image = self.get_local_image(src)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 344, in get_local_image
    self.link_hash, src, self.config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 52, in store_image
    image = self.read_localfile(link_hash, src, config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 81, in read_localfile
    image_details = self.get_image_dimensions(identify, local_image_name)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 36, in get_image_dimensions
    image = Image.open(path)
  File "/usr/lib/python2.7/dist-packages/PIL/Image.py", line 2028, in open
    raise IOError("cannot identify image file")
IOError: cannot identify image file
Traceback (most recent call last):
(<type 'exceptions.IOError'>, IOError('cannot identify image file',), <traceback object at 0x7f4de71a79e0>)
processed 20 skipped 3
processed 21 skipped 3
processed 22 skipped 3
processed 23 skipped 3
processed 24 skipped 3
processed 25 skipped 3
processed 26 skipped 3
processed 27 skipped 3
processed 28 skipped 3
error on download0fb4f846-1043-4578-8863-a4bf82dccb74.html
  File "<ipython-input-67-6114dbc3b5a2>", line 16, in <module>
    res = comp_extractors( extractor_training_object )
  File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
    extraction_results = get_extraction_results( eto )
  File "<ipython-input-21-ff5d87d959e4>", line 25, in get_extraction_results
    ret['py_goose']  = { 'extracted_html': extract_with_python_goose( raw_content ) }
  File "<ipython-input-9-da7d07f83e6d>", line 6, in extract_with_python_goose
    r = g.extract( raw_html=raw_content )
  File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 56, in extract
    return self.crawl(cc)
  File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 63, in crawl
    article = crawler.crawl(crawl_candiate)
  File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 125, in crawl
    self.get_image()
  File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 147, in get_image
    self.article.top_image = self.image_extractor.get_best_image(doc, top_node)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 88, in get_best_image
    image = self.check_large_images(topNode, 0, 0)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 122, in check_large_images
    good_images = self.get_image_candidates(node)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 283, in get_image_candidates
    good_images = self.get_images_bytesize_match(filtered_images)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 299, in get_images_bytesize_match
    local_image = self.get_local_image(src)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 344, in get_local_image
    self.link_hash, src, self.config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 59, in store_image
    image = self.write_localfile(data, link_hash, src, config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 101, in write_localfile
    return self.read_localfile(link_hash, src, config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 81, in read_localfile
    image_details = self.get_image_dimensions(identify, local_image_name)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 36, in get_image_dimensions
    image = Image.open(path)
  File "/usr/lib/python2.7/dist-packages/PIL/Image.py", line 2028, in open
    raise IOError("cannot identify image file")
IOError: cannot identify image file
Traceback (most recent call last):
(<type 'exceptions.IOError'>, IOError('cannot identify image file',), <traceback object at 0x7f4e0f04e3b0>)
processed 28 skipped 4
processed 29 skipped 4
processed 30 skipped 4
processed 31 skipped 4
processed 32 skipped 4
processed 33 skipped 4
processed 34 skipped 4
processed 35 skipped 4
processed 36 skipped 4
processed 37 skipped 4
processed 38 skipped 4
processed 39 skipped 4
processed 40 skipped 4
processed 41 skipped 4
processed 42 skipped 4
processed 43 skipped 4
processed 44 skipped 4
processed 45 skipped 4
processed 46 skipped 4
processed 47 skipped 4
processed 48 skipped 4
processed 49 skipped 4
processed 50 skipped 4
processed 51 skipped 4
processed 52 skipped 4
processed 53 skipped 4
processed 54 skipped 4
processed 55 skipped 4
processed 56 skipped 4
processed 57 skipped 4
processed 58 skipped 4
processed 59 skipped 4
processed 60 skipped 4
processed 61 skipped 4
processed 62 skipped 4
processed 63 skipped 4
processed 64 skipped 4
processed 65 skipped 4
processed 66 skipped 4
processed 67 skipped 4
processed 68 skipped 4
processed 69 skipped 4
processed 70 skipped 4
processed 71 skipped 4
processed 72 skipped 4
processed 73 skipped 4
processed 74 skipped 4
processed 75 skipped 4
processed 76 skipped 4
processed 77 skipped 4
processed 78 skipped 4
processed 79 skipped 4
processed 80 skipped 4
processed 81 skipped 4
processed 82 skipped 4
processed 83 skipped 4
processed 84 skipped 4
processed 85 skipped 4
processed 86 skipped 4
processed 87 skipped 4
processed 88 skipped 4
processed 89 skipped 4
processed 90 skipped 4
processed 91 skipped 4
processed 92 skipped 4
processed 93 skipped 4
processed 94 skipped 4
error on download2d25dd6c-5093-4be6-b801-d671416c2e61.html
  File "<ipython-input-67-6114dbc3b5a2>", line 16, in <module>
    res = comp_extractors( extractor_training_object )
  File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
    extraction_results = get_extraction_results( eto )
  File "<ipython-input-21-ff5d87d959e4>", line 25, in get_extraction_results
    ret['py_goose']  = { 'extracted_html': extract_with_python_goose( raw_content ) }
  File "<ipython-input-9-da7d07f83e6d>", line 6, in extract_with_python_goose
    r = g.extract( raw_html=raw_content )
  File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 56, in extract
    return self.crawl(cc)
  File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 63, in crawl
    article = crawler.crawl(crawl_candiate)
  File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 125, in crawl
    self.get_image()
  File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 147, in get_image
    self.article.top_image = self.image_extractor.get_best_image(doc, top_node)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 88, in get_best_image
    image = self.check_large_images(topNode, 0, 0)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 122, in check_large_images
    good_images = self.get_image_candidates(node)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 283, in get_image_candidates
    good_images = self.get_images_bytesize_match(filtered_images)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 299, in get_images_bytesize_match
    local_image = self.get_local_image(src)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 344, in get_local_image
    self.link_hash, src, self.config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 59, in store_image
    image = self.write_localfile(data, link_hash, src, config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 101, in write_localfile
    return self.read_localfile(link_hash, src, config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 81, in read_localfile
    image_details = self.get_image_dimensions(identify, local_image_name)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 36, in get_image_dimensions
    image = Image.open(path)
  File "/usr/lib/python2.7/dist-packages/PIL/Image.py", line 2028, in open
    raise IOError("cannot identify image file")
IOError: cannot identify image file
Traceback (most recent call last):
(<type 'exceptions.IOError'>, IOError('cannot identify image file',), <traceback object at 0x7f4dcdba0098>)
processed 94 skipped 5
processed 95 skipped 5
processed 96 skipped 5
processed 97 skipped 5
error on download2fc045e8-a8a6-4ae4-9269-a8892fe69085.html
  File "<ipython-input-67-6114dbc3b5a2>", line 16, in <module>
    res = comp_extractors( extractor_training_object )
  File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
    extraction_results = get_extraction_results( eto )
  File "<ipython-input-21-ff5d87d959e4>", line 25, in get_extraction_results
    ret['py_goose']  = { 'extracted_html': extract_with_python_goose( raw_content ) }
  File "<ipython-input-9-da7d07f83e6d>", line 6, in extract_with_python_goose
    r = g.extract( raw_html=raw_content )
  File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 56, in extract
    return self.crawl(cc)
  File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 63, in crawl
    article = crawler.crawl(crawl_candiate)
  File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 125, in crawl
    self.get_image()
  File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 147, in get_image
    self.article.top_image = self.image_extractor.get_best_image(doc, top_node)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 88, in get_best_image
    image = self.check_large_images(topNode, 0, 0)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
    depth_obj.parent_depth, depth_obj.sibling_depth)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
    depth_obj.parent_depth, depth_obj.sibling_depth)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 122, in check_large_images
    good_images = self.get_image_candidates(node)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 283, in get_image_candidates
    good_images = self.get_images_bytesize_match(filtered_images)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 299, in get_images_bytesize_match
    local_image = self.get_local_image(src)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 344, in get_local_image
    self.link_hash, src, self.config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 59, in store_image
    image = self.write_localfile(data, link_hash, src, config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 101, in write_localfile
    return self.read_localfile(link_hash, src, config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 81, in read_localfile
    image_details = self.get_image_dimensions(identify, local_image_name)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 36, in get_image_dimensions
    image = Image.open(path)
  File "/usr/lib/python2.7/dist-packages/PIL/Image.py", line 2028, in open
    raise IOError("cannot identify image file")
IOError: cannot identify image file
Traceback (most recent call last):
(<type 'exceptions.ValueError'>, ValueError(u'Unicode strings with encoding declaration are not supported. Please use bytes input or XML fragments without declaration.',), <traceback object at 0x7f4dcdc7a248>)
processed 97 skipped 6
error on download2fd3440e-8cf1-422d-8b6b-0ec1082cbebb.html
  File "<ipython-input-67-6114dbc3b5a2>", line 16, in <module>
    res = comp_extractors( extractor_training_object )
  File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
    extraction_results = get_extraction_results( eto )
  File "<ipython-input-21-ff5d87d959e4>", line 25, in get_extraction_results
    ret['py_goose']  = { 'extracted_html': extract_with_python_goose( raw_content ) }
  File "<ipython-input-9-da7d07f83e6d>", line 6, in extract_with_python_goose
    r = g.extract( raw_html=raw_content )
  File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 56, in extract
    return self.crawl(cc)
  File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 63, in crawl
    article = crawler.crawl(crawl_candiate)
  File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 90, in crawl
    doc = self.get_document(raw_html)
  File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 176, in get_document
    doc = self.parser.fromstring(raw_html)
  File "/usr/local/lib/python2.7/dist-packages/goose/parsers.py", line 54, in fromstring
    self.doc = lxml.html.fromstring(html)
  File "/usr/local/lib/python2.7/dist-packages/lxml/html/__init__.py", line 672, in fromstring
    doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
  File "/usr/local/lib/python2.7/dist-packages/lxml/html/__init__.py", line 568, in document_fromstring
    value = etree.fromstring(html, parser, **kw)
  File "lxml.etree.pyx", line 2997, in lxml.etree.fromstring (src/lxml/lxml.etree.c:63276)
  File "parser.pxi", line 1607, in lxml.etree._parseMemoryDocument (src/lxml/lxml.etree.c:93592)
ValueError: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fragments without declaration.
Traceback (most recent call last):
(<type 'exceptions.IOError'>, IOError('cannot identify image file',), <traceback object at 0x7f4dccd12098>)
processed 97 skipped 7
processed 98 skipped 7
processed 99 skipped 7
processed 100 skipped 7
processed 101 skipped 7
processed 102 skipped 7
processed 103 skipped 7
processed 104 skipped 7
processed 105 skipped 7
processed 106 skipped 7
error on download3164baec-188a-4116-9aed-a18041854535.html
  File "<ipython-input-67-6114dbc3b5a2>", line 16, in <module>
    res = comp_extractors( extractor_training_object )
  File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
    extraction_results = get_extraction_results( eto )
  File "<ipython-input-21-ff5d87d959e4>", line 25, in get_extraction_results
    ret['py_goose']  = { 'extracted_html': extract_with_python_goose( raw_content ) }
  File "<ipython-input-9-da7d07f83e6d>", line 6, in extract_with_python_goose
    r = g.extract( raw_html=raw_content )
  File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 56, in extract
    return self.crawl(cc)
  File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 63, in crawl
    article = crawler.crawl(crawl_candiate)
  File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 125, in crawl
    self.get_image()
  File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 147, in get_image
    self.article.top_image = self.image_extractor.get_best_image(doc, top_node)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 88, in get_best_image
    image = self.check_large_images(topNode, 0, 0)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
    depth_obj.parent_depth, depth_obj.sibling_depth)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 122, in check_large_images
    good_images = self.get_image_candidates(node)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 283, in get_image_candidates
    good_images = self.get_images_bytesize_match(filtered_images)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 299, in get_images_bytesize_match
    local_image = self.get_local_image(src)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 344, in get_local_image
    self.link_hash, src, self.config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 59, in store_image
    image = self.write_localfile(data, link_hash, src, config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 101, in write_localfile
    return self.read_localfile(link_hash, src, config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 81, in read_localfile
    image_details = self.get_image_dimensions(identify, local_image_name)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 36, in get_image_dimensions
    image = Image.open(path)
  File "/usr/lib/python2.7/dist-packages/PIL/Image.py", line 2028, in open
    raise IOError("cannot identify image file")
IOError: cannot identify image file
Traceback (most recent call last):
(<type 'exceptions.IOError'>, IOError("Couldn't open file /usr/local/lib/python2.7/dist-packages/goose/resources/text/stopwords-fa.txt",), <traceback object at 0x7f4dcd474560>)
processed 106 skipped 8
processed 107 skipped 8
processed 108 skipped 8
processed 109 skipped 8
processed 110 skipped 8
processed 111 skipped 8
processed 112 skipped 8
processed 113 skipped 8
processed 114 skipped 8
processed 115 skipped 8
processed 116 skipped 8
processed 117 skipped 8
processed 118 skipped 8
processed 119 skipped 8
processed 120 skipped 8
processed 121 skipped 8
processed 122 skipped 8
processed 123 skipped 8
processed 124 skipped 8
processed 125 skipped 8
processed 126 skipped 8
processed 127 skipped 8
processed 128 skipped 8
processed 129 skipped 8
processed 130 skipped 8
processed 131 skipped 8
processed 132 skipped 8
processed 133 skipped 8
processed 134 skipped 8
processed 135 skipped 8
processed 136 skipped 8
processed 137 skipped 8
processed 138 skipped 8
error on download41f7673a-553d-4893-9a51-ef7bb0ce6293.html
  File "<ipython-input-67-6114dbc3b5a2>", line 16, in <module>
    res = comp_extractors( extractor_training_object )
  File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
    extraction_results = get_extraction_results( eto )
  File "<ipython-input-21-ff5d87d959e4>", line 25, in get_extraction_results
    ret['py_goose']  = { 'extracted_html': extract_with_python_goose( raw_content ) }
  File "<ipython-input-9-da7d07f83e6d>", line 6, in extract_with_python_goose
    r = g.extract( raw_html=raw_content )
  File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 56, in extract
    return self.crawl(cc)
  File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 63, in crawl
    article = crawler.crawl(crawl_candiate)
  File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 131, in crawl
    self.article.cleaned_text = self.formatter.get_formatted_text()
  File "/usr/local/lib/python2.7/dist-packages/goose/outputformatters.py", line 66, in get_formatted_text
    self.remove_fewwords_paragraphs()
  File "/usr/local/lib/python2.7/dist-packages/goose/outputformatters.py", line 123, in remove_fewwords_paragraphs
    stop_words = self.stopwords_class(language=self.get_language()).get_stopword_count(text)
  File "/usr/local/lib/python2.7/dist-packages/goose/text.py", line 98, in __init__
    self._cached_stop_words[language] = set(FileHelper.loadResourceFile(path).splitlines())
  File "/usr/local/lib/python2.7/dist-packages/goose/utils/__init__.py", line 79, in loadResourceFile
    raise IOError("Couldn't open file %s" % path)
IOError: Couldn't open file /usr/local/lib/python2.7/dist-packages/goose/resources/text/stopwords-fa.txt
Traceback (most recent call last):
(<type 'exceptions.IOError'>, IOError('cannot identify image file',), <traceback object at 0x7f4dccce5908>)
processed 138 skipped 9
error on download42abb137-8a90-47db-90e2-3f8013cb9ae8.html
  File "<ipython-input-67-6114dbc3b5a2>", line 16, in <module>
    res = comp_extractors( extractor_training_object )
  File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
    extraction_results = get_extraction_results( eto )
  File "<ipython-input-21-ff5d87d959e4>", line 25, in get_extraction_results
    ret['py_goose']  = { 'extracted_html': extract_with_python_goose( raw_content ) }
  File "<ipython-input-9-da7d07f83e6d>", line 6, in extract_with_python_goose
    r = g.extract( raw_html=raw_content )
  File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 56, in extract
    return self.crawl(cc)
  File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 63, in crawl
    article = crawler.crawl(crawl_candiate)
  File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 125, in crawl
    self.get_image()
  File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 147, in get_image
    self.article.top_image = self.image_extractor.get_best_image(doc, top_node)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 88, in get_best_image
    image = self.check_large_images(topNode, 0, 0)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
    depth_obj.parent_depth, depth_obj.sibling_depth)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 122, in check_large_images
    good_images = self.get_image_candidates(node)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 283, in get_image_candidates
    good_images = self.get_images_bytesize_match(filtered_images)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 299, in get_images_bytesize_match
    local_image = self.get_local_image(src)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 344, in get_local_image
    self.link_hash, src, self.config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 59, in store_image
    image = self.write_localfile(data, link_hash, src, config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 101, in write_localfile
    return self.read_localfile(link_hash, src, config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 81, in read_localfile
    image_details = self.get_image_dimensions(identify, local_image_name)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 36, in get_image_dimensions
    image = Image.open(path)
  File "/usr/lib/python2.7/dist-packages/PIL/Image.py", line 2028, in open
    raise IOError("cannot identify image file")
IOError: cannot identify image file
Traceback (most recent call last):
(<type 'exceptions.IOError'>, IOError('cannot identify image file',), <traceback object at 0x7f4da2191b48>)
processed 138 skipped 10
processed 139 skipped 10
error on download43fb5dbe-6f8c-45f8-bd59-7b258df028fb.html
  File "<ipython-input-67-6114dbc3b5a2>", line 16, in <module>
    res = comp_extractors( extractor_training_object )
  File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
    extraction_results = get_extraction_results( eto )
  File "<ipython-input-21-ff5d87d959e4>", line 25, in get_extraction_results
    ret['py_goose']  = { 'extracted_html': extract_with_python_goose( raw_content ) }
  File "<ipython-input-9-da7d07f83e6d>", line 6, in extract_with_python_goose
    r = g.extract( raw_html=raw_content )
  File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 56, in extract
    return self.crawl(cc)
  File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 63, in crawl
    article = crawler.crawl(crawl_candiate)
  File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 125, in crawl
    self.get_image()
  File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 147, in get_image
    self.article.top_image = self.image_extractor.get_best_image(doc, top_node)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 88, in get_best_image
    image = self.check_large_images(topNode, 0, 0)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
    depth_obj.parent_depth, depth_obj.sibling_depth)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
    depth_obj.parent_depth, depth_obj.sibling_depth)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
    depth_obj.parent_depth, depth_obj.sibling_depth)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 122, in check_large_images
    good_images = self.get_image_candidates(node)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 283, in get_image_candidates
    good_images = self.get_images_bytesize_match(filtered_images)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 299, in get_images_bytesize_match
    local_image = self.get_local_image(src)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 344, in get_local_image
    self.link_hash, src, self.config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 59, in store_image
    image = self.write_localfile(data, link_hash, src, config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 101, in write_localfile
    return self.read_localfile(link_hash, src, config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 81, in read_localfile
    image_details = self.get_image_dimensions(identify, local_image_name)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 36, in get_image_dimensions
    image = Image.open(path)
  File "/usr/lib/python2.7/dist-packages/PIL/Image.py", line 2028, in open
    raise IOError("cannot identify image file")
IOError: cannot identify image file
Traceback (most recent call last):
(<type 'exceptions.ValueError'>, ValueError(u'Unicode strings with encoding declaration are not supported. Please use bytes input or XML fragments without declaration.',), <traceback object at 0x7f4de618fc68>)
processed 139 skipped 11
processed 140 skipped 11
processed 141 skipped 11
processed 142 skipped 11
processed 143 skipped 11
processed 144 skipped 11
processed 145 skipped 11
processed 146 skipped 11
processed 147 skipped 11
processed 148 skipped 11
processed 149 skipped 11
processed 150 skipped 11
processed 151 skipped 11
processed 152 skipped 11
processed 153 skipped 11
processed 154 skipped 11
processed 155 skipped 11
processed 156 skipped 11
processed 157 skipped 11
processed 158 skipped 11
processed 159 skipped 11
processed 160 skipped 11
processed 161 skipped 11
processed 162 skipped 11
processed 163 skipped 11
processed 164 skipped 11
processed 165 skipped 11
processed 166 skipped 11
processed 167 skipped 11
processed 168 skipped 11
processed 169 skipped 11
processed 170 skipped 11
processed 171 skipped 11
processed 172 skipped 11
processed 173 skipped 11
error on download50f75f84-f64b-4d08-87dc-b351742a4e4b.html
  File "<ipython-input-67-6114dbc3b5a2>", line 16, in <module>
    res = comp_extractors( extractor_training_object )
  File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
    extraction_results = get_extraction_results( eto )
  File "<ipython-input-21-ff5d87d959e4>", line 25, in get_extraction_results
    ret['py_goose']  = { 'extracted_html': extract_with_python_goose( raw_content ) }
  File "<ipython-input-9-da7d07f83e6d>", line 6, in extract_with_python_goose
    r = g.extract( raw_html=raw_content )
  File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 56, in extract
    return self.crawl(cc)
  File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 63, in crawl
    article = crawler.crawl(crawl_candiate)
  File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 90, in crawl
    doc = self.get_document(raw_html)
  File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 176, in get_document
    doc = self.parser.fromstring(raw_html)
  File "/usr/local/lib/python2.7/dist-packages/goose/parsers.py", line 54, in fromstring
    self.doc = lxml.html.fromstring(html)
  File "/usr/local/lib/python2.7/dist-packages/lxml/html/__init__.py", line 672, in fromstring
    doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
  File "/usr/local/lib/python2.7/dist-packages/lxml/html/__init__.py", line 568, in document_fromstring
    value = etree.fromstring(html, parser, **kw)
  File "lxml.etree.pyx", line 2997, in lxml.etree.fromstring (src/lxml/lxml.etree.c:63276)
  File "parser.pxi", line 1607, in lxml.etree._parseMemoryDocument (src/lxml/lxml.etree.c:93592)
ValueError: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fragments without declaration.
Traceback (most recent call last):
(<type 'exceptions.IOError'>, IOError('cannot identify image file',), <traceback object at 0x7f4da21b62d8>)
processed 173 skipped 12
processed 174 skipped 12
processed 175 skipped 12
processed 176 skipped 12
processed 177 skipped 12
processed 178 skipped 12
processed 179 skipped 12
processed 180 skipped 12
processed 181 skipped 12
processed 182 skipped 12
processed 183 skipped 12
processed 184 skipped 12
processed 185 skipped 12
processed 186 skipped 12
processed 187 skipped 12
processed 188 skipped 12
processed 189 skipped 12
processed 190 skipped 12
processed 191 skipped 12
processed 192 skipped 12
processed 193 skipped 12
processed 194 skipped 12
processed 195 skipped 12
processed 196 skipped 12
processed 197 skipped 12
processed 198 skipped 12
processed 199 skipped 12
processed 200 skipped 12
processed 201 skipped 12
processed 202 skipped 12
processed 203 skipped 12
processed 204 skipped 12
processed 205 skipped 12
processed 206 skipped 12
processed 207 skipped 12
processed 208 skipped 12
processed 209 skipped 12
processed 210 skipped 12
processed 211 skipped 12
processed 212 skipped 12
processed 213 skipped 12
processed 214 skipped 12
processed 215 skipped 12
processed 216 skipped 12
processed 217 skipped 12
processed 218 skipped 12
processed 219 skipped 12
processed 220 skipped 12
processed 221 skipped 12
processed 222 skipped 12
processed 223 skipped 12
processed 224 skipped 12
processed 225 skipped 12
processed 226 skipped 12
error on download693c707d-eeab-4486-97d0-2f3d286d1a0e.html
  File "<ipython-input-67-6114dbc3b5a2>", line 16, in <module>
    res = comp_extractors( extractor_training_object )
  File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
    extraction_results = get_extraction_results( eto )
  File "<ipython-input-21-ff5d87d959e4>", line 25, in get_extraction_results
    ret['py_goose']  = { 'extracted_html': extract_with_python_goose( raw_content ) }
  File "<ipython-input-9-da7d07f83e6d>", line 6, in extract_with_python_goose
    r = g.extract( raw_html=raw_content )
  File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 56, in extract
    return self.crawl(cc)
  File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 63, in crawl
    article = crawler.crawl(crawl_candiate)
  File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 125, in crawl
    self.get_image()
  File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 147, in get_image
    self.article.top_image = self.image_extractor.get_best_image(doc, top_node)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 88, in get_best_image
    image = self.check_large_images(topNode, 0, 0)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
    depth_obj.parent_depth, depth_obj.sibling_depth)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 122, in check_large_images
    good_images = self.get_image_candidates(node)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 283, in get_image_candidates
    good_images = self.get_images_bytesize_match(filtered_images)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 299, in get_images_bytesize_match
    local_image = self.get_local_image(src)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 344, in get_local_image
    self.link_hash, src, self.config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 59, in store_image
    image = self.write_localfile(data, link_hash, src, config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 101, in write_localfile
    return self.read_localfile(link_hash, src, config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 81, in read_localfile
    image_details = self.get_image_dimensions(identify, local_image_name)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 36, in get_image_dimensions
    image = Image.open(path)
  File "/usr/lib/python2.7/dist-packages/PIL/Image.py", line 2028, in open
    raise IOError("cannot identify image file")
IOError: cannot identify image file
Traceback (most recent call last):
(<type 'exceptions.ValueError'>, ValueError(u'Unicode strings with encoding declaration are not supported. Please use bytes input or XML fragments without declaration.',), <traceback object at 0x7f4da21aca28>)
processed 226 skipped 13
processed 227 skipped 13
processed 228 skipped 13
processed 229 skipped 13
error on download69df3069-b6f2-4624-8f94-c9aa23298e3b.html
  File "<ipython-input-67-6114dbc3b5a2>", line 16, in <module>
    res = comp_extractors( extractor_training_object )
  File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
    extraction_results = get_extraction_results( eto )
  File "<ipython-input-21-ff5d87d959e4>", line 25, in get_extraction_results
    ret['py_goose']  = { 'extracted_html': extract_with_python_goose( raw_content ) }
  File "<ipython-input-9-da7d07f83e6d>", line 6, in extract_with_python_goose
    r = g.extract( raw_html=raw_content )
  File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 56, in extract
    return self.crawl(cc)
  File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 63, in crawl
    article = crawler.crawl(crawl_candiate)
  File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 90, in crawl
    doc = self.get_document(raw_html)
  File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 176, in get_document
    doc = self.parser.fromstring(raw_html)
  File "/usr/local/lib/python2.7/dist-packages/goose/parsers.py", line 54, in fromstring
    self.doc = lxml.html.fromstring(html)
  File "/usr/local/lib/python2.7/dist-packages/lxml/html/__init__.py", line 672, in fromstring
    doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
  File "/usr/local/lib/python2.7/dist-packages/lxml/html/__init__.py", line 568, in document_fromstring
    value = etree.fromstring(html, parser, **kw)
  File "lxml.etree.pyx", line 2997, in lxml.etree.fromstring (src/lxml/lxml.etree.c:63276)
  File "parser.pxi", line 1607, in lxml.etree._parseMemoryDocument (src/lxml/lxml.etree.c:93592)
ValueError: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fragments without declaration.
Traceback (most recent call last):
(<type 'exceptions.IOError'>, IOError('cannot identify image file',), <traceback object at 0x7f4df95eea28>)
processed 229 skipped 14
processed 230 skipped 14
error on download6bb2477c-bb70-4027-8851-d15a51cd9a49.html
  File "<ipython-input-67-6114dbc3b5a2>", line 16, in <module>
    res = comp_extractors( extractor_training_object )
  File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
    extraction_results = get_extraction_results( eto )
  File "<ipython-input-21-ff5d87d959e4>", line 25, in get_extraction_results
    ret['py_goose']  = { 'extracted_html': extract_with_python_goose( raw_content ) }
  File "<ipython-input-9-da7d07f83e6d>", line 6, in extract_with_python_goose
    r = g.extract( raw_html=raw_content )
  File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 56, in extract
    return self.crawl(cc)
  File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 63, in crawl
    article = crawler.crawl(crawl_candiate)
  File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 125, in crawl
    self.get_image()
  File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 147, in get_image
    self.article.top_image = self.image_extractor.get_best_image(doc, top_node)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 88, in get_best_image
    image = self.check_large_images(topNode, 0, 0)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
    depth_obj.parent_depth, depth_obj.sibling_depth)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
    depth_obj.parent_depth, depth_obj.sibling_depth)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 122, in check_large_images
    good_images = self.get_image_candidates(node)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 283, in get_image_candidates
    good_images = self.get_images_bytesize_match(filtered_images)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 299, in get_images_bytesize_match
    local_image = self.get_local_image(src)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 344, in get_local_image
    self.link_hash, src, self.config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 59, in store_image
    image = self.write_localfile(data, link_hash, src, config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 101, in write_localfile
    return self.read_localfile(link_hash, src, config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 81, in read_localfile
    image_details = self.get_image_dimensions(identify, local_image_name)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 36, in get_image_dimensions
    image = Image.open(path)
  File "/usr/lib/python2.7/dist-packages/PIL/Image.py", line 2028, in open
    raise IOError("cannot identify image file")
IOError: cannot identify image file
Traceback (most recent call last):
(<type 'exceptions.IOError'>, IOError('cannot identify image file',), <traceback object at 0x7f4de72344d0>)
processed 230 skipped 15
error on download6bfd738f-ffe9-4303-948e-c8d75a2944e1.html
  File "<ipython-input-67-6114dbc3b5a2>", line 16, in <module>
    res = comp_extractors( extractor_training_object )
  File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
    extraction_results = get_extraction_results( eto )
  File "<ipython-input-21-ff5d87d959e4>", line 25, in get_extraction_results
    ret['py_goose']  = { 'extracted_html': extract_with_python_goose( raw_content ) }
  File "<ipython-input-9-da7d07f83e6d>", line 6, in extract_with_python_goose
    r = g.extract( raw_html=raw_content )
  File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 56, in extract
    return self.crawl(cc)
  File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 63, in crawl
    article = crawler.crawl(crawl_candiate)
  File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 125, in crawl
    self.get_image()
  File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 147, in get_image
    self.article.top_image = self.image_extractor.get_best_image(doc, top_node)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 88, in get_best_image
    image = self.check_large_images(topNode, 0, 0)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 122, in check_large_images
    good_images = self.get_image_candidates(node)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 283, in get_image_candidates
    good_images = self.get_images_bytesize_match(filtered_images)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 299, in get_images_bytesize_match
    local_image = self.get_local_image(src)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 344, in get_local_image
    self.link_hash, src, self.config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 59, in store_image
    image = self.write_localfile(data, link_hash, src, config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 101, in write_localfile
    return self.read_localfile(link_hash, src, config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 81, in read_localfile
    image_details = self.get_image_dimensions(identify, local_image_name)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 36, in get_image_dimensions
    image = Image.open(path)
  File "/usr/lib/python2.7/dist-packages/PIL/Image.py", line 2028, in open
    raise IOError("cannot identify image file")
IOError: cannot identify image file
Traceback (most recent call last):
(<type 'exceptions.IOError'>, IOError('cannot identify image file',), <traceback object at 0x7f4de618f098>)
processed 230 skipped 16
processed 231 skipped 16
processed 232 skipped 16
processed 233 skipped 16
processed 234 skipped 16
processed 235 skipped 16
processed 236 skipped 16
error on download6e371db7-72dc-4e89-9bc5-2c259cf456e1.html
  File "<ipython-input-67-6114dbc3b5a2>", line 16, in <module>
    res = comp_extractors( extractor_training_object )
  File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
    extraction_results = get_extraction_results( eto )
  File "<ipython-input-21-ff5d87d959e4>", line 25, in get_extraction_results
    ret['py_goose']  = { 'extracted_html': extract_with_python_goose( raw_content ) }
  File "<ipython-input-9-da7d07f83e6d>", line 6, in extract_with_python_goose
    r = g.extract( raw_html=raw_content )
  File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 56, in extract
    return self.crawl(cc)
  File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 63, in crawl
    article = crawler.crawl(crawl_candiate)
  File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 125, in crawl
    self.get_image()
  File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 147, in get_image
    self.article.top_image = self.image_extractor.get_best_image(doc, top_node)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 88, in get_best_image
    image = self.check_large_images(topNode, 0, 0)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
    depth_obj.parent_depth, depth_obj.sibling_depth)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 122, in check_large_images
    good_images = self.get_image_candidates(node)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 283, in get_image_candidates
    good_images = self.get_images_bytesize_match(filtered_images)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 299, in get_images_bytesize_match
    local_image = self.get_local_image(src)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 344, in get_local_image
    self.link_hash, src, self.config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 59, in store_image
    image = self.write_localfile(data, link_hash, src, config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 101, in write_localfile
    return self.read_localfile(link_hash, src, config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 81, in read_localfile
    image_details = self.get_image_dimensions(identify, local_image_name)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 36, in get_image_dimensions
    image = Image.open(path)
  File "/usr/lib/python2.7/dist-packages/PIL/Image.py", line 2028, in open
    raise IOError("cannot identify image file")
IOError: cannot identify image file
Traceback (most recent call last):
(<type 'exceptions.IOError'>, IOError('cannot identify image file',), <traceback object at 0x7f4dccd0a950>)
processed 236 skipped 17
processed 237 skipped 17
processed 238 skipped 17
processed 239 skipped 17
processed 240 skipped 17
processed 241 skipped 17
processed 242 skipped 17
processed 243 skipped 17
processed 244 skipped 17
processed 245 skipped 17
processed 246 skipped 17
processed 247 skipped 17
processed 248 skipped 17
processed 249 skipped 17
processed 250 skipped 17
processed 251 skipped 17
processed 252 skipped 17
processed 253 skipped 17
processed 254 skipped 17
processed 255 skipped 17
processed 256 skipped 17
processed 257 skipped 17
error on download756ff125-8afa-43d2-a2bb-f9e742109202.html
  File "<ipython-input-67-6114dbc3b5a2>", line 16, in <module>
    res = comp_extractors( extractor_training_object )
  File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
    extraction_results = get_extraction_results( eto )
  File "<ipython-input-21-ff5d87d959e4>", line 25, in get_extraction_results
    ret['py_goose']  = { 'extracted_html': extract_with_python_goose( raw_content ) }
  File "<ipython-input-9-da7d07f83e6d>", line 6, in extract_with_python_goose
    r = g.extract( raw_html=raw_content )
  File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 56, in extract
    return self.crawl(cc)
  File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 63, in crawl
    article = crawler.crawl(crawl_candiate)
  File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 125, in crawl
    self.get_image()
  File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 147, in get_image
    self.article.top_image = self.image_extractor.get_best_image(doc, top_node)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 88, in get_best_image
    image = self.check_large_images(topNode, 0, 0)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
    depth_obj.parent_depth, depth_obj.sibling_depth)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
    depth_obj.parent_depth, depth_obj.sibling_depth)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
    depth_obj.parent_depth, depth_obj.sibling_depth)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
    depth_obj.parent_depth, depth_obj.sibling_depth)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 122, in check_large_images
    good_images = self.get_image_candidates(node)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 283, in get_image_candidates
    good_images = self.get_images_bytesize_match(filtered_images)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 299, in get_images_bytesize_match
    local_image = self.get_local_image(src)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 344, in get_local_image
    self.link_hash, src, self.config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 59, in store_image
    image = self.write_localfile(data, link_hash, src, config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 101, in write_localfile
    return self.read_localfile(link_hash, src, config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 81, in read_localfile
    image_details = self.get_image_dimensions(identify, local_image_name)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 36, in get_image_dimensions
    image = Image.open(path)
  File "/usr/lib/python2.7/dist-packages/PIL/Image.py", line 2028, in open
    raise IOError("cannot identify image file")
IOError: cannot identify image file
Traceback (most recent call last):
(<type 'exceptions.IOError'>, IOError('cannot identify image file',), <traceback object at 0x7f4da15cc2d8>)
processed 257 skipped 18
processed 258 skipped 18
processed 259 skipped 18
processed 260 skipped 18
processed 261 skipped 18
processed 262 skipped 18
processed 263 skipped 18
processed 264 skipped 18
processed 265 skipped 18
error on download79f28b11-26b6-4a20-8e9b-de1064b8bc01.html
  File "<ipython-input-67-6114dbc3b5a2>", line 16, in <module>
    res = comp_extractors( extractor_training_object )
  File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
    extraction_results = get_extraction_results( eto )
  File "<ipython-input-21-ff5d87d959e4>", line 25, in get_extraction_results
    ret['py_goose']  = { 'extracted_html': extract_with_python_goose( raw_content ) }
  File "<ipython-input-9-da7d07f83e6d>", line 6, in extract_with_python_goose
    r = g.extract( raw_html=raw_content )
  File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 56, in extract
    return self.crawl(cc)
  File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 63, in crawl
    article = crawler.crawl(crawl_candiate)
  File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 125, in crawl
    self.get_image()
  File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 147, in get_image
    self.article.top_image = self.image_extractor.get_best_image(doc, top_node)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 88, in get_best_image
    image = self.check_large_images(topNode, 0, 0)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
    depth_obj.parent_depth, depth_obj.sibling_depth)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
    depth_obj.parent_depth, depth_obj.sibling_depth)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
    depth_obj.parent_depth, depth_obj.sibling_depth)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 122, in check_large_images
    good_images = self.get_image_candidates(node)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 283, in get_image_candidates
    good_images = self.get_images_bytesize_match(filtered_images)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 299, in get_images_bytesize_match
    local_image = self.get_local_image(src)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 344, in get_local_image
    self.link_hash, src, self.config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 59, in store_image
    image = self.write_localfile(data, link_hash, src, config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 101, in write_localfile
    return self.read_localfile(link_hash, src, config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 81, in read_localfile
    image_details = self.get_image_dimensions(identify, local_image_name)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 36, in get_image_dimensions
    image = Image.open(path)
  File "/usr/lib/python2.7/dist-packages/PIL/Image.py", line 2028, in open
    raise IOError("cannot identify image file")
IOError: cannot identify image file
Traceback (most recent call last):
(<type 'exceptions.IOError'>, IOError('cannot identify image file',), <traceback object at 0x7f4da21ac638>)
processed 265 skipped 19
processed 266 skipped 19
processed 267 skipped 19
processed 268 skipped 19
processed 269 skipped 19
processed 270 skipped 19
processed 271 skipped 19
processed 272 skipped 19
processed 273 skipped 19
processed 274 skipped 19
processed 275 skipped 19
processed 276 skipped 19
processed 277 skipped 19
processed 278 skipped 19
processed 279 skipped 19
processed 280 skipped 19
processed 281 skipped 19
processed 282 skipped 19
processed 283 skipped 19
processed 284 skipped 19
processed 285 skipped 19
processed 286 skipped 19
processed 287 skipped 19
processed 288 skipped 19
processed 289 skipped 19
processed 290 skipped 19
processed 291 skipped 19
processed 292 skipped 19
processed 293 skipped 19
error on download89fc8a0d-3841-4ffc-8366-18fa10cbae64.html
  File "<ipython-input-67-6114dbc3b5a2>", line 16, in <module>
    res = comp_extractors( extractor_training_object )
  File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
    extraction_results = get_extraction_results( eto )
  File "<ipython-input-21-ff5d87d959e4>", line 25, in get_extraction_results
    ret['py_goose']  = { 'extracted_html': extract_with_python_goose( raw_content ) }
  File "<ipython-input-9-da7d07f83e6d>", line 6, in extract_with_python_goose
    r = g.extract( raw_html=raw_content )
  File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 56, in extract
    return self.crawl(cc)
  File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 63, in crawl
    article = crawler.crawl(crawl_candiate)
  File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 125, in crawl
    self.get_image()
  File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 147, in get_image
    self.article.top_image = self.image_extractor.get_best_image(doc, top_node)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 88, in get_best_image
    image = self.check_large_images(topNode, 0, 0)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
    depth_obj.parent_depth, depth_obj.sibling_depth)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
    depth_obj.parent_depth, depth_obj.sibling_depth)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
    depth_obj.parent_depth, depth_obj.sibling_depth)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
    depth_obj.parent_depth, depth_obj.sibling_depth)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
    depth_obj.parent_depth, depth_obj.sibling_depth)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 122, in check_large_images
    good_images = self.get_image_candidates(node)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 283, in get_image_candidates
    good_images = self.get_images_bytesize_match(filtered_images)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 299, in get_images_bytesize_match
    local_image = self.get_local_image(src)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 344, in get_local_image
    self.link_hash, src, self.config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 59, in store_image
    image = self.write_localfile(data, link_hash, src, config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 101, in write_localfile
    return self.read_localfile(link_hash, src, config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 81, in read_localfile
    image_details = self.get_image_dimensions(identify, local_image_name)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 36, in get_image_dimensions
    image = Image.open(path)
  File "/usr/lib/python2.7/dist-packages/PIL/Image.py", line 2028, in open
    raise IOError("cannot identify image file")
IOError: cannot identify image file
Traceback (most recent call last):
(<type 'exceptions.IOError'>, IOError('cannot identify image file',), <traceback object at 0x7f4dcdb816c8>)
processed 293 skipped 20
processed 294 skipped 20
processed 295 skipped 20
processed 296 skipped 20
processed 297 skipped 20
processed 298 skipped 20
processed 299 skipped 20
processed 300 skipped 20
processed 301 skipped 20
processed 302 skipped 20
processed 303 skipped 20
processed 304 skipped 20
processed 305 skipped 20
processed 306 skipped 20
processed 307 skipped 20
processed 308 skipped 20
processed 309 skipped 20
processed 310 skipped 20
processed 311 skipped 20
processed 312 skipped 20
processed 313 skipped 20
error on download931bd5b0-b1c7-4026-b937-a2404e3d8891.html
  File "<ipython-input-67-6114dbc3b5a2>", line 16, in <module>
    res = comp_extractors( extractor_training_object )
  File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
    extraction_results = get_extraction_results( eto )
  File "<ipython-input-21-ff5d87d959e4>", line 25, in get_extraction_results
    ret['py_goose']  = { 'extracted_html': extract_with_python_goose( raw_content ) }
  File "<ipython-input-9-da7d07f83e6d>", line 6, in extract_with_python_goose
    r = g.extract( raw_html=raw_content )
  File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 56, in extract
    return self.crawl(cc)
  File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 63, in crawl
    article = crawler.crawl(crawl_candiate)
  File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 125, in crawl
    self.get_image()
  File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 147, in get_image
    self.article.top_image = self.image_extractor.get_best_image(doc, top_node)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 88, in get_best_image
    image = self.check_large_images(topNode, 0, 0)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
    depth_obj.parent_depth, depth_obj.sibling_depth)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 122, in check_large_images
    good_images = self.get_image_candidates(node)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 283, in get_image_candidates
    good_images = self.get_images_bytesize_match(filtered_images)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 299, in get_images_bytesize_match
    local_image = self.get_local_image(src)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 344, in get_local_image
    self.link_hash, src, self.config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 59, in store_image
    image = self.write_localfile(data, link_hash, src, config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 101, in write_localfile
    return self.read_localfile(link_hash, src, config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 81, in read_localfile
    image_details = self.get_image_dimensions(identify, local_image_name)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 36, in get_image_dimensions
    image = Image.open(path)
  File "/usr/lib/python2.7/dist-packages/PIL/Image.py", line 2028, in open
    raise IOError("cannot identify image file")
IOError: cannot identify image file
Traceback (most recent call last):
(<type 'exceptions.IOError'>, IOError('cannot identify image file',), <traceback object at 0x7f4dccd12ea8>)
processed 313 skipped 21
processed 314 skipped 21
processed 315 skipped 21
processed 316 skipped 21
processed 317 skipped 21
processed 318 skipped 21
processed 319 skipped 21
processed 320 skipped 21
processed 321 skipped 21
processed 322 skipped 21
processed 323 skipped 21
processed 324 skipped 21
processed 325 skipped 21
processed 326 skipped 21
processed 327 skipped 21
processed 328 skipped 21
processed 329 skipped 21
processed 330 skipped 21
processed 331 skipped 21
processed 332 skipped 21
processed 333 skipped 21
processed 334 skipped 21
processed 335 skipped 21
processed 336 skipped 21
processed 337 skipped 21
processed 338 skipped 21
processed 339 skipped 21
processed 340 skipped 21
processed 341 skipped 21
processed 342 skipped 21
processed 343 skipped 21
processed 344 skipped 21
processed 345 skipped 21
processed 346 skipped 21
processed 347 skipped 21
processed 348 skipped 21
processed 349 skipped 21
processed 350 skipped 21
processed 351 skipped 21
processed 352 skipped 21
processed 353 skipped 21
processed 354 skipped 21
processed 355 skipped 21
processed 356 skipped 21
processed 357 skipped 21
processed 358 skipped 21
processed 359 skipped 21
processed 360 skipped 21
processed 361 skipped 21
processed 362 skipped 21
processed 363 skipped 21
processed 364 skipped 21
processed 365 skipped 21
processed 366 skipped 21
processed 367 skipped 21
processed 368 skipped 21
processed 369 skipped 21
processed 370 skipped 21
processed 371 skipped 21
error on downloada8bbed64-c630-4e89-bfc3-795a7aebe780.html
  File "<ipython-input-67-6114dbc3b5a2>", line 16, in <module>
    res = comp_extractors( extractor_training_object )
  File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
    extraction_results = get_extraction_results( eto )
  File "<ipython-input-21-ff5d87d959e4>", line 25, in get_extraction_results
    ret['py_goose']  = { 'extracted_html': extract_with_python_goose( raw_content ) }
  File "<ipython-input-9-da7d07f83e6d>", line 6, in extract_with_python_goose
    r = g.extract( raw_html=raw_content )
  File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 56, in extract
    return self.crawl(cc)
  File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 63, in crawl
    article = crawler.crawl(crawl_candiate)
  File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 125, in crawl
    self.get_image()
  File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 147, in get_image
    self.article.top_image = self.image_extractor.get_best_image(doc, top_node)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 88, in get_best_image
    image = self.check_large_images(topNode, 0, 0)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 122, in check_large_images
    good_images = self.get_image_candidates(node)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 283, in get_image_candidates
    good_images = self.get_images_bytesize_match(filtered_images)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 299, in get_images_bytesize_match
    local_image = self.get_local_image(src)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 344, in get_local_image
    self.link_hash, src, self.config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 59, in store_image
    image = self.write_localfile(data, link_hash, src, config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 101, in write_localfile
    return self.read_localfile(link_hash, src, config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 81, in read_localfile
    image_details = self.get_image_dimensions(identify, local_image_name)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 36, in get_image_dimensions
    image = Image.open(path)
  File "/usr/lib/python2.7/dist-packages/PIL/Image.py", line 2028, in open
    raise IOError("cannot identify image file")
IOError: cannot identify image file
Traceback (most recent call last):
(<type 'exceptions.IOError'>, IOError('cannot identify image file',), <traceback object at 0x7f4da15f0248>)
processed 371 skipped 22
processed 372 skipped 22
processed 373 skipped 22
processed 374 skipped 22
processed 375 skipped 22
processed 376 skipped 22
processed 377 skipped 22
processed 378 skipped 22
processed 379 skipped 22
processed 380 skipped 22
processed 381 skipped 22
processed 382 skipped 22
processed 383 skipped 22
processed 384 skipped 22
processed 385 skipped 22
processed 386 skipped 22
processed 387 skipped 22
processed 388 skipped 22
processed 389 skipped 22
processed 390 skipped 22
processed 391 skipped 22
error on downloadb09fe710-8e21-48bb-abf7-1cc842b668ea.html
  File "<ipython-input-67-6114dbc3b5a2>", line 16, in <module>
    res = comp_extractors( extractor_training_object )
  File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
    extraction_results = get_extraction_results( eto )
  File "<ipython-input-21-ff5d87d959e4>", line 25, in get_extraction_results
    ret['py_goose']  = { 'extracted_html': extract_with_python_goose( raw_content ) }
  File "<ipython-input-9-da7d07f83e6d>", line 6, in extract_with_python_goose
    r = g.extract( raw_html=raw_content )
  File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 56, in extract
    return self.crawl(cc)
  File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 63, in crawl
    article = crawler.crawl(crawl_candiate)
  File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 125, in crawl
    self.get_image()
  File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 147, in get_image
    self.article.top_image = self.image_extractor.get_best_image(doc, top_node)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 88, in get_best_image
    image = self.check_large_images(topNode, 0, 0)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
    depth_obj.parent_depth, depth_obj.sibling_depth)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
    depth_obj.parent_depth, depth_obj.sibling_depth)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
    depth_obj.parent_depth, depth_obj.sibling_depth)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
    depth_obj.parent_depth, depth_obj.sibling_depth)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 122, in check_large_images
    good_images = self.get_image_candidates(node)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 283, in get_image_candidates
    good_images = self.get_images_bytesize_match(filtered_images)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 299, in get_images_bytesize_match
    local_image = self.get_local_image(src)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 344, in get_local_image
    self.link_hash, src, self.config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 59, in store_image
    image = self.write_localfile(data, link_hash, src, config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 101, in write_localfile
    return self.read_localfile(link_hash, src, config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 81, in read_localfile
    image_details = self.get_image_dimensions(identify, local_image_name)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 36, in get_image_dimensions
    image = Image.open(path)
  File "/usr/lib/python2.7/dist-packages/PIL/Image.py", line 2028, in open
    raise IOError("cannot identify image file")
IOError: cannot identify image file
Traceback (most recent call last):
(<type 'exceptions.IOError'>, IOError('cannot identify image file',), <traceback object at 0x7f4de604b7a0>)
processed 391 skipped 23
processed 392 skipped 23
processed 393 skipped 23
processed 394 skipped 23
processed 395 skipped 23
processed 396 skipped 23
processed 397 skipped 23
processed 398 skipped 23
processed 399 skipped 23
processed 400 skipped 23
processed 401 skipped 23
processed 402 skipped 23
processed 403 skipped 23
processed 404 skipped 23
processed 405 skipped 23
processed 406 skipped 23
processed 407 skipped 23
processed 408 skipped 23
processed 409 skipped 23
processed 410 skipped 23
processed 411 skipped 23
processed 412 skipped 23
processed 413 skipped 23
processed 414 skipped 23
processed 415 skipped 23
processed 416 skipped 23
processed 417 skipped 23
processed 418 skipped 23
error on downloadba66beed-a665-4cce-8ad3-61f7e05f0c47.html
  File "<ipython-input-67-6114dbc3b5a2>", line 16, in <module>
    res = comp_extractors( extractor_training_object )
  File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
    extraction_results = get_extraction_results( eto )
  File "<ipython-input-21-ff5d87d959e4>", line 25, in get_extraction_results
    ret['py_goose']  = { 'extracted_html': extract_with_python_goose( raw_content ) }
  File "<ipython-input-9-da7d07f83e6d>", line 6, in extract_with_python_goose
    r = g.extract( raw_html=raw_content )
  File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 56, in extract
    return self.crawl(cc)
  File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 63, in crawl
    article = crawler.crawl(crawl_candiate)
  File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 125, in crawl
    self.get_image()
  File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 147, in get_image
    self.article.top_image = self.image_extractor.get_best_image(doc, top_node)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 88, in get_best_image
    image = self.check_large_images(topNode, 0, 0)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 122, in check_large_images
    good_images = self.get_image_candidates(node)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 283, in get_image_candidates
    good_images = self.get_images_bytesize_match(filtered_images)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 299, in get_images_bytesize_match
    local_image = self.get_local_image(src)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 344, in get_local_image
    self.link_hash, src, self.config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 52, in store_image
    image = self.read_localfile(link_hash, src, config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 81, in read_localfile
    image_details = self.get_image_dimensions(identify, local_image_name)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 36, in get_image_dimensions
    image = Image.open(path)
  File "/usr/lib/python2.7/dist-packages/PIL/Image.py", line 2028, in open
    raise IOError("cannot identify image file")
IOError: cannot identify image file
Traceback (most recent call last):
(<type 'exceptions.IOError'>, IOError('cannot identify image file',), <traceback object at 0x7f4dcd647d88>)
processed 418 skipped 24
error on downloadbaeee9ce-b4a7-4fdc-ba0c-f5400a0a76ef.html
  File "<ipython-input-67-6114dbc3b5a2>", line 16, in <module>
    res = comp_extractors( extractor_training_object )
  File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
    extraction_results = get_extraction_results( eto )
  File "<ipython-input-21-ff5d87d959e4>", line 25, in get_extraction_results
    ret['py_goose']  = { 'extracted_html': extract_with_python_goose( raw_content ) }
  File "<ipython-input-9-da7d07f83e6d>", line 6, in extract_with_python_goose
    r = g.extract( raw_html=raw_content )
  File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 56, in extract
    return self.crawl(cc)
  File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 63, in crawl
    article = crawler.crawl(crawl_candiate)
  File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 125, in crawl
    self.get_image()
  File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 147, in get_image
    self.article.top_image = self.image_extractor.get_best_image(doc, top_node)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 88, in get_best_image
    image = self.check_large_images(topNode, 0, 0)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
    depth_obj.parent_depth, depth_obj.sibling_depth)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
    depth_obj.parent_depth, depth_obj.sibling_depth)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
    depth_obj.parent_depth, depth_obj.sibling_depth)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 122, in check_large_images
    good_images = self.get_image_candidates(node)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 283, in get_image_candidates
    good_images = self.get_images_bytesize_match(filtered_images)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 299, in get_images_bytesize_match
    local_image = self.get_local_image(src)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 344, in get_local_image
    self.link_hash, src, self.config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 59, in store_image
    image = self.write_localfile(data, link_hash, src, config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 101, in write_localfile
    return self.read_localfile(link_hash, src, config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 81, in read_localfile
    image_details = self.get_image_dimensions(identify, local_image_name)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 36, in get_image_dimensions
    image = Image.open(path)
  File "/usr/lib/python2.7/dist-packages/PIL/Image.py", line 2028, in open
    raise IOError("cannot identify image file")
IOError: cannot identify image file
Traceback (most recent call last):
(<type 'exceptions.IOError'>, IOError('cannot identify image file',), <traceback object at 0x7f4de71a7518>)
processed 418 skipped 25
processed 419 skipped 25
processed 420 skipped 25
processed 421 skipped 25
processed 422 skipped 25
processed 423 skipped 25
processed 424 skipped 25
processed 425 skipped 25
processed 426 skipped 25
processed 427 skipped 25
processed 428 skipped 25
processed 429 skipped 25
processed 430 skipped 25
processed 431 skipped 25
processed 432 skipped 25
processed 433 skipped 25
processed 434 skipped 25
processed 435 skipped 25
processed 436 skipped 25
processed 437 skipped 25
processed 438 skipped 25
processed 439 skipped 25
processed 440 skipped 25
processed 441 skipped 25
processed 442 skipped 25
processed 443 skipped 25
processed 444 skipped 25
processed 445 skipped 25
processed 446 skipped 25
processed 447 skipped 25
processed 448 skipped 25
processed 449 skipped 25
processed 450 skipped 25
error on downloadc6604e4c-239d-43df-9023-906bc4136622.html
  File "<ipython-input-67-6114dbc3b5a2>", line 16, in <module>
    res = comp_extractors( extractor_training_object )
  File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
    extraction_results = get_extraction_results( eto )
  File "<ipython-input-21-ff5d87d959e4>", line 25, in get_extraction_results
    ret['py_goose']  = { 'extracted_html': extract_with_python_goose( raw_content ) }
  File "<ipython-input-9-da7d07f83e6d>", line 6, in extract_with_python_goose
    r = g.extract( raw_html=raw_content )
  File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 56, in extract
    return self.crawl(cc)
  File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 63, in crawl
    article = crawler.crawl(crawl_candiate)
  File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 125, in crawl
    self.get_image()
  File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 147, in get_image
    self.article.top_image = self.image_extractor.get_best_image(doc, top_node)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 88, in get_best_image
    image = self.check_large_images(topNode, 0, 0)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
    depth_obj.parent_depth, depth_obj.sibling_depth)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
    depth_obj.parent_depth, depth_obj.sibling_depth)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
    depth_obj.parent_depth, depth_obj.sibling_depth)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
    depth_obj.parent_depth, depth_obj.sibling_depth)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 122, in check_large_images
    good_images = self.get_image_candidates(node)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 283, in get_image_candidates
    good_images = self.get_images_bytesize_match(filtered_images)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 299, in get_images_bytesize_match
    local_image = self.get_local_image(src)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 344, in get_local_image
    self.link_hash, src, self.config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 59, in store_image
    image = self.write_localfile(data, link_hash, src, config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 101, in write_localfile
    return self.read_localfile(link_hash, src, config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 81, in read_localfile
    image_details = self.get_image_dimensions(identify, local_image_name)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 36, in get_image_dimensions
    image = Image.open(path)
  File "/usr/lib/python2.7/dist-packages/PIL/Image.py", line 2028, in open
    raise IOError("cannot identify image file")
IOError: cannot identify image file
Traceback (most recent call last):
(<type 'exceptions.IOError'>, IOError('cannot identify image file',), <traceback object at 0x7f4dccceae18>)
processed 450 skipped 26
processed 451 skipped 26
processed 452 skipped 26
processed 453 skipped 26
processed 454 skipped 26
processed 455 skipped 26
error on downloadc8a6ace8-c4e8-4d05-8e85-5e58284151bb.html
  File "<ipython-input-67-6114dbc3b5a2>", line 16, in <module>
    res = comp_extractors( extractor_training_object )
  File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
    extraction_results = get_extraction_results( eto )
  File "<ipython-input-21-ff5d87d959e4>", line 25, in get_extraction_results
    ret['py_goose']  = { 'extracted_html': extract_with_python_goose( raw_content ) }
  File "<ipython-input-9-da7d07f83e6d>", line 6, in extract_with_python_goose
    r = g.extract( raw_html=raw_content )
  File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 56, in extract
    return self.crawl(cc)
  File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 63, in crawl
    article = crawler.crawl(crawl_candiate)
  File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 125, in crawl
    self.get_image()
  File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 147, in get_image
    self.article.top_image = self.image_extractor.get_best_image(doc, top_node)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 88, in get_best_image
    image = self.check_large_images(topNode, 0, 0)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
    depth_obj.parent_depth, depth_obj.sibling_depth)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
    depth_obj.parent_depth, depth_obj.sibling_depth)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 122, in check_large_images
    good_images = self.get_image_candidates(node)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 283, in get_image_candidates
    good_images = self.get_images_bytesize_match(filtered_images)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 299, in get_images_bytesize_match
    local_image = self.get_local_image(src)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 344, in get_local_image
    self.link_hash, src, self.config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 52, in store_image
    image = self.read_localfile(link_hash, src, config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 81, in read_localfile
    image_details = self.get_image_dimensions(identify, local_image_name)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 36, in get_image_dimensions
    image = Image.open(path)
  File "/usr/lib/python2.7/dist-packages/PIL/Image.py", line 2028, in open
    raise IOError("cannot identify image file")
IOError: cannot identify image file
Traceback (most recent call last):
(<type 'exceptions.IOError'>, IOError('cannot identify image file',), <traceback object at 0x7f4dcd661c68>)
processed 455 skipped 27
processed 456 skipped 27
processed 457 skipped 27
processed 458 skipped 27
processed 459 skipped 27
processed 460 skipped 27
processed 461 skipped 27
processed 462 skipped 27
error on downloadcbeeb77e-294d-4ac9-9139-ce587ec19626.html
  File "<ipython-input-67-6114dbc3b5a2>", line 16, in <module>
    res = comp_extractors( extractor_training_object )
  File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
    extraction_results = get_extraction_results( eto )
  File "<ipython-input-21-ff5d87d959e4>", line 25, in get_extraction_results
    ret['py_goose']  = { 'extracted_html': extract_with_python_goose( raw_content ) }
  File "<ipython-input-9-da7d07f83e6d>", line 6, in extract_with_python_goose
    r = g.extract( raw_html=raw_content )
  File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 56, in extract
    return self.crawl(cc)
  File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 63, in crawl
    article = crawler.crawl(crawl_candiate)
  File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 125, in crawl
    self.get_image()
  File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 147, in get_image
    self.article.top_image = self.image_extractor.get_best_image(doc, top_node)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 88, in get_best_image
    image = self.check_large_images(topNode, 0, 0)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
    depth_obj.parent_depth, depth_obj.sibling_depth)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
    depth_obj.parent_depth, depth_obj.sibling_depth)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
    depth_obj.parent_depth, depth_obj.sibling_depth)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
    depth_obj.parent_depth, depth_obj.sibling_depth)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 122, in check_large_images
    good_images = self.get_image_candidates(node)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 283, in get_image_candidates
    good_images = self.get_images_bytesize_match(filtered_images)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 299, in get_images_bytesize_match
    local_image = self.get_local_image(src)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 344, in get_local_image
    self.link_hash, src, self.config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 59, in store_image
    image = self.write_localfile(data, link_hash, src, config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 101, in write_localfile
    return self.read_localfile(link_hash, src, config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 81, in read_localfile
    image_details = self.get_image_dimensions(identify, local_image_name)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 36, in get_image_dimensions
    image = Image.open(path)
  File "/usr/lib/python2.7/dist-packages/PIL/Image.py", line 2028, in open
    raise IOError("cannot identify image file")
IOError: cannot identify image file
Traceback (most recent call last):
(<type 'exceptions.IOError'>, IOError('cannot identify image file',), <traceback object at 0x7f4df95ee0e0>)
processed 462 skipped 28
processed 463 skipped 28
processed 464 skipped 28
processed 465 skipped 28
processed 466 skipped 28
processed 467 skipped 28
processed 468 skipped 28
error on downloadceaaf8b9-4739-4250-9d9d-f868e68872fc.html
  File "<ipython-input-67-6114dbc3b5a2>", line 16, in <module>
    res = comp_extractors( extractor_training_object )
  File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
    extraction_results = get_extraction_results( eto )
  File "<ipython-input-21-ff5d87d959e4>", line 25, in get_extraction_results
    ret['py_goose']  = { 'extracted_html': extract_with_python_goose( raw_content ) }
  File "<ipython-input-9-da7d07f83e6d>", line 6, in extract_with_python_goose
    r = g.extract( raw_html=raw_content )
  File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 56, in extract
    return self.crawl(cc)
  File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 63, in crawl
    article = crawler.crawl(crawl_candiate)
  File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 125, in crawl
    self.get_image()
  File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 147, in get_image
    self.article.top_image = self.image_extractor.get_best_image(doc, top_node)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 88, in get_best_image
    image = self.check_large_images(topNode, 0, 0)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 122, in check_large_images
    good_images = self.get_image_candidates(node)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 283, in get_image_candidates
    good_images = self.get_images_bytesize_match(filtered_images)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 299, in get_images_bytesize_match
    local_image = self.get_local_image(src)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 344, in get_local_image
    self.link_hash, src, self.config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 59, in store_image
    image = self.write_localfile(data, link_hash, src, config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 101, in write_localfile
    return self.read_localfile(link_hash, src, config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 81, in read_localfile
    image_details = self.get_image_dimensions(identify, local_image_name)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 36, in get_image_dimensions
    image = Image.open(path)
  File "/usr/lib/python2.7/dist-packages/PIL/Image.py", line 2028, in open
    raise IOError("cannot identify image file")
IOError: cannot identify image file
Traceback (most recent call last):
(<type 'exceptions.ValueError'>, ValueError(u'Unicode strings with encoding declaration are not supported. Please use bytes input or XML fragments without declaration.',), <traceback object at 0x7f4da2a1db48>)
processed 468 skipped 29
processed 469 skipped 29
error on downloadd00505ef-6e10-4616-88f0-d1165292c417.html
  File "<ipython-input-67-6114dbc3b5a2>", line 16, in <module>
    res = comp_extractors( extractor_training_object )
  File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
    extraction_results = get_extraction_results( eto )
  File "<ipython-input-21-ff5d87d959e4>", line 25, in get_extraction_results
    ret['py_goose']  = { 'extracted_html': extract_with_python_goose( raw_content ) }
  File "<ipython-input-9-da7d07f83e6d>", line 6, in extract_with_python_goose
    r = g.extract( raw_html=raw_content )
  File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 56, in extract
    return self.crawl(cc)
  File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 63, in crawl
    article = crawler.crawl(crawl_candiate)
  File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 90, in crawl
    doc = self.get_document(raw_html)
  File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 176, in get_document
    doc = self.parser.fromstring(raw_html)
  File "/usr/local/lib/python2.7/dist-packages/goose/parsers.py", line 54, in fromstring
    self.doc = lxml.html.fromstring(html)
  File "/usr/local/lib/python2.7/dist-packages/lxml/html/__init__.py", line 672, in fromstring
    doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
  File "/usr/local/lib/python2.7/dist-packages/lxml/html/__init__.py", line 568, in document_fromstring
    value = etree.fromstring(html, parser, **kw)
  File "lxml.etree.pyx", line 2997, in lxml.etree.fromstring (src/lxml/lxml.etree.c:63276)
  File "parser.pxi", line 1607, in lxml.etree._parseMemoryDocument (src/lxml/lxml.etree.c:93592)
ValueError: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fragments without declaration.
Traceback (most recent call last):
(<type 'exceptions.IOError'>, IOError("Couldn't open file /usr/local/lib/python2.7/dist-packages/goose/resources/text/stopwords-vi.txt",), <traceback object at 0x7f4de6585d88>)
processed 469 skipped 30
processed 470 skipped 30
processed 471 skipped 30
processed 472 skipped 30
processed 473 skipped 30
processed 474 skipped 30
processed 475 skipped 30
processed 476 skipped 30
processed 477 skipped 30
processed 478 skipped 30
error on downloadd35ceaa6-3fb5-4d60-bc6c-82f1a23bb59c.html
  File "<ipython-input-67-6114dbc3b5a2>", line 16, in <module>
    res = comp_extractors( extractor_training_object )
  File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
    extraction_results = get_extraction_results( eto )
  File "<ipython-input-21-ff5d87d959e4>", line 25, in get_extraction_results
    ret['py_goose']  = { 'extracted_html': extract_with_python_goose( raw_content ) }
  File "<ipython-input-9-da7d07f83e6d>", line 6, in extract_with_python_goose
    r = g.extract( raw_html=raw_content )
  File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 56, in extract
    return self.crawl(cc)
  File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 63, in crawl
    article = crawler.crawl(crawl_candiate)
  File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 131, in crawl
    self.article.cleaned_text = self.formatter.get_formatted_text()
  File "/usr/local/lib/python2.7/dist-packages/goose/outputformatters.py", line 66, in get_formatted_text
    self.remove_fewwords_paragraphs()
  File "/usr/local/lib/python2.7/dist-packages/goose/outputformatters.py", line 123, in remove_fewwords_paragraphs
    stop_words = self.stopwords_class(language=self.get_language()).get_stopword_count(text)
  File "/usr/local/lib/python2.7/dist-packages/goose/text.py", line 98, in __init__
    self._cached_stop_words[language] = set(FileHelper.loadResourceFile(path).splitlines())
  File "/usr/local/lib/python2.7/dist-packages/goose/utils/__init__.py", line 79, in loadResourceFile
    raise IOError("Couldn't open file %s" % path)
IOError: Couldn't open file /usr/local/lib/python2.7/dist-packages/goose/resources/text/stopwords-vi.txt
Traceback (most recent call last):
(<type 'exceptions.IOError'>, IOError('cannot identify image file',), <traceback object at 0x7f4dccce5f38>)
processed 478 skipped 31
processed 479 skipped 31
processed 480 skipped 31
processed 481 skipped 31
processed 482 skipped 31
processed 483 skipped 31
processed 484 skipped 31
processed 485 skipped 31
processed 486 skipped 31
processed 487 skipped 31
processed 488 skipped 31
processed 489 skipped 31
error on downloaddc9814d5-b1eb-4095-a183-05c3be64c537.html
  File "<ipython-input-67-6114dbc3b5a2>", line 16, in <module>
    res = comp_extractors( extractor_training_object )
  File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
    extraction_results = get_extraction_results( eto )
  File "<ipython-input-21-ff5d87d959e4>", line 25, in get_extraction_results
    ret['py_goose']  = { 'extracted_html': extract_with_python_goose( raw_content ) }
  File "<ipython-input-9-da7d07f83e6d>", line 6, in extract_with_python_goose
    r = g.extract( raw_html=raw_content )
  File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 56, in extract
    return self.crawl(cc)
  File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 63, in crawl
    article = crawler.crawl(crawl_candiate)
  File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 125, in crawl
    self.get_image()
  File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 147, in get_image
    self.article.top_image = self.image_extractor.get_best_image(doc, top_node)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 88, in get_best_image
    image = self.check_large_images(topNode, 0, 0)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
    depth_obj.parent_depth, depth_obj.sibling_depth)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 122, in check_large_images
    good_images = self.get_image_candidates(node)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 283, in get_image_candidates
    good_images = self.get_images_bytesize_match(filtered_images)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 299, in get_images_bytesize_match
    local_image = self.get_local_image(src)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 344, in get_local_image
    self.link_hash, src, self.config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 59, in store_image
    image = self.write_localfile(data, link_hash, src, config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 101, in write_localfile
    return self.read_localfile(link_hash, src, config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 81, in read_localfile
    image_details = self.get_image_dimensions(identify, local_image_name)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 36, in get_image_dimensions
    image = Image.open(path)
  File "/usr/lib/python2.7/dist-packages/PIL/Image.py", line 2028, in open
    raise IOError("cannot identify image file")
IOError: cannot identify image file
Traceback (most recent call last):
(<type 'exceptions.ValueError'>, ValueError(u'Unicode strings with encoding declaration are not supported. Please use bytes input or XML fragments without declaration.',), <traceback object at 0x7f4dcd474dd0>)
processed 489 skipped 32
processed 490 skipped 32
processed 491 skipped 32
processed 492 skipped 32
processed 493 skipped 32
processed 494 skipped 32
processed 495 skipped 32
processed 496 skipped 32
processed 497 skipped 32
processed 498 skipped 32
processed 499 skipped 32
processed 500 skipped 32
processed 501 skipped 32
processed 502 skipped 32
processed 503 skipped 32
processed 504 skipped 32
processed 505 skipped 32
processed 506 skipped 32
processed 507 skipped 32
processed 508 skipped 32
processed 509 skipped 32
processed 510 skipped 32
processed 511 skipped 32
processed 512 skipped 32
processed 513 skipped 32
processed 514 skipped 32
processed 515 skipped 32
processed 516 skipped 32
processed 517 skipped 32
processed 518 skipped 32
processed 519 skipped 32
processed 520 skipped 32
processed 521 skipped 32
processed 522 skipped 32
processed 523 skipped 32
processed 524 skipped 32
processed 525 skipped 32
processed 526 skipped 32
processed 527 skipped 32
processed 528 skipped 32
processed 529 skipped 32
error on downloadeba86c40-98c9-4af1-bf0a-c26ce4db3536.html
  File "<ipython-input-67-6114dbc3b5a2>", line 16, in <module>
    res = comp_extractors( extractor_training_object )
  File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
    extraction_results = get_extraction_results( eto )
  File "<ipython-input-21-ff5d87d959e4>", line 25, in get_extraction_results
    ret['py_goose']  = { 'extracted_html': extract_with_python_goose( raw_content ) }
  File "<ipython-input-9-da7d07f83e6d>", line 6, in extract_with_python_goose
    r = g.extract( raw_html=raw_content )
  File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 56, in extract
    return self.crawl(cc)
  File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 63, in crawl
    article = crawler.crawl(crawl_candiate)
  File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 90, in crawl
    doc = self.get_document(raw_html)
  File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 176, in get_document
    doc = self.parser.fromstring(raw_html)
  File "/usr/local/lib/python2.7/dist-packages/goose/parsers.py", line 54, in fromstring
    self.doc = lxml.html.fromstring(html)
  File "/usr/local/lib/python2.7/dist-packages/lxml/html/__init__.py", line 672, in fromstring
    doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
  File "/usr/local/lib/python2.7/dist-packages/lxml/html/__init__.py", line 568, in document_fromstring
    value = etree.fromstring(html, parser, **kw)
  File "lxml.etree.pyx", line 2997, in lxml.etree.fromstring (src/lxml/lxml.etree.c:63276)
  File "parser.pxi", line 1607, in lxml.etree._parseMemoryDocument (src/lxml/lxml.etree.c:93592)
ValueError: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fragments without declaration.
Traceback (most recent call last):
(<type 'exceptions.IOError'>, IOError('cannot identify image file',), <traceback object at 0x7f4dcd65ce60>)
processed 529 skipped 33
processed 530 skipped 33
processed 531 skipped 33
processed 532 skipped 33
processed 533 skipped 33
processed 534 skipped 33
processed 535 skipped 33
processed 536 skipped 33
processed 537 skipped 33
processed 538 skipped 33
processed 539 skipped 33
processed 540 skipped 33
processed 541 skipped 33
processed 542 skipped 33
processed 543 skipped 33
processed 544 skipped 33
processed 545 skipped 33
processed 546 skipped 33
processed 547 skipped 33
processed 548 skipped 33
processed 549 skipped 33
processed 550 skipped 33
processed 551 skipped 33
processed 552 skipped 33
processed 553 skipped 33
processed 554 skipped 33
processed 555 skipped 33
processed 556 skipped 33
processed 557 skipped 33
processed 558 skipped 33
processed 559 skipped 33
processed 560 skipped 33
processed 561 skipped 33
processed 562 skipped 33
processed 563 skipped 33
processed 564 skipped 33
processed 565 skipped 33
processed 566 skipped 33
processed 567 skipped 33
processed 568 skipped 33
processed 569 skipped 33
processed 570 skipped 33
processed 571 skipped 33
processed 572 skipped 33
processed 573 skipped 33
error on downloadfbdd64ab-0d00-4a1f-99ec-e21855eae2af.html
  File "<ipython-input-67-6114dbc3b5a2>", line 16, in <module>
    res = comp_extractors( extractor_training_object )
  File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
    extraction_results = get_extraction_results( eto )
  File "<ipython-input-21-ff5d87d959e4>", line 25, in get_extraction_results
    ret['py_goose']  = { 'extracted_html': extract_with_python_goose( raw_content ) }
  File "<ipython-input-9-da7d07f83e6d>", line 6, in extract_with_python_goose
    r = g.extract( raw_html=raw_content )
  File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 56, in extract
    return self.crawl(cc)
  File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 63, in crawl
    article = crawler.crawl(crawl_candiate)
  File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 125, in crawl
    self.get_image()
  File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 147, in get_image
    self.article.top_image = self.image_extractor.get_best_image(doc, top_node)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 88, in get_best_image
    image = self.check_large_images(topNode, 0, 0)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 122, in check_large_images
    good_images = self.get_image_candidates(node)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 283, in get_image_candidates
    good_images = self.get_images_bytesize_match(filtered_images)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 299, in get_images_bytesize_match
    local_image = self.get_local_image(src)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 344, in get_local_image
    self.link_hash, src, self.config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 59, in store_image
    image = self.write_localfile(data, link_hash, src, config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 101, in write_localfile
    return self.read_localfile(link_hash, src, config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 81, in read_localfile
    image_details = self.get_image_dimensions(identify, local_image_name)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 36, in get_image_dimensions
    image = Image.open(path)
  File "/usr/lib/python2.7/dist-packages/PIL/Image.py", line 2028, in open
    raise IOError("cannot identify image file")
IOError: cannot identify image file
Traceback (most recent call last):
(<type 'exceptions.IOError'>, IOError('cannot identify image file',), <traceback object at 0x7f4dac691128>)
processed 573 skipped 34
processed 574 skipped 34
processed 575 skipped 34
processed 576 skipped 34
processed 577 skipped 34
processed 578 skipped 34
processed 579 skipped 34
processed 580 skipped 34
processed 581 skipped 34
processed 582 skipped 34
processed 583 skipped 34
processed 584 skipped 34
processed 585 skipped 34
processed 586 skipped 34
processed 587 skipped 34
Total_time 1:25:34.219244
Time per download 0:00:08.267663
  File "<ipython-input-67-6114dbc3b5a2>", line 16, in <module>
    res = comp_extractors( extractor_training_object )
  File "<ipython-input-22-91249bf1c046>", line 19, in comp_extractors
    extraction_results = get_extraction_results( eto )
  File "<ipython-input-21-ff5d87d959e4>", line 25, in get_extraction_results
    ret['py_goose']  = { 'extracted_html': extract_with_python_goose( raw_content ) }
  File "<ipython-input-9-da7d07f83e6d>", line 6, in extract_with_python_goose
    r = g.extract( raw_html=raw_content )
  File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 56, in extract
    return self.crawl(cc)
  File "/usr/local/lib/python2.7/dist-packages/goose/__init__.py", line 63, in crawl
    article = crawler.crawl(crawl_candiate)
  File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 125, in crawl
    self.get_image()
  File "/usr/local/lib/python2.7/dist-packages/goose/crawler.py", line 147, in get_image
    self.article.top_image = self.image_extractor.get_best_image(doc, top_node)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 88, in get_best_image
    image = self.check_large_images(topNode, 0, 0)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 141, in check_large_images
    depth_obj.parent_depth, depth_obj.sibling_depth)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 122, in check_large_images
    good_images = self.get_image_candidates(node)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 283, in get_image_candidates
    good_images = self.get_images_bytesize_match(filtered_images)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 299, in get_images_bytesize_match
    local_image = self.get_local_image(src)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/extractors.py", line 344, in get_local_image
    self.link_hash, src, self.config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 59, in store_image
    image = self.write_localfile(data, link_hash, src, config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 101, in write_localfile
    return self.read_localfile(link_hash, src, config)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 81, in read_localfile
    image_details = self.get_image_dimensions(identify, local_image_name)
  File "/usr/local/lib/python2.7/dist-packages/goose/images/utils.py", line 36, in get_image_dimensions
    image = Image.open(path)
  File "/usr/lib/python2.7/dist-packages/PIL/Image.py", line 2028, in open
    raise IOError("cannot identify image file")
IOError: cannot identify image file
Out[67]:
['justext',
 'heur',
 'media_id',
 'gold',
 'py_boiler_pipe_CanolaExtractor',
 'downloads_id',
 'py_boiler_pipe_DefaultExtractor',
 'py_boiler_pipe_KeepEverythingExtractor',
 'story_is_spidered',
 'py_boiler_pipe_ArticleExtractor',
 'crf',
 'python_readibilty',
 'py_boiler_pipe_NumWordsRulesExtractor',
 'py_goose',
 'py_boiler_pipe_ArticleSentencesExtractor',
 'py_boiler_pipe_LargestContentExtractor',
 'boiler_pipe']

In [69]:
df = get_data_frame_from_comparision_objects( comps_downloads_boiler_pipe )
print_results_by_measurement_type( df )


       precision_boiler_pipe  precision_crf  precision_gold  precision_heur  \
count             587.000000     587.000000             587      587.000000   
mean                0.947119       0.731588               1        0.791331   
std                 0.103217       0.260810               0        0.236207   
min                 0.000000       0.000000               1        0.000000   
2%                  0.682820       0.000000               1        0.000000   
5%                  0.827871       0.100000               1        0.194313   
10%                 0.892255       0.325697               1        0.499278   
50%                 0.974522       0.822270               1        0.871486   
max                 1.000000       1.000000               1        1.000000   

       precision_justext  precision_py_boiler_pipe_ArticleExtractor  \
count         587.000000                                 587.000000   
mean            0.832769                                   0.881723   
std             0.320992                                   0.105574   
min             0.000000                                   0.000000   
2%              0.000000                                   0.660794   
5%              0.000000                                   0.755209   
10%             0.020766                                   0.804754   
50%             0.990854                                   0.905172   
max             1.000000                                   0.990328   

       precision_py_boiler_pipe_ArticleSentencesExtractor  \
count                                                587    
mean                                                   0    
std                                                    0    
min                                                    0    
2%                                                     0    
5%                                                     0    
10%                                                    0    
50%                                                    0    
max                                                    0    

       precision_py_boiler_pipe_CanolaExtractor  \
count                                587.000000   
mean                                   0.773246   
std                                    0.187061   
min                                    0.000000   
2%                                     0.174271   
5%                                     0.390601   
10%                                    0.507277   
50%                                    0.839002   
max                                    0.987124   

       precision_py_boiler_pipe_DefaultExtractor  \
count                                 587.000000   
mean                                    0.817369   
std                                     0.167634   
min                                     0.000000   
2%                                      0.178499   
5%                                      0.472519   
10%                                     0.640371   
50%                                     0.871854   
max                                     0.987685   

       precision_py_boiler_pipe_KeepEverythingExtractor  \
count                                        587.000000   
mean                                           0.562141   
std                                            0.201950   
min                                            0.000000   
2%                                             0.146591   
5%                                             0.212001   
10%                                            0.288187   
50%                                            0.579137   
max                                            0.973542   

       precision_py_boiler_pipe_LargestContentExtractor  \
count                                        587.000000   
mean                                           0.874354   
std                                            0.145622   
min                                            0.000000   
2%                                             0.056011   
5%                                             0.754319   
10%                                            0.804667   
50%                                            0.907063   
max                                            0.991265   

       precision_py_boiler_pipe_NumWordsRulesExtractor  precision_py_goose  \
count                                       587.000000          587.000000   
mean                                          0.821370            0.918389   
std                                           0.161591            0.185575   
min                                           0.000000            0.000000   
2%                                            0.264646            0.218827   
5%                                            0.471592            0.489141   
10%                                           0.632772            0.698180   
50%                                           0.871508            0.990566   
max                                           0.987117            1.000000   

       precision_python_readibilty  
count                   587.000000  
mean                      0.918135  
std                       0.166788  
min                       0.000000  
2%                        0.174974  
5%                        0.671403  
10%                       0.827754  
50%                       0.967532  
max                       1.000000  
       recall_boiler_pipe  recall_crf  recall_gold  recall_heur  \
count          587.000000  587.000000          587   587.000000   
mean             0.951461    0.864146            1     0.876037   
std              0.110789    0.264172            0     0.202423   
min              0.000000    0.000000            1     0.000000   
2%               0.583534    0.000000            1     0.000000   
5%               0.796251    0.012519            1     0.529092   
10%              0.899326    0.533212            1     0.796895   
50%              0.983333    0.956835            1     0.928230   
max              1.000000    1.000000            1     1.000000   

       recall_justext  recall_py_boiler_pipe_ArticleExtractor  \
count      587.000000                              587.000000   
mean         0.797343                                0.914903   
std          0.306030                                0.112087   
min          0.000000                                0.000000   
2%           0.000000                                0.574515   
5%           0.000000                                0.752301   
10%          0.006784                                0.851506   
50%          0.932203                                0.932927   
max          1.000000                                1.000000   

       recall_py_boiler_pipe_ArticleSentencesExtractor  \
count                                              587   
mean                                                 0   
std                                                  0   
min                                                  0   
2%                                                   0   
5%                                                   0   
10%                                                  0   
50%                                                  0   
max                                                  0   

       recall_py_boiler_pipe_CanolaExtractor  \
count                             587.000000   
mean                                0.882045   
std                                 0.106958   
min                                 0.000000   
2%                                  0.611500   
5%                                  0.737168   
10%                                 0.792529   
50%                                 0.903988   
max                                 0.999087   

       recall_py_boiler_pipe_DefaultExtractor  \
count                              587.000000   
mean                                 0.885074   
std                                  0.127238   
min                                  0.000000   
2%                                   0.532425   
5%                                   0.699313   
10%                                  0.781537   
50%                                  0.911950   
max                                  1.000000   

       recall_py_boiler_pipe_KeepEverythingExtractor  \
count                                     587.000000   
mean                                        0.934558   
std                                         0.089161   
min                                         0.000000   
2%                                          0.831792   
5%                                          0.869596   
10%                                         0.896111   
50%                                         0.940937   
max                                         1.000000   

       recall_py_boiler_pipe_LargestContentExtractor  \
count                                     587.000000   
mean                                        0.858449   
std                                         0.178183   
min                                         0.000000   
2%                                          0.067892   
5%                                          0.494472   
10%                                         0.657884   
50%                                         0.916667   
max                                         1.000000   

       recall_py_boiler_pipe_NumWordsRulesExtractor  recall_py_goose  \
count                                    587.000000       587.000000   
mean                                       0.907176         0.881010   
std                                        0.101053         0.197481   
min                                        0.000000         0.000000   
2%                                         0.645752         0.031562   
5%                                         0.792025         0.512564   
10%                                        0.847497         0.769361   
50%                                        0.920863         0.938462   
max                                        1.000000         1.000000   

       recall_python_readibilty  
count                587.000000  
mean                   0.901399  
std                    0.194292  
min                    0.000000  
2%                     0.030122  
5%                     0.658983  
10%                    0.860628  
50%                    0.947047  
max                    1.000000  
       f1_boiler_pipe      f1_crf  f1_gold     f1_heur  f1_justext  \
count      587.000000  587.000000      587  587.000000  587.000000   
mean         0.945981    0.773358        1    0.818156    0.801968   
std          0.103467    0.259278        0    0.216310    0.307921   
min          0.000000    0.000000        1    0.000000    0.000000   
2%           0.696706    0.000000        1    0.000000    0.000000   
5%           0.808139    0.018326        1    0.294773    0.000000   
10%          0.881987    0.380801        1    0.617992    0.010730   
50%          0.974648    0.875940        1    0.890909    0.938389   
max          1.000000    1.000000        1    0.999224    1.000000   

       f1_py_boiler_pipe_ArticleExtractor  \
count                          587.000000   
mean                             0.895349   
std                              0.103379   
min                              0.000000   
2%                               0.684377   
5%                               0.777131   
10%                              0.824159   
50%                              0.915521   
max                              0.992034   

       f1_py_boiler_pipe_ArticleSentencesExtractor  \
count                                          587   
mean                                             0   
std                                              0   
min                                              0   
2%                                               0   
5%                                               0   
10%                                              0   
50%                                              0   
max                                              0   

       f1_py_boiler_pipe_CanolaExtractor  f1_py_boiler_pipe_DefaultExtractor  \
count                         587.000000                          587.000000   
mean                            0.811977                            0.840813   
std                             0.154431                            0.148167   
min                             0.000000                            0.000000   
2%                              0.291535                            0.297843   
5%                              0.519562                            0.599921   
10%                             0.626885                            0.696688   
50%                             0.863076                            0.886466   
max                             0.987124                            0.985744   

       f1_py_boiler_pipe_KeepEverythingExtractor  \
count                                 587.000000   
mean                                    0.681243   
std                                     0.178612   
min                                     0.000000   
2%                                      0.254176   
5%                                      0.341140   
10%                                     0.437733   
50%                                     0.715528   
max                                     0.982999   

       f1_py_boiler_pipe_LargestContentExtractor  \
count                                 587.000000   
mean                                    0.861334   
std                                     0.156952   
min                                     0.000000   
2%                                      0.056688   
5%                                      0.622468   
10%                                     0.740102   
50%                                     0.904841   
max                                     0.992034   

       f1_py_boiler_pipe_NumWordsRulesExtractor  f1_py_goose  \
count                                587.000000   587.000000   
mean                                   0.853158     0.884124   
std                                    0.136401     0.204271   
min                                    0.000000     0.000000   
2%                                     0.405971     0.048493   
5%                                     0.616763     0.506277   
10%                                    0.716640     0.681772   
50%                                    0.894185     0.953125   
max                                    0.986814     1.000000   

       f1_python_readibilty  
count            587.000000  
mean               0.894271  
std                0.200252  
min                0.000000  
2%                 0.038057  
5%                 0.407503  
10%                0.817409  
50%                0.949510  
max                1.000000