extractor_test


Extractor evaluation

This notebook evaluates both Media Cloud's internal extractors and third party FLOSS extractor libraries across a corpus of hand annotated articles.

Readers may wish to skip to the results section at the end.

Set up / Methods


In [1]:
import cPickle
import os.path

api_key = cPickle.load( file( os.path.expanduser( '~/mediacloud_api_key.pickle' ), 'r' ) )

In [2]:
import cPickle
import os.path

cPickle.dump( api_key, file( os.path.expanduser( '~/mediacloud_api_key.pickle' ), 'wb' ) )

In [3]:
import sys
sys.path.append('../../foreign_modules/python/')

In [4]:
loc_key = 'f66a50230d54afaf18822808aed649f1d6ca72b08fb06d5efb6247afe9fbae52'

In [5]:
import mediacloud, requests, csv, sys, os, json, cPickle

def get_download( downloads_id ):
    download = requests.get('https://api.mediacloud.org/api/v2/downloads/single/'+str(downloads_id)+'?key='+api_key)
    return download.json()[0]

def extract_story( preprocessed_lines, title, description, extractor_method ):
    extract_params = {'key':loc_key, 'preprocessed_lines':preprocessed_lines, 
                           'story_title':title, 'story_description':description, 'extractor_method': extractor_method}
    
    extract_result = requests.put('http://0:3000/api/v2/extractlines/extract',data=json.dumps(extract_params), 
                                headers = {'Content-type': 'application/json'})
    
    extract_result.raise_for_status()
    return extract_result.json()

def get_story_lines( raw_content ):
    story_lines_params = {'key':loc_key, 'body_html':raw_content }
    headers = {'Content-type': 'application/json'}
    story_lines = requests.put('http://0:3000/api/v2/extractlines/story_lines',data=json.dumps(story_lines_params), 
                               params={ 'key': loc_key },headers=headers)
    
    story_lines.raise_for_status()
    
    return story_lines

In [6]:
import subprocess
import tempfile
import codecs
import time
from lxml import html

#download = get_download( downloads_id )
#raw_content = download[u'raw_content']

def extract_with_boilerpipe( raw_content ):
    with tempfile.NamedTemporaryFile( suffix='.html', delete=False ) as t:
        #print t.name
    
        UTF8Writer = codecs.getwriter('utf8')
        t.file = UTF8Writer(t.file)
        t.file.write( raw_content )
    
        t.close()
        #time.sleep( 2 )
        #print "original article tmp file ", t.name
        
        #input_file = '/tmp/416655019.htm'
        input_file = t.name
        
        output_tmp = tempfile.NamedTemporaryFile( suffix='.html', delete=False )
        
        output_file = output_tmp.name
        #output_file = '/tmp/highlighted.html'
        #print output_file
        
        subprocess.check_output(['java', '-jar',
                               '/home/dlarochelle/dev_scratch/boilerpipe_test/out/artifacts/boilerpipe_test_jar/boilerpipe_test.jar',
                               input_file, output_file ] )
        f = open( output_file, 'rb' )
        
        annotated_file_str = f.read()
        
        #t.unlink( t.name )
        output_tmp.close()
        #output_tmp.unlink( output_tmp.name )

    tree = html.fromstring( annotated_file_str )   
    spans = tree.xpath('//span[@class="x-boilerpipe-mark1"]')
    boiler_pipe_lines = [ etree.tostring(s) for s in spans ]

    ret = { 'extracted_html': "\n\n".join(boiler_pipe_lines) }
    return ret

In [7]:
#f = open( '/tmp/tmp01CV6F.html' )
#annotated_file_str = f.read()
#tree = html.fromstring( annotated_file_str )   
#spans = tree.xpath('//span[@class="x-boilerpipe-mark1"]')
#span = spans[0]
#etree.tostring( span )

In [8]:
import readability

def extract_with_python_readability( raw_content ):
    doc = readability.Document( raw_content )
    
    return doc.short_title() + "\n\n" + doc.summary()

In [9]:
import goose

def extract_with_python_goose( raw_content ):
    g = goose.Goose()
    
    r = g.extract( raw_html=raw_content )
    return r.title + "\n\n" +  r.cleaned_text

In [10]:
import justext

def extract_with_justext( raw_content ):
    ret = []
    
    paragraphs = justext.justext( raw_content, justext.get_stoplist('English') )
    
    #p = paragraphs[0]
    for p in paragraphs:
        if not p.is_boilerplate:
            ret.append(p.text)
            
    return "\n\n".join(ret)

#extract_with_justext( raw_content )
#raw_html

#justext.get_stoplists()

In [11]:
import operator

def get_extractor_training_text( downloads_id, preprocessed_lines ):
    extractor_training_lines_result = requests.get(
                                                   'https://api.mediacloud.org/api/v2/extractlines/extractor_training_lines/' + str(downloads_id),
                                               headers = {'Content-type': 'application/json'}
                                               , params= {'key': api_key}
                                               )

    extractor_training_lines_result.raise_for_status()

    extractor_training_lines_result = extractor_training_lines_result.json()
    
    line_numbers = [ x['line_number'] for x in extractor_training_lines_result ]
    
    line_numbers = sorted(line_numbers)
    
    line_numbers.sort()
    
    #print line_numbers
    
    return operator.itemgetter( * line_numbers )( preprocessed_lines  )

import operator

def get_extracted_text( extractor_results ):
    included_line_numbers = extractor_results['included_line_numbers']
    #print included_line_numbers
    
    dl = extractor_results['download_lines']
   
    if len( included_line_numbers ) == 0:
        return []
    else:    
        return operator.itemgetter( * extractor_results['included_line_numbers']   )(dl)

In [12]:
#import Levenshtein

def lines_to_comparable_text( lines ):
    text = u"\n\n".join([ clean_for_comparison(line) for line in lines ])
    
    if text == '':
        text = u''
        
    return text

def html_to_comparable_text( html_text ):
    text = clean_for_comparison( html_text )
    
    if text == '' or text == None:
        text = u''
        
    return text
    
    
#def compare_accuracy( lines, lines_expected ):
#    return Levenshtein.distance( lines_to_comparable_text( lines ) , lines_to_comparable_text( lines_expected ) )

In [13]:
def get_anncestors( element ):
    anncestors = [ element ];
    anncestor = element.getparent()
    
    while anncestor != None :
        #print 'loop'
        anncestors.append( anncestor )
        anncestor = anncestor.getparent()
        
    return anncestors

In [14]:
def text_from_lxml_object( obj):
    if type(obj) is etree._ElementStringResult:
        return u'' + obj
    if type(obj) ==  etree._ElementUnicodeResult:
        return u'' + obj 
    else:
        try:
            return etree.tostring( obj , method='text', encoding="UTF-8") 
        except:
            print type(obj)
            print obj
            
            raise ''

In [15]:
from lxml import etree

downloads_id =  582817308 
download = get_download( downloads_id )
raw_content = download[ 'raw_content' ]
with open( '/tmp/' + str(downloads_id) , 'wb' ) as f:
    f.write( raw_content )

In [16]:
from nltk import tokenize

def remove_duplicate_sentences( article_html, story ):
    #sentences_from_html
    extract_params = {'key':loc_key, 'story_html': article_html }
    
    extract_result = requests.put('http://0:3000/api/v2/extractlines/sentences_from_html',data=json.dumps(extract_params), 
                                headers = {'Content-type': 'application/json'})
    
    #print "article_html:\n", article_html
    
    extract_result.raise_for_status()
    sentences = extract_result.json()
    
    #print "sentences", sentences
    
    #comp_text = lines_to_comparable_text( text_lines )
    #sentences = [ sent.strip()  for sent in tokenize.sent_tokenize( comp_text ) ]
    #print sentences
    non_duplicate_sentences = [sentence for sentence in sentences if not sentence_is_duplicate( sentence, story ) ] 
    return u"\n".join( non_duplicate_sentences )

In [17]:
def text_children( element):
    ret =  [ t for t in element.xpath("//text()" ) if t.getparent() == element ]
    assert len( ret ) <= 2
    
    if len( ret ) == 2:
        assert ret[0].is_text
        assert ret[1].is_tail
    
    for r in ret:
        if r.is_text:
            assert element.text == r
        else:
            assert r.is_tail
            assert element.tail == r
            
    return ret

In [18]:
def html_strip( str ):
    if str.isspace() or str == '':
        return u' '
    
    if str == '<':
        return u' '        
    
    try:
        return html.fromstring(str).text_content()    
    except:
        print "Unexpected error on string '" + str + "'" , sys.exc_info()[0]
        #raise
        return u''       

def clean_for_comparison( str ):
    if len(str) > 0:
        ret = html_strip( str )
    else:
        return str
    
    return ret

In [19]:
def extract_with_mc_extractor( eto, method ):
    story = eto['story']
    preprocessed_lines = eto['preprocessed_lines']
    title = story[u'title']
    description = story[u'description']
    
    extract_result = extract_story( preprocessed_lines, title, description, 'HeuristicExtractor')
    #html_lines = get_extracted_text( extract_result )
    
    ret = {}
    
    ret['extracted_html'] = extract_result[ 'extracted_html' ]
    
    return ret

def extract_with_heur( eto ):
    return extract_with_mc_extractor( eto, 'HeuristicExtractor' )

def extract_with_crf( eto ):
    return extract_with_mc_extractor( eto, 'CrfExtractor' )

In [20]:
import difflib
from IPython.display import HTML

from collections import Counter

def ratcliff_obershelp_compare( actual_text, expected_text ):
    
    words_expected = expected_text.split()
    words_crf      = actual_text.split()
    
    differ = difflib.Differ( )
    
    #print words_crf[:10]
    #print words_expected[:10]
    list( differ.compare( words_crf , words_expected ) )
    counts = Counter([ d[0] for d in differ.compare( words_expected, words_crf   ) ])
    
    tp = counts[' ']
    fp = counts['+']
    fn = counts['-']
    
    if float(tp+fp) == 0:
        precision = 0.0
    else:
        precision = tp/float(tp+fp)
        
    if float( tp + fn ) == 0:
        recall = 0
    else:
        recall    = tp/float( tp + fn )
    
    if ( precision + recall ) > 0:
        f1 = 2*(precision*recall)/( precision + recall )
    else:
        f1 = 0
    
    ret = { 'precision': precision,
        'recall': recall,
        'f1': f1
    }
    
    return ret

def compare_with_expected( extractor_name, actual_text, actual_html, expected_text, story ):
    #actual_text = lines_to_comparable_text( actual_lines )
    #expected_text = lines_to_comparable_text( expected_lines )
    ret = {}
    ret[ extractor_name ] = ratcliff_obershelp_compare( actual_text, expected_text )
    
    dedup_text = remove_duplicate_sentences( actual_html, story )
    
    ret[ extractor_name + "_dedup" ] = ratcliff_obershelp_compare( dedup_text, expected_text )
    
    return ret

In [21]:
ratcliff_obershelp_compare( actual_text='foo', expected_text='bar foo baz BAST')


Out[21]:
{'f1': 0.4, 'precision': 1.0, 'recall': 0.25}

In [22]:
def get_extraction_results( eto ):

    raw_content = eto[ 'raw_content' ]
    
    ret = {}
    
    ret['heur'] = extract_with_heur( eto )
    ret['crf'] = extract_with_crf( eto )
    ret['boiler_pipe'] =   extract_with_boilerpipe( raw_content)
    ret['python_readibilty'] = { 'extracted_html': extract_with_python_readability( raw_content ) }
    ret['py_goose']  = { 'extracted_html': extract_with_python_goose( raw_content ) }
    ret['justext'] =  { 'extracted_html': extract_with_justext( raw_content ) }
    
    for method, result in ret.iteritems():
        if 'extracted_text' not in result:
            result['extracted_text'] = html_to_comparable_text( result['extracted_html' ] )
            
    return ret

In [23]:
def compare_extractors_for_download( downloads_id ):
    
    eto = create_extractor_training_object( downloads_id )
    
    return comp_extractors( eto )    
    
def comp_extractors( eto ):    
    downloads_id = eto['downloads_id']
    media_id     = eto['media_id' ]
    story = eto['story']
    raw_content = eto['raw_content']
    preprocessed_lines = eto['preprocessed_lines']
    expected_text = eto['expected_text']
        
    title = story[u'title']
    description = story[u'description']
    url = story[u'url']
    
    extraction_results = get_extraction_results( eto )
    
    comp_results = {}
        
    comp_results['downloads_id'] = downloads_id
    
    comp_results['media_id']  = media_id
    
    comp_results['story_is_spidered'] = eto['story_is_spidered']
    
    for name, value in extraction_results.iteritems():
        #print name, value
        comp_results.update (compare_with_expected( name, value['extracted_text'], value['extracted_html'], expected_text, story ) )
    
    comp_results.update( compare_with_expected( 'gold', expected_text, expected_text, expected_text, story ) )
    
    return comp_results

In [24]:
def create_extractor_training_object( downloads_id, expected_text=None ):
    download = get_download( downloads_id )
    
    raw_content = download[u'raw_content']
    stories_id = download[u'stories_id']
    
    #print download['url']
    
    story = requests.get('https://api.mediacloud.org/api/v2/stories/single/'+str(stories_id)+'?key='+api_key)
    
    story = story.json()[0]
    
    story_lines = get_story_lines( raw_content )
    #print story_lines.content
    preprocessed_lines = story_lines.json()
    
    if not expected_text:
        expected_lines = get_extractor_training_text( downloads_id, preprocessed_lines  )
        expected_text  = lines_to_comparable_text( expected_lines )

    story_is_spidered_result =  story_is_spidered( story )
    
    ret = { 'downloads_id': downloads_id,
           'raw_content': raw_content,
           'media_id': story['media_id'],
           'story': story,
           'story_is_spidered': story_is_spidered_result,
           'preprocessed_lines': preprocessed_lines,
           'expected_text': expected_text
           }
    
    return ret

In [25]:
import sys

sys.path.append('../')

import mc_config

def get_db_info():
    config_file = mc_config.read_config()
    
    db_infos = config_file['database']
    db_info = next (db_info for db_info in db_infos if db_info['port'] == '6000' )
    return db_info

import psycopg2
#import solr_reimport
import psycopg2.extras

#db_info = get_db_info()

#conn = psycopg2.connect( database=db_info['db'], user=db_info['user'], 
#                        password=db_info['pass'], host=db_info['host'], port=db_info['port'] )

conn = None

story_sentence_counts_cache = {}

def get_sentence_counts( sentence, story ):

    stories_id = story['stories_id']
    
    if not stories_id in story_sentence_counts_cache:
        story_sentence_counts_cache[ stories_id ] = {}
        
    if sentence in story_sentence_counts_cache[ stories_id ]:
        return story_sentence_counts_cache[stories_id ][sentence]

    global conn 
    
    if conn == None:        
        db_info = get_db_info()
        
        conn = psycopg2.connect( database=db_info['db'], user=db_info['user'], 
                                password=db_info['pass'], host=db_info['host'], port=db_info['port'] )

        
    cursor = conn.cursor(cursor_factory=psycopg2.extras.DictCursor)
    query = '''               
                   SELECT MIN( story_sentence_counts_id) AS story_sentence_counts_id, sentence_count, first_stories_id,
                   sentence_md5
            FROM story_sentence_counts
            WHERE sentence_md5  = md5(%(sentence)s)
              AND media_id = %(media_id)s
              AND publish_week =  DATE_TRUNC( 'week', %(publish_date)s::date )
            GROUP BY story_sentence_counts_id
    '''
    
    #print sentence
    #md5_sum = md5.new( sentence ).hexdigest()
    
    params = { 'sentence': sentence,
                            'media_id': story['media_id'], 
                            'publish_date': story['publish_date']
                            } 
    
    #print params
    
    #print eto[ 'story'] ['stories_id' ]
    cursor.execute( query, params )
    
    fetched = cursor.fetchall()
    
    if len( fetched ) == 0:
        story_sentence_counts_cache[ stories_id ][sentence] = None
    else:
        story_sentence_counts_cache[ stories_id ][sentence] = dict(fetched[0])
        
    return story_sentence_counts_cache[stories_id ][sentence]
    
def sentence_is_duplicate( sentence, story ):
    sentence_counts = get_sentence_counts( sentence, story )
    
    if sentence_counts != None:
        if sentence_counts['sentence_count'] > 1:
            #print "duplicate sentence", sentence
            return True
        elif sentence_counts['first_stories_id'] == story['stories_id']:
            return True
            #print "duplicate sentence (diff first_stories_id) ", sentence
            
    else:
        return False
        pass
        #print "sentence not found ", sentence

In [26]:
def get_extractor_training_objects_legacy ( downloads_ids ):
    print downloads_ids
    extractor_training_objects = []
    for downloads_id in downloads_ids[:]:
        print 'downloads_id:', downloads_id
        extractor_training_objects.append( create_extractor_training_object( downloads_id ) )
        
    return extractor_training_objects

In [27]:
import sqlite3

def get_extractor_training_objects_sqlite( db_file ):
    
    db = sqlite3.connect( db_file )
    db.row_factory = sqlite3.Row
    
    cursor = db.cursor()
    
    cursor.execute( "SELECT * from dlannotations  where selected_texts_json is not null order by downloads_id" )
    
    extractor_training_objects = []
    
    skipped_downloads = 0
    for row in list( cursor.fetchall() )[:]:
        row =  dict([ (k, row[k]) for k in row.keys() ])
        
        #print row
        
        row['annotations'] = json.loads( row['annotations_json'] )
        row['raw_content'] = u'' + row['raw_content']
        row['selected_texts'] = json.loads( row['selected_texts_json'] )
    
        annotations = row['annotations']
        download = get_download( row['downloads_id'] )
    
        assert row['selected_texts'] != None
        assert row['selected_texts'] > 0
        
        eto = create_extractor_training_object( row['downloads_id'], expected_text=u"\n".join(row['selected_texts']) )
               
        if eto['raw_content'] != row['raw_content']:
            #TODO figure out why these may differ
            pass
        
            #d = difflib.Differ()
            #diff = d.compare(eto['raw_content'].splitlines(1), row['raw_content'].splitlines(1))
            #print '\n'.join(diff)
        
        extractor_training_objects.append( eto )
    
    
    print "skipped", skipped_downloads
    print "processed", len(extractor_training_objects)
    
    return extractor_training_objects

In [28]:
import pandas as pd

def get_data_frame_from_comparision_objects( comparison_objects ):
    
    new_comps = []
    for comp in comparison_objects:
        
        new_comp = {}
        new_comp = { 'downloads_id': comp['downloads_id'] }
        
        extractor_types = [ k for k in comp.keys() if k not in { 'downloads_id', 'media_id', 'story_is_spidered' }  ]
        
        for extractor_type in extractor_types:
            new_comp.update([ ( k + '_' + extractor_type , v) for k,v in comp[ extractor_type ].iteritems() ])
            
        new_comps.append( new_comp )
        
    df = pd.DataFrame( new_comps )
    df.set_index('downloads_id', inplace=True )
    return df

In [29]:
import boilerpipe.extract

def extract_with_py_boilerpipe( raw_content ):
    e = boilerpipe.extract.Extractor( extractor='ArticleExtractor', html=raw_content )
    html = e.getHTML()

    ret = { 'extracted_html': html }
    return ret

In [30]:
def print_results_by_measurement_type( df ):
    df.describe(percentiles=[.5] )
    result_types = [ 'precision', 'recall', 'f1' ]
    for result_type in result_types:
        res_columns = [ col for col in df.columns if col.startswith( result_type ) ]
        #df.ix[:,['f1_boiler_pipe',	'f1_crf',	'f1_heur', 'f1_python_readibilty']].describe()
        print df.ix[:,res_columns].describe( percentiles=[0.02, 0.05,.1,0.5])

In [31]:
def filter_by_media_tags_id( comps_downloads, media_tags_ids ):
    media_ids_matching = set()
    #print media_id_media_map
    for media_id, media in media_id_media_map.iteritems():
        if not media[ 'media_source_tags_ids'].isdisjoint( media_tags_ids ):
            media_ids_matching.add( media_id )
            
    return  [cd for cd in comps_downloads if cd['media_id'] in media_ids_matching ]

In [32]:
def remove_spidered_downloads( comps_downloads ):           
    return  [cd for cd in comps_downloads if not cd['story_is_spidered'] ]

def only_spidered_downloads( comps_downloads ):           
    return  [cd for cd in comps_downloads if cd['story_is_spidered'] ]

In [33]:
def story_is_spidered( story ):
    for taginfo in story['story_tags']:
        if taginfo['tag'] == 'spidered' and taginfo['tag_set'] == 'spidered':
            return True
    
    return False

Flags


In [34]:
regenerate_extractor_training_objects = False
regenerate_media_id_media_map         = False
regenerate_comps_downloads            = False

Constants


In [35]:
brazil_downloads_ids = [391881020,401370599,412896439,412952145,412977048,413024519,413657081,413835576,414040102, 
                        414257623,414377428,414480464,414818749,414983458,415185946,415186582,415197547,415424551,
                        415978069,416026460,416026587,416047494,416047513,416210404,416263840,416306952,416426245,
                        416655019,416730837,416802690,417347290,417347524,417368539,417389613,417477837,417653177,
                        418489742,418544762,418574641,418648698,418661859,419404469,419440474,419483895,419873979,
                        420430754,420599387,420666122,421520860,421834553,422181106,422280595,422910963,423318170,
                        424080271,424369085,424796346,424840366,425206279,426405203,426560018,426632784,426709900,
                        428449440,429607289,430363249,430995428,433457459,435624796,435659593,461175103,461175549,
                        461176415,461176844,461177487,461178557,461178590,461179203,461179222,461179441,461179762,
                        461179818,461179954,461179956,461180307,461181039,461181597,461186137,461186258,461186833,
                        461187188,461187261,461187577,461188549,461189069,461190586,461193383]

sqlite_db_file = 'extractor_train_dbs/dev_2014-12-09T12_27_40-0500.db'

Data Analysis

Load Data


In [36]:
extractor_training_objects = []
if regenerate_extractor_training_objects:
    eto_sqlite = get_extractor_training_objects_sqlite( sqlite_db_file )
    eto_brazil = get_extractor_training_objects_legacy( brazil_downloads_ids )
    extractor_training_objects.extend( eto_brazil  )
    extractor_training_objects.extend( eto_sqlite )
    cPickle.dump( extractor_training_objects, open("extractor_traning_objects.pickle", "wb"))

In [37]:
extractor_training_objects = cPickle.load( open( "extractor_traning_objects.pickle", "rb") )

Look up Media Tags


In [38]:
import itertools
from collections import Counter

mc = mediacloud.api.MediaCloud(api_key)

if regenerate_media_id_media_map:
    media_id_media_map = {}
    
    media_ids = sorted(list(set([ eto['media_id'] for eto in extractor_training_objects ])))
    
    for media_id in list(media_ids)[:10]:
        media = mc.media( media_id )
        media[ 'media_source_tags_ids' ] = set( [ media_source_tag['tags_id'] 
                                                 for media_source_tag in media['media_source_tags'] ] )
        media_id_media_map[ media_id ] = media
    
    cPickle.dump( media_id_media_map, open("media_id_media_map.pickle", "wb"))

In [39]:
media_id_media_map = cPickle.load( open( "media_id_media_map.pickle", "rb") )    

media_tag_counts = Counter(list ( itertools.chain.from_iterable( media_source['media_source_tags_ids'] for media_source in media_id_media_map.values() )) ) 
tags_id_to_media_tags_map = {}
for media_tag in media_id_media_map.values():
    source_tags = media_tag[ 'media_source_tags' ]
    for source_tag in source_tags:
        tags_id_to_media_tags_map[ source_tag[ 'tags_id' ] ] = source_tag

In [40]:
[ (tags_id_to_media_tags_map[tag_id], count) for tag_id, count in media_tag_counts.most_common( 15 ) ]


Out[40]:
[({u'description': u'A site that is a mainstream media outlet, such as The New York Times and The Washington Post; an online-only news outlet, such as Slate, Salon, or the Huffington Post; or a citizen journalism or non-profit news outlet, such as Global Voices or ProPublica',
   u'label': u'General News',
   u'show_on_media': 1,
   u'show_on_stories': None,
   u'tag': u'General Online News Media',
   u'tag_set': u'media_type',
   u'tag_sets_id': 1099,
   u'tags_id': 8878416},
  9),
 ({u'description': u"Top U.S. mainstream media according Google Ad Planner's measure of unique monthly users.",
   u'label': u'U.S. Mainstream Media',
   u'show_on_media': 1,
   u'show_on_stories': None,
   u'tag': u'ap_english_us_top25_20100110',
   u'tag_set': u'collection',
   u'tag_sets_id': 5,
   u'tags_id': 8875027},
  8),
 ({u'description': None,
   u'label': None,
   u'show_on_media': 0,
   u'show_on_stories': None,
   u'tag': u'include',
   u'tag_set': u'word_cloud',
   u'tag_sets_id': 17,
   u'tags_id': 6071565},
  5),
 ({u'description': None,
   u'label': None,
   u'show_on_media': 0,
   u'show_on_stories': None,
   u'tag': u'default',
   u'tag_set': u'word_cloud',
   u'tag_sets_id': 17,
   u'tags_id': 6729599},
  5),
 ({u'description': None,
   u'label': None,
   u'show_on_media': 0,
   u'show_on_stories': None,
   u'tag': u'pmcheck',
   u'tag_set': u'workflow',
   u'tag_sets_id': 4,
   u'tags_id': 6},
  4),
 ({u'description': None,
   u'label': None,
   u'show_on_media': 0,
   u'show_on_stories': None,
   u'tag': u'oped_project',
   u'tag_set': u'collection',
   u'tag_sets_id': 5,
   u'tags_id': 8878390},
  3),
 ({u'description': u'List of sites related to New York State politics.  Created by manually spidering related sites in 2012-10.',
   u'label': u'New York State Politics (manual)',
   u'show_on_media': 1,
   u'show_on_stories': None,
   u'tag': u'newyork_jessie_manualspidering_20121023',
   u'tag_set': u'collection',
   u'tag_sets_id': 5,
   u'tags_id': 8875676},
  2),
 ({u'description': None,
   u'label': None,
   u'show_on_media': 0,
   u'show_on_stories': None,
   u'tag': u'largemetroarea',
   u'tag_set': u'pkgeog-type',
   u'tag_sets_id': 16,
   u'tags_id': 2496423},
  2),
 ({u'description': u'Large collection of local California newspapers, collected from http://www.usnpl.com/canews.php in 2012-09.',
   u'label': u'2012 California Newspapers',
   u'show_on_media': 1,
   u'show_on_stories': None,
   u'tag': u'california_newspapers_20120914',
   u'tag_set': u'collection',
   u'tag_sets_id': 5,
   u'tags_id': 8875460},
  2),
 ({u'description': u'Large list of regional TC and newspapers sites, collected by Pew in 2010.',
   u'label': u'U.S. Regional Mainstream Media',
   u'show_on_media': 1,
   u'show_on_stories': None,
   u'tag': u'pew_knight_study',
   u'tag_set': u'collection',
   u'tag_sets_id': 5,
   u'tags_id': 2453107},
  2),
 ({u'description': None,
   u'label': None,
   u'show_on_media': 0,
   u'show_on_stories': None,
   u'tag': u'1',
   u'tag_set': u'usnewspapercirculation',
   u'tag_sets_id': 3,
   u'tags_id': 5},
  1),
 ({u'description': None,
   u'label': None,
   u'show_on_media': 0,
   u'show_on_stories': None,
   u'tag': u'hrcheck',
   u'tag_set': u'workflow',
   u'tag_sets_id': 4,
   u'tags_id': 7},
  1),
 ({u'description': None,
   u'label': None,
   u'show_on_media': 0,
   u'show_on_stories': None,
   u'tag': u'27',
   u'tag_set': u'usnewspapercirculation',
   u'tag_sets_id': 3,
   u'tags_id': 11},
  1),
 ({u'description': None,
   u'label': None,
   u'show_on_media': 0,
   u'show_on_stories': None,
   u'tag': u'4',
   u'tag_set': u'usnewspapercirculation',
   u'tag_sets_id': 3,
   u'tags_id': 14},
  1),
 ({u'description': None,
   u'label': None,
   u'show_on_media': 0,
   u'show_on_stories': None,
   u'tag': u'5',
   u'tag_set': u'usnewspapercirculation',
   u'tag_sets_id': 3,
   u'tags_id': 16},
  1)]

Run extractors


In [41]:
#eto = extractor_training_objects[ 0 ]
#eto.keys()
#print eto['expected_text']
#get_extraction_results( eto )
#comp_extractors ( eto )

extraction_results = []

for eto in  extractor_training_objects[:2]:
    er = dict( eto )
    er[ 'extractor_results'] = get_extraction_results( eto )
    
    extraction_results.append( er )

eto.keys()    
#er.keys()


Out[41]:
['story',
 'preprocessed_lines',
 'media_id',
 'downloads_id',
 'raw_content',
 'story_is_spidered',
 'expected_text']

In [42]:
if regenerate_comps_downloads:
    
    comps_downloads = []
    processed = 0
    skipped = 0
    
    e=None
    for extractor_training_object in extractor_training_objects[:]:
        print 'processed ', processed
        print 'skipped ', skipped
        print extractor_training_object[ 'downloads_id']
        try:
            res = comp_extractors( extractor_training_object )
            #print res
            comps_downloads.append( res )
            processed += 1
        except Exception, e:
            print "error on download{}".format( extractor_training_object[ 'downloads_id'] )
            e = sys.exc_info()
            
            import traceback
            
            traceback.print_exc()
            print e
            #raise e
            skipped += 1
            
    cPickle.dump( comps_downloads, open("comps_downloads.pickle", "wb"))
    
    e
#extractor_training_objects

In [43]:
comps_downloads = cPickle.load( open( "comps_downloads.pickle", "rb") )

Results

Results Overall


In [44]:
comps_downloads[0]


Out[44]:
{'boiler_pipe': {'f1': 0.7136752136752136,
  'precision': 0.6139705882352942,
  'recall': 0.8520408163265306},
 'boiler_pipe_dedup': {'f1': 0.7109207708779444,
  'precision': 0.6125461254612546,
  'recall': 0.8469387755102041},
 'crf': {'f1': 0.8988235294117648,
  'precision': 0.834061135371179,
  'recall': 0.9744897959183674},
 'crf_dedup': {'f1': 0.8985507246376812,
  'precision': 0.8532110091743119,
  'recall': 0.9489795918367347},
 'downloads_id': 391881020,
 'gold': {'f1': 1.0, 'precision': 1.0, 'recall': 1.0},
 'gold_dedup': {'f1': 0.9821882951653944,
  'precision': 0.9796954314720813,
  'recall': 0.9846938775510204},
 'heur': {'f1': 0.8988235294117648,
  'precision': 0.834061135371179,
  'recall': 0.9744897959183674},
 'heur_dedup': {'f1': 0.8985507246376812,
  'precision': 0.8532110091743119,
  'recall': 0.9489795918367347},
 'justext': {'f1': 0, 'precision': 0.0, 'recall': 0.0},
 'justext_dedup': {'f1': 0, 'precision': 0.0, 'recall': 0.0},
 'media_id': 83371,
 'py_goose': {'f1': 0.926208651399491,
  'precision': 0.9238578680203046,
  'recall': 0.9285714285714286},
 'py_goose_dedup': {'f1': 0.926208651399491,
  'precision': 0.9238578680203046,
  'recall': 0.9285714285714286},
 'python_readibilty': {'f1': 0.9411764705882353,
  'precision': 0.9435897435897436,
  'recall': 0.9387755102040817},
 'python_readibilty_dedup': {'f1': 0.9187817258883249,
  'precision': 0.9141414141414141,
  'recall': 0.923469387755102},
 'story_is_spidered': False}

In [45]:
df = get_data_frame_from_comparision_objects( comps_downloads )
print_results_by_measurement_type( df )


       precision_boiler_pipe  precision_boiler_pipe_dedup  precision_crf  \
count             644.000000                   644.000000     644.000000   
mean                0.673146                     0.675358       0.748868   
std                 0.377755                     0.376360       0.297527   
min                 0.000000                     0.000000       0.000000   
2%                  0.000000                     0.000000       0.000000   
5%                  0.004954                     0.005719       0.082641   
10%                 0.022343                     0.023343       0.251831   
50%                 0.882041                     0.878578       0.881513   
max                 1.000000                     1.000000       1.000000   

       precision_crf_dedup  precision_gold  precision_gold_dedup  \
count           644.000000             644            644.000000   
mean              0.804301               1              0.997567   
std               0.311642               0              0.008295   
min               0.000000               1              0.909091   
2%                0.000000               1              0.977524   
5%                0.000000               1              0.984289   
10%               0.198751               1              0.992714   
50%               0.966172               1              1.000000   
max               1.000000               1              1.000000   

       precision_heur  precision_heur_dedup  precision_justext  \
count      644.000000            644.000000         644.000000   
mean         0.748868              0.804301           0.415551   
std          0.297527              0.311642           0.444839   
min          0.000000              0.000000           0.000000   
2%           0.000000              0.000000           0.000000   
5%           0.082641              0.000000           0.000000   
10%          0.251831              0.198751           0.000000   
50%          0.881513              0.966172           0.141032   
max          1.000000              1.000000           1.000000   

       precision_justext_dedup  precision_py_goose  precision_py_goose_dedup  \
count               644.000000          644.000000                644.000000   
mean                  0.421475            0.890629                  0.858846   
std                   0.450568            0.235992                  0.281457   
min                   0.000000            0.000000                  0.000000   
2%                    0.000000            0.040846                  0.000000   
5%                    0.000000            0.312500                  0.023970   
10%                   0.000000            0.500000                  0.384198   
50%                   0.135243            0.998836                  0.995829   
max                   1.000000            1.000000                  1.000000   

       precision_python_readibilty  precision_python_readibilty_dedup  
count                   644.000000                         644.000000  
mean                      0.903364                           0.896494  
std                       0.213072                           0.222137  
min                       0.008499                           0.000000  
2%                        0.051373                           0.030111  
5%                        0.314462                           0.297013  
10%                       0.750000                           0.702025  
50%                       0.977971                           0.981174  
max                       1.000000                           1.000000  
       recall_boiler_pipe  recall_boiler_pipe_dedup  recall_crf  \
count          644.000000                644.000000  644.000000   
mean             0.726914                  0.684892    0.824376   
std              0.374434                  0.376018    0.239798   
min              0.000000                  0.000000    0.000000   
2%               0.000000                  0.000000    0.000000   
5%               0.018198                  0.018198    0.166667   
10%              0.040000                  0.039773    0.466211   
50%              0.943349                  0.903600    0.915143   
max              1.000000                  1.000000    1.000000   

       recall_crf_dedup  recall_gold  recall_gold_dedup  recall_heur  \
count        644.000000          644         644.000000   644.000000   
mean           0.743098            1           0.924960     0.824376   
std            0.315359            0           0.199021     0.239798   
min            0.000000            1           0.021583     0.000000   
2%             0.000000            1           0.111819     0.000000   
5%             0.000000            1           0.403178     0.166667   
10%            0.056685            1           0.757895     0.466211   
50%            0.894517            1           1.000000     0.915143   
max            1.000000            1           1.000000     1.000000   

       recall_heur_dedup  recall_justext  recall_justext_dedup  \
count         644.000000      644.000000            644.000000   
mean            0.743098        0.452075              0.408899   
std             0.315359        0.445453              0.429813   
min             0.000000        0.000000              0.000000   
2%              0.000000        0.000000              0.000000   
5%              0.000000        0.000000              0.000000   
10%             0.056685        0.000000              0.000000   
50%             0.894517        0.338979              0.161884   
max             1.000000        1.000000              1.000000   

       recall_py_goose  recall_py_goose_dedup  recall_python_readibilty  \
count       644.000000             644.000000                644.000000   
mean          0.576674               0.526721                  0.863263   
std           0.408982               0.408359                  0.253866   
min           0.000000               0.000000                  0.002874   
2%            0.003655               0.000000                  0.035259   
5%            0.010523               0.002412                  0.082882   
10%           0.024552               0.014510                  0.579457   
50%           0.793689               0.647658                  0.960937   
max           1.000000               1.000000                  1.000000   

       recall_python_readibilty_dedup  
count                      644.000000  
mean                         0.795957  
std                          0.310906  
min                          0.000000  
2%                           0.013497  
5%                           0.032396  
10%                          0.114791  
50%                          0.949153  
max                          1.000000  
       f1_boiler_pipe  f1_boiler_pipe_dedup      f1_crf  f1_crf_dedup  \
count      644.000000            644.000000  644.000000    644.000000   
mean         0.677385              0.653520    0.748647      0.740478   
std          0.375707              0.374177    0.270094      0.315791   
min          0.000000              0.000000    0.000000      0.000000   
2%           0.000000              0.000000    0.000000      0.000000   
5%           0.008285              0.008306    0.103632      0.000000   
10%          0.025495              0.026259    0.292901      0.093874   
50%          0.887676              0.838040    0.864991      0.895367   
max          1.000000              1.000000    1.000000      1.000000   

       f1_gold  f1_gold_dedup     f1_heur  f1_heur_dedup  f1_justext  \
count      644     644.000000  644.000000     644.000000  644.000000   
mean         1       0.942635    0.748647       0.740478    0.406961   
std          0       0.169775    0.270094       0.315791    0.426200   
min          1       0.042254    0.000000       0.000000    0.000000   
2%           1       0.201146    0.000000       0.000000    0.000000   
5%           1       0.561269    0.103632       0.000000    0.000000   
10%          1       0.857520    0.292901       0.093874    0.000000   
50%          1       1.000000    0.864991       0.895367    0.200358   
max          1       1.000000    1.000000       1.000000    1.000000   

       f1_justext_dedup  f1_py_goose  f1_py_goose_dedup  f1_python_readibilty  \
count        644.000000   644.000000         644.000000            644.000000   
mean           0.386846     0.605757           0.563397              0.861178   
std            0.420299     0.398192           0.402577              0.253648   
min            0.000000     0.000000           0.000000              0.005525   
2%             0.000000     0.007274           0.000000              0.035096   
5%             0.000000     0.020292           0.004808              0.085328   
10%            0.000000     0.045969           0.026535              0.533585   
50%            0.111429     0.839000           0.734851              0.957090   
max            1.000000     1.000000           1.000000              1.000000   

       f1_python_readibilty_dedup  
count                  644.000000  
mean                     0.808616  
std                      0.300368  
min                      0.000000  
2%                       0.020957  
5%                       0.042998  
10%                      0.156222  
50%                      0.951220  
max                      1.000000  

In [46]:
print "spidered"
df = get_data_frame_from_comparision_objects( only_spidered_downloads( comps_downloads ) )
print_results_by_measurement_type( df )


spidered
       precision_boiler_pipe  precision_boiler_pipe_dedup  precision_crf  \
count             115.000000                   115.000000     115.000000   
mean                0.759524                     0.760001       0.728986   
std                 0.310086                     0.310471       0.345630   
min                 0.000000                     0.000000       0.000000   
2%                  0.006306                     0.012074       0.000000   
5%                  0.025225                     0.025225       0.000154   
10%                 0.111641                     0.085428       0.105578   
50%                 0.901591                     0.904762       0.941392   
max                 1.000000                     1.000000       1.000000   

       precision_crf_dedup  precision_gold  precision_gold_dedup  \
count           115.000000             115            115.000000   
mean              0.721599               1              0.998975   
std               0.372825               0              0.002797   
min               0.000000               1              0.983015   
2%                0.000000               1              0.989518   
5%                0.000000               1              0.994159   
10%               0.000000               1              0.996115   
50%               0.956772               1              1.000000   
max               1.000000               1              1.000000   

       precision_heur  precision_heur_dedup  precision_justext  \
count      115.000000            115.000000         115.000000   
mean         0.728986              0.721599           0.502603   
std          0.345630              0.372825           0.443725   
min          0.000000              0.000000           0.000000   
2%           0.000000              0.000000           0.000000   
5%           0.000154              0.000000           0.000000   
10%          0.105578              0.000000           0.000000   
50%          0.941392              0.956772           0.468547   
max          1.000000              1.000000           1.000000   

       precision_justext_dedup  precision_py_goose  precision_py_goose_dedup  \
count               115.000000          115.000000                115.000000   
mean                  0.513274            0.879815                  0.883859   
std                   0.446751            0.260398                  0.258326   
min                   0.000000            0.007173                  0.007173   
2%                    0.000000            0.033548                  0.035865   
5%                    0.000000            0.166667                  0.166667   
10%                   0.000000            0.442045                  0.442045   
50%                   0.468547            0.995851                  0.995349   
max                   1.000000            1.000000                  1.000000   

       precision_python_readibilty  precision_python_readibilty_dedup  
count                   115.000000                         115.000000  
mean                      0.832070                           0.822126  
std                       0.301095                           0.304594  
min                       0.008499                           0.007067  
2%                        0.030500                           0.023559  
5%                        0.081429                           0.086565  
10%                       0.223742                           0.193379  
50%                       0.977679                           0.972727  
max                       1.000000                           1.000000  
       recall_boiler_pipe  recall_boiler_pipe_dedup  recall_crf  \
count          115.000000                115.000000  115.000000   
mean             0.826639                  0.775159    0.727501   
std              0.275779                  0.292861    0.321322   
min              0.000000                  0.000000    0.000000   
2%               0.075197                  0.075197    0.000000   
5%               0.082956                  0.082956    0.005018   
10%              0.296784                  0.239981    0.117013   
50%              0.956081                  0.936053    0.889344   
max              1.000000                  1.000000    1.000000   

       recall_crf_dedup  recall_gold  recall_gold_dedup  recall_heur  \
count        115.000000          115         115.000000   115.000000   
mean           0.644234            1           0.917858     0.727501   
std            0.370118            0           0.220640     0.321322   
min            0.000000            1           0.059172     0.000000   
2%             0.000000            1           0.147813     0.000000   
5%             0.000000            1           0.294415     0.005018   
10%            0.000000            1           0.686249     0.117013   
50%            0.829710            1           1.000000     0.889344   
max            1.000000            1           1.000000     1.000000   

       recall_heur_dedup  recall_justext  recall_justext_dedup  \
count         115.000000      115.000000            115.000000   
mean            0.644234        0.580008              0.523605   
std             0.370118        0.430435              0.423792   
min             0.000000        0.000000              0.000000   
2%              0.000000        0.000000              0.000000   
5%              0.000000        0.000000              0.000000   
10%             0.000000        0.000000              0.000000   
50%             0.829710        0.838407              0.747788   
max             1.000000        1.000000              1.000000   

       recall_py_goose  recall_py_goose_dedup  recall_python_readibilty  \
count       115.000000             115.000000                115.000000   
mean          0.682953               0.625534                  0.835127   
std           0.378934               0.392403                  0.317199   
min           0.002874               0.002874                  0.002874   
2%            0.005576               0.005576                  0.017247   
5%            0.014152               0.008901                  0.033962   
10%           0.026731               0.024740                  0.089791   
50%           0.908081               0.864865                  0.976285   
max           1.000000               1.000000                  1.000000   

       recall_python_readibilty_dedup  
count                      115.000000  
mean                         0.768521  
std                          0.362741  
min                          0.004310  
2%                           0.007016  
5%                           0.011913  
10%                          0.042088  
50%                          0.969555  
max                          1.000000  
       f1_boiler_pipe  f1_boiler_pipe_dedup      f1_crf  f1_crf_dedup  \
count      115.000000            115.000000  115.000000    115.000000   
mean         0.743924              0.720247    0.681098      0.638833   
std          0.308386              0.310287    0.330442      0.369206   
min          0.000000              0.000000    0.000000      0.000000   
2%           0.011767              0.021945    0.000000      0.000000   
5%           0.038741              0.038741    0.000307      0.000000   
10%          0.148844              0.120862    0.064914      0.000000   
50%          0.895199              0.851391    0.826990      0.799078   
max          1.000000              0.996663    0.998779      0.999504   

       f1_gold  f1_gold_dedup     f1_heur  f1_heur_dedup  f1_justext  \
count      115     115.000000  115.000000     115.000000  115.000000   
mean         1       0.935951    0.681098       0.638833    0.477664   
std          0       0.183827    0.330442       0.369206    0.409128   
min          1       0.111732    0.000000       0.000000    0.000000   
2%           1       0.256728    0.000000       0.000000    0.000000   
5%           1       0.454631    0.000307       0.000000    0.000000   
10%          1       0.813788    0.064914       0.000000    0.000000   
50%          1       1.000000    0.826990       0.799078    0.485185   
max          1       1.000000    0.998779       0.999504    1.000000   

       f1_justext_dedup  f1_py_goose  f1_py_goose_dedup  f1_python_readibilty  \
count        115.000000   115.000000         115.000000            115.000000   
mean           0.455847     0.704697           0.655273              0.800922   
std            0.400042     0.373744           0.384180              0.330705   
min            0.000000     0.005698           0.005698              0.005525   
2%             0.000000     0.010997           0.010802              0.018968   
5%             0.000000     0.015531           0.014115              0.038364   
10%            0.000000     0.041221           0.041409              0.081796   
50%            0.436764     0.926154           0.895636              0.966102   
max            1.000000     1.000000           1.000000              1.000000   

       f1_python_readibilty_dedup  
count                  115.000000  
mean                     0.746244  
std                      0.361362  
min                      0.008264  
2%                       0.011768  
5%                       0.016233  
10%                      0.049514  
50%                      0.957265  
max                      0.997980  

Results by Subset


In [47]:
regional = { 2453107 }
print "region / pew knight sutdy / 245107 "
df = get_data_frame_from_comparision_objects( filter_by_media_tags_id( comps_downloads, regional ) )
print_results_by_measurement_type( df )

ap_english_us_top_25 = { 2453107 }
print "ap_english_us_top25 / 8875027 "
df = get_data_frame_from_comparision_objects( filter_by_media_tags_id( comps_downloads, ap ) )
print_results_by_measurement_type( df )

political_blogs = { 125 }
print "political blogs / 125"
df = get_data_frame_from_comparision_objects( filter_by_media_tags_id( comps_downloads, political_blogs ) )
print_results_by_measurement_type( df )


russian = { 7796878 }
print 'russian'
df = get_data_frame_from_comparision_objects( filter_by_media_tags_id( comps_downloads, russian ) )
print_results_by_measurement_type( df )

print 'brazil'
df = get_data_frame_from_comparision_objects( filter_by_media_tags_id( comps_downloads, {8877968,  8877969, 8877973, 8877970 } ) )
print_results_by_measurement_type( df )

arabic = { 8878255 }
print 'arabic'
df = get_data_frame_from_comparision_objects( filter_by_media_tags_id( comps_downloads, arabic ) )
print_results_by_measurement_type( df )


region / pew knight sutdy / 245107 
       precision_boiler_pipe  precision_boiler_pipe_dedup  precision_crf  \
count              16.000000                    16.000000      16.000000   
mean                0.725938                     0.736816       0.638475   
std                 0.302335                     0.297586       0.224373   
min                 0.045918                     0.046154       0.301170   
2%                  0.145964                     0.137417       0.311373   
5%                  0.296032                     0.274312       0.326677   
10%                 0.384241                     0.373789       0.337376   
50%                 0.860206                     0.845775       0.686694   
max                 0.991114                     1.000000       0.941935   

       precision_crf_dedup  precision_gold  precision_gold_dedup  \
count            16.000000              16             16.000000   
mean              0.758878               1              0.999633   
std               0.242286               0              0.001204   
min               0.323615               1              0.995241   
2%                0.334289               1              0.996335   
5%                0.350300               1              0.997977   
10%               0.369875               1              0.999444   
50%               0.825397               1              1.000000   
max               1.000000               1              1.000000   

       precision_heur  precision_heur_dedup  precision_justext  \
count       16.000000             16.000000          16.000000   
mean         0.638475              0.758878           0.537627   
std          0.224373              0.242286           0.338963   
min          0.301170              0.323615           0.020161   
2%           0.311373              0.334289           0.023989   
5%           0.326677              0.350300           0.029732   
10%          0.337376              0.369875           0.036285   
50%          0.686694              0.825397           0.683204   
max          0.941935              1.000000           0.995455   

       precision_justext_dedup  precision_py_goose  precision_py_goose_dedup  \
count                16.000000           16.000000                 16.000000   
mean                  0.587437            0.992751                  0.991805   
std                   0.366665            0.008386                  0.008850   
min                   0.021552            0.975000                  0.975000   
2%                    0.024963            0.977143                  0.975523   
5%                    0.030079            0.980357                  0.976308   
10%                   0.045873            0.982222                  0.979523   
50%                   0.715038            0.997057                  0.995598   
max                   0.995455            1.000000                  1.000000   

       precision_python_readibilty  precision_python_readibilty_dedup  
count                    16.000000                          16.000000  
mean                      0.970112                           0.994401  
std                       0.026198                           0.009272  
min                       0.884058                           0.974194  
2%                        0.904597                           0.974979  
5%                        0.935406                           0.976157  
10%                       0.957743                           0.978202  
50%                       0.974920                           1.000000  
max                       1.000000                           1.000000  
       recall_boiler_pipe  recall_boiler_pipe_dedup  recall_crf  \
count           16.000000                 16.000000   16.000000   
mean             0.918452                  0.832067    0.916917   
std              0.228239                  0.225957    0.054466   
min              0.066667                  0.066667    0.793103   
2%               0.322098                  0.257621    0.807622   
5%               0.705244                  0.544052    0.829400   
10%              0.928135                  0.703571    0.850819   
50%              0.976690                  0.923678    0.933668   
max              1.000000                  0.993341    0.993341   

       recall_crf_dedup  recall_gold  recall_gold_dedup  recall_heur  \
count         16.000000           16          16.000000    16.000000   
mean           0.859778            1           0.929605     0.916917   
std            0.118175            0           0.096666     0.054466   
min            0.630742            1           0.710247     0.793103   
2%             0.649212            1           0.729341     0.807622   
5%             0.676916            1           0.757981     0.829400   
10%            0.707825            1           0.792075     0.850819   
50%            0.881066            1           0.985154     0.933668   
max            0.998934            1           1.000000     0.993341   

       recall_heur_dedup  recall_justext  recall_justext_dedup  \
count          16.000000       16.000000             16.000000   
mean            0.859778        0.793756              0.709709   
std             0.118175        0.368278              0.342666   
min             0.630742        0.034483              0.034483   
2%              0.649212        0.043019              0.034627   
5%              0.676916        0.055823              0.034844   
10%             0.707825        0.065657              0.051671   
50%             0.881066        0.964091              0.822389   
max             0.998934        1.000000              0.993341   

       recall_py_goose  recall_py_goose_dedup  recall_python_readibilty  \
count        16.000000              16.000000                 16.000000   
mean          0.959655               0.865618                  0.930970   
std           0.040620               0.112344                  0.047700   
min           0.866667               0.643110                  0.827586   
2%            0.868571               0.650177                  0.832731   
5%            0.871429               0.660777                  0.840449   
10%           0.905473               0.685315                  0.870644   
50%           0.974939               0.902299                  0.944134   
max           1.000000               0.986681                  1.000000   

       recall_python_readibilty_dedup  
count                       16.000000  
mean                         0.885925  
std                          0.099482  
min                          0.699647  
2%                           0.700941  
5%                           0.702884  
10%                          0.732934  
50%                          0.916769  
max                          0.988901  
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-47-3971c36169d0> in <module>()
      6 ap_english_us_top_25 = { 2453107 }
      7 print "ap_english_us_top25 / 8875027 "
----> 8 df = get_data_frame_from_comparision_objects( filter_by_media_tags_id( comps_downloads, ap ) )
      9 print_results_by_measurement_type( df )
     10 

NameError: name 'ap' is not defined
       f1_boiler_pipe  f1_boiler_pipe_dedup     f1_crf  f1_crf_dedup  f1_gold  \
count       16.000000             16.000000  16.000000     16.000000       16   
mean         0.789421              0.758594   0.733554      0.776456        1   
std          0.263828              0.245140   0.176351      0.161895        0   
min          0.054381              0.054545   0.448802      0.482609        1   
2%           0.201491              0.182182   0.460532      0.493106        1   
5%           0.422155              0.373636   0.478128      0.508851        1   
10%          0.550422              0.522178   0.488653      0.530086        1   
50%          0.910472              0.855466   0.792578      0.815749        1   
max          0.991870              0.992790   0.941176      0.978228        1   

       f1_gold_dedup    f1_heur  f1_heur_dedup  f1_justext  f1_justext_dedup  \
count      16.000000  16.000000      16.000000   16.000000         16.000000   
mean        0.960756   0.733554       0.776456    0.615955          0.611985   
std         0.054902   0.176351       0.161895    0.349292          0.349589   
min         0.830579   0.448802       0.482609    0.025445          0.026525   
2%          0.843166   0.460532       0.493106    0.031145          0.031726   
5%          0.862047   0.478128       0.508851    0.039695          0.039526   
10%         0.883860   0.488653       0.530086    0.046547          0.044152   
50%         0.992521   0.792578       0.815749    0.805163          0.789394   
max         1.000000   0.941176       0.978228    0.976000          0.963696   

       f1_py_goose  f1_py_goose_dedup  f1_python_readibilty  \
count    16.000000          16.000000             16.000000   
mean      0.975627           0.920619              0.949469   
std       0.024955           0.068276              0.030260   
min       0.917647           0.781116              0.895105   
2%        0.919664           0.784517              0.895193   
5%        0.922689           0.789619              0.895324   
10%       0.942750           0.809359              0.909820   
50%       0.986642           0.942581              0.951892   
max       1.000000           0.992187              1.000000   

       f1_python_readibilty_dedup  
count                   16.000000  
mean                     0.934173  
std                      0.058683  
min                      0.823285  
2%                       0.824179  
5%                       0.825520  
10%                      0.841704  
50%                      0.951215  
max                      0.993865  
ap_english_us_top25 / 8875027