In [1]:
import cPickle
import os.path

api_key = cPickle.load( file( os.path.expanduser( '~/mediacloud_api_key.pickle' ), 'r' ) )

In [ ]:
import cPickle
import os.path

cPickle.dump( api_key, file( os.path.expanduser( '~/mediacloud_api_key.pickle' ), 'wb' ) )

In [2]:
import sys
sys.path.append('../../foreign_modules/python/')

In [6]:
def gen_data(downloads_id, included_line_numbers):
    
    heuristic_training_ip = []
    c_t_ip = []
    h_t_ip = []
    
    try:
        
        #api_key = ''
        loc_key = api_key
    
        download = requests.get('https://api.mediacloud.org/api/v2/downloads/single/'+str(downloads_id)+'?key='+api_key)
        raw_content = download.json()[0][u'raw_content']
        stories_id = download.json()[0][u'stories_id']
    
        story = requests.get('https://api.mediacloud.org/api/v2/stories/single/'+str(stories_id)+'?key='+api_key)
        title = story.json()[0][u'title']
        description = story.json()[0][u'description']
        url = story.json()[0][u'url']

        story_lines_params = {'key':loc_key, 'body_html':raw_content}
        story_lines = requests.get('https://api.mediacloud.org/api/v2/extractlines/story_lines',params=story_lines_params)
        preprocessed_lines = story_lines.text

        heur_extract_params = {'key':loc_key, 'preprocessed_lines':preprocessed_lines, 'story_title':title, 'story_description':description, 'extractor_method':'HeuristicExtractor'}
        heur_extract = requests.get('https://api.mediacloud.org/api/v2/extractlines/extract',params=heur_extract_params)

        crf_extract_params = {'key':loc_key, 'preprocessed_lines':preprocessed_lines, 'story_title':title, 'story_description':description, 'extractor_method':'CrfExtractor'}
        crf_extract = requests.get('https://api.mediacloud.org/api/v2/extractlines/extract',params=crf_extract_params)

        for ln, hscore in enumerate(heur_extract.json()[u'scores']):

            t = 1 if str(ln) in included_line_numbers else 0

            if hscore[u'autoexcluded'] != 1:
                h_t_ip.append( (t, hscore[u'include_probability']) )

            cscore = crf_extract.json()[u'scores'][ln]
            if u'autoexcluded' not in cscore:
                c_t_ip.append( (t, cscore[u'include_probability']) ) 
    
    except Exception as e:
        
        pass

    return h_t_ip, c_t_ip

In [13]:
import mediacloud, requests, csv, sys, os, json, cPickle
from pyroc import *

#extractor_training_lines_checked has the training lines for downloads for which the highest line listed as 'included' was less than the number of lines in the download (max(included_line_numbers) < len(story_lines.json()))
f = open("extractor_training_lines_checked.json").read()
reader = json.loads(f)
heur = []
cPickle.dump(heur, open("heur.p", "wb"))
crf = []
cPickle.dump(crf, open("crf.p", "wb"))
done = []
cPickle.dump(done, open("done.p", "wb"))

for row in reader[:30]:
    
    did = row[u'downloads_id']
    lns = row[u'included_line_numbers']
    
    curh, curc = gen_data(did, lns)
    
    heur+=curh
    crf+=curc
    done.append(did)
    
cPickle.dump(done, open("done.p", "wb"))
cPickle.dump(heur, open("heur.p", "wb"))
cPickle.dump(crf, open("crf.p", "wb"))

In [14]:
import cPickle
from pyroc import *

heur = cPickle.load(open("heur.p","rb"))
crf = cPickle.load(open("crf.p","rb"))

rocheur = ROCData(heur)
roccrf = ROCData(crf)


print rocheur.auc()

plot_multiple_roc(rocList=(rocheur,roccrf), title='Extractor ROC Curve', labels=("heuristic curve","crf curve"))


0.0