In [1]:
import cPickle
import os.path
api_key = cPickle.load( file( os.path.expanduser( '~/mediacloud_api_key.pickle' ), 'r' ) )
In [ ]:
import cPickle
import os.path
cPickle.dump( api_key, file( os.path.expanduser( '~/mediacloud_api_key.pickle' ), 'wb' ) )
In [2]:
import sys
sys.path.append('../../foreign_modules/python/')
In [6]:
def gen_data(downloads_id, included_line_numbers):
heuristic_training_ip = []
c_t_ip = []
h_t_ip = []
try:
#api_key = ''
loc_key = api_key
download = requests.get('https://api.mediacloud.org/api/v2/downloads/single/'+str(downloads_id)+'?key='+api_key)
raw_content = download.json()[0][u'raw_content']
stories_id = download.json()[0][u'stories_id']
story = requests.get('https://api.mediacloud.org/api/v2/stories/single/'+str(stories_id)+'?key='+api_key)
title = story.json()[0][u'title']
description = story.json()[0][u'description']
url = story.json()[0][u'url']
story_lines_params = {'key':loc_key, 'body_html':raw_content}
story_lines = requests.get('https://api.mediacloud.org/api/v2/extractlines/story_lines',params=story_lines_params)
preprocessed_lines = story_lines.text
heur_extract_params = {'key':loc_key, 'preprocessed_lines':preprocessed_lines, 'story_title':title, 'story_description':description, 'extractor_method':'HeuristicExtractor'}
heur_extract = requests.get('https://api.mediacloud.org/api/v2/extractlines/extract',params=heur_extract_params)
crf_extract_params = {'key':loc_key, 'preprocessed_lines':preprocessed_lines, 'story_title':title, 'story_description':description, 'extractor_method':'CrfExtractor'}
crf_extract = requests.get('https://api.mediacloud.org/api/v2/extractlines/extract',params=crf_extract_params)
for ln, hscore in enumerate(heur_extract.json()[u'scores']):
t = 1 if str(ln) in included_line_numbers else 0
if hscore[u'autoexcluded'] != 1:
h_t_ip.append( (t, hscore[u'include_probability']) )
cscore = crf_extract.json()[u'scores'][ln]
if u'autoexcluded' not in cscore:
c_t_ip.append( (t, cscore[u'include_probability']) )
except Exception as e:
pass
return h_t_ip, c_t_ip
In [13]:
import mediacloud, requests, csv, sys, os, json, cPickle
from pyroc import *
#extractor_training_lines_checked has the training lines for downloads for which the highest line listed as 'included' was less than the number of lines in the download (max(included_line_numbers) < len(story_lines.json()))
f = open("extractor_training_lines_checked.json").read()
reader = json.loads(f)
heur = []
cPickle.dump(heur, open("heur.p", "wb"))
crf = []
cPickle.dump(crf, open("crf.p", "wb"))
done = []
cPickle.dump(done, open("done.p", "wb"))
for row in reader[:30]:
did = row[u'downloads_id']
lns = row[u'included_line_numbers']
curh, curc = gen_data(did, lns)
heur+=curh
crf+=curc
done.append(did)
cPickle.dump(done, open("done.p", "wb"))
cPickle.dump(heur, open("heur.p", "wb"))
cPickle.dump(crf, open("crf.p", "wb"))
In [14]:
import cPickle
from pyroc import *
heur = cPickle.load(open("heur.p","rb"))
crf = cPickle.load(open("crf.p","rb"))
rocheur = ROCData(heur)
roccrf = ROCData(crf)
print rocheur.auc()
plot_multiple_roc(rocList=(rocheur,roccrf), title='Extractor ROC Curve', labels=("heuristic curve","crf curve"))