In [ ]:
import mediacloud, requests, csv, sys, os, json
from pyroc import *
api_key = ''
#mc = mediacloud.api.MediaCloud(api_key)
loc_key = ''
checked = json.load(open("extractor_training_lines_checked.json"))[0]
h_t_ip = []
c_t_ip = []
for dload in checked:
cur_h_t_ip, cur_c_t_ip = gen_data(dload['downloads_id'],dload['included_line_numbers'],dload['required_line_numbers'])
h_t_ip+= cur_h_t_ip
c_t_ip+= cur_c_t_ip
In [ ]:
def gen_data(downloads_id, included_line_numbers, required_line_numbers):
download = requests.get('https://api.mediacloud.org/api/v2/downloads/single/'+str(downloads_id)+'?key='+api_key)
raw_content = download.json()[0][u'raw_content']
stories_id = download.json()[0][u'stories_id']
story = requests.get('https://api.mediacloud.org/api/v2/stories/single/'+str(stories_id)+'?key='+api_key)
title = story.json()[0][u'title']
description = story.json()[0][u'description']
url = story.json()[0][u'url']
story_lines_params = {'key':loc_key, 'body_html':raw_content}
story_lines = requests.get('http://0:3000/api/v2/extractlines/story_lines',params=story_lines_params)
preprocessed_lines = story_lines.text
story_lines_params = {'key':loc_key, 'body_html':raw_content}
story_lines = requests.get('http://0:3000/api/v2/extractlines/story_lines',params=story_lines_params)
preprocessed_lines = story_lines.text
heur_extract_params = {'key':loc_key, 'preprocessed_lines':preprocessed_lines, 'story_title':title, 'story_description':description, 'extractor_method':'HeuristicExtractor'}
heur_extract = requests.get('http://0:3000/api/v2/extractlines/extract',params=heur_extract_params)
crf_extract_params = {'key':loc_key, 'preprocessed_lines':preprocessed_lines, 'story_title':title, 'story_description':description, 'extractor_method':'CrfExtractor'}
crf_extract = requests.get('http://0:3000/api/v2/extractlines/extract',params=crf_extract_params)
h_t_ip = []
c_t_ip = []
for ln, hscore in enumerate(heur_extract.json()[u'scores']):
t = 1 if ln in included_line_numbers else 0
h_t_ip.append( (t, hscore[u'include_probability']) )
cscore = crf_extract.json()[u'scores']
if u'auto_excluded' in cscore:
c_t_ip.append( (t, 0) )
else:
c_t_ip.append( (t, cscore[u'include_probability']) )
return h_t_ip, c_t_ip