extractor_backup



In [ ]:
import mediacloud, requests, csv, sys, os, json
from pyroc import *

api_key = ''
#mc = mediacloud.api.MediaCloud(api_key)

loc_key = ''

checked = json.load(open("extractor_training_lines_checked.json"))[0]

h_t_ip = []
c_t_ip = []

for dload in checked:
    
    cur_h_t_ip, cur_c_t_ip = gen_data(dload['downloads_id'],dload['included_line_numbers'],dload['required_line_numbers']) 

    h_t_ip+= cur_h_t_ip
    c_t_ip+= cur_c_t_ip

In [ ]:
def gen_data(downloads_id, included_line_numbers, required_line_numbers):
    
    download = requests.get('https://api.mediacloud.org/api/v2/downloads/single/'+str(downloads_id)+'?key='+api_key)
    raw_content = download.json()[0][u'raw_content']
    stories_id = download.json()[0][u'stories_id']

    story = requests.get('https://api.mediacloud.org/api/v2/stories/single/'+str(stories_id)+'?key='+api_key)
    title = story.json()[0][u'title']
    description = story.json()[0][u'description']
    url = story.json()[0][u'url']
    
    story_lines_params = {'key':loc_key, 'body_html':raw_content}
    story_lines = requests.get('http://0:3000/api/v2/extractlines/story_lines',params=story_lines_params)
    preprocessed_lines = story_lines.text
    
    story_lines_params = {'key':loc_key, 'body_html':raw_content}
    story_lines = requests.get('http://0:3000/api/v2/extractlines/story_lines',params=story_lines_params)
    preprocessed_lines = story_lines.text
    
    heur_extract_params = {'key':loc_key, 'preprocessed_lines':preprocessed_lines, 'story_title':title, 'story_description':description, 'extractor_method':'HeuristicExtractor'}
    heur_extract = requests.get('http://0:3000/api/v2/extractlines/extract',params=heur_extract_params)

    crf_extract_params = {'key':loc_key, 'preprocessed_lines':preprocessed_lines, 'story_title':title, 'story_description':description, 'extractor_method':'CrfExtractor'}
    crf_extract = requests.get('http://0:3000/api/v2/extractlines/extract',params=crf_extract_params)

    h_t_ip = []
    c_t_ip = []
    
    for ln, hscore in enumerate(heur_extract.json()[u'scores']):
        
        t = 1 if ln in included_line_numbers else 0
        
        h_t_ip.append( (t, hscore[u'include_probability']) )
        
        cscore = crf_extract.json()[u'scores']
        if u'auto_excluded' in cscore:
            c_t_ip.append( (t, 0) )   
        else:
            c_t_ip.append( (t, cscore[u'include_probability']) ) 
            
    return h_t_ip, c_t_ip