In [3]:
import cPickle as pickle
import pandas as pd
from sqlalchemy import create_engine
import db_connect
from collections import defaultdict, Counter

from sqlalchemy.sql import func, select, and_, or_, not_, desc
from db_tables import metadata, TrialPublications

Trial scores


In [34]:
trial_scores = pickle.load(open('trial_scores.pkl', 'rb'))

In [35]:
Counter(trial_scores.values())


Out[35]:
Counter({4: 15604, 5: 831, 6: 227, 7: 20, 8: 7, 9: 1})

In [11]:
trial_links = pickle.load(open('pubmed_trial_links.pkl', 'rb'))
trial_links = [(x[0], x[1]) for x in trial_links.items() if x[1] != '']
trial_links = dict(trial_links)

Give confidence scores to all matched publications where linked trials have a confidence of 1. Likely - 0.9 Probable - 0.6 Possible - 0.3


In [41]:
#initialize the connection to the db
engine = create_engine('mysql+pymysql://' + db_connect.conn)
metadata.create_all(engine)

In [42]:
#pull down the publication table to update it
trial_pubs_table = [(x[0],x[1],x[2],x[3],x[4],x[5]) for x in engine.execute(select([TrialPublications]))]

In [63]:
trial_link_lookup = zip(trial_links.values(), trial_links.keys())
new_pub_table = []
for trial in trial_pubs_table:
    if (trial[0], trial[1]) in trial_link_lookup:
        new_pub_table.append(trial[:5] + (1.0,))
    elif (trial[0], trial[1]) in trial_scores:
        score = trial_scores[(trial[0], trial[1])]
        if score == 4:
            new_score = 0.3
        elif score == 5:
            new_score = 0.6
        else:
            new_score = 0.9
        new_pub_table.append(trial[:5] + (new_score,))
    elif trial[5] == 0.0:
        new_pub_table.append(trial[:5] + (0.3,))
    else:
        new_pub_table.append(trial)

In [64]:
import csv
with open('data/new_pub_table.txt', 'w') as file:
    w = csv.writer(file, dialect='excel-tab')
    w.writerows(new_pub_table)

In [ ]: