In [1]:
import codecs, requests, nltk, json, string, cPickle as pickle, Levenshtein as L, random, collections
import pandas as pd
from string import punctuation
from connect import gkey

Loading data

Facilities


In [2]:
column_names = ["facility_id",
                "nct_id",
                "status",
                "facility_name",
                "city",
                "state",
                "zipcode",
                "country"]

facilities = pd.read_csv('../data/facilities.txt', names=column_names, sep="|", encoding='utf-8', quoting=3)

Sponsors


In [3]:
column_names = ["sponsor_id",
                "nct_id",
                "sponsor_type",
                "agency",
                "agency_class"]

sponsors = pd.read_csv('../data/sponsors.txt', names=column_names, sep="|", encoding='utf-8', quoting=3)

Freebase lookups


In [43]:
# procedure get data
def query_url(url, retries=5):
    '''
    url: full URL from which to request data
    retries: maximum number of retries before moving on
    '''
    while retries > 0:
        try:
            r = requests.get(url)
        except:
            print '  Got error querying %s. Retrying.' % url
            pass
        if r.status_code == 200:
            return json.loads(r.text)
        else:
            retries -= 1
    
    # if we've tried enough times, print problem and return nothing
    print '  Unable to query %s. Status code %d.' % (url, r.status_code)

In [69]:
# procedure to get Freebase data for sponsors
def query_freebase(org):
    '''
    org: string representing name of sponsor in CTTI database
    '''
    # remove any trailing abbrevation
    word_list = org.split()
    if word_list[-1][0] == '(' and word_list[-1][-1] == ')':
        org_noabbr = ' '.join(word_list[:-1])
    else:
        org_noabbr = org
    
    # construct url and query
    url = 'https://www.googleapis.com/freebase/v1/search?query=%s&key=%s' % (org_noabbr,gkey)
    return query_url(url)

In [46]:
# getting list of sponsors and initializing results dictionary
sponsor_list = sorted(dict(sponsors.agency.value_counts()).items(), key=lambda x: x[1], reverse=True)
sponsor_possibilities = {}

In [140]:
# procedure to get Freebase entries for commonly appearing sponsors
for org, cnt in sponsor_list:
    if cnt >= 1 and org not in sponsor_possibilities:
        r = query_freebase(org)
        '''
        if r['result']:
            if 'notable' in r['result'][0]:
                print '%s: %s (%s)' % (org.decode('utf-8'), r['result'][0]['name'], r['result'][0]['notable']['name'])
            else:
                print '%s: %s (%s)' % (org.decode('utf-8'), r['result'][0]['name'], 'NO NOTABLE RESULT')
        else:
            print 'NO RESULTS: %s' % org
        '''
        sponsor_possibilities[org] = r['result']

In [170]:
len(sponsor_possibilities)


Out[170]:
29989

In [145]:
pickle.dump(sponsor_possibilities,open('../data/sponsor_guess.pickle','wb'))

In [ ]: