In [1]:
import codecs, requests, nltk, json, string, cPickle as pickle, Levenshtein as L, random, collections
import pandas as pd
from string import punctuation
from connect import gkey
Facilities
In [2]:
column_names = ["facility_id",
"nct_id",
"status",
"facility_name",
"city",
"state",
"zipcode",
"country"]
facilities = pd.read_csv('../data/facilities.txt', names=column_names, sep="|", encoding='utf-8', quoting=3)
Sponsors
In [3]:
column_names = ["sponsor_id",
"nct_id",
"sponsor_type",
"agency",
"agency_class"]
sponsors = pd.read_csv('../data/sponsors.txt', names=column_names, sep="|", encoding='utf-8', quoting=3)
In [43]:
# procedure get data
def query_url(url, retries=5):
'''
url: full URL from which to request data
retries: maximum number of retries before moving on
'''
while retries > 0:
try:
r = requests.get(url)
except:
print ' Got error querying %s. Retrying.' % url
pass
if r.status_code == 200:
return json.loads(r.text)
else:
retries -= 1
# if we've tried enough times, print problem and return nothing
print ' Unable to query %s. Status code %d.' % (url, r.status_code)
In [69]:
# procedure to get Freebase data for sponsors
def query_freebase(org):
'''
org: string representing name of sponsor in CTTI database
'''
# remove any trailing abbrevation
word_list = org.split()
if word_list[-1][0] == '(' and word_list[-1][-1] == ')':
org_noabbr = ' '.join(word_list[:-1])
else:
org_noabbr = org
# construct url and query
url = 'https://www.googleapis.com/freebase/v1/search?query=%s&key=%s' % (org_noabbr,gkey)
return query_url(url)
In [46]:
# getting list of sponsors and initializing results dictionary
sponsor_list = sorted(dict(sponsors.agency.value_counts()).items(), key=lambda x: x[1], reverse=True)
sponsor_possibilities = {}
In [140]:
# procedure to get Freebase entries for commonly appearing sponsors
for org, cnt in sponsor_list:
if cnt >= 1 and org not in sponsor_possibilities:
r = query_freebase(org)
'''
if r['result']:
if 'notable' in r['result'][0]:
print '%s: %s (%s)' % (org.decode('utf-8'), r['result'][0]['name'], r['result'][0]['notable']['name'])
else:
print '%s: %s (%s)' % (org.decode('utf-8'), r['result'][0]['name'], 'NO NOTABLE RESULT')
else:
print 'NO RESULTS: %s' % org
'''
sponsor_possibilities[org] = r['result']
In [170]:
len(sponsor_possibilities)
Out[170]:
In [145]:
pickle.dump(sponsor_possibilities,open('../data/sponsor_guess.pickle','wb'))
In [ ]: