In [1]:
import cPickle as pickle, codecs, requests, json, re
from bs4 import BeautifulSoup
from connect import mongoip, gkey
In [2]:
sponsors_good = pickle.load(open('../data/facility_match_good.pkl','rb'))
In [3]:
# procedure get data
def query_url(url, param_data=None, retries=5):
'''
url: full URL from which to request data
retries: maximum number of retries before moving on
'''
while retries > 0:
try:
if param_data:
r = requests.get(url, params=param_data)
else:
r = requests.get(url)
except:
print ' Got error querying %s. Retrying.' % url
pass
if r.status_code == 200:
return json.loads(r.text)
else:
retries -= 1
# if we've tried enough times, print problem and return nothing
print ' Unable to query %s. Status code %d.' % (url, r.status_code)
return None
In [ ]:
desc = '/common/topic/description'
hq = '/organization/organization/headquarters'
city = '/location/mailing_address/citytown'
state = '/location/mailing_address/state_province_region'
country = '/location/mailing_address/country'
image = '/common/topic/image'
article = '/common/topic/article'
for s in sponsors_good.keys():
# reinitialize variables
summary = None
loc = None
image_url = None
print s
mid = sponsors_good[s]['mid'].split('/')[-1]
# summary from Freebase
if 'summary' not in sponsors_good[s]:
url = 'https://www.googleapis.com/freebase/v1/topic/m/%s?filter=%s&key=%s' % (mid, desc, gkey)
summary_info = query_url(url)
if summary_info and 'property' in summary_info:
summary = summary_info['property'][desc]['values'][0]['value'].replace('\n',' ').replace('\t',' ')
summary = re.sub(' +',' ',summary)
else:
summary = None
sponsors_good[s]['summary'] = summary
print summary
# city location from Freebase
if 'geo' not in sponsors_good[s] or 'loc' not in sponsors_good[s]['geo']:
url = 'https://www.googleapis.com/freebase/v1/topic/m/%s?filter=%s&key=%s' % (mid, hq, gkey)
addr_info = query_url(url)
if addr_info and 'property' in addr_info:
for v in addr_info['property'][hq]['values']:
if not loc and country in v['property'] and len(v['property'][country]['values']) > 0 and \
city in v['property'] and len(v['property'][city]['values']) > 0:
cur_country = v['property'][country]
cur_city = v['property'][city]
if state in v['property']:
cur_state = v['property'][state]
if len(cur_country['values']) > 0:
if cur_country['values'][0]['text'] == 'United States of America':
if state in v['property'] and len(cur_state['values']) > 0:
loc = '%s, %s' % (v['property'][city]['values'][0]['text'],
v['property'][state]['values'][0]['text'])
else:
loc = cur_city['values'][0]['text']
else:
loc = '%s, %s' % (v['property'][city]['values'][0]['text'],
v['property'][country]['values'][0]['text'])
else:
loc = None
if 'geo' not in sponsors_good[s]: sponsors_good[s]['geo'] = {}
sponsors_good[s]['geo']['loc'] = loc
print loc
# image url data
if 'image' not in sponsors_good[s]:
image_url = None
url = 'https://www.googleapis.com/freebase/v1/topic/m/%s?filter=%s&key=%s' % (mid, image, gkey)
image_info = query_url(url)
if image_info and 'property' in image_info and len(image_info['property'][image]['values']) > 0:
image_url = 'https://www.googleapis.com/freebase/v1/image%s' % image_info['property'][image]['values'][0]['id']
else:
url = 'https://www.googleapis.com/freebase/v1/topic/m/%s?filter=%s&key=%s' % (mid, article, gkey)
article_info = query_url(url)
if article_info and 'property' in article_info:
uri_pot = article_info['property'][article]['values'][0]['property']
if '/common/document/source_uri' in uri_pot:
uri= uri_pot['/common/document/source_uri']['values'][0]['text']
wiki_id = uri.split('/')[-1]
url = 'http://en.wikipedia.org/wiki/index.html?curid=%s' % wiki_id
wiki_page = requests.get(url)
if wiki_page:
soup = BeautifulSoup(wiki_page.text)
infobox = soup.find('table', {'class': 'infobox'})
if infobox:
top_img = infobox.find('img')
if top_img:
image_url = 'http:%s' % top_img['src']
sponsors_good[s]['image'] = image_url
print image_url
print
In [6]:
pickle.dump(sponsors_good,open('../data/facility_match_good.pkl','wb'))
In [ ]: