In [1]:
import cPickle as pickle, codecs, requests, json, re
from bs4 import BeautifulSoup
from connect import mongoip, gkey

Load good institution data


In [2]:
sponsors_good = pickle.load(open('../data/facility_match_good.pkl','rb'))

Get Freebase summary, location, and Wikipedia image


In [3]:
# procedure get data
def query_url(url, param_data=None, retries=5):
    '''
    url: full URL from which to request data
    retries: maximum number of retries before moving on
    '''
    while retries > 0:
        try:
            if param_data:
                r = requests.get(url, params=param_data)
            else:
                r = requests.get(url)
        except:
            print '  Got error querying %s. Retrying.' % url
            pass
        if r.status_code == 200:
            return json.loads(r.text)
        else:
            retries -= 1
    
    # if we've tried enough times, print problem and return nothing
    print '  Unable to query %s. Status code %d.' % (url, r.status_code)
    return None

In [ ]:
desc = '/common/topic/description'
hq = '/organization/organization/headquarters'
city = '/location/mailing_address/citytown'
state = '/location/mailing_address/state_province_region'
country = '/location/mailing_address/country'
image = '/common/topic/image'
article = '/common/topic/article'

for s in sponsors_good.keys():
    # reinitialize variables
    summary = None
    loc = None
    image_url = None
    
    print s
    mid = sponsors_good[s]['mid'].split('/')[-1]
    
    # summary from Freebase
    if 'summary' not in sponsors_good[s]:
        url = 'https://www.googleapis.com/freebase/v1/topic/m/%s?filter=%s&key=%s' % (mid, desc, gkey)
        summary_info = query_url(url)
        if summary_info and 'property' in summary_info:
            summary = summary_info['property'][desc]['values'][0]['value'].replace('\n',' ').replace('\t',' ')
            summary = re.sub(' +',' ',summary)
        else:
            summary = None
        sponsors_good[s]['summary'] = summary
        print summary
    
    # city location from Freebase
    if 'geo' not in sponsors_good[s] or 'loc' not in sponsors_good[s]['geo']:
        url = 'https://www.googleapis.com/freebase/v1/topic/m/%s?filter=%s&key=%s' % (mid, hq, gkey)
        addr_info = query_url(url)
        if addr_info and 'property' in addr_info:
            for v in addr_info['property'][hq]['values']:
                if not loc and country in v['property'] and len(v['property'][country]['values']) > 0 and \
                   city in v['property'] and len(v['property'][city]['values']) > 0:
                    cur_country = v['property'][country]
                    cur_city = v['property'][city]
                    if state in v['property']:
                        cur_state = v['property'][state]
                    if len(cur_country['values']) > 0:
                        if cur_country['values'][0]['text'] == 'United States of America':
                            if state in v['property'] and len(cur_state['values']) > 0:
                                loc = '%s, %s' % (v['property'][city]['values'][0]['text'],
                                                  v['property'][state]['values'][0]['text'])
                            else:
                                loc = cur_city['values'][0]['text']
                        else:
                            loc = '%s, %s' % (v['property'][city]['values'][0]['text'],
                                              v['property'][country]['values'][0]['text'])
        else:
            loc = None
        
        if 'geo' not in sponsors_good[s]: sponsors_good[s]['geo'] = {}
        sponsors_good[s]['geo']['loc'] = loc
        print loc
    
    # image url data
    if 'image' not in sponsors_good[s]:
        image_url = None
        url = 'https://www.googleapis.com/freebase/v1/topic/m/%s?filter=%s&key=%s' % (mid, image, gkey)
        image_info = query_url(url)
        if image_info and 'property' in image_info and len(image_info['property'][image]['values']) > 0:
            image_url = 'https://www.googleapis.com/freebase/v1/image%s' % image_info['property'][image]['values'][0]['id']
        else:
            url = 'https://www.googleapis.com/freebase/v1/topic/m/%s?filter=%s&key=%s' % (mid, article, gkey)
            article_info = query_url(url)
            if article_info and 'property' in article_info:
                uri_pot = article_info['property'][article]['values'][0]['property']
                if '/common/document/source_uri' in uri_pot:
                    uri= uri_pot['/common/document/source_uri']['values'][0]['text']
                    wiki_id = uri.split('/')[-1]
                    url = 'http://en.wikipedia.org/wiki/index.html?curid=%s' % wiki_id
                    wiki_page = requests.get(url)
                    if wiki_page:
                        soup = BeautifulSoup(wiki_page.text)
                        infobox = soup.find('table', {'class': 'infobox'})
                        if infobox:
                            top_img = infobox.find('img')
                            if top_img:
                                image_url = 'http:%s' % top_img['src']
        
        sponsors_good[s]['image'] = image_url
        print image_url

    print

In [6]:
pickle.dump(sponsors_good,open('../data/facility_match_good.pkl','wb'))

In [ ]: