In [2]:
import requests
import json
import prettytable
import csv
import codecs

In [3]:
from bs4 import BeautifulSoup
import requests

In [4]:
url = 'http://www.nobelprize.org/nobel_prizes/lists/universities.html'
r = requests.get(url)
soup = BeautifulSoup(r.text)
place_acquired = soup.find_all(name="div", attrs={"class": "by_year"})

In [516]:


In [7]:
def grab_field_and_number(string):
    '''
    >>>grab_field_and_number("The Nobel Prize in Physics 2000")
    ('2000', 'Physics')
    
    >>>grab_field_and_number("The Prize in Economic Sciences 2010")
    ('2010', 'Economic Sciences')
    
    >>>grab_field_and_number("The Nobel Prize in Physiology or Medicine 2000")
    ('2000', 'Physiology or Medicine')
    
    >>>grab_field_and_number("The Nobel in Peace Prize 2010")
    ('2010', 'Peace')
    '''
    
    if "Economic" in string:
        temp_string = string.split()
        year = temp_string.pop()
        field = temp_string[-2] + " " + temp_string[-1]
    elif "Physiology or Medicine" in string:
        temp_string = string.split()
        year = temp_string.pop()
        field = temp_string[-3] + " " + temp_string[-2] + " " + temp_string[-1]
    elif "Peace" in string:
        temp_string = string.split()
        year = temp_string.pop()
        field = temp_string[-2]
    else:
        temp_string = string.split()
        year = temp_string.pop()
        field = temp_string[-1]
    return year, field

In [8]:
grab_field_and_number("The Nobel in Peace Prize 2010")


Out[8]:
('2010', 'Peace')

In [9]:
#grab_field_and_number("The Nobel Prize in Physics 2000")

In [10]:
#grab_field_and_number("The Prize in Economic Sciences 2010")

In [11]:
#grab_field_and_number("The Nobel Prize in Physiology or Medicine 2000")

In [12]:
def grab_inst_country_citystate(string):
    '''
    >>>grab_inst_citystate_country("Edinburgh University, Edinburgh, United Kingdom")
    ('Edinburgh University', ' United Kingdom', ' Edinburgh', '', '')
    
    >>>grab_inst_country_citystate("Fred Hutchinson Cancer Research Center, Seattle, WA, USA")
    ('Fred Hutchinson Cancer Research Center', ' USA', ' WA', ' Seattle', '')
    
    >>>grab_inst_country_citystate("Columbia University Division, Cardio-Pulmonary Laboratory, Bellevue Hospital, New York, NY, USA")
    ('Columbia University Division',
 ' USA',
 ' NY',
 ' New York',
 ' Cardio-Pulmonary Laboratory,  Bellevue Hospital')
    '''
    pieces = string.split(",")
    institution = pieces[0]
    country = pieces[-1]
    city_state = pieces[1:-1]
    city, state, extra_loc = grab_city_state(city_state)
    return institution, country, city, state, extra_loc

In [13]:
#grab_inst_citystate_country("Edinburgh University, Edinburgh, United Kingdom")

In [14]:
#grab_inst_country_citystate("Fred Hutchinson Cancer Research Center, Seattle, WA, USA")

In [15]:
#grab_inst_country_citystate("Columbia University Division, Cardio-Pulmonary Laboratory, Bellevue Hospital, New York, NY, USA")

In [16]:
def grab_city_state(a_list):
    '''
    >>>grab_city_state(["Cardio-Pulmonary Laboratory", "Bellevue Hospital", "New York", "NY"])
    ('NY', 'New York', 'Cardio-Pulmonary Laboratory, Bellevue Hospital')
    
    >>>grab_city_state(["Bellevue Hospital", "New York", "NY"])
    ('NY', 'New York', 'Bellevue Hospital')
    
    >>>grab_city_state(['New York', 'NY'])
    grab_city_state(['New York', 'NY'])
    
    >>>grab_city_state(['New York'])
    ('New York', '', '')    
    '''
    city = a_list.pop()
    state = ""    
    other = ""
    if len(a_list) >= 1:
        state = a_list.pop()
        other = ", ".join(a_list)
    return city, state, other

In [17]:
#grab_city_state(["Cardio-Pulmonary Laboratory", "Bellevue Hospital", "New York", "NY"])

In [18]:
#grab_city_state(["Bellevue Hospital", "New York", "NY"])

In [19]:
#grab_city_state(['New York', 'NY'])

In [20]:
#grab_city_state(['New York'])

In [ ]:


In [223]:
def separate_old_country_names(country):
    old = ""
    new = ""
    if " (now " in country:
        old_and_new = country.split(' (now ')
        old = old_and_new[0]
        new = old_and_new[1][:-1]
    else:
        old = country
        new = country
    return old, new

In [274]:
def find_country_acq(bs4_html):
    all_names = [["name", "institution",
                  "old_country_name_acquired","current_country_name_acquired",
                  "city","state","year","field"]]
    place_acq = ""
    for i in bs4_html:
        #pprint.pprint(i) 
        #print "*"*80
        #print i
        if i.find_all('h3'):
            #print "i.TEXT: ", i.text
            place_acq = i.h3.text
        if i.find_all('a'):
            #print ""
            #print "i.a.TEXT: ", i.a.text
            #print "i.h6.TEXT: ", i.h6.text
            #print "PLACE_ACQ: ", place_acq
            #print "field_year: ", field_year
            field_year = i.a.text
            name = i.h6.text
            year, field = grab_field_and_number(field_year)
            institution, country, city, state, extra_loc = grab_inst_country_citystate(place_acq)
            
            old_country_name, new_country_name = separate_old_country_names(country)
            
            all_names.append([name.encode('utf-8'),
                              institution.encode('utf-8'),
                              old_country_name.encode('utf-8'),
                              new_country_name.encode('utf-8'),
                              city.encode('utf-8'), 
                              state.encode('utf-8'),
                              year.encode('utf-8'),
                              field.encode('utf-8')])
            
            #print ""
            #print "*"*80
    return all_names

In [275]:
len(find_country_acq(place_acquired))


Out[275]:
698

In [276]:


In [23]:
url = 'http://www.nobelprize.org/nobel_prizes/lists/countries.html'
r = requests.get(url)
soup = BeautifulSoup(r.text)
birth_html = soup.find_all(name="div", attrs={"class": "by_year"})

In [23]:


In [283]:
def find_country_birth(bs4_html):
    all_names = [["name","birth_country_old_name",
                  "birth_country_current_name",
                  "year","field"]]
    place_acq = ""
    for i in bs4_html:
        # Only place acquired entries have an 'h3' sub-class
        if i.find_all('h3'):
            place_acq = i.h3.text
        # Only field_year/name entries have an 'h6' sub-class.
        if i.find_all('h6'):
            field_year = i.a.text
            name = i.h6.text
            year, field = grab_field_and_number(field_year)
            old_country_name, new_country_name = separate_old_country_names(place_acq)
            
            all_names.append([name.encode('utf-8'), 
                              old_country_name.encode('utf-8'),
                              new_country_name.encode('utf-8'),
                              year.encode('utf-8'),
                              field.encode('utf-8')])
            
    return all_names

In [229]:
len(find_country_birth(birth_html))


Out[229]:
865

In [25]:


In [25]:


In [26]:
url = 'http://www.nobelprize.org/nobel_prizes/lists/age.html'
r = requests.get(url)
soup = BeautifulSoup(r.text)
age_html = soup.find_all(name="div", attrs={"class": "large-12 columns"})

In [85]:
def find_age(bs4_html):
    all_names = [["name", "age"]]
    place_acq = ""
    for i in age_html[6].find_all(['h3', 'h6']):
        
        if "Age" in i.string:
            age = i.string.split()[-1]
        if "Age" not in i.string:
            name = i.string
            all_names.append([name.encode('utf-8'), age.encode('utf-8')])
    return all_names

In [86]:
len(find_age(age_html))


Out[86]:
865

In [88]:
nobel_ages = find_age(age_html)

In [89]:
with open('nobel_ages.csv', 'wb') as f:
    writer = csv.writer(f)
    writer.writerows(nobel_ages)

In [226]:
country_acquired = find_country_acq(place_acquired)

In [111]:
#country_acquired

In [118]:


In [119]:
#country_birth

In [314]:
import pandas as pd

country_birth = find_country_birth(birth_html)
headers = country_birth.pop(0)
df = pd.DataFrame(country_birth, columns=headers)
df.head()


Out[314]:
name birth_country_old_name birth_country_current_name year field
0 Bernardo Houssay Argentina Argentina 1947 Physiology or Medicine
1 César Milstein Argentina Argentina 1984 Physiology or Medicine
2 Carlos Saavedra Lamas Argentina Argentina 1936 Peace
3 Adolfo Pérez Esquivel Argentina Argentina 1980 Peace
4 Lawrence Bragg Australia Australia 1915 Physics

In [236]:
countries = list(set(df.birth_country_new_name))

In [238]:
#google_api_key = "AIzaSyDAJxRxTE-ZC5M7qGN5Bg_FXwgc5e_TqdU"

In [239]:
def lookup_lat_lon(city="", state="", country="", key=""):
    return "https://maps.googleapis.com/maps/api/geocode/json?"+"address="+country+"&key="+key

In [288]:
lookup_lat_lon(country=countries[38], key=google_api_key)


Out[288]:
"https://maps.googleapis.com/maps/api/geocode/json?address=People's Republic of China&key=AIzaSyDAJxRxTE-ZC5M7qGN5Bg_FXwgc5e_TqdU"

In [289]:
url2 = lookup_lat_lon(country=countries[38], key=google_api_key)

In [290]:
r2 = requests.get(url2)

In [291]:
country_json = r2.json()

In [292]:
birth_lat = country_json['results'][0]['geometry']['location']['lat']
birth_lon = country_json['results'][0]['geometry']['location']['lng']
birth_country_long_name = country_json['results'][0]['address_components'][0]['long_name']
birth_country_short_name = country_json['results'][0]['address_components'][0]['short_name']

In [296]:
print birth_lat
print birth_lon
#birth_country_long_name


6.9044087
79.8714212

In [295]:
#country_json

In [315]:
def get_long_lat(country_list, birth_countries=True):
    
    output = [['birth_lat', 
               'birth_lon', 
               'birth_country_current_name',
               'birth_country_short_name']]
    if birth_countries == False:
        output = [['acquired_lat', 
                   'acquired_lon', 
                   'current_country_name_acquired',
                   'acquired_country_short_name']]
    # https://console.developers.google.com
    # https://developers.google.com/maps/documentation/geocoding/?csw=1
    google_api_key = "AIzaSyDAJxRxTE-ZC5M7qGN5Bg_FXwgc5e_TqdU"
    
    for each_country in country_list:
        url = lookup_lat_lon(country=each_country, key=google_api_key)
        r = requests.get(url)
        country_json = r.json()
        lat = country_json['results'][0]['geometry']['location']['lat']
        lon = country_json['results'][0]['geometry']['location']['lng']
        #country_long_name = country_json['results'][0]['address_components'][0]['long_name']
        country_long_name = each_country
        country_short_name = country_json['results'][0]['address_components'][0]['short_name']
        
        output.append([lat,
                       lon,
                       country_long_name,
                       country_short_name])
    return output

In [307]:
# Get the lat/lon from the Google API!
lat_lon_birth_countries = get_long_lat(countries, birth_countries=True)

In [308]:
headers = lat_lon_birth_countries.pop(0)
birth_countries_df = pd.DataFrame(lat_lon_birth_countries, columns=headers)

In [313]:
birth_countries_df.head()


Out[313]:
birth_lat birth_lon birth_country_current_name birth_country_short_name
0 -14.235004 -51.925280 Brazil BR
1 56.130366 -106.346771 Canada CA
2 -18.766947 46.869107 Madagascar MG
3 41.608635 21.745275 Republic of Macedonia MK
4 41.871940 12.567380 Italy IT

In [312]:
df = pd.merge(df, birth_countries_df)
df.head()


Out[312]:
name birth_country_old_name birth_country_current_name year field birth_lat birth_lon birth_country_short_name
0 Bernardo Houssay Argentina Argentina 1947 Physiology or Medicine -38.416097 -63.616672 AR
1 César Milstein Argentina Argentina 1984 Physiology or Medicine -38.416097 -63.616672 AR
2 Carlos Saavedra Lamas Argentina Argentina 1936 Peace -38.416097 -63.616672 AR
3 Adolfo Pérez Esquivel Argentina Argentina 1980 Peace -38.416097 -63.616672 AR
4 Lawrence Bragg Australia Australia 1915 Physics -25.274398 133.775136 AU

In [ ]: