In [97]:
import requests
import json
import prettytable
import csv
import codecs

In [ ]:


In [108]:
from bs4 import BeautifulSoup
import requests

In [111]:
url = 'http://www.nobelprize.org/nobel_prizes/lists/universities.html'
r = requests.get(url)
soup = BeautifulSoup(r.text, from_encoding=r.encoding)
place_acquired = soup.find_all(name="div", attrs={"class": "by_year"})

In [113]:
place_acquired
#soup = unicode(soup)
#soup = soup.encode('ascii', 'ignore')


---------------------------------------------------------------------------
UnicodeDecodeError                        Traceback (most recent call last)
<ipython-input-113-3e8953726478> in <module>()
----> 1 place_acquired
      2 #soup = unicode(soup)
      3 #soup = soup.encode('ascii', 'ignore')

c:\Anaconda\lib\site-packages\IPython\core\displayhook.pyc in __call__(self, result)
    236                 self.write_format_data(format_dict, md_dict)
    237                 self.log_output(format_dict)
--> 238             self.finish_displayhook()
    239 
    240     def cull_cache(self):

c:\Anaconda\lib\site-packages\IPython\kernel\zmq\displayhook.pyc in finish_displayhook(self)
     70         sys.stderr.flush()
     71         if self.msg['content']['data']:
---> 72             self.session.send(self.pub_socket, self.msg, ident=self.topic)
     73         self.msg = None
     74 

c:\Anaconda\lib\site-packages\IPython\kernel\zmq\session.pyc in send(self, stream, msg_or_type, content, parent, ident, buffers, track, header, metadata)
    647         if self.adapt_version:
    648             msg = adapt(msg, self.adapt_version)
--> 649         to_send = self.serialize(msg, ident)
    650         to_send.extend(buffers)
    651         longest = max([ len(s) for s in to_send ])

c:\Anaconda\lib\site-packages\IPython\kernel\zmq\session.pyc in serialize(self, msg, ident)
    551             content = self.none
    552         elif isinstance(content, dict):
--> 553             content = self.pack(content)
    554         elif isinstance(content, bytes):
    555             # content is already packed, as in a relayed message

c:\Anaconda\lib\site-packages\IPython\kernel\zmq\session.pyc in <lambda>(obj)
     83 # disallow nan, because it's not actually valid JSON
     84 json_packer = lambda obj: jsonapi.dumps(obj, default=date_default,
---> 85     ensure_ascii=False, allow_nan=False,
     86 )
     87 json_unpacker = lambda s: jsonapi.loads(s)

c:\Anaconda\lib\site-packages\zmq\utils\jsonapi.pyc in dumps(o, **kwargs)
     38         kwargs['separators'] = (',', ':')
     39 
---> 40     s = jsonmod.dumps(o, **kwargs)
     41 
     42     if isinstance(s, unicode):

c:\Anaconda\lib\json\__init__.pyc in dumps(obj, skipkeys, ensure_ascii, check_circular, allow_nan, cls, indent, separators, encoding, default, sort_keys, **kw)
    248         check_circular=check_circular, allow_nan=allow_nan, indent=indent,
    249         separators=separators, encoding=encoding, default=default,
--> 250         sort_keys=sort_keys, **kw).encode(obj)
    251 
    252 

c:\Anaconda\lib\json\encoder.pyc in encode(self, o)
    208         if not isinstance(chunks, (list, tuple)):
    209             chunks = list(chunks)
--> 210         return ''.join(chunks)
    211 
    212     def iterencode(self, o, _one_shot=False):

UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 8957: ordinal not in range(128)

In [6]:
def grab_field_and_number(string):
    '''
    >>>grab_field_and_number("The Nobel Prize in Physics 2000")
    ('2000', 'Physics')
    
    >>>grab_field_and_number("The Prize in Economic Sciences 2010")
    ('2010', 'Economic Sciences')
    
    >>>grab_field_and_number("The Nobel Prize in Physiology or Medicine 2000")
    ('2000', 'Physiology or Medicine')
    
    >>>grab_field_and_number("The Nobel in Peace Prize 2010")
    ('2010', 'Peace')
    '''
    
    if "Economic" in string:
        temp_string = string.split()
        year = temp_string.pop()
        field = temp_string[-2] + " " + temp_string[-1]
    elif "Physiology or Medicine" in string:
        temp_string = string.split()
        year = temp_string.pop()
        field = temp_string[-3] + " " + temp_string[-2] + " " + temp_string[-1]
    elif "Peace" in string:
        temp_string = string.split()
        year = temp_string.pop()
        field = temp_string[-2]
    else:
        temp_string = string.split()
        year = temp_string.pop()
        field = temp_string[-1]
    return year, field

In [7]:
grab_field_and_number("The Nobel in Peace Prize 2010")


Out[7]:
('2010', 'Peace')

In [8]:
#grab_field_and_number("The Nobel Prize in Physics 2000")

In [9]:
#grab_field_and_number("The Prize in Economic Sciences 2010")

In [10]:
#grab_field_and_number("The Nobel Prize in Physiology or Medicine 2000")

In [11]:
def grab_inst_country_citystate(string):
    '''
    >>>grab_inst_citystate_country("Edinburgh University, Edinburgh, United Kingdom")
    ('Edinburgh University', ' United Kingdom', ' Edinburgh', '', '')
    
    >>>grab_inst_country_citystate("Fred Hutchinson Cancer Research Center, Seattle, WA, USA")
    ('Fred Hutchinson Cancer Research Center', ' USA', ' WA', ' Seattle', '')
    
    >>>grab_inst_country_citystate("Columbia University Division, Cardio-Pulmonary Laboratory, Bellevue Hospital, New York, NY, USA")
    ('Columbia University Division',
 ' USA',
 ' NY',
 ' New York',
 ' Cardio-Pulmonary Laboratory,  Bellevue Hospital')
    '''
    pieces = string.split(",")
    institution = pieces[0]
    country = pieces[-1]
    city_state = pieces[1:-1]
    city, state, extra_loc = grab_city_state(city_state)
    return institution, country, city, state, extra_loc

In [12]:
#grab_inst_citystate_country("Edinburgh University, Edinburgh, United Kingdom")

In [13]:
#grab_inst_country_citystate("Fred Hutchinson Cancer Research Center, Seattle, WA, USA")

In [14]:
#grab_inst_country_citystate("Columbia University Division, Cardio-Pulmonary Laboratory, Bellevue Hospital, New York, NY, USA")

In [15]:
def grab_city_state(a_list):
    '''
    >>>grab_city_state(["Cardio-Pulmonary Laboratory", "Bellevue Hospital", "New York", "NY"])
    ('NY', 'New York', 'Cardio-Pulmonary Laboratory, Bellevue Hospital')
    
    >>>grab_city_state(["Bellevue Hospital", "New York", "NY"])
    ('NY', 'New York', 'Bellevue Hospital')
    
    >>>grab_city_state(['New York', 'NY'])
    grab_city_state(['New York', 'NY'])
    
    >>>grab_city_state(['New York'])
    ('New York', '', '')    
    '''
    city = a_list.pop()
    state = ""    
    other = ""
    if len(a_list) >= 1:
        state = a_list.pop()
        other = ", ".join(a_list)
    return city, state, other

In [16]:
#grab_city_state(["Cardio-Pulmonary Laboratory", "Bellevue Hospital", "New York", "NY"])

In [17]:
#grab_city_state(["Bellevue Hospital", "New York", "NY"])

In [18]:
#grab_city_state(['New York', 'NY'])

In [19]:
#grab_city_state(['New York'])

In [ ]:


In [20]:
def separate_old_country_names(country):
    old = ""
    new = ""
    if " (now " in country:
        old_and_new = country.split(' (now ')
        old = old_and_new[0]
        new = old_and_new[1][:-1]
    else:
        old = country
        new = country
    return old, new

In [114]:
def find_country_acq(bs4_html):
    all_names = [["name", "institution",
                  "old_country_name_acquired","current_country_name_acquired",
                  "city","state","year","field"]]
    place_acq = ""
    for i in bs4_html:
        #pprint.pprint(i) 
        #print "*"*80
        #print i
        if i.find_all('h3'):
            #print "i.TEXT: ", i.text
            place_acq = i.h3.text
        if i.find_all('a'):
            #print ""
            #print "i.a.TEXT: ", i.a.text
            #print "i.h6.TEXT: ", i.h6.text
            #print "PLACE_ACQ: ", place_acq
            #print "field_year: ", field_year
            field_year = i.a.text
            name = i.h6.text
            year, field = grab_field_and_number(field_year)
            institution, country, city, state, extra_loc = grab_inst_country_citystate(place_acq)
            
            old_country_name, new_country_name = separate_old_country_names(country)
            
            all_names.append([name.encode('utf-8'),
                              institution.encode('utf-8'),
                              old_country_name.encode('utf-8'),
                              new_country_name.encode('utf-8'),
                              city.encode('utf-8'), 
                              state.encode('utf-8'),
                              year.encode('utf-8'),
                              field.encode('utf-8')])
            
            #print ""
            #print "*"*80
    return all_names

In [117]:
len(find_country_acq(place_acquired))


Out[117]:
698

In [ ]:


In [23]:
url = 'http://www.nobelprize.org/nobel_prizes/lists/countries.html'
r = requests.get(url)
soup = BeautifulSoup(r.text)
birth_html = soup.find_all(name="div", attrs={"class": "by_year"})

In [ ]:


In [24]:
def find_country_birth(bs4_html):
    all_names = [["name","birth_country_old_name",
                  "birth_country_current_name",
                  "year","field"]]
    place_acq = ""
    for i in bs4_html:
        # Only place acquired entries have an 'h3' sub-class
        if i.find_all('h3'):
            place_acq = i.h3.text
        # Only field_year/name entries have an 'h6' sub-class.
        if i.find_all('h6'):
            field_year = i.a.text
            name = i.h6.text
            year, field = grab_field_and_number(field_year)
            old_country_name, new_country_name = separate_old_country_names(place_acq)
            
            all_names.append([name.encode('utf-8'), 
                              old_country_name.encode('utf-8'),
                              new_country_name.encode('utf-8'),
                              year.encode('utf-8'),
                              field.encode('utf-8')])
            
    return all_names

In [42]:
len(find_country_birth(birth_html))


Out[42]:
865

In [ ]:


In [ ]:


In [26]:
url = 'http://www.nobelprize.org/nobel_prizes/lists/age.html'
r = requests.get(url)
soup = BeautifulSoup(r.text)
age_html = soup.find_all(name="div", attrs={"class": "large-12 columns"})

In [27]:
def find_age(bs4_html):
    all_names = [["name", "age"]]
    place_acq = ""
    for i in age_html[6].find_all(['h3', 'h6']):
        
        if "Age" in i.string:
            age = i.string.split()[-1]
        if "Age" not in i.string:
            name = i.string
            all_names.append([name.encode('utf-8'), age.encode('utf-8')])
    return all_names

In [28]:
len(find_age(age_html))


Out[28]:
865

In [29]:
nobel_ages = find_age(age_html)

In [30]:
with open('nobel_ages.csv', 'wb') as f:
    writer = csv.writer(f)
    writer.writerows(nobel_ages)

In [118]:
country_acquired = find_country_acq(place_acquired)

In [35]:
#country_acquired

In [120]:
import pandas as pd

headers = country_acquired.pop(0)
df = pd.DataFrame(country_acquired, columns=headers)
df.head()


Out[120]:
name institution old_country_name_acquired current_country_name_acquired city state year field
0 Zhores I. Alferov A.F. Ioffe Physico-Technical Institute Russia Russia St. Petersburg 2000 Physics
1 Jens C. Skou Aarhus University Denmark Denmark Aarhus 1997 Chemistry
2 Dale T. Mortensen Aarhus University Denmark Denmark Aarhus 2010 Economic Sciences
3 Lev Landau Academy of Sciences USSR USSR Moscow 1962 Physics
4 Pyotr Kapitsa Academy of Sciences USSR USSR Moscow 1978 Physics

In [38]:
#country_birth

In [44]:
import pandas as pd

country_birth = find_country_birth(birth_html)
headers = country_birth.pop(0)
df = pd.DataFrame(country_birth, columns=headers)
df.head()


---------------------------------------------------------------------------
UnicodeDecodeError                        Traceback (most recent call last)
<ipython-input-44-85da9c4b1a68> in <module>()
      4 headers = country_birth.pop(0)
      5 df = pd.DataFrame(country_birth, columns=headers)
----> 6 df.head()

c:\Anaconda\lib\site-packages\IPython\core\displayhook.pyc in __call__(self, result)
    236                 self.write_format_data(format_dict, md_dict)
    237                 self.log_output(format_dict)
--> 238             self.finish_displayhook()
    239 
    240     def cull_cache(self):

c:\Anaconda\lib\site-packages\IPython\kernel\zmq\displayhook.pyc in finish_displayhook(self)
     70         sys.stderr.flush()
     71         if self.msg['content']['data']:
---> 72             self.session.send(self.pub_socket, self.msg, ident=self.topic)
     73         self.msg = None
     74 

c:\Anaconda\lib\site-packages\IPython\kernel\zmq\session.pyc in send(self, stream, msg_or_type, content, parent, ident, buffers, track, header, metadata)
    647         if self.adapt_version:
    648             msg = adapt(msg, self.adapt_version)
--> 649         to_send = self.serialize(msg, ident)
    650         to_send.extend(buffers)
    651         longest = max([ len(s) for s in to_send ])

c:\Anaconda\lib\site-packages\IPython\kernel\zmq\session.pyc in serialize(self, msg, ident)
    551             content = self.none
    552         elif isinstance(content, dict):
--> 553             content = self.pack(content)
    554         elif isinstance(content, bytes):
    555             # content is already packed, as in a relayed message

c:\Anaconda\lib\site-packages\IPython\kernel\zmq\session.pyc in <lambda>(obj)
     83 # disallow nan, because it's not actually valid JSON
     84 json_packer = lambda obj: jsonapi.dumps(obj, default=date_default,
---> 85     ensure_ascii=False, allow_nan=False,
     86 )
     87 json_unpacker = lambda s: jsonapi.loads(s)

c:\Anaconda\lib\site-packages\zmq\utils\jsonapi.pyc in dumps(o, **kwargs)
     38         kwargs['separators'] = (',', ':')
     39 
---> 40     s = jsonmod.dumps(o, **kwargs)
     41 
     42     if isinstance(s, unicode):

c:\Anaconda\lib\json\__init__.pyc in dumps(obj, skipkeys, ensure_ascii, check_circular, allow_nan, cls, indent, separators, encoding, default, sort_keys, **kw)
    248         check_circular=check_circular, allow_nan=allow_nan, indent=indent,
    249         separators=separators, encoding=encoding, default=default,
--> 250         sort_keys=sort_keys, **kw).encode(obj)
    251 
    252 

c:\Anaconda\lib\json\encoder.pyc in encode(self, o)
    208         if not isinstance(chunks, (list, tuple)):
    209             chunks = list(chunks)
--> 210         return ''.join(chunks)
    211 
    212     def iterencode(self, o, _one_shot=False):

UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 171: ordinal not in range(128)

In [48]:
countries = list(set(df.birth_country_new_name))


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-48-a7a78aa7a2c0> in <module>()
----> 1 countries = list(set(df.birth_country_new_name))

c:\Anaconda\lib\site-packages\pandas\core\generic.pyc in __getattr__(self, name)
   1841                 return self[name]
   1842             raise AttributeError("'%s' object has no attribute '%s'" %
-> 1843                                  (type(self).__name__, name))
   1844 
   1845     def __setattr__(self, name, value):

AttributeError: 'DataFrame' object has no attribute 'birth_country_new_name'

In [238]:
#google_api_key = "AIzaSyDAJxRxTE-ZC5M7qGN5Bg_FXwgc5e_TqdU"

In [239]:
def lookup_lat_lon(city="", state="", country="", key=""):
    return "https://maps.googleapis.com/maps/api/geocode/json?"+"address="+country+"&key="+key

In [288]:
lookup_lat_lon(country=countries[38], key=google_api_key)


Out[288]:
"https://maps.googleapis.com/maps/api/geocode/json?address=People's Republic of China&key=AIzaSyDAJxRxTE-ZC5M7qGN5Bg_FXwgc5e_TqdU"

In [289]:
url2 = lookup_lat_lon(country=countries[38], key=google_api_key)

In [290]:
r2 = requests.get(url2)

In [291]:
country_json = r2.json()

In [292]:
birth_lat = country_json['results'][0]['geometry']['location']['lat']
birth_lon = country_json['results'][0]['geometry']['location']['lng']
birth_country_long_name = country_json['results'][0]['address_components'][0]['long_name']
birth_country_short_name = country_json['results'][0]['address_components'][0]['short_name']

In [45]:
print birth_lat
print birth_lon
#birth_country_long_name


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-45-0d2c4c7e0c91> in <module>()
----> 1 print birth_lat
      2 print birth_lon
      3 #birth_country_long_name

NameError: name 'birth_lat' is not defined

In [295]:
#country_json

In [46]:
def get_long_lat(country_list, birth_countries=True):
    
    output = [['birth_lat', 
               'birth_lon', 
               'birth_country_current_name',
               'birth_country_short_name']]
    if birth_countries == False:
        output = [['acquired_lat', 
                   'acquired_lon', 
                   'current_country_name_acquired',
                   'acquired_country_short_name']]
    # https://console.developers.google.com
    # https://developers.google.com/maps/documentation/geocoding/?csw=1
    google_api_key = "AIzaSyDAJxRxTE-ZC5M7qGN5Bg_FXwgc5e_TqdU"
    
    for each_country in country_list:
        url = lookup_lat_lon(country=each_country, key=google_api_key)
        r = requests.get(url)
        country_json = r.json()
        lat = country_json['results'][0]['geometry']['location']['lat']
        lon = country_json['results'][0]['geometry']['location']['lng']
        #country_long_name = country_json['results'][0]['address_components'][0]['long_name']
        country_long_name = each_country
        country_short_name = country_json['results'][0]['address_components'][0]['short_name']
        
        output.append([lat,
                       lon,
                       country_long_name,
                       country_short_name])
    return output

In [47]:
# Get the lat/lon from the Google API!
lat_lon_birth_countries = get_long_lat(countries, birth_countries=True)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-47-a641c1a16df4> in <module>()
      1 # Get the lat/lon from the Google API!
----> 2 lat_lon_birth_countries = get_long_lat(countries, birth_countries=True)

NameError: name 'countries' is not defined

In [308]:
headers = lat_lon_birth_countries.pop(0)
birth_countries_df = pd.DataFrame(lat_lon_birth_countries, columns=headers)

In [313]:
birth_countries_df.head()


Out[313]:
birth_lat birth_lon birth_country_current_name birth_country_short_name
0 -14.235004 -51.925280 Brazil BR
1 56.130366 -106.346771 Canada CA
2 -18.766947 46.869107 Madagascar MG
3 41.608635 21.745275 Republic of Macedonia MK
4 41.871940 12.567380 Italy IT

In [397]:
df = pd.merge(df, birth_countries_df)
df.tail()


Out[397]:
name birth_country_old_name birth_country_current_name year field birth_lat birth_lon birth_country_short_name
859 Robert J. Shiller USA USA 2013 Economic Sciences 37.090240 -95.712891 US
860 Lars Peter Hansen USA USA 2013 Economic Sciences 37.090240 -95.712891 US
861 Baruj Benacerraf Venezuela Venezuela 1980 Physiology or Medicine 6.423750 -66.589730 VE
862 Le Duc Tho Vietnam Vietnam 1973 Peace 14.058324 108.277199 VN
863 Tawakkol Karman Yemen Yemen 2011 Peace 15.552727 48.516388 YE

In [398]:
# df.to_csv('data/temp.csv')

In [319]:
headers = nobel_ages.pop(0)
nobel_ages_df = pd.DataFrame(nobel_ages, columns=headers)

In [384]:
#pd.merge(df, nobel_ages_df).tail(20)

Since 4 people won Nobel Prizes twice (!) at different ages, these dataframes can't just be merged on the 'name' column. Instead, we can sort/reorder each dataframe by the names and year/age, resetting the index to get them aligned.

Now, we can merge(or join()) them in pandas on the indices of each dataframe. Now, we can see Marie Curie was age 36 when recieving the nobel when recieving the Nobel Prize in 1903 in Physics, then 44 in 1911 when recieving the Nobel Prize in Chemistry.


In [399]:
sorted1 = df.sort(columns=['name', 'year']).reset_index(drop=True)
sorted2 = nobel_ages_df.sort(columns=['name', 'age']).reset_index(drop=True)
merged = pd.merge(sorted1, sorted2, left_index=True, right_index=True, how='outer', on='name')
merged[merged.name=="Marie Curie"]


Out[399]:
name birth_country_old_name birth_country_current_name year field birth_lat birth_lon birth_country_short_name age
521 Marie Curie Russian Empire Poland 1903 Physics 51.919438 19.145136 PL 36
522 Marie Curie Russian Empire Poland 1911 Chemistry 51.919438 19.145136 PL 44

In [401]:
merged.to_csv('data/temp.csv', encoding='utf-8')

In [402]:
merged.head()


Out[402]:
name birth_country_old_name birth_country_current_name year field birth_lat birth_lon birth_country_short_name age
0 A. Michael Spence USA USA 2001 Economic Sciences 37.090240 -95.712891 US 58
1 Aage N. Bohr Denmark Denmark 1975 Physics 56.263920 9.501785 DK 53
2 Aaron Ciechanover British Protectorate of Palestine Israel 2004 Chemistry 31.046051 34.851612 IL 57
3 Aaron Klug Lithuania Lithuania 1982 Chemistry 55.169438 23.881275 LT 56
4 Abdus Salam India Pakistan 1979 Physics 30.375321 69.345116 PK 53

In [403]:



Out[403]:
[[0, -1, -2, -3], [1, 0, -1, -2], [2, 1, 0, -1]]

In [404]:



Out[404]:
[[0, 1, 2], [-1, 0, 1], [-2, -1, 0], [-3, -2, -1]]

In [405]:



The Zen of Python, by Tim Peters

Beautiful is better than ugly.
Explicit is better than implicit.
Simple is better than complex.
Complex is better than complicated.
Flat is better than nested.
Sparse is better than dense.
Readability counts.
Special cases aren't special enough to break the rules.
Although practicality beats purity.
Errors should never pass silently.
Unless explicitly silenced.
In the face of ambiguity, refuse the temptation to guess.
There should be one-- and preferably only one --obvious way to do it.
Although that way may not be obvious at first unless you're Dutch.
Now is better than never.
Although never is often better than *right* now.
If the implementation is hard to explain, it's a bad idea.
If the implementation is easy to explain, it may be a good idea.
Namespaces are one honking great idea -- let's do more of those!

In [ ]:


In [ ]: