In [136]:

    
import pandas as pd
import os
from unidecode import unidecode
from uuid import uuid4
from collections import Counter
import string
import re
import numpy as np
import Levenshtein



In [2]:

    
def strip_punctuation(s):
    exclude = set(string.punctuation)
    s = ''.join(ch for ch in s if ch not in exclude)
    return s



In [3]:

    
def strip_clean(obj):
    try:
        return obj.strip().lower().replace('.', '')
    except AttributeError:
        return ''



In [4]:

    
def strip_nopunc(obj):
    try:
        return obj.lower().strip()
    except AttributeError:
        return ''

Cleaning logic



In [5]:

    
institutions_lookup = {}
institution_ids = {}



In [6]:

    
def cleaned_institutions(inames):
    return unidecode(unicode(inames)            # The value of the 'Affiliation' field is
                         .strip()               #  stripped padding whitespace (e.g. spaces),
                         .lower()               #  and converted to lowercase.
                         .replace('&', 'and')   # Ampersands are converted to 'and', and
                         .replace('-', ' '))    #  hyphens are interpreted as spaces.



In [7]:

    
def cleaned_person(last, first):
    lastname = unidecode(strip_clean(unicode(last)))
    firstname = strip_clean(' '.join([ n.strip(' ') for n 
                                      in re.split('\.|\W+', unidecode(strip_clean(unicode(first)))) 
                                      if n != '' ]).strip().replace('.',''))
    return lastname, firstname



In [8]:

    
def normalized_person(last, first):
    """
    Generates a normalized representation of a personal name.
    """
            
    name = cleaned_person(last, first)
    if name in person_map:
        normed_name = person_map[name]
    else:
        normed_name = name
    if normed_name not in person_ids:
        uri = 'http://history.archives.mbl.edu/concepts/person/{0}'.format(uuid4())
        person_ids[normed_name] = uri
    else:
        uri = person_ids[normed_name]
    return normed_name, uri



In [9]:

    
def normalized_institutions(inames):
    """
    Generates a normalized representation of an institutional name.
    """
    anames = []
    affs = cleaned_institutions(inames)

    # The 'Affiliation' field can contain multiple institutions, separated by a slash ('/').
    for aff in affs.split('/'):
        # All punctuation is removed, and double-spaces are converted to single.     
        aff = strip_punctuation(aff.strip()).replace('  ',' ')  
        aff = ' '.join([ word for word          # One source of variation in names it the
                        in aff.split(' ')       #  inclusion of 'the'. We simply remove
                        if word != 'the'] )     #  'the' from all names.
        
        if aff in institutions_lookup:          # In a previous step, we generated aggregation
            aff = institutions_lookup[aff]      #  rules for some names. If there were multiple
                                                #  similar names, this retrieves the most likely.
        anames.append(aff)
        
        if aff not in institution_ids:
            uri = 'http://history.archives.mbl.edu/concepts/institution/{0}'.format(uuid4())
            institution_ids[aff] = uri            
    return anames



In [10]:

    
def get_location_uri(location):
    if location == '' or pd.isnull(location):
        return
    if location not in location_ids:
        uri = 'http://history.archives.mbl.edu/concepts/location/{0}'.format(uuid4())  
        location_ids[location] = uri
    else:
        uri = location_ids[location]
    return uri



In [11]:

    
def get_coursegroup_uri(coursegroup):
    if coursegroup not in coursegroup_ids:
        uri = 'http://history.archives.mbl.edu/concepts/coursegroup/{0}'.format(uuid4())  
        coursegroup_ids[coursegroup] = uri
    else:
        uri = coursegroup_ids[coursegroup]
    return uri



In [12]:

    
def get_course_uri(cname):
    if cname not in course_ids:
        uri = 'http://history.archives.mbl.edu/concepts/course/{0}'.format(uuid4())  
        course_ids[cname] = uri
    else:
        uri = course_ids[cname]
    return uri



In [13]:

    
def clean_coursename(cname):
    return cname.strip().lower().replace('&', 'and').replace('1', 'i').replace('2', 'ii')



In [14]:

    
def normalize_coursename(cname):
    cname = clean_coursename(unidecode(cname))
    
    if cname in course_map:
        cname = course_map[cname]
    if cname in coursegroup_map:
        group = coursegroup_map[cname]
    else:
        group = cname
    return cname, group

Load authority files



In [15]:

    
apath = '/Users/erickpeirson/Projects/MBL-Data/Authority Files/'
courseAuthority = pd.read_csv(os.path.join(apath, 'course.csv'))
courseGroupAuthority = pd.read_csv(os.path.join(apath, 'coursegroup.csv'))
institutionAuthority = pd.read_csv(os.path.join(apath, 'institution.csv'))
locationAuthority = pd.read_csv(os.path.join(apath, 'location.csv'))
personAuthority = pd.read_csv(os.path.join(apath, 'person.csv'))



In [16]:

    
courseAuthority[:5]









    Out[16]:






  
    
      
      Course Name
      Course URI
    
  
  
    
      0
      Embryology 1919
      http://history.archives.mbl.edu/concepts/cours...
    
    
      1
      Embryology 1918
      http://history.archives.mbl.edu/concepts/cours...
    
    
      2
      Zoology 1952
      http://history.archives.mbl.edu/concepts/cours...
    
    
      3
      Zoology 1953
      http://history.archives.mbl.edu/concepts/cours...
    
    
      4
      Zoology 1954
      http://history.archives.mbl.edu/concepts/cours...



In [17]:

    
personAuthority[:5]









    Out[17]:






  
    
      
      Last Name
      First Name
      Person URI
    
  
  
    
      0
      Sunley
      Daniel
      http://history.archives.mbl.edu/concepts/perso...
    
    
      1
      Delmar
      Mario
      http://history.archives.mbl.edu/concepts/perso...
    
    
      2
      Bray
      Dennis
      http://history.archives.mbl.edu/concepts/perso...
    
    
      3
      Mumm
      Jeffrey
      http://history.archives.mbl.edu/concepts/perso...
    
    
      4
      Taylor
      Martha
      http://history.archives.mbl.edu/concepts/perso...



In [18]:

    
personLookup = dict(zip(zip(personAuthority['Last Name'].values, personAuthority['First Name'].values), 
                        personAuthority['Person URI'].values))



In [19]:

    
cleaned_names = [cleaned_person(last, first) for last, first in zip(personAuthority['Last Name'], personAuthority['First Name'])]



In [20]:

    
person_ids = dict(zip(cleaned_names, personAuthority['Person URI'].values))



In [21]:

    
institutionAuthority[:5]









    Out[21]:






  
    
      
      Institution
      Institution URI
    
  
  
    
      0
      St Luc Hospital
      http://history.archives.mbl.edu/concepts/insti...
    
    
      1
      New England Institute For Medical Research
      http://history.archives.mbl.edu/concepts/insti...
    
    
      2
      Station Biologique
      http://history.archives.mbl.edu/concepts/insti...
    
    
      3
      Albany College Of Pharmacy
      http://history.archives.mbl.edu/concepts/insti...
    
    
      4
      New York College Of Medicine And Believue Hosp...
      http://history.archives.mbl.edu/concepts/insti...



In [22]:

    
inormed = [cleaned_institutions(i)[0] for i in institutionAuthority.Institution.values]



In [23]:

    
institutions_lookup = dict(zip(inormed, inormed))



In [24]:

    
institution_ids = dict(zip(inormed, institutionAuthority['Institution URI'].values))



In [25]:

    
locationAuthority[:5]









    Out[25]:






  
    
      
      Location
      Location URI
    
  
  
    
      0
      catawissa, pa
      http://history.archives.mbl.edu/concepts/locat...
    
    
      1
      warsaw, ny
      http://history.archives.mbl.edu/concepts/locat...
    
    
      2
      chile
      http://history.archives.mbl.edu/concepts/locat...
    
    
      3
      trenton, nj
      http://history.archives.mbl.edu/concepts/locat...
    
    
      4
      copenhagen, denmark
      http://history.archives.mbl.edu/concepts/locat...



In [26]:

    
location_ids = dict(zip([strip_nopunc(v) for v in locationAuthority['Location'].values], locationAuthority['Location URI']))



In [96]:

    
courseAuthority[:5]









    Out[96]:






  
    
      
      Course Name
      Course URI
    
  
  
    
      0
      Embryology 1919
      http://history.archives.mbl.edu/concepts/cours...
    
    
      1
      Embryology 1918
      http://history.archives.mbl.edu/concepts/cours...
    
    
      2
      Zoology 1952
      http://history.archives.mbl.edu/concepts/cours...
    
    
      3
      Zoology 1953
      http://history.archives.mbl.edu/concepts/cours...
    
    
      4
      Zoology 1954
      http://history.archives.mbl.edu/concepts/cours...



In [97]:

    
course_ids = dict(zip([strip_nopunc(v) for v in courseAuthority['Course Name'].values], courseAuthority['Course URI']))



In [98]:

    
courseGroupAuthority[:5]









    Out[98]:






  
    
      
      Course Group
      Course Group URI
    
  
  
    
      0
      Botanical Museum Development
      http://history.archives.mbl.edu/concepts/cours...
    
    
      1
      Research Program In History Of Biology
      http://history.archives.mbl.edu/concepts/cours...
    
    
      2
      Small Computers In Biomedical Research
      http://history.archives.mbl.edu/concepts/cours...
    
    
      3
      Biomedical Informatics 2
      http://history.archives.mbl.edu/concepts/cours...
    
    
      4
      Microbiology
      http://history.archives.mbl.edu/concepts/cours...



In [99]:

    
coursegroup_ids = dict(zip([clean_coursename(v) for v in courseGroupAuthority['Course Group'].values], courseGroupAuthority['Course Group URI']))

Load existing cleaned data

See './Code/MBL Course and Investigator Data.ipynb' for cleaning/disambiguation procedure.



In [31]:

    
datapath = '/Users/erickpeirson/Projects/MBL-Data/Cleaned Data/'



In [32]:

    
affiliations = pd.read_csv(os.path.join(datapath, 'cleaned_affiliations.csv'))



In [33]:

    
affiliations[:5]









    Out[33]:






  
    
      
      Institution URI
      Last Name
      Year
      First Name
      Person URI
      Position
      Institution
    
  
  
    
      0
      http://history.archives.mbl.edu/concepts/insti...
      Gardiner
      1888
      Edward G
      http://history.archives.mbl.edu/concepts/perso...
      NaN
      Institute Of Technology
    
    
      1
      http://history.archives.mbl.edu/concepts/insti...
      Jordan
      1888
      Edwin O
      http://history.archives.mbl.edu/concepts/perso...
      NaN
      Institute Of Technology
    
    
      2
      http://history.archives.mbl.edu/concepts/insti...
      Washburn
      1888
      F L
      http://history.archives.mbl.edu/concepts/perso...
      NaN
      University Of Michigan
    
    
      3
      http://history.archives.mbl.edu/concepts/insti...
      Clapp
      1888
      Cornelia Maria
      http://history.archives.mbl.edu/concepts/perso...
      NaN
      Mt Holyoke Seminary And College
    
    
      4
      http://history.archives.mbl.edu/concepts/insti...
      O'Grady
      1888
      Marcella I
      http://history.archives.mbl.edu/concepts/perso...
      NaN
      Bryn Mawr College



In [34]:

    
attendance = pd.read_csv(os.path.join(datapath, 'cleaned_attendance.csv'))



In [125]:

    
coursegroups = pd.read_csv(os.path.join(datapath, 'cleaned_coursegroups.csv'))



In [36]:

    
investigators = pd.read_csv(os.path.join(datapath, 'cleaned_investigators.csv'))



In [37]:

    
locations = pd.read_csv(os.path.join(datapath, 'cleaned_locations.csv'))

Load and integrate Wes' updated data

Located in './Original Data/Wes - Updated'.



In [38]:

    
wdpath = '/Users/erickpeirson/Projects/MBL-Data/Original Data/Wes - Updated/'



In [39]:

    
investigators_2012 = pd.read_csv(os.path.join(wdpath, 'MBL INVESTIGATOR WES 2012.csv'), encoding='utf-8')
investigators_2013 = pd.read_csv(os.path.join(wdpath, 'MBL INVESTIGATOR WES 2013.csv'), encoding='utf-8')
attendance_2009 = pd.read_csv(os.path.join(wdpath, 'MBL COURSE LIST WES 2009.csv'), encoding='utf-8')
attendance_2010 = pd.read_csv(os.path.join(wdpath, 'MBL COURSE LIST WES 2010.csv'), encoding='utf-8')
attendance_2011 = pd.read_csv(os.path.join(wdpath, 'MBL COURSE LIST WES 2011.csv'), encoding='utf-8')
attendance_2012 = pd.read_csv(os.path.join(wdpath, 'MBL COURSE LIST WES 2012.csv'), encoding='utf-8')
attendance_2013 = pd.read_csv(os.path.join(wdpath, 'MBL COURSE LIST WES 2013.csv'), encoding='utf-8')



In [41]:

    
withNames = [attendance_2009, attendance_2010, attendance_2011, attendance_2012, attendance_2013, investigators_2012, investigators_2013]



In [42]:

    
personal_names = set([])
f_personal_names = Counter()
personal_affiliations = {}



In [43]:

    
withCourses = [attendance_2009, attendance_2010, attendance_2011, attendance_2012, attendance_2013]
courseYears = [2009, 2010, 2011, 2012, 2013]



In [44]:

    
for df in [affiliations] + withNames:    
    last, first = zip(*[cleaned_person(l, f) for l, f in zip(df['Last Name'].values, df['First Name'].values)])
    
    if 'Affiliation' in df.columns:
        affs = [normalized_institutions(inst) for inst in df['Affiliation'].values]
    elif 'Institution' in df.columns:
        affs = [normalized_institutions(inst) for inst in df['Institution'].values]
    names = zip(last, first)
    
    personal_names |= set(names)
    for n,v in Counter(names).items():
        f_personal_names[n] += v
    
    for name, aff in zip(names, affs):
        if name not in personal_affiliations:
            personal_affiliations[name] = set([])
        for affiliation in aff:  
            personal_affiliations[name].add(affiliation)



In [45]:

    
personal_names_list = list(personal_names)
N_names = len(personal_names)
by_last = {}
for i in xrange(N_names):
    lastname, firstname = personal_names_list[i]
    if lastname not in by_last:
        by_last[lastname] = set([])
    by_last[lastname].add(firstname)    # The surname of A is identical to the surname of B



In [46]:

    
person_map = {}
for last, firsts in by_last.iteritems():    # We assume that surnames are not misspelled.
    N_firsts = len(firsts)                  #  This is not strictly true, but it is not
    if N_firsts > 1:                        #  quite clear how to proceed otherwise.
        lfirsts = list(firsts)              # Consider cases in which two names, I and J,
        for i in xrange(N_firsts):          #  have a common surname.
            iname = lfirsts[i]
            inames = iname.split(' ')
            iinits = [f[0] for f in inames ]
            
            for j in xrange(i+1, N_firsts):     
                jname = lfirsts[j]
                jnames = jname.split(' ')
                jinits = [f[0] for f in jnames ]

                # For each such pair, I and J, we compare the X parts of their forenames,
                #  where X is the minimum number of forename parts for I and J.
                match = True               
                for x in xrange(min( [len(inames), len(jnames)] )):
                    # If the x part if either forename is an initial, we evaluate
                    #  only the first character of the two parts.
                    if len(inames[x]) == 1 or len(jnames[x]) == 1:
                        if iinits[x] != jinits[x]:
                            match = False
                    # Otherwise, the x part of the two forenames must be identical.
                    else:
                        if inames[x] != jnames[x]:
                            match = False
                if match:     
                    # If the forenames of I and J match, as described above, we check
                    # to see whether they share at least one institutional affiliation.
                    shared = personal_affiliations[(last, iname)] & personal_affiliations[(last, jname)]
                    if len(shared) > 0:
                        # If they share at least one institutional affiliation, then
                        #  we believe that I and J both refer to the same person.
                        if len(iname) > len(jname):    # Use the longest name (most complete).
                            key = jname
                            alt = iname
                        else:
                            key = iname
                            alt = jname
                        if (last, alt) in person_map:
                            top = False
                            while not top:
                                try:
                                    alt = person_map[(last,alt)][1]
                                except KeyError:
                                    top = True
                        person_map[(last, key)] = (last, alt)      
                        
    # If the conditions above are not satisfied, then we lack sufficient evidence to
    #  assert that the names I and J refer to the same person.
print len(person_map)



In [47]:

    
loc = [get_location_uri(l) for l in attendance_2011['Location'].values]



In [48]:

    
attendance[:5]









    Out[48]:






  
    
      
      Year
      Last Name
      Person URI
      Course Name
      First Name
      Role
      Course URI
    
  
  
    
      0
      1890
      Mcmurrich
      http://history.archives.mbl.edu/concepts/perso...
      Coelenterates 1890
      J Playfair
      Instructor
      http://history.archives.mbl.edu/concepts/cours...
    
    
      1
      1891
      Setchell
      http://history.archives.mbl.edu/concepts/perso...
      Botany 1891
      William Albert
      Instructor
      http://history.archives.mbl.edu/concepts/cours...
    
    
      2
      1891
      Rankin
      http://history.archives.mbl.edu/concepts/perso...
      Zoology 1891
      Walter M
      Instructor
      http://history.archives.mbl.edu/concepts/cours...
    
    
      3
      1891
      Bumpus
      http://history.archives.mbl.edu/concepts/perso...
      Zoology 1891
      Hermon C
      Instructor
      http://history.archives.mbl.edu/concepts/cours...
    
    
      4
      1891
      Mcmurrich
      http://history.archives.mbl.edu/concepts/perso...
      Zoology 1891
      J Playfair
      Instructor
      http://history.archives.mbl.edu/concepts/cours...



In [49]:

    
def mean(seq):
    return float(sum(seq))/len(seq)



In [50]:

    
course_names = set([])
f_course_names = Counter()

for df in withCourses:
    name = [clean_coursename(c) for c in df['Course Name'].values]
    course_names |= set(name)
    for n in name:
        f_course_names[n] += 1

print len(course_names)

distances = []
course_names_list = list(course_names)
for i in xrange(len(course_names_list)):
    for j in xrange(i+1, len(course_names_list)):
        a = course_names_list[i]
        b = course_names_list[j]
        d = Levenshtein.distance(a,b)
        dnorm = float(d)/mean([float(len(a)), float(len(b))])
        distances.append( (i,j,d, dnorm) )
        
for d in distances:
    if d[3] < 0.17:
        a = course_names_list[d[0]]
        b = course_names_list[d[1]]
        dnorm = d[3]

        print d[3], d[2]
        print f_course_names[a], '\t', a
        print f_course_names[b], '\t', b   
        print '-'*40









    



38
0.0571428571429 2
244 	zebrafish development and genetics
57 	zebrafish developmental and genetics
----------------------------------------
0.0224719101124 1
111 	analystical and quantitative light microscopy
165 	analytical and quantitative light microscopy
----------------------------------------
0.0141843971631 1
154 	strategies and techniques for analyzing microbial population structures
90 	strategies and techniques for analyzing microbial population structure
----------------------------------------
0.161290322581 10
97 	embryology: concepts and techniques in modern development
173 	embryology: concepts and techniques in modern developmental biology
----------------------------------------
0.0792079207921 4
40 	summer program in neuroscience, ethics $ survival
151 	summer program in neuroscience, ethics, and survival
----------------------------------------
0.0869565217391 2
137 	biomedical informatics i
90 	biomedical informatics
----------------------------------------
0.0408163265306 1
137 	biomedical informatics i
93 	biomedical informatics ii
----------------------------------------
0.127659574468 3
90 	biomedical informatics
93 	biomedical informatics ii
----------------------------------------



In [76]:

    
# The course_map handles typographical errors in the dataset. There are remarkably
#  few typos. We specify the appropriate spellings manually, below.
course_map = { 
    'optimal microscopy': 'optical microscopy',
    'optimal microscopy and imaging in the biomedical sciences': 'optical microscopy and imaging in the biomedical sciences',
    'nasa planetary biology inernship': 'nasa planetary biology internship',
    'summer program in neuroscience, ethics $ survival': 'summer program in neuroscience, ethics and survival',
    'spines--summer program in neuroscience, ethics and survival': 'summer program in neuroscience, ethics and survival',
    'physiology: modern cell biology using microscopic, biochemical and computational approaches': 'physiology: modern cell biology using microscopic, biochemical, and computational approaches',
    'physiology: cell and molecular biology': 'physiology: cellular and molecular biology',
    'parthogenesis of neuroimmunologic diseases': 'pathogenesis of neuroimmunologic diseases',
    'zebrafish developmental and genetics': 'zebrafish development and genetics',
    'analystical and quantitative light microscopy': 'analytical and quantitative light microscopy',
    'strategies and techniques for analyzing microbial population structure': 'strategies and techniques for analyzing microbial population structures',
    'embryology: concepts and techniques in modern development': 'embryology: concepts and techniques in modern developmental biology',
    'summer program in neuroscience, ethics $ survival': 'summer program in neuroscience, ethics, and survival',
}

# If two courses share the same name, then we generally consider them to belong to the 
#  same course group. For example, the 'Ecology' course in 1934 (say) belongs to the
#  same group (or series) as the 'Ecology' course in 1965 (say).
#
# In some cases, however, courses with slightly (or perhaps very) different names
#  may belong to the same group. For example, an Embryology course might have some
#  subtitle, like: "Embryology: Some great new theme for this course". Or they might
#  be numbered, like "Biomedical informatics I" and "Biomedical informatics II".
#
# The coursegroup_map handles the latter cases. Keys are specific course names that
#  occur in the dataset, and values are the group names that should be used.
#  Some of these mappings are given manually, based on inspection of the dataset.
#  Other mappings are generated by looking for colons (':') in course names; the
#  part of the name before the colon is assumed to be the proper group name.

coursegroup_map = { 
    'small computers in biomedical research, i': 'small computers in biomedical research',
    'small computers in biomedical research, ii': 'small computers in biomedical research',
    'medical informatics': 'biomedical informatics',
    'medical informatics i': 'biomedical informatics',
    'medical informatics ii': 'biomedical informatics',
    'biomedical informatics i': 'biomedical informatics',
    'biomedical informatics ii': 'biomedical informatics',
    'advanced workshop on recombinant dna methodology': 'workshop on recombinant dna methodology',
    'basic workshop on recombinant dna methodology': 'workshop on recombinant dna methodology',
    'biomedical informatics ii': 'biomedical informatics',
    'biomedical informatics i': 'biomedical informatics',
    'biomedical informatics': 'biomedical informatics',
    'biomedical informatics (fall)': 'biomedical informatics',
    'biomedical informatics (spring)': 'biomedical informatics',
    'optical microscopy and imaging in the biomedical sciences': 'optical microscopy',
    'gene regulatory networks for development': 'gene regulatory networks'
}



In [77]:

    
for cname in list(course_names):   # Here we look for course names with subtitles,
    parts = cname.split(':')       #  characterized by a colon (':') in their names.
    if len(parts) > 1:
        coursegroup_map.update({cname:parts[0]})
print len(coursegroup_map)



In [100]:

    
coursegroup = []
role = []
cname = []
year = []
name = []
aff = []
loc = []
position = []
for df, years in zip(withCourses, courseYears):
    role += [strip_clean(r) for r in df['Role']]
    cname_this, coursegroup_this = zip(*[normalize_coursename(c.lower()) for c in df['Course Name'].values])
    coursegroup += coursegroup_this
    cname += cname_this
    year += [years]*len(cname_this)
    name += zip(df['Last Name'], df['First Name'])
    aff += [normalized_institutions(i) for i in df['Affiliation']]
    loc += [strip_nopunc(l) for l in df['Location'].values]
    position += [strip_clean(p) for p in df['Position at Affiliation'].values]
    
cname = ['{0} {1}'.format(c, y).title() for c, y in zip(cname, year)]
course_uri = [get_course_uri(c.lower()) for c in cname]
coursegroup_uri = [get_coursegroup_uri(c) for c in coursegroup]
name, name_uri = zip(*[normalized_person(n[0], n[1]) for n in name])
coursegroup = [c.title() for c in coursegroup]
lastname, firstname = zip(*name)
lastname = [n.title() for n in lastname]
firstname = [n.title() for n in firstname]
role = [r.title() for r in role]
position = [p.title() for p in position]
aff_uris = [[institution_ids[i] for i in ai] for ai in aff]
loc_uris = [get_location_uri(l) for l in loc]
aff = [[a.title() for a in ai] for ai in aff]



In [101]:

    
Counter([len(a) for a in aff])









    Out[101]:





Counter({1: 6490, 2: 182, 3: 3})



In [102]:

    
attendance_wes = pd.DataFrame(data=zip(year, lastname, name_uri, cname, firstname, role, course_uri),
                              columns=['Year', 'Last Name', 'Person URI', 'Course Name', 'First Name', 
                                       'Role', 'Course URI'])



In [103]:

    
attendance_wes[:5]









    Out[103]:






  
    
      
      Year
      Last Name
      Person URI
      Course Name
      First Name
      Role
      Course URI
    
  
  
    
      0
      2009
      Elmendorf
      http://history.archives.mbl.edu/concepts/perso...
      Biology Of Parasitism 2009
      Heidi
      Course Director
      http://history.archives.mbl.edu/concepts/cours...
    
    
      1
      2009
      Goldberg
      http://history.archives.mbl.edu/concepts/perso...
      Biology Of Parasitism 2009
      Daniel
      Course Director
      http://history.archives.mbl.edu/concepts/cours...
    
    
      2
      2009
      Belkaid
      http://history.archives.mbl.edu/concepts/perso...
      Biology Of Parasitism 2009
      Yasmine
      Faculty
      http://history.archives.mbl.edu/concepts/cours...
    
    
      3
      2009
      Deitsch
      http://history.archives.mbl.edu/concepts/perso...
      Biology Of Parasitism 2009
      Kirk
      Faculty
      http://history.archives.mbl.edu/concepts/cours...
    
    
      4
      2009
      Heiges
      http://history.archives.mbl.edu/concepts/perso...
      Biology Of Parasitism 2009
      Mark
      Faculty
      http://history.archives.mbl.edu/concepts/cours...



In [104]:

    
# Location URI,Last Name,Person URI,First Name,Location,Year
locations_raw = []
for i in xrange(len(loc)):
    if loc[i] != '' and loc[i] is not None and loc_uris[i] is not None:
        locations_raw.append((loc_uris[i], lastname[i], name_uri[i], firstname[i], loc[i], year[i]))
locations_wes = pd.DataFrame(data=locations_raw,
                             columns=['Location URI', 'Last Name', 'Person URI', 'First Name', 'Location', 'Year'])



In [105]:

    
locations_wes[:5]









    Out[105]:






  
    
      
      Location URI
      Last Name
      Person URI
      First Name
      Location
      Year
    
  
  
    
      0
      http://history.archives.mbl.edu/concepts/locat...
      Pell
      http://history.archives.mbl.edu/concepts/perso...
      Jonathan
      denver
      2009
    
    
      1
      http://history.archives.mbl.edu/concepts/locat...
      Bryan
      http://history.archives.mbl.edu/concepts/perso...
      Susan
      irvine
      2009
    
    
      2
      http://history.archives.mbl.edu/concepts/locat...
      Gardiner
      http://history.archives.mbl.edu/concepts/perso...
      David
      irvine
      2009
    
    
      3
      http://history.archives.mbl.edu/concepts/locat...
      Levine
      http://history.archives.mbl.edu/concepts/perso...
      Michael
      berkeley
      2009
    
    
      4
      http://history.archives.mbl.edu/concepts/locat...
      Grieco
      http://history.archives.mbl.edu/concepts/perso...
      Theresa
      berkeley
      2009



In [106]:

    
affiliations_raw = []
for i in xrange(len(aff)):
    a = aff[i]
    for x in xrange(len(a)):
        affiliations_raw.append((aff_uris[i][x], lastname[i], year[i], firstname[i], name_uri[i], position[i], aff[i][x]))



In [107]:

    
# Institution URI,Last Name,Year,First Name,Person URI,Position,Institution
affiliations_wes = pd.DataFrame(data=affiliations_raw, 
                                 columns=['Institution URI', 'Last Name', 'Year', 'First Name', 
                                          'Person URI', 'Position', 'Institution'])



In [108]:

    
role = []
year = []
name = []
aff = []
loc = []
for df, years in zip([investigators_2012, investigators_2013], [2012, 2013]):
    role += [strip_clean(r) for r in df['Role']]
    year += [years]*len(role)
    name += zip(df['Last Name'], df['First Name'])
    aff += [normalized_institutions(i) for i in df['Affiliation']]
    loc += [strip_nopunc(l) for l in df['Location'].values]
    
name, name_uri = zip(*[normalized_person(n[0], n[1]) for n in name])
lastname, firstname = zip(*name)
lastname = [n.title() for n in lastname]
firstname = [n.title() for n in firstname]
role = [r.title() for r in role]
aff_uris = [[institution_ids[i] for i in ai] for ai in aff]
loc_uris = [get_location_uri(l) for l in loc]
aff = [[a.title() for a in ai] for ai in aff]
position = ['']*len(aff)



In [109]:

    
affiliations_raw = []
for i in xrange(len(aff)):
    a = aff[i]
    for x in xrange(len(a)):
        affiliations_raw.append((aff_uris[i][x], lastname[i], year[i], firstname[i], name_uri[i], position[i], aff[i][x]))



In [110]:

    
print len(affiliations_wes),
affiliations_wes = affiliations_wes.append(pd.DataFrame(data=affiliations_raw, 
                                           columns=['Institution URI', 'Last Name', 'Year', 'First Name', 
                                                  'Person URI', 'Position', 'Institution']))
print len(affiliations_wes)



In [111]:

    
affiliations_wes[:5]









    Out[111]:






  
    
      
      Institution URI
      Last Name
      Year
      First Name
      Person URI
      Position
      Institution
    
  
  
    
      0
      http://history.archives.mbl.edu/concepts/insti...
      Elmendorf
      2009
      Heidi
      http://history.archives.mbl.edu/concepts/perso...
      
      Georgetown University
    
    
      1
      http://history.archives.mbl.edu/concepts/insti...
      Goldberg
      2009
      Daniel
      http://history.archives.mbl.edu/concepts/perso...
      
      Washington University In St Louis School Of Me...
    
    
      2
      http://history.archives.mbl.edu/concepts/insti...
      Belkaid
      2009
      Yasmine
      http://history.archives.mbl.edu/concepts/perso...
      
      National Institutes Of Health
    
    
      3
      http://history.archives.mbl.edu/concepts/insti...
      Deitsch
      2009
      Kirk
      http://history.archives.mbl.edu/concepts/perso...
      
      Weill Medical College Of Cornell University
    
    
      4
      http://history.archives.mbl.edu/concepts/insti...
      Heiges
      2009
      Mark
      http://history.archives.mbl.edu/concepts/perso...
      
      University Of Georgia



In [112]:

    
investigators_wes = pd.DataFrame(data=zip(lastname, name_uri, firstname, role, year, ['']*len(year)),
                                 columns=['Last Name', 'Person URI', 'First Name', 'Role', 'Year', 'Subject'])



In [113]:

    
investigators_wes[:5]









    Out[113]:






  
    
      
      Last Name
      Person URI
      First Name
      Role
      Year
      Subject
    
  
  
    
      0
      Goldman
      http://history.archives.mbl.edu/concepts/perso...
      Robert
      Director
      2012
      
    
    
      1
      Akkin
      http://history.archives.mbl.edu/concepts/perso...
      Taner
      Investigator
      2012
      
    
    
      2
      Andersen
      http://history.archives.mbl.edu/concepts/perso...
      Bruce
      Investigator
      2012
      
    
    
      3
      Armstrong
      http://history.archives.mbl.edu/concepts/perso...
      Peter
      Investigator
      2012
      
    
    
      4
      Augustine
      http://history.archives.mbl.edu/concepts/perso...
      George
      Investigator
      2012



In [114]:

    
# Location URI,Last Name,Person URI,First Name,Location,Year
locations_raw = []
for i in xrange(len(loc)):
    if loc[i] != '' and loc[i] is not None and loc_uris[i] is not None:
        locations_raw.append((loc_uris[i], lastname[i], name_uri[i], firstname[i], loc[i], year[i]))
print len(locations_wes),
locations_wes = locations_wes.append(pd.DataFrame(data=locations_raw,
                                     columns=['Location URI', 'Last Name', 'Person URI', 'First Name', 'Location', 'Year']))
print len(locations_wes)



In [115]:

    
int('Biomedical Informatics 2012'[-4:])









    Out[115]:





2012



In [120]:

    
cg_raw = []
for c, cg in dict(zip(cname, coursegroup)).items():
    y = int(c[-4:])
    curi = get_course_uri(c.lower())
    cguri = get_coursegroup_uri(cg.lower())
    cg_raw.append((y, curi, cguri, cg, c))



In [126]:

    
coursegroups_wes = pd.DataFrame(data=cg_raw, columns=['Year', 'Course URI', 'Course Group URI', 'Course Group', 'Course Name'])



In [127]:

    
coursegroups_wes[:5]









    Out[127]:






  
    
      
      Year
      Course URI
      Course Group URI
      Course Group
      Course Name
    
  
  
    
      0
      2012
      http://history.archives.mbl.edu/concepts/cours...
      http://history.archives.mbl.edu/concepts/cours...
      Biomedical Informatics
      Biomedical Informatics 2012
    
    
      1
      2010
      http://history.archives.mbl.edu/concepts/cours...
      http://history.archives.mbl.edu/concepts/cours...
      Biology Of Parasitism
      Biology Of Parasitism 2010
    
    
      2
      2009
      http://history.archives.mbl.edu/concepts/cours...
      http://history.archives.mbl.edu/concepts/cours...
      Biomedical Informatics
      Biomedical Informatics I 2009
    
    
      3
      2009
      http://history.archives.mbl.edu/concepts/cours...
      http://history.archives.mbl.edu/concepts/cours...
      Workshop On Molecular Evolution
      Workshop On Molecular Evolution 2009
    
    
      4
      2009
      http://history.archives.mbl.edu/concepts/cours...
      http://history.archives.mbl.edu/concepts/cours...
      Biomedical Informatics
      Biomedical Informatics Ii 2009



In [128]:

    
coursegroups[:5]









    Out[128]:






  
    
      
      Year
      Course URI
      Course Group URI
      Course Group
      Course Name
    
  
  
    
      0
      1890
      http://history.archives.mbl.edu/concepts/cours...
      http://history.archives.mbl.edu/concepts/cours...
      Coelenterates
      Coelenterates 1890
    
    
      1
      1891
      http://history.archives.mbl.edu/concepts/cours...
      http://history.archives.mbl.edu/concepts/cours...
      Botany
      Botany 1891
    
    
      2
      1891
      http://history.archives.mbl.edu/concepts/cours...
      http://history.archives.mbl.edu/concepts/cours...
      Zoology
      Zoology 1891
    
    
      3
      1892
      http://history.archives.mbl.edu/concepts/cours...
      http://history.archives.mbl.edu/concepts/cours...
      Botany
      Botany 1892
    
    
      4
      1892
      http://history.archives.mbl.edu/concepts/cours...
      http://history.archives.mbl.edu/concepts/cours...
      Embryology
      Embryology 1892



In [129]:

    
affiliations_combined = affiliations.append(affiliations_wes)
attendance_combined = attendance.append(attendance_wes)
coursegroups_combined = coursegroups.append(coursegroups_wes)
investigators_combined = investigators.append(investigators_wes)
locations_combined = locations.append(locations_wes)



In [132]:

    
print len(affiliations), len(affiliations_combined)
print len(attendance), len(attendance_combined)
print len(coursegroups), len(coursegroups_combined)
print len(investigators), len(investigators_combined)
print len(locations), len(locations_combined)









    



52500 59743
46909 53584
1095 1201
7996 8374
47248 47803

Summary



In [144]:

    
import matplotlib.pyplot as plt



In [152]:

    
%pylab inline









    



Populating the interactive namespace from numpy and matplotlib






    



WARNING: pylab import has clobbered these variables: ['f', 'mean']
`%matplotlib` prevents importing * from pylab and numpy



In [140]:

    
locations_finite = locations_combined[pd.notnull(locations_combined['Location'])]



In [153]:

    
plt.bar(locations_finite.Year.value_counts().index, locations_finite.Year.value_counts().values)









    Out[153]:





<Container object of 119 artists>



In [154]:

    
plt.bar(attendance_combined.Year.value_counts().index, attendance_combined.Year.value_counts().values)









    Out[154]:





<Container object of 122 artists>



In [155]:

    
plt.bar(affiliations_combined.Year.value_counts().index, affiliations_combined.Year.value_counts().values)









    Out[155]:





<Container object of 124 artists>



In [ ]:

	Course Name	Course URI
0	Embryology 1919	http://history.archives.mbl.edu/concepts/cours...
1	Embryology 1918	http://history.archives.mbl.edu/concepts/cours...
2	Zoology 1952	http://history.archives.mbl.edu/concepts/cours...
3	Zoology 1953	http://history.archives.mbl.edu/concepts/cours...
4	Zoology 1954	http://history.archives.mbl.edu/concepts/cours...

	Last Name	First Name	Person URI
0	Sunley	Daniel	http://history.archives.mbl.edu/concepts/perso...
1	Delmar	Mario	http://history.archives.mbl.edu/concepts/perso...
2	Bray	Dennis	http://history.archives.mbl.edu/concepts/perso...
3	Mumm	Jeffrey	http://history.archives.mbl.edu/concepts/perso...
4	Taylor	Martha	http://history.archives.mbl.edu/concepts/perso...

	Institution	Institution URI
0	St Luc Hospital	http://history.archives.mbl.edu/concepts/insti...
1	New England Institute For Medical Research	http://history.archives.mbl.edu/concepts/insti...
2	Station Biologique	http://history.archives.mbl.edu/concepts/insti...
3	Albany College Of Pharmacy	http://history.archives.mbl.edu/concepts/insti...
4	New York College Of Medicine And Believue Hosp...	http://history.archives.mbl.edu/concepts/insti...

	Location	Location URI
0	catawissa, pa	http://history.archives.mbl.edu/concepts/locat...
1	warsaw, ny	http://history.archives.mbl.edu/concepts/locat...
2	chile	http://history.archives.mbl.edu/concepts/locat...
3	trenton, nj	http://history.archives.mbl.edu/concepts/locat...
4	copenhagen, denmark	http://history.archives.mbl.edu/concepts/locat...

	Course Group	Course Group URI
0	Botanical Museum Development	http://history.archives.mbl.edu/concepts/cours...
1	Research Program In History Of Biology	http://history.archives.mbl.edu/concepts/cours...
2	Small Computers In Biomedical Research	http://history.archives.mbl.edu/concepts/cours...
3	Biomedical Informatics 2	http://history.archives.mbl.edu/concepts/cours...
4	Microbiology	http://history.archives.mbl.edu/concepts/cours...

	Institution URI	Last Name	Year	First Name	Person URI	Position	Institution
0	http://history.archives.mbl.edu/concepts/insti...	Gardiner	1888	Edward G	http://history.archives.mbl.edu/concepts/perso...	NaN	Institute Of Technology
1	http://history.archives.mbl.edu/concepts/insti...	Jordan	1888	Edwin O	http://history.archives.mbl.edu/concepts/perso...	NaN	Institute Of Technology
2	http://history.archives.mbl.edu/concepts/insti...	Washburn	1888	F L	http://history.archives.mbl.edu/concepts/perso...	NaN	University Of Michigan
3	http://history.archives.mbl.edu/concepts/insti...	Clapp	1888	Cornelia Maria	http://history.archives.mbl.edu/concepts/perso...	NaN	Mt Holyoke Seminary And College
4	http://history.archives.mbl.edu/concepts/insti...	O'Grady	1888	Marcella I	http://history.archives.mbl.edu/concepts/perso...	NaN	Bryn Mawr College

	Year	Last Name	Person URI	Course Name	First Name	Role	Course URI
0	1890	Mcmurrich	http://history.archives.mbl.edu/concepts/perso...	Coelenterates 1890	J Playfair	Instructor	http://history.archives.mbl.edu/concepts/cours...
1	1891	Setchell	http://history.archives.mbl.edu/concepts/perso...	Botany 1891	William Albert	Instructor	http://history.archives.mbl.edu/concepts/cours...
2	1891	Rankin	http://history.archives.mbl.edu/concepts/perso...	Zoology 1891	Walter M	Instructor	http://history.archives.mbl.edu/concepts/cours...
3	1891	Bumpus	http://history.archives.mbl.edu/concepts/perso...	Zoology 1891	Hermon C	Instructor	http://history.archives.mbl.edu/concepts/cours...
4	1891	Mcmurrich	http://history.archives.mbl.edu/concepts/perso...	Zoology 1891	J Playfair	Instructor	http://history.archives.mbl.edu/concepts/cours...

	Year	Last Name	Person URI	Course Name	First Name	Role	Course URI
0	2009	Elmendorf	http://history.archives.mbl.edu/concepts/perso...	Biology Of Parasitism 2009	Heidi	Course Director	http://history.archives.mbl.edu/concepts/cours...
1	2009	Goldberg	http://history.archives.mbl.edu/concepts/perso...	Biology Of Parasitism 2009	Daniel	Course Director	http://history.archives.mbl.edu/concepts/cours...
2	2009	Belkaid	http://history.archives.mbl.edu/concepts/perso...	Biology Of Parasitism 2009	Yasmine	Faculty	http://history.archives.mbl.edu/concepts/cours...
3	2009	Deitsch	http://history.archives.mbl.edu/concepts/perso...	Biology Of Parasitism 2009	Kirk	Faculty	http://history.archives.mbl.edu/concepts/cours...
4	2009	Heiges	http://history.archives.mbl.edu/concepts/perso...	Biology Of Parasitism 2009	Mark	Faculty	http://history.archives.mbl.edu/concepts/cours...

	Location URI	Last Name	Person URI	First Name	Location	Year
0	http://history.archives.mbl.edu/concepts/locat...	Pell	http://history.archives.mbl.edu/concepts/perso...	Jonathan	denver	2009
1	http://history.archives.mbl.edu/concepts/locat...	Bryan	http://history.archives.mbl.edu/concepts/perso...	Susan	irvine	2009
2	http://history.archives.mbl.edu/concepts/locat...	Gardiner	http://history.archives.mbl.edu/concepts/perso...	David	irvine	2009
3	http://history.archives.mbl.edu/concepts/locat...	Levine	http://history.archives.mbl.edu/concepts/perso...	Michael	berkeley	2009
4	http://history.archives.mbl.edu/concepts/locat...	Grieco	http://history.archives.mbl.edu/concepts/perso...	Theresa	berkeley	2009

	Last Name	Person URI	First Name	Role	Year
0	Goldman	http://history.archives.mbl.edu/concepts/perso...	Robert	Director	2012
1	Akkin	http://history.archives.mbl.edu/concepts/perso...	Taner	Investigator	2012
2	Andersen	http://history.archives.mbl.edu/concepts/perso...	Bruce	Investigator	2012
3	Armstrong	http://history.archives.mbl.edu/concepts/perso...	Peter	Investigator	2012
4	Augustine	http://history.archives.mbl.edu/concepts/perso...	George	Investigator	2012