In [136]:
import pandas as pd
import os
from unidecode import unidecode
from uuid import uuid4
from collections import Counter
import string
import re
import numpy as np
import Levenshtein

In [2]:
def strip_punctuation(s):
    exclude = set(string.punctuation)
    s = ''.join(ch for ch in s if ch not in exclude)
    return s

In [3]:
def strip_clean(obj):
    try:
        return obj.strip().lower().replace('.', '')
    except AttributeError:
        return ''

In [4]:
def strip_nopunc(obj):
    try:
        return obj.lower().strip()
    except AttributeError:
        return ''

Cleaning logic


In [5]:
institutions_lookup = {}
institution_ids = {}

In [6]:
def cleaned_institutions(inames):
    return unidecode(unicode(inames)            # The value of the 'Affiliation' field is
                         .strip()               #  stripped padding whitespace (e.g. spaces),
                         .lower()               #  and converted to lowercase.
                         .replace('&', 'and')   # Ampersands are converted to 'and', and
                         .replace('-', ' '))    #  hyphens are interpreted as spaces.

In [7]:
def cleaned_person(last, first):
    lastname = unidecode(strip_clean(unicode(last)))
    firstname = strip_clean(' '.join([ n.strip(' ') for n 
                                      in re.split('\.|\W+', unidecode(strip_clean(unicode(first)))) 
                                      if n != '' ]).strip().replace('.',''))
    return lastname, firstname

In [8]:
def normalized_person(last, first):
    """
    Generates a normalized representation of a personal name.
    """
            
    name = cleaned_person(last, first)
    if name in person_map:
        normed_name = person_map[name]
    else:
        normed_name = name
    if normed_name not in person_ids:
        uri = 'http://history.archives.mbl.edu/concepts/person/{0}'.format(uuid4())
        person_ids[normed_name] = uri
    else:
        uri = person_ids[normed_name]
    return normed_name, uri

In [9]:
def normalized_institutions(inames):
    """
    Generates a normalized representation of an institutional name.
    """
    anames = []
    affs = cleaned_institutions(inames)

    # The 'Affiliation' field can contain multiple institutions, separated by a slash ('/').
    for aff in affs.split('/'):
        # All punctuation is removed, and double-spaces are converted to single.     
        aff = strip_punctuation(aff.strip()).replace('  ',' ')  
        aff = ' '.join([ word for word          # One source of variation in names it the
                        in aff.split(' ')       #  inclusion of 'the'. We simply remove
                        if word != 'the'] )     #  'the' from all names.
        
        if aff in institutions_lookup:          # In a previous step, we generated aggregation
            aff = institutions_lookup[aff]      #  rules for some names. If there were multiple
                                                #  similar names, this retrieves the most likely.
        anames.append(aff)
        
        if aff not in institution_ids:
            uri = 'http://history.archives.mbl.edu/concepts/institution/{0}'.format(uuid4())
            institution_ids[aff] = uri            
    return anames

In [10]:
def get_location_uri(location):
    if location == '' or pd.isnull(location):
        return
    if location not in location_ids:
        uri = 'http://history.archives.mbl.edu/concepts/location/{0}'.format(uuid4())  
        location_ids[location] = uri
    else:
        uri = location_ids[location]
    return uri

In [11]:
def get_coursegroup_uri(coursegroup):
    if coursegroup not in coursegroup_ids:
        uri = 'http://history.archives.mbl.edu/concepts/coursegroup/{0}'.format(uuid4())  
        coursegroup_ids[coursegroup] = uri
    else:
        uri = coursegroup_ids[coursegroup]
    return uri

In [12]:
def get_course_uri(cname):
    if cname not in course_ids:
        uri = 'http://history.archives.mbl.edu/concepts/course/{0}'.format(uuid4())  
        course_ids[cname] = uri
    else:
        uri = course_ids[cname]
    return uri

In [13]:
def clean_coursename(cname):
    return cname.strip().lower().replace('&', 'and').replace('1', 'i').replace('2', 'ii')

In [14]:
def normalize_coursename(cname):
    cname = clean_coursename(unidecode(cname))
    
    if cname in course_map:
        cname = course_map[cname]
    if cname in coursegroup_map:
        group = coursegroup_map[cname]
    else:
        group = cname
    return cname, group

Load authority files


In [15]:
apath = '/Users/erickpeirson/Projects/MBL-Data/Authority Files/'
courseAuthority = pd.read_csv(os.path.join(apath, 'course.csv'))
courseGroupAuthority = pd.read_csv(os.path.join(apath, 'coursegroup.csv'))
institutionAuthority = pd.read_csv(os.path.join(apath, 'institution.csv'))
locationAuthority = pd.read_csv(os.path.join(apath, 'location.csv'))
personAuthority = pd.read_csv(os.path.join(apath, 'person.csv'))

In [16]:
courseAuthority[:5]


Out[16]:
Course Name Course URI
0 Embryology 1919 http://history.archives.mbl.edu/concepts/cours...
1 Embryology 1918 http://history.archives.mbl.edu/concepts/cours...
2 Zoology 1952 http://history.archives.mbl.edu/concepts/cours...
3 Zoology 1953 http://history.archives.mbl.edu/concepts/cours...
4 Zoology 1954 http://history.archives.mbl.edu/concepts/cours...

In [17]:
personAuthority[:5]


Out[17]:
Last Name First Name Person URI
0 Sunley Daniel http://history.archives.mbl.edu/concepts/perso...
1 Delmar Mario http://history.archives.mbl.edu/concepts/perso...
2 Bray Dennis http://history.archives.mbl.edu/concepts/perso...
3 Mumm Jeffrey http://history.archives.mbl.edu/concepts/perso...
4 Taylor Martha http://history.archives.mbl.edu/concepts/perso...

In [18]:
personLookup = dict(zip(zip(personAuthority['Last Name'].values, personAuthority['First Name'].values), 
                        personAuthority['Person URI'].values))

In [19]:
cleaned_names = [cleaned_person(last, first) for last, first in zip(personAuthority['Last Name'], personAuthority['First Name'])]

In [20]:
person_ids = dict(zip(cleaned_names, personAuthority['Person URI'].values))

In [21]:
institutionAuthority[:5]


Out[21]:
Institution Institution URI
0 St Luc Hospital http://history.archives.mbl.edu/concepts/insti...
1 New England Institute For Medical Research http://history.archives.mbl.edu/concepts/insti...
2 Station Biologique http://history.archives.mbl.edu/concepts/insti...
3 Albany College Of Pharmacy http://history.archives.mbl.edu/concepts/insti...
4 New York College Of Medicine And Believue Hosp... http://history.archives.mbl.edu/concepts/insti...

In [22]:
inormed = [cleaned_institutions(i)[0] for i in institutionAuthority.Institution.values]

In [23]:
institutions_lookup = dict(zip(inormed, inormed))

In [24]:
institution_ids = dict(zip(inormed, institutionAuthority['Institution URI'].values))

In [25]:
locationAuthority[:5]


Out[25]:
Location Location URI
0 catawissa, pa http://history.archives.mbl.edu/concepts/locat...
1 warsaw, ny http://history.archives.mbl.edu/concepts/locat...
2 chile http://history.archives.mbl.edu/concepts/locat...
3 trenton, nj http://history.archives.mbl.edu/concepts/locat...
4 copenhagen, denmark http://history.archives.mbl.edu/concepts/locat...

In [26]:
location_ids = dict(zip([strip_nopunc(v) for v in locationAuthority['Location'].values], locationAuthority['Location URI']))

In [96]:
courseAuthority[:5]


Out[96]:
Course Name Course URI
0 Embryology 1919 http://history.archives.mbl.edu/concepts/cours...
1 Embryology 1918 http://history.archives.mbl.edu/concepts/cours...
2 Zoology 1952 http://history.archives.mbl.edu/concepts/cours...
3 Zoology 1953 http://history.archives.mbl.edu/concepts/cours...
4 Zoology 1954 http://history.archives.mbl.edu/concepts/cours...

In [97]:
course_ids = dict(zip([strip_nopunc(v) for v in courseAuthority['Course Name'].values], courseAuthority['Course URI']))

In [98]:
courseGroupAuthority[:5]


Out[98]:
Course Group Course Group URI
0 Botanical Museum Development http://history.archives.mbl.edu/concepts/cours...
1 Research Program In History Of Biology http://history.archives.mbl.edu/concepts/cours...
2 Small Computers In Biomedical Research http://history.archives.mbl.edu/concepts/cours...
3 Biomedical Informatics 2 http://history.archives.mbl.edu/concepts/cours...
4 Microbiology http://history.archives.mbl.edu/concepts/cours...

In [99]:
coursegroup_ids = dict(zip([clean_coursename(v) for v in courseGroupAuthority['Course Group'].values], courseGroupAuthority['Course Group URI']))

Load existing cleaned data

See './Code/MBL Course and Investigator Data.ipynb' for cleaning/disambiguation procedure.


In [31]:
datapath = '/Users/erickpeirson/Projects/MBL-Data/Cleaned Data/'

In [32]:
affiliations = pd.read_csv(os.path.join(datapath, 'cleaned_affiliations.csv'))

In [33]:
affiliations[:5]


Out[33]:
Institution URI Last Name Year First Name Person URI Position Institution
0 http://history.archives.mbl.edu/concepts/insti... Gardiner 1888 Edward G http://history.archives.mbl.edu/concepts/perso... NaN Institute Of Technology
1 http://history.archives.mbl.edu/concepts/insti... Jordan 1888 Edwin O http://history.archives.mbl.edu/concepts/perso... NaN Institute Of Technology
2 http://history.archives.mbl.edu/concepts/insti... Washburn 1888 F L http://history.archives.mbl.edu/concepts/perso... NaN University Of Michigan
3 http://history.archives.mbl.edu/concepts/insti... Clapp 1888 Cornelia Maria http://history.archives.mbl.edu/concepts/perso... NaN Mt Holyoke Seminary And College
4 http://history.archives.mbl.edu/concepts/insti... O'Grady 1888 Marcella I http://history.archives.mbl.edu/concepts/perso... NaN Bryn Mawr College

In [34]:
attendance = pd.read_csv(os.path.join(datapath, 'cleaned_attendance.csv'))

In [125]:
coursegroups = pd.read_csv(os.path.join(datapath, 'cleaned_coursegroups.csv'))

In [36]:
investigators = pd.read_csv(os.path.join(datapath, 'cleaned_investigators.csv'))

In [37]:
locations = pd.read_csv(os.path.join(datapath, 'cleaned_locations.csv'))

Load and integrate Wes' updated data

Located in './Original Data/Wes - Updated'.


In [38]:
wdpath = '/Users/erickpeirson/Projects/MBL-Data/Original Data/Wes - Updated/'

In [39]:
investigators_2012 = pd.read_csv(os.path.join(wdpath, 'MBL INVESTIGATOR WES 2012.csv'), encoding='utf-8')
investigators_2013 = pd.read_csv(os.path.join(wdpath, 'MBL INVESTIGATOR WES 2013.csv'), encoding='utf-8')
attendance_2009 = pd.read_csv(os.path.join(wdpath, 'MBL COURSE LIST WES 2009.csv'), encoding='utf-8')
attendance_2010 = pd.read_csv(os.path.join(wdpath, 'MBL COURSE LIST WES 2010.csv'), encoding='utf-8')
attendance_2011 = pd.read_csv(os.path.join(wdpath, 'MBL COURSE LIST WES 2011.csv'), encoding='utf-8')
attendance_2012 = pd.read_csv(os.path.join(wdpath, 'MBL COURSE LIST WES 2012.csv'), encoding='utf-8')
attendance_2013 = pd.read_csv(os.path.join(wdpath, 'MBL COURSE LIST WES 2013.csv'), encoding='utf-8')

In [41]:
withNames = [attendance_2009, attendance_2010, attendance_2011, attendance_2012, attendance_2013, investigators_2012, investigators_2013]

In [42]:
personal_names = set([])
f_personal_names = Counter()
personal_affiliations = {}

In [43]:
withCourses = [attendance_2009, attendance_2010, attendance_2011, attendance_2012, attendance_2013]
courseYears = [2009, 2010, 2011, 2012, 2013]

In [44]:
for df in [affiliations] + withNames:    
    last, first = zip(*[cleaned_person(l, f) for l, f in zip(df['Last Name'].values, df['First Name'].values)])
    
    if 'Affiliation' in df.columns:
        affs = [normalized_institutions(inst) for inst in df['Affiliation'].values]
    elif 'Institution' in df.columns:
        affs = [normalized_institutions(inst) for inst in df['Institution'].values]
    names = zip(last, first)
    
    personal_names |= set(names)
    for n,v in Counter(names).items():
        f_personal_names[n] += v
    
    for name, aff in zip(names, affs):
        if name not in personal_affiliations:
            personal_affiliations[name] = set([])
        for affiliation in aff:  
            personal_affiliations[name].add(affiliation)

In [45]:
personal_names_list = list(personal_names)
N_names = len(personal_names)
by_last = {}
for i in xrange(N_names):
    lastname, firstname = personal_names_list[i]
    if lastname not in by_last:
        by_last[lastname] = set([])
    by_last[lastname].add(firstname)    # The surname of A is identical to the surname of B

In [46]:
person_map = {}
for last, firsts in by_last.iteritems():    # We assume that surnames are not misspelled.
    N_firsts = len(firsts)                  #  This is not strictly true, but it is not
    if N_firsts > 1:                        #  quite clear how to proceed otherwise.
        lfirsts = list(firsts)              # Consider cases in which two names, I and J,
        for i in xrange(N_firsts):          #  have a common surname.
            iname = lfirsts[i]
            inames = iname.split(' ')
            iinits = [f[0] for f in inames ]
            
            for j in xrange(i+1, N_firsts):     
                jname = lfirsts[j]
                jnames = jname.split(' ')
                jinits = [f[0] for f in jnames ]

                # For each such pair, I and J, we compare the X parts of their forenames,
                #  where X is the minimum number of forename parts for I and J.
                match = True               
                for x in xrange(min( [len(inames), len(jnames)] )):
                    # If the x part if either forename is an initial, we evaluate
                    #  only the first character of the two parts.
                    if len(inames[x]) == 1 or len(jnames[x]) == 1:
                        if iinits[x] != jinits[x]:
                            match = False
                    # Otherwise, the x part of the two forenames must be identical.
                    else:
                        if inames[x] != jnames[x]:
                            match = False
                if match:     
                    # If the forenames of I and J match, as described above, we check
                    # to see whether they share at least one institutional affiliation.
                    shared = personal_affiliations[(last, iname)] & personal_affiliations[(last, jname)]
                    if len(shared) > 0:
                        # If they share at least one institutional affiliation, then
                        #  we believe that I and J both refer to the same person.
                        if len(iname) > len(jname):    # Use the longest name (most complete).
                            key = jname
                            alt = iname
                        else:
                            key = iname
                            alt = jname
                        if (last, alt) in person_map:
                            top = False
                            while not top:
                                try:
                                    alt = person_map[(last,alt)][1]
                                except KeyError:
                                    top = True
                        person_map[(last, key)] = (last, alt)      
                        
    # If the conditions above are not satisfied, then we lack sufficient evidence to
    #  assert that the names I and J refer to the same person.
print len(person_map)


47

In [47]:
loc = [get_location_uri(l) for l in attendance_2011['Location'].values]

In [48]:
attendance[:5]


Out[48]:
Year Last Name Person URI Course Name First Name Role Course URI
0 1890 Mcmurrich http://history.archives.mbl.edu/concepts/perso... Coelenterates 1890 J Playfair Instructor http://history.archives.mbl.edu/concepts/cours...
1 1891 Setchell http://history.archives.mbl.edu/concepts/perso... Botany 1891 William Albert Instructor http://history.archives.mbl.edu/concepts/cours...
2 1891 Rankin http://history.archives.mbl.edu/concepts/perso... Zoology 1891 Walter M Instructor http://history.archives.mbl.edu/concepts/cours...
3 1891 Bumpus http://history.archives.mbl.edu/concepts/perso... Zoology 1891 Hermon C Instructor http://history.archives.mbl.edu/concepts/cours...
4 1891 Mcmurrich http://history.archives.mbl.edu/concepts/perso... Zoology 1891 J Playfair Instructor http://history.archives.mbl.edu/concepts/cours...

In [49]:
def mean(seq):
    return float(sum(seq))/len(seq)

In [50]:
course_names = set([])
f_course_names = Counter()

for df in withCourses:
    name = [clean_coursename(c) for c in df['Course Name'].values]
    course_names |= set(name)
    for n in name:
        f_course_names[n] += 1

print len(course_names)

distances = []
course_names_list = list(course_names)
for i in xrange(len(course_names_list)):
    for j in xrange(i+1, len(course_names_list)):
        a = course_names_list[i]
        b = course_names_list[j]
        d = Levenshtein.distance(a,b)
        dnorm = float(d)/mean([float(len(a)), float(len(b))])
        distances.append( (i,j,d, dnorm) )
        
for d in distances:
    if d[3] < 0.17:
        a = course_names_list[d[0]]
        b = course_names_list[d[1]]
        dnorm = d[3]

        print d[3], d[2]
        print f_course_names[a], '\t', a
        print f_course_names[b], '\t', b   
        print '-'*40


38
0.0571428571429 2
244 	zebrafish development and genetics
57 	zebrafish developmental and genetics
----------------------------------------
0.0224719101124 1
111 	analystical and quantitative light microscopy
165 	analytical and quantitative light microscopy
----------------------------------------
0.0141843971631 1
154 	strategies and techniques for analyzing microbial population structures
90 	strategies and techniques for analyzing microbial population structure
----------------------------------------
0.161290322581 10
97 	embryology: concepts and techniques in modern development
173 	embryology: concepts and techniques in modern developmental biology
----------------------------------------
0.0792079207921 4
40 	summer program in neuroscience, ethics $ survival
151 	summer program in neuroscience, ethics, and survival
----------------------------------------
0.0869565217391 2
137 	biomedical informatics i
90 	biomedical informatics
----------------------------------------
0.0408163265306 1
137 	biomedical informatics i
93 	biomedical informatics ii
----------------------------------------
0.127659574468 3
90 	biomedical informatics
93 	biomedical informatics ii
----------------------------------------

In [76]:
# The course_map handles typographical errors in the dataset. There are remarkably
#  few typos. We specify the appropriate spellings manually, below.
course_map = { 
    'optimal microscopy': 'optical microscopy',
    'optimal microscopy and imaging in the biomedical sciences': 'optical microscopy and imaging in the biomedical sciences',
    'nasa planetary biology inernship': 'nasa planetary biology internship',
    'summer program in neuroscience, ethics $ survival': 'summer program in neuroscience, ethics and survival',
    'spines--summer program in neuroscience, ethics and survival': 'summer program in neuroscience, ethics and survival',
    'physiology: modern cell biology using microscopic, biochemical and computational approaches': 'physiology: modern cell biology using microscopic, biochemical, and computational approaches',
    'physiology: cell and molecular biology': 'physiology: cellular and molecular biology',
    'parthogenesis of neuroimmunologic diseases': 'pathogenesis of neuroimmunologic diseases',
    'zebrafish developmental and genetics': 'zebrafish development and genetics',
    'analystical and quantitative light microscopy': 'analytical and quantitative light microscopy',
    'strategies and techniques for analyzing microbial population structure': 'strategies and techniques for analyzing microbial population structures',
    'embryology: concepts and techniques in modern development': 'embryology: concepts and techniques in modern developmental biology',
    'summer program in neuroscience, ethics $ survival': 'summer program in neuroscience, ethics, and survival',
}

# If two courses share the same name, then we generally consider them to belong to the 
#  same course group. For example, the 'Ecology' course in 1934 (say) belongs to the
#  same group (or series) as the 'Ecology' course in 1965 (say).
#
# In some cases, however, courses with slightly (or perhaps very) different names
#  may belong to the same group. For example, an Embryology course might have some
#  subtitle, like: "Embryology: Some great new theme for this course". Or they might
#  be numbered, like "Biomedical informatics I" and "Biomedical informatics II".
#
# The coursegroup_map handles the latter cases. Keys are specific course names that
#  occur in the dataset, and values are the group names that should be used.
#  Some of these mappings are given manually, based on inspection of the dataset.
#  Other mappings are generated by looking for colons (':') in course names; the
#  part of the name before the colon is assumed to be the proper group name.

coursegroup_map = { 
    'small computers in biomedical research, i': 'small computers in biomedical research',
    'small computers in biomedical research, ii': 'small computers in biomedical research',
    'medical informatics': 'biomedical informatics',
    'medical informatics i': 'biomedical informatics',
    'medical informatics ii': 'biomedical informatics',
    'biomedical informatics i': 'biomedical informatics',
    'biomedical informatics ii': 'biomedical informatics',
    'advanced workshop on recombinant dna methodology': 'workshop on recombinant dna methodology',
    'basic workshop on recombinant dna methodology': 'workshop on recombinant dna methodology',
    'biomedical informatics ii': 'biomedical informatics',
    'biomedical informatics i': 'biomedical informatics',
    'biomedical informatics': 'biomedical informatics',
    'biomedical informatics (fall)': 'biomedical informatics',
    'biomedical informatics (spring)': 'biomedical informatics',
    'optical microscopy and imaging in the biomedical sciences': 'optical microscopy',
    'gene regulatory networks for development': 'gene regulatory networks'
}

In [77]:
for cname in list(course_names):   # Here we look for course names with subtitles,
    parts = cname.split(':')       #  characterized by a colon (':') in their names.
    if len(parts) > 1:
        coursegroup_map.update({cname:parts[0]})
print len(coursegroup_map)


17

In [100]:
coursegroup = []
role = []
cname = []
year = []
name = []
aff = []
loc = []
position = []
for df, years in zip(withCourses, courseYears):
    role += [strip_clean(r) for r in df['Role']]
    cname_this, coursegroup_this = zip(*[normalize_coursename(c.lower()) for c in df['Course Name'].values])
    coursegroup += coursegroup_this
    cname += cname_this
    year += [years]*len(cname_this)
    name += zip(df['Last Name'], df['First Name'])
    aff += [normalized_institutions(i) for i in df['Affiliation']]
    loc += [strip_nopunc(l) for l in df['Location'].values]
    position += [strip_clean(p) for p in df['Position at Affiliation'].values]
    
cname = ['{0} {1}'.format(c, y).title() for c, y in zip(cname, year)]
course_uri = [get_course_uri(c.lower()) for c in cname]
coursegroup_uri = [get_coursegroup_uri(c) for c in coursegroup]
name, name_uri = zip(*[normalized_person(n[0], n[1]) for n in name])
coursegroup = [c.title() for c in coursegroup]
lastname, firstname = zip(*name)
lastname = [n.title() for n in lastname]
firstname = [n.title() for n in firstname]
role = [r.title() for r in role]
position = [p.title() for p in position]
aff_uris = [[institution_ids[i] for i in ai] for ai in aff]
loc_uris = [get_location_uri(l) for l in loc]
aff = [[a.title() for a in ai] for ai in aff]

In [101]:
Counter([len(a) for a in aff])


Out[101]:
Counter({1: 6490, 2: 182, 3: 3})

In [102]:
attendance_wes = pd.DataFrame(data=zip(year, lastname, name_uri, cname, firstname, role, course_uri),
                              columns=['Year', 'Last Name', 'Person URI', 'Course Name', 'First Name', 
                                       'Role', 'Course URI'])

In [103]:
attendance_wes[:5]


Out[103]:
Year Last Name Person URI Course Name First Name Role Course URI
0 2009 Elmendorf http://history.archives.mbl.edu/concepts/perso... Biology Of Parasitism 2009 Heidi Course Director http://history.archives.mbl.edu/concepts/cours...
1 2009 Goldberg http://history.archives.mbl.edu/concepts/perso... Biology Of Parasitism 2009 Daniel Course Director http://history.archives.mbl.edu/concepts/cours...
2 2009 Belkaid http://history.archives.mbl.edu/concepts/perso... Biology Of Parasitism 2009 Yasmine Faculty http://history.archives.mbl.edu/concepts/cours...
3 2009 Deitsch http://history.archives.mbl.edu/concepts/perso... Biology Of Parasitism 2009 Kirk Faculty http://history.archives.mbl.edu/concepts/cours...
4 2009 Heiges http://history.archives.mbl.edu/concepts/perso... Biology Of Parasitism 2009 Mark Faculty http://history.archives.mbl.edu/concepts/cours...

In [104]:
# Location URI,Last Name,Person URI,First Name,Location,Year
locations_raw = []
for i in xrange(len(loc)):
    if loc[i] != '' and loc[i] is not None and loc_uris[i] is not None:
        locations_raw.append((loc_uris[i], lastname[i], name_uri[i], firstname[i], loc[i], year[i]))
locations_wes = pd.DataFrame(data=locations_raw,
                             columns=['Location URI', 'Last Name', 'Person URI', 'First Name', 'Location', 'Year'])

In [105]:
locations_wes[:5]


Out[105]:
Location URI Last Name Person URI First Name Location Year
0 http://history.archives.mbl.edu/concepts/locat... Pell http://history.archives.mbl.edu/concepts/perso... Jonathan denver 2009
1 http://history.archives.mbl.edu/concepts/locat... Bryan http://history.archives.mbl.edu/concepts/perso... Susan irvine 2009
2 http://history.archives.mbl.edu/concepts/locat... Gardiner http://history.archives.mbl.edu/concepts/perso... David irvine 2009
3 http://history.archives.mbl.edu/concepts/locat... Levine http://history.archives.mbl.edu/concepts/perso... Michael berkeley 2009
4 http://history.archives.mbl.edu/concepts/locat... Grieco http://history.archives.mbl.edu/concepts/perso... Theresa berkeley 2009

In [106]:
affiliations_raw = []
for i in xrange(len(aff)):
    a = aff[i]
    for x in xrange(len(a)):
        affiliations_raw.append((aff_uris[i][x], lastname[i], year[i], firstname[i], name_uri[i], position[i], aff[i][x]))

In [107]:
# Institution URI,Last Name,Year,First Name,Person URI,Position,Institution
affiliations_wes = pd.DataFrame(data=affiliations_raw, 
                                 columns=['Institution URI', 'Last Name', 'Year', 'First Name', 
                                          'Person URI', 'Position', 'Institution'])

In [108]:
role = []
year = []
name = []
aff = []
loc = []
for df, years in zip([investigators_2012, investigators_2013], [2012, 2013]):
    role += [strip_clean(r) for r in df['Role']]
    year += [years]*len(role)
    name += zip(df['Last Name'], df['First Name'])
    aff += [normalized_institutions(i) for i in df['Affiliation']]
    loc += [strip_nopunc(l) for l in df['Location'].values]
    
name, name_uri = zip(*[normalized_person(n[0], n[1]) for n in name])
lastname, firstname = zip(*name)
lastname = [n.title() for n in lastname]
firstname = [n.title() for n in firstname]
role = [r.title() for r in role]
aff_uris = [[institution_ids[i] for i in ai] for ai in aff]
loc_uris = [get_location_uri(l) for l in loc]
aff = [[a.title() for a in ai] for ai in aff]
position = ['']*len(aff)

In [109]:
affiliations_raw = []
for i in xrange(len(aff)):
    a = aff[i]
    for x in xrange(len(a)):
        affiliations_raw.append((aff_uris[i][x], lastname[i], year[i], firstname[i], name_uri[i], position[i], aff[i][x]))

In [110]:
print len(affiliations_wes),
affiliations_wes = affiliations_wes.append(pd.DataFrame(data=affiliations_raw, 
                                           columns=['Institution URI', 'Last Name', 'Year', 'First Name', 
                                                  'Person URI', 'Position', 'Institution']))
print len(affiliations_wes)


6863 7243

In [111]:
affiliations_wes[:5]


Out[111]:
Institution URI Last Name Year First Name Person URI Position Institution
0 http://history.archives.mbl.edu/concepts/insti... Elmendorf 2009 Heidi http://history.archives.mbl.edu/concepts/perso... Georgetown University
1 http://history.archives.mbl.edu/concepts/insti... Goldberg 2009 Daniel http://history.archives.mbl.edu/concepts/perso... Washington University In St Louis School Of Me...
2 http://history.archives.mbl.edu/concepts/insti... Belkaid 2009 Yasmine http://history.archives.mbl.edu/concepts/perso... National Institutes Of Health
3 http://history.archives.mbl.edu/concepts/insti... Deitsch 2009 Kirk http://history.archives.mbl.edu/concepts/perso... Weill Medical College Of Cornell University
4 http://history.archives.mbl.edu/concepts/insti... Heiges 2009 Mark http://history.archives.mbl.edu/concepts/perso... University Of Georgia

In [112]:
investigators_wes = pd.DataFrame(data=zip(lastname, name_uri, firstname, role, year, ['']*len(year)),
                                 columns=['Last Name', 'Person URI', 'First Name', 'Role', 'Year', 'Subject'])

In [113]:
investigators_wes[:5]


Out[113]:
Last Name Person URI First Name Role Year Subject
0 Goldman http://history.archives.mbl.edu/concepts/perso... Robert Director 2012
1 Akkin http://history.archives.mbl.edu/concepts/perso... Taner Investigator 2012
2 Andersen http://history.archives.mbl.edu/concepts/perso... Bruce Investigator 2012
3 Armstrong http://history.archives.mbl.edu/concepts/perso... Peter Investigator 2012
4 Augustine http://history.archives.mbl.edu/concepts/perso... George Investigator 2012

In [114]:
# Location URI,Last Name,Person URI,First Name,Location,Year
locations_raw = []
for i in xrange(len(loc)):
    if loc[i] != '' and loc[i] is not None and loc_uris[i] is not None:
        locations_raw.append((loc_uris[i], lastname[i], name_uri[i], firstname[i], loc[i], year[i]))
print len(locations_wes),
locations_wes = locations_wes.append(pd.DataFrame(data=locations_raw,
                                     columns=['Location URI', 'Last Name', 'Person URI', 'First Name', 'Location', 'Year']))
print len(locations_wes)


435 555

In [115]:
int('Biomedical Informatics 2012'[-4:])


Out[115]:
2012

In [120]:
cg_raw = []
for c, cg in dict(zip(cname, coursegroup)).items():
    y = int(c[-4:])
    curi = get_course_uri(c.lower())
    cguri = get_coursegroup_uri(cg.lower())
    cg_raw.append((y, curi, cguri, cg, c))

In [126]:
coursegroups_wes = pd.DataFrame(data=cg_raw, columns=['Year', 'Course URI', 'Course Group URI', 'Course Group', 'Course Name'])

In [127]:
coursegroups_wes[:5]


Out[127]:
Year Course URI Course Group URI Course Group Course Name
0 2012 http://history.archives.mbl.edu/concepts/cours... http://history.archives.mbl.edu/concepts/cours... Biomedical Informatics Biomedical Informatics 2012
1 2010 http://history.archives.mbl.edu/concepts/cours... http://history.archives.mbl.edu/concepts/cours... Biology Of Parasitism Biology Of Parasitism 2010
2 2009 http://history.archives.mbl.edu/concepts/cours... http://history.archives.mbl.edu/concepts/cours... Biomedical Informatics Biomedical Informatics I 2009
3 2009 http://history.archives.mbl.edu/concepts/cours... http://history.archives.mbl.edu/concepts/cours... Workshop On Molecular Evolution Workshop On Molecular Evolution 2009
4 2009 http://history.archives.mbl.edu/concepts/cours... http://history.archives.mbl.edu/concepts/cours... Biomedical Informatics Biomedical Informatics Ii 2009

In [128]:
coursegroups[:5]


Out[128]:
Year Course URI Course Group URI Course Group Course Name
0 1890 http://history.archives.mbl.edu/concepts/cours... http://history.archives.mbl.edu/concepts/cours... Coelenterates Coelenterates 1890
1 1891 http://history.archives.mbl.edu/concepts/cours... http://history.archives.mbl.edu/concepts/cours... Botany Botany 1891
2 1891 http://history.archives.mbl.edu/concepts/cours... http://history.archives.mbl.edu/concepts/cours... Zoology Zoology 1891
3 1892 http://history.archives.mbl.edu/concepts/cours... http://history.archives.mbl.edu/concepts/cours... Botany Botany 1892
4 1892 http://history.archives.mbl.edu/concepts/cours... http://history.archives.mbl.edu/concepts/cours... Embryology Embryology 1892

In [129]:
affiliations_combined = affiliations.append(affiliations_wes)
attendance_combined = attendance.append(attendance_wes)
coursegroups_combined = coursegroups.append(coursegroups_wes)
investigators_combined = investigators.append(investigators_wes)
locations_combined = locations.append(locations_wes)

In [132]:
print len(affiliations), len(affiliations_combined)
print len(attendance), len(attendance_combined)
print len(coursegroups), len(coursegroups_combined)
print len(investigators), len(investigators_combined)
print len(locations), len(locations_combined)


52500 59743
46909 53584
1095 1201
7996 8374
47248 47803

Summary


In [144]:
import matplotlib.pyplot as plt

In [152]:
%pylab inline


Populating the interactive namespace from numpy and matplotlib
WARNING: pylab import has clobbered these variables: ['f', 'mean']
`%matplotlib` prevents importing * from pylab and numpy

In [140]:
locations_finite = locations_combined[pd.notnull(locations_combined['Location'])]

In [153]:
plt.bar(locations_finite.Year.value_counts().index, locations_finite.Year.value_counts().values)


Out[153]:
<Container object of 119 artists>

In [154]:
plt.bar(attendance_combined.Year.value_counts().index, attendance_combined.Year.value_counts().values)


Out[154]:
<Container object of 122 artists>

In [155]:
plt.bar(affiliations_combined.Year.value_counts().index, affiliations_combined.Year.value_counts().values)


Out[155]:
<Container object of 124 artists>

In [ ]: