In [136]:
import pandas as pd
import os
from unidecode import unidecode
from uuid import uuid4
from collections import Counter
import string
import re
import numpy as np
import Levenshtein
In [2]:
def strip_punctuation(s):
exclude = set(string.punctuation)
s = ''.join(ch for ch in s if ch not in exclude)
return s
In [3]:
def strip_clean(obj):
try:
return obj.strip().lower().replace('.', '')
except AttributeError:
return ''
In [4]:
def strip_nopunc(obj):
try:
return obj.lower().strip()
except AttributeError:
return ''
In [5]:
institutions_lookup = {}
institution_ids = {}
In [6]:
def cleaned_institutions(inames):
return unidecode(unicode(inames) # The value of the 'Affiliation' field is
.strip() # stripped padding whitespace (e.g. spaces),
.lower() # and converted to lowercase.
.replace('&', 'and') # Ampersands are converted to 'and', and
.replace('-', ' ')) # hyphens are interpreted as spaces.
In [7]:
def cleaned_person(last, first):
lastname = unidecode(strip_clean(unicode(last)))
firstname = strip_clean(' '.join([ n.strip(' ') for n
in re.split('\.|\W+', unidecode(strip_clean(unicode(first))))
if n != '' ]).strip().replace('.',''))
return lastname, firstname
In [8]:
def normalized_person(last, first):
"""
Generates a normalized representation of a personal name.
"""
name = cleaned_person(last, first)
if name in person_map:
normed_name = person_map[name]
else:
normed_name = name
if normed_name not in person_ids:
uri = 'http://history.archives.mbl.edu/concepts/person/{0}'.format(uuid4())
person_ids[normed_name] = uri
else:
uri = person_ids[normed_name]
return normed_name, uri
In [9]:
def normalized_institutions(inames):
"""
Generates a normalized representation of an institutional name.
"""
anames = []
affs = cleaned_institutions(inames)
# The 'Affiliation' field can contain multiple institutions, separated by a slash ('/').
for aff in affs.split('/'):
# All punctuation is removed, and double-spaces are converted to single.
aff = strip_punctuation(aff.strip()).replace(' ',' ')
aff = ' '.join([ word for word # One source of variation in names it the
in aff.split(' ') # inclusion of 'the'. We simply remove
if word != 'the'] ) # 'the' from all names.
if aff in institutions_lookup: # In a previous step, we generated aggregation
aff = institutions_lookup[aff] # rules for some names. If there were multiple
# similar names, this retrieves the most likely.
anames.append(aff)
if aff not in institution_ids:
uri = 'http://history.archives.mbl.edu/concepts/institution/{0}'.format(uuid4())
institution_ids[aff] = uri
return anames
In [10]:
def get_location_uri(location):
if location == '' or pd.isnull(location):
return
if location not in location_ids:
uri = 'http://history.archives.mbl.edu/concepts/location/{0}'.format(uuid4())
location_ids[location] = uri
else:
uri = location_ids[location]
return uri
In [11]:
def get_coursegroup_uri(coursegroup):
if coursegroup not in coursegroup_ids:
uri = 'http://history.archives.mbl.edu/concepts/coursegroup/{0}'.format(uuid4())
coursegroup_ids[coursegroup] = uri
else:
uri = coursegroup_ids[coursegroup]
return uri
In [12]:
def get_course_uri(cname):
if cname not in course_ids:
uri = 'http://history.archives.mbl.edu/concepts/course/{0}'.format(uuid4())
course_ids[cname] = uri
else:
uri = course_ids[cname]
return uri
In [13]:
def clean_coursename(cname):
return cname.strip().lower().replace('&', 'and').replace('1', 'i').replace('2', 'ii')
In [14]:
def normalize_coursename(cname):
cname = clean_coursename(unidecode(cname))
if cname in course_map:
cname = course_map[cname]
if cname in coursegroup_map:
group = coursegroup_map[cname]
else:
group = cname
return cname, group
In [15]:
apath = '/Users/erickpeirson/Projects/MBL-Data/Authority Files/'
courseAuthority = pd.read_csv(os.path.join(apath, 'course.csv'))
courseGroupAuthority = pd.read_csv(os.path.join(apath, 'coursegroup.csv'))
institutionAuthority = pd.read_csv(os.path.join(apath, 'institution.csv'))
locationAuthority = pd.read_csv(os.path.join(apath, 'location.csv'))
personAuthority = pd.read_csv(os.path.join(apath, 'person.csv'))
In [16]:
courseAuthority[:5]
Out[16]:
In [17]:
personAuthority[:5]
Out[17]:
In [18]:
personLookup = dict(zip(zip(personAuthority['Last Name'].values, personAuthority['First Name'].values),
personAuthority['Person URI'].values))
In [19]:
cleaned_names = [cleaned_person(last, first) for last, first in zip(personAuthority['Last Name'], personAuthority['First Name'])]
In [20]:
person_ids = dict(zip(cleaned_names, personAuthority['Person URI'].values))
In [21]:
institutionAuthority[:5]
Out[21]:
In [22]:
inormed = [cleaned_institutions(i)[0] for i in institutionAuthority.Institution.values]
In [23]:
institutions_lookup = dict(zip(inormed, inormed))
In [24]:
institution_ids = dict(zip(inormed, institutionAuthority['Institution URI'].values))
In [25]:
locationAuthority[:5]
Out[25]:
In [26]:
location_ids = dict(zip([strip_nopunc(v) for v in locationAuthority['Location'].values], locationAuthority['Location URI']))
In [96]:
courseAuthority[:5]
Out[96]:
In [97]:
course_ids = dict(zip([strip_nopunc(v) for v in courseAuthority['Course Name'].values], courseAuthority['Course URI']))
In [98]:
courseGroupAuthority[:5]
Out[98]:
In [99]:
coursegroup_ids = dict(zip([clean_coursename(v) for v in courseGroupAuthority['Course Group'].values], courseGroupAuthority['Course Group URI']))
In [31]:
datapath = '/Users/erickpeirson/Projects/MBL-Data/Cleaned Data/'
In [32]:
affiliations = pd.read_csv(os.path.join(datapath, 'cleaned_affiliations.csv'))
In [33]:
affiliations[:5]
Out[33]:
In [34]:
attendance = pd.read_csv(os.path.join(datapath, 'cleaned_attendance.csv'))
In [125]:
coursegroups = pd.read_csv(os.path.join(datapath, 'cleaned_coursegroups.csv'))
In [36]:
investigators = pd.read_csv(os.path.join(datapath, 'cleaned_investigators.csv'))
In [37]:
locations = pd.read_csv(os.path.join(datapath, 'cleaned_locations.csv'))
In [38]:
wdpath = '/Users/erickpeirson/Projects/MBL-Data/Original Data/Wes - Updated/'
In [39]:
investigators_2012 = pd.read_csv(os.path.join(wdpath, 'MBL INVESTIGATOR WES 2012.csv'), encoding='utf-8')
investigators_2013 = pd.read_csv(os.path.join(wdpath, 'MBL INVESTIGATOR WES 2013.csv'), encoding='utf-8')
attendance_2009 = pd.read_csv(os.path.join(wdpath, 'MBL COURSE LIST WES 2009.csv'), encoding='utf-8')
attendance_2010 = pd.read_csv(os.path.join(wdpath, 'MBL COURSE LIST WES 2010.csv'), encoding='utf-8')
attendance_2011 = pd.read_csv(os.path.join(wdpath, 'MBL COURSE LIST WES 2011.csv'), encoding='utf-8')
attendance_2012 = pd.read_csv(os.path.join(wdpath, 'MBL COURSE LIST WES 2012.csv'), encoding='utf-8')
attendance_2013 = pd.read_csv(os.path.join(wdpath, 'MBL COURSE LIST WES 2013.csv'), encoding='utf-8')
In [41]:
withNames = [attendance_2009, attendance_2010, attendance_2011, attendance_2012, attendance_2013, investigators_2012, investigators_2013]
In [42]:
personal_names = set([])
f_personal_names = Counter()
personal_affiliations = {}
In [43]:
withCourses = [attendance_2009, attendance_2010, attendance_2011, attendance_2012, attendance_2013]
courseYears = [2009, 2010, 2011, 2012, 2013]
In [44]:
for df in [affiliations] + withNames:
last, first = zip(*[cleaned_person(l, f) for l, f in zip(df['Last Name'].values, df['First Name'].values)])
if 'Affiliation' in df.columns:
affs = [normalized_institutions(inst) for inst in df['Affiliation'].values]
elif 'Institution' in df.columns:
affs = [normalized_institutions(inst) for inst in df['Institution'].values]
names = zip(last, first)
personal_names |= set(names)
for n,v in Counter(names).items():
f_personal_names[n] += v
for name, aff in zip(names, affs):
if name not in personal_affiliations:
personal_affiliations[name] = set([])
for affiliation in aff:
personal_affiliations[name].add(affiliation)
In [45]:
personal_names_list = list(personal_names)
N_names = len(personal_names)
by_last = {}
for i in xrange(N_names):
lastname, firstname = personal_names_list[i]
if lastname not in by_last:
by_last[lastname] = set([])
by_last[lastname].add(firstname) # The surname of A is identical to the surname of B
In [46]:
person_map = {}
for last, firsts in by_last.iteritems(): # We assume that surnames are not misspelled.
N_firsts = len(firsts) # This is not strictly true, but it is not
if N_firsts > 1: # quite clear how to proceed otherwise.
lfirsts = list(firsts) # Consider cases in which two names, I and J,
for i in xrange(N_firsts): # have a common surname.
iname = lfirsts[i]
inames = iname.split(' ')
iinits = [f[0] for f in inames ]
for j in xrange(i+1, N_firsts):
jname = lfirsts[j]
jnames = jname.split(' ')
jinits = [f[0] for f in jnames ]
# For each such pair, I and J, we compare the X parts of their forenames,
# where X is the minimum number of forename parts for I and J.
match = True
for x in xrange(min( [len(inames), len(jnames)] )):
# If the x part if either forename is an initial, we evaluate
# only the first character of the two parts.
if len(inames[x]) == 1 or len(jnames[x]) == 1:
if iinits[x] != jinits[x]:
match = False
# Otherwise, the x part of the two forenames must be identical.
else:
if inames[x] != jnames[x]:
match = False
if match:
# If the forenames of I and J match, as described above, we check
# to see whether they share at least one institutional affiliation.
shared = personal_affiliations[(last, iname)] & personal_affiliations[(last, jname)]
if len(shared) > 0:
# If they share at least one institutional affiliation, then
# we believe that I and J both refer to the same person.
if len(iname) > len(jname): # Use the longest name (most complete).
key = jname
alt = iname
else:
key = iname
alt = jname
if (last, alt) in person_map:
top = False
while not top:
try:
alt = person_map[(last,alt)][1]
except KeyError:
top = True
person_map[(last, key)] = (last, alt)
# If the conditions above are not satisfied, then we lack sufficient evidence to
# assert that the names I and J refer to the same person.
print len(person_map)
In [47]:
loc = [get_location_uri(l) for l in attendance_2011['Location'].values]
In [48]:
attendance[:5]
Out[48]:
In [49]:
def mean(seq):
return float(sum(seq))/len(seq)
In [50]:
course_names = set([])
f_course_names = Counter()
for df in withCourses:
name = [clean_coursename(c) for c in df['Course Name'].values]
course_names |= set(name)
for n in name:
f_course_names[n] += 1
print len(course_names)
distances = []
course_names_list = list(course_names)
for i in xrange(len(course_names_list)):
for j in xrange(i+1, len(course_names_list)):
a = course_names_list[i]
b = course_names_list[j]
d = Levenshtein.distance(a,b)
dnorm = float(d)/mean([float(len(a)), float(len(b))])
distances.append( (i,j,d, dnorm) )
for d in distances:
if d[3] < 0.17:
a = course_names_list[d[0]]
b = course_names_list[d[1]]
dnorm = d[3]
print d[3], d[2]
print f_course_names[a], '\t', a
print f_course_names[b], '\t', b
print '-'*40
In [76]:
# The course_map handles typographical errors in the dataset. There are remarkably
# few typos. We specify the appropriate spellings manually, below.
course_map = {
'optimal microscopy': 'optical microscopy',
'optimal microscopy and imaging in the biomedical sciences': 'optical microscopy and imaging in the biomedical sciences',
'nasa planetary biology inernship': 'nasa planetary biology internship',
'summer program in neuroscience, ethics $ survival': 'summer program in neuroscience, ethics and survival',
'spines--summer program in neuroscience, ethics and survival': 'summer program in neuroscience, ethics and survival',
'physiology: modern cell biology using microscopic, biochemical and computational approaches': 'physiology: modern cell biology using microscopic, biochemical, and computational approaches',
'physiology: cell and molecular biology': 'physiology: cellular and molecular biology',
'parthogenesis of neuroimmunologic diseases': 'pathogenesis of neuroimmunologic diseases',
'zebrafish developmental and genetics': 'zebrafish development and genetics',
'analystical and quantitative light microscopy': 'analytical and quantitative light microscopy',
'strategies and techniques for analyzing microbial population structure': 'strategies and techniques for analyzing microbial population structures',
'embryology: concepts and techniques in modern development': 'embryology: concepts and techniques in modern developmental biology',
'summer program in neuroscience, ethics $ survival': 'summer program in neuroscience, ethics, and survival',
}
# If two courses share the same name, then we generally consider them to belong to the
# same course group. For example, the 'Ecology' course in 1934 (say) belongs to the
# same group (or series) as the 'Ecology' course in 1965 (say).
#
# In some cases, however, courses with slightly (or perhaps very) different names
# may belong to the same group. For example, an Embryology course might have some
# subtitle, like: "Embryology: Some great new theme for this course". Or they might
# be numbered, like "Biomedical informatics I" and "Biomedical informatics II".
#
# The coursegroup_map handles the latter cases. Keys are specific course names that
# occur in the dataset, and values are the group names that should be used.
# Some of these mappings are given manually, based on inspection of the dataset.
# Other mappings are generated by looking for colons (':') in course names; the
# part of the name before the colon is assumed to be the proper group name.
coursegroup_map = {
'small computers in biomedical research, i': 'small computers in biomedical research',
'small computers in biomedical research, ii': 'small computers in biomedical research',
'medical informatics': 'biomedical informatics',
'medical informatics i': 'biomedical informatics',
'medical informatics ii': 'biomedical informatics',
'biomedical informatics i': 'biomedical informatics',
'biomedical informatics ii': 'biomedical informatics',
'advanced workshop on recombinant dna methodology': 'workshop on recombinant dna methodology',
'basic workshop on recombinant dna methodology': 'workshop on recombinant dna methodology',
'biomedical informatics ii': 'biomedical informatics',
'biomedical informatics i': 'biomedical informatics',
'biomedical informatics': 'biomedical informatics',
'biomedical informatics (fall)': 'biomedical informatics',
'biomedical informatics (spring)': 'biomedical informatics',
'optical microscopy and imaging in the biomedical sciences': 'optical microscopy',
'gene regulatory networks for development': 'gene regulatory networks'
}
In [77]:
for cname in list(course_names): # Here we look for course names with subtitles,
parts = cname.split(':') # characterized by a colon (':') in their names.
if len(parts) > 1:
coursegroup_map.update({cname:parts[0]})
print len(coursegroup_map)
In [100]:
coursegroup = []
role = []
cname = []
year = []
name = []
aff = []
loc = []
position = []
for df, years in zip(withCourses, courseYears):
role += [strip_clean(r) for r in df['Role']]
cname_this, coursegroup_this = zip(*[normalize_coursename(c.lower()) for c in df['Course Name'].values])
coursegroup += coursegroup_this
cname += cname_this
year += [years]*len(cname_this)
name += zip(df['Last Name'], df['First Name'])
aff += [normalized_institutions(i) for i in df['Affiliation']]
loc += [strip_nopunc(l) for l in df['Location'].values]
position += [strip_clean(p) for p in df['Position at Affiliation'].values]
cname = ['{0} {1}'.format(c, y).title() for c, y in zip(cname, year)]
course_uri = [get_course_uri(c.lower()) for c in cname]
coursegroup_uri = [get_coursegroup_uri(c) for c in coursegroup]
name, name_uri = zip(*[normalized_person(n[0], n[1]) for n in name])
coursegroup = [c.title() for c in coursegroup]
lastname, firstname = zip(*name)
lastname = [n.title() for n in lastname]
firstname = [n.title() for n in firstname]
role = [r.title() for r in role]
position = [p.title() for p in position]
aff_uris = [[institution_ids[i] for i in ai] for ai in aff]
loc_uris = [get_location_uri(l) for l in loc]
aff = [[a.title() for a in ai] for ai in aff]
In [101]:
Counter([len(a) for a in aff])
Out[101]:
In [102]:
attendance_wes = pd.DataFrame(data=zip(year, lastname, name_uri, cname, firstname, role, course_uri),
columns=['Year', 'Last Name', 'Person URI', 'Course Name', 'First Name',
'Role', 'Course URI'])
In [103]:
attendance_wes[:5]
Out[103]:
In [104]:
# Location URI,Last Name,Person URI,First Name,Location,Year
locations_raw = []
for i in xrange(len(loc)):
if loc[i] != '' and loc[i] is not None and loc_uris[i] is not None:
locations_raw.append((loc_uris[i], lastname[i], name_uri[i], firstname[i], loc[i], year[i]))
locations_wes = pd.DataFrame(data=locations_raw,
columns=['Location URI', 'Last Name', 'Person URI', 'First Name', 'Location', 'Year'])
In [105]:
locations_wes[:5]
Out[105]:
In [106]:
affiliations_raw = []
for i in xrange(len(aff)):
a = aff[i]
for x in xrange(len(a)):
affiliations_raw.append((aff_uris[i][x], lastname[i], year[i], firstname[i], name_uri[i], position[i], aff[i][x]))
In [107]:
# Institution URI,Last Name,Year,First Name,Person URI,Position,Institution
affiliations_wes = pd.DataFrame(data=affiliations_raw,
columns=['Institution URI', 'Last Name', 'Year', 'First Name',
'Person URI', 'Position', 'Institution'])
In [108]:
role = []
year = []
name = []
aff = []
loc = []
for df, years in zip([investigators_2012, investigators_2013], [2012, 2013]):
role += [strip_clean(r) for r in df['Role']]
year += [years]*len(role)
name += zip(df['Last Name'], df['First Name'])
aff += [normalized_institutions(i) for i in df['Affiliation']]
loc += [strip_nopunc(l) for l in df['Location'].values]
name, name_uri = zip(*[normalized_person(n[0], n[1]) for n in name])
lastname, firstname = zip(*name)
lastname = [n.title() for n in lastname]
firstname = [n.title() for n in firstname]
role = [r.title() for r in role]
aff_uris = [[institution_ids[i] for i in ai] for ai in aff]
loc_uris = [get_location_uri(l) for l in loc]
aff = [[a.title() for a in ai] for ai in aff]
position = ['']*len(aff)
In [109]:
affiliations_raw = []
for i in xrange(len(aff)):
a = aff[i]
for x in xrange(len(a)):
affiliations_raw.append((aff_uris[i][x], lastname[i], year[i], firstname[i], name_uri[i], position[i], aff[i][x]))
In [110]:
print len(affiliations_wes),
affiliations_wes = affiliations_wes.append(pd.DataFrame(data=affiliations_raw,
columns=['Institution URI', 'Last Name', 'Year', 'First Name',
'Person URI', 'Position', 'Institution']))
print len(affiliations_wes)
In [111]:
affiliations_wes[:5]
Out[111]:
In [112]:
investigators_wes = pd.DataFrame(data=zip(lastname, name_uri, firstname, role, year, ['']*len(year)),
columns=['Last Name', 'Person URI', 'First Name', 'Role', 'Year', 'Subject'])
In [113]:
investigators_wes[:5]
Out[113]:
In [114]:
# Location URI,Last Name,Person URI,First Name,Location,Year
locations_raw = []
for i in xrange(len(loc)):
if loc[i] != '' and loc[i] is not None and loc_uris[i] is not None:
locations_raw.append((loc_uris[i], lastname[i], name_uri[i], firstname[i], loc[i], year[i]))
print len(locations_wes),
locations_wes = locations_wes.append(pd.DataFrame(data=locations_raw,
columns=['Location URI', 'Last Name', 'Person URI', 'First Name', 'Location', 'Year']))
print len(locations_wes)
In [115]:
int('Biomedical Informatics 2012'[-4:])
Out[115]:
In [120]:
cg_raw = []
for c, cg in dict(zip(cname, coursegroup)).items():
y = int(c[-4:])
curi = get_course_uri(c.lower())
cguri = get_coursegroup_uri(cg.lower())
cg_raw.append((y, curi, cguri, cg, c))
In [126]:
coursegroups_wes = pd.DataFrame(data=cg_raw, columns=['Year', 'Course URI', 'Course Group URI', 'Course Group', 'Course Name'])
In [127]:
coursegroups_wes[:5]
Out[127]:
In [128]:
coursegroups[:5]
Out[128]:
In [129]:
affiliations_combined = affiliations.append(affiliations_wes)
attendance_combined = attendance.append(attendance_wes)
coursegroups_combined = coursegroups.append(coursegroups_wes)
investigators_combined = investigators.append(investigators_wes)
locations_combined = locations.append(locations_wes)
In [132]:
print len(affiliations), len(affiliations_combined)
print len(attendance), len(attendance_combined)
print len(coursegroups), len(coursegroups_combined)
print len(investigators), len(investigators_combined)
print len(locations), len(locations_combined)
In [144]:
import matplotlib.pyplot as plt
In [152]:
%pylab inline
In [140]:
locations_finite = locations_combined[pd.notnull(locations_combined['Location'])]
In [153]:
plt.bar(locations_finite.Year.value_counts().index, locations_finite.Year.value_counts().values)
Out[153]:
In [154]:
plt.bar(attendance_combined.Year.value_counts().index, attendance_combined.Year.value_counts().values)
Out[154]:
In [155]:
plt.bar(affiliations_combined.Year.value_counts().index, affiliations_combined.Year.value_counts().values)
Out[155]:
In [ ]: