In [1]:
%pylab inline
In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from scipy.interpolate import spline
In [3]:
datapath = '/Users/erickpeirson/Projects/MBL-Data/Cleaned Data'
In [4]:
attendance = pd.read_csv(os.path.join(datapath, 'cleaned_attendance.csv'))
investigators = pd.read_csv(os.path.join(datapath, 'cleaned_investigators.csv'))
affiliations = pd.read_csv(os.path.join(datapath, 'cleaned_affiliations.csv'))
coursegroups = pd.read_csv(os.path.join(datapath, 'cleaned_coursegroups.csv'))
In [33]:
institutions = pd.read_csv('/Users/erickpeirson/Projects/MBL-Data/Authority Files/institution.csv')
In [84]:
chicagoAff = affiliations[affiliations['Institution URI'] == 'http://history.archives.mbl.edu/concepts/institution/966a06a9-34d2-4fa1-ac01-e41887f55ff2']
In [92]:
len(set(chicagoAff['Person URI'].values))
Out[92]:
In [93]:
len(affiliations)
Out[93]:
In [88]:
plt.figure(figsize=(40,10))
plt.bar(chicagoAff.Year.value_counts().index, chicagoAff.Year.value_counts().values)
Out[88]:
In [5]:
attendance.Year.max()
Out[5]:
In [6]:
byyear = attendance.groupby('Year')
In [7]:
def isInstructor(role):
if hasattr(role, '__iter__'):
return [isInstructor(o) for o in role]
keywords = 'instructor', 'assistant', 'lecturer', 'faculty', 'coordinator', 'director'
for word in keywords:
try:
if word in role.lower():
return True
except AttributeError:
return False
return False
In [8]:
def isStudent(role):
if hasattr(role, '__iter__'):
return [isStudent(o) for o in role]
try:
if 'student' in role.lower():
return True
except AttributeError: # Assume student by default.
return True
return False
In [56]:
def isChicago(aff):
if hasattr(aff, '__iter__'):
return [isChicago(o) for o in aff]
try:
if aff == 'University Of Chicago':
return True
except AttributeError:
return False
return False
In [57]:
students = attendance[isStudent(attendance.Role)]
instructors = attendance[isInstructor(attendance.Role.values)]
In [58]:
plt.figure(figsize=(40,10))
investigatorsByYear = investigators.Year.value_counts()
vOrder = argsort(investigatorsByYear.index)
plt.bar(investigatorsByYear.index[vOrder], investigatorsByYear.values[vOrder])
Out[58]:
In [59]:
max(investigatorsByYear.index)
Out[59]:
In [60]:
plt.figure(figsize=(40,10))
instructorsByYear = instructors.Year.value_counts()
iOrder = argsort(instructorsByYear.index)
plt.bar(instructorsByYear.index[iOrder], instructorsByYear.values[iOrder])
Out[60]:
In [61]:
max(instructorsByYear.index)
Out[61]:
In [62]:
plt.figure(figsize=(40,10))
studentsByYear = students.Year.value_counts()
sOrder = argsort(studentsByYear.index)
plt.bar(studentsByYear.index[sOrder], studentsByYear.values[sOrder])
Out[62]:
In [63]:
max(studentsByYear.index)
Out[63]:
In [64]:
chicago = affiliations[isChicago(affiliations.Institution)]
chicagoByYear = chicago.Year.value_counts()
cOrder = argsort(chicagoByYear.index)
In [65]:
studentsD = dict(zip(studentsByYear.index[sOrder], studentsByYear.values[sOrder]))
instructorsD = dict(zip(instructorsByYear.index[iOrder], instructorsByYear.values[iOrder]))
investigatorsD = dict(zip(investigatorsByYear.index[vOrder], investigatorsByYear.values[vOrder]))
chicagoD = dict(zip(chicagoByYear.index[cOrder], chicagoByYear.values[cOrder]))
In [66]:
start = min(min(instructorsByYear.index),
min(studentsByYear.index),
min(investigatorsByYear.index))
end = max(max(instructorsByYear.index),
max(studentsByYear.index),
max(investigatorsByYear.index))
domain = range(start, end + 1)
In [67]:
studentsDense = []
instructorsDense = []
investigatorsDense = []
chicagoDense = []
for i in domain:
if i in studentsD: studentsDense.append(studentsD[i])
else: studentsDense.append(0.)
if i in instructorsD: instructorsDense.append(instructorsD[i])
else: instructorsDense.append(0.)
if i in investigatorsD: investigatorsDense.append(investigatorsD[i])
else: investigatorsDense.append(0.)
if i in chicagoD: chicagoDense.append(chicagoD[i])
else: chicagoDense.append(0.)
In [68]:
plt.figure(figsize=(40, 20))
font = {'family' : 'georgia',
'size' : 32}
matplotlib.rc('font', **font)
plt.bar(domain, studentsDense, color='g', alpha=0.4, label='Students')
plt.bar(domain, instructorsDense, bottom=studentsDense, color='b', alpha=0.4, label='Instructors')
plt.bar(domain, investigatorsDense, bottom=array(studentsDense) + array(instructorsDense),
color='#ff9933', alpha=0.6, label='Investigators')
# Porter highlights
plt.plot([1937, 1937], [0., 1000], color='k', lw=2)
plt.text(1936, 1000, 'Porter hired as Beginning Instructor (1937)',
horizontalalignment='right', verticalalignment='top', fontsize=26)
plt.plot([1975, 1975], [0., 1600], color='k', lw=2)
plt.text(1974, 1600, 'Porter becomes MBL Director (1975)',
horizontalalignment='right', verticalalignment='top', fontsize=26)
plt.plot([1976, 1976], [0., 1800], color='k', lw=2)
plt.text(1975, 1800, 'Porter gives special lecture in Neurobiology course (1976)',
horizontalalignment='right', verticalalignment='top', fontsize=26)
plt.plot([1977, 1977], [0., 2000], color='k', lw=2)
plt.text(1976, 2000, 'Porter gives special lecture in Developmental Biology course (1977)',
horizontalalignment='right', verticalalignment='top', fontsize=26)
plt.plot([1979, 1979], [0., 2400], color='k', lw=2)
plt.text(1978, 2400, 'Porter is faculty for Electron Microscopy in the Biological Sciences course (1979)',
horizontalalignment='right', verticalalignment='top', fontsize=26)
# Missing data.
ax1 = plt.gca()
ax1.add_patch(plt.Rectangle((1903,0), 5.8, 3000, color='gray', alpha=0.2, lw=2))
#
plt.title('Participation in the Marine Biological Laboratory: 1888-2013')
plt.legend(loc=2, fontsize=32)
plt.xlim(start, end -1)
plt.show()
In [69]:
def rollingAverage(series, yLeft=1, yRight=1):
wSize = yLeft + yRight + 1.
sIter = xrange(1, len(series)-1)
return array([sum(series[i-yLeft:i+yRight+1])/wSize for i in sIter])
In [70]:
plt.figure(figsize=(40, 10))
font = {'family' : 'georgia',
'size' : 32}
matplotlib.rc('font', **font)
X = array(domain[1:-1])
Y = rollingAverage(chicagoDense)
x_smooth = np.linspace(X.min(), X.max(), 400)
y_smooth = spline(X, Y, x_smooth)
plt.plot(x_smooth, y_smooth, lw=2)
plt.xlim(start, end+1)
plt.ylim(0, 16)
plt.show()
In [71]:
chicagoans = set(chicago['Person URI'])
def isChicagoan(uri):
if hasattr(uri, '__iter__'):
return [isChicagoan(o) for o in uri]
if uri in chicagoans:
return True
return False
In [72]:
allCourseGroups = set(coursegroups['Course Group'].values)
In [73]:
chicagoAttendance = attendance[isChicagoan(attendance['Person URI'])]
chicagoInvestigators = investigators[isChicagoan(investigators['Person URI'])]
In [75]:
chicagoInstructorByYear = chicagoAttendance[isInstructor(chicagoAttendance.Role)].Year.value_counts()
chicagoStudentByYear = chicagoAttendance[isStudent(chicagoAttendance.Role)].Year.value_counts()
chicagoInvestigatorByYear = chicagoInvestigators.Year.value_counts()
In [127]:
invYearGrouped = chicagoInvestigators.groupby('Year')
stuYearGrouped = chicagoAttendance[isStudent(chicagoAttendance.Role)].groupby('Year')
insYearGrouped = chicagoAttendance[isInstructor(chicagoAttendance.Role)].groupby('Year')
cI_pared = []
cS_pared = []
cV_pared = []
for year in domain:
try: cV_pared.append(len(invYearGrouped['Person URI'].value_counts()[year]))
except IndexError: cV_pared.append(0)
try: cS_pared.append(len(stuYearGrouped['Person URI'].value_counts()[year]))
except IndexError: cS_pared.append(0)
try: cI_pared.append(len(insYearGrouped['Person URI'].value_counts()[year]))
except IndexError: cI_pared.append(0)
In [76]:
cIorder = argsort(chicagoInstructorByYear.index)
cSorder = argsort(chicagoStudentByYear.index)
cVorder = argsort(chicagoInvestigatorByYear.index)
cID = dict(zip(chicagoInstructorByYear.index, chicagoInstructorByYear.values))
cSD = dict(zip(chicagoStudentByYear.index, chicagoStudentByYear.values))
cVD = dict(zip(chicagoInvestigatorByYear.index, chicagoInvestigatorByYear.values))
In [77]:
cIDense = []
cSDense = []
cVDense = []
for i in domain:
if i in cID: cIDense.append(cID[i])
else: cIDense.append(0.)
if i in cSD: cSDense.append(cSD[i])
else: cSDense.append(0.)
if i in cVD: cVDense.append(cVD[i])
else: cVDense.append(0.)
In [128]:
plt.figure(figsize=(40, 20))
font = {'family' : 'georgia',
'size' : 32}
matplotlib.rc('font', **font)
plt.bar(domain, cS_pared, color='g', alpha=0.4, label='Students')
plt.bar(domain, cI_pared, bottom=cS_pared, color='b', alpha=0.4, label='Instructors')
plt.bar(domain, cV_pared, bottom=array(cS_pared) + array(cI_pared),
color='#ff9933', alpha=0.6, label='Investigators')
# Missing data.
ax1 = plt.gca()
ax1.add_patch(plt.Rectangle((1903,0), 5.8, 3000, color='gray', alpha=0.2, lw=2))
#
plt.title('University of Chicago at the Marine Biological Laboratory: 1888-2013')
plt.legend(loc=2, fontsize=32)
plt.xlim(start, end + 1)
plt.ylim(0, 60)
plt.show()
In [446]:
chicagoCourses = chicagoAttendance.groupby('Year')['Course URI'].value_counts()
In [450]:
cgLookup = dict(zip(coursegroups['Course URI'].values, coursegroups['Course Group']))
In [452]:
cgValues = {cg:[] for cg in list(allCourseGroups)}
for y in domain:
if y in chicagoCourses:
V = {cgLookup[u]: v for u, v in zip(chicagoCourses[y].index, chicagoCourses[y].values)}
for k in cgValues.keys():
if k in V:
cgValues[k].append(V[k])
else:
cgValues[k].append(0.)
else:
for k in cgValues.keys():
cgValues[k].append(0.)
In [456]:
c = plt.get_cmap('jet')
plt.figure(figsize=(40,20))
x_smooth = np.linspace(array(domain).min(), array(domain).max(), 1000)
subset = [k for k, v in cgValues.iteritems() if sum(v) >= 10 and not pd.isnull(k)]
other = array([V for k, V in cgValues.iteritems() if k not in subset]).sum(axis=0)
last = array(other)
last_smooth = array([max(v, 0.) for v in spline(domain, other, x_smooth)])
plt.fill_between(x_smooth, last_smooth, [0. for i in x_smooth], color='k', alpha=0.2)
legendArtists = [Rectangle((0, 0), 1, 1, fc='k', alpha=0.2)]
legendLabels = ['Other (<10 attendees)']
x = 0.
for key in subset:
Values = array(cgValues[key])
Values_smooth = array([max(v, 0.) for v in spline(domain, Values, x_smooth)])
color = c((x/len(subset)))
plt.fill_between(x_smooth, last_smooth, Values_smooth + last_smooth, alpha=0.5, color=color)
plt.plot(x_smooth, Values_smooth + last_smooth, color=color, label=key, lw=1)
last = Values + last
last_smooth = Values_smooth + last_smooth
legendArtists.append(Rectangle((0, 0), 1, 1, fc=color, alpha=0.5))
legendLabels.append(key)
x += 1.
plt.xlim(start, end)
plt.ylim(0, 35)
plt.legend(legendArtists, legendLabels, loc=2, fontsize=16)
plt.title('University of Chicago Participation in MBL Courses (1888-2013)')
plt.show()
In [280]:
[k for k, v in cgValues.iteritems() if sum(v) > 5 and not pd.isnull(k)]
Out[280]:
In [ ]:
spline()