In [1]:
%pylab inline


Populating the interactive namespace from numpy and matplotlib

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from scipy.interpolate import spline

In [3]:
datapath = '/Users/erickpeirson/Projects/MBL-Data/Cleaned Data'

In [4]:
attendance = pd.read_csv(os.path.join(datapath, 'cleaned_attendance.csv'))
investigators = pd.read_csv(os.path.join(datapath, 'cleaned_investigators.csv'))
affiliations = pd.read_csv(os.path.join(datapath, 'cleaned_affiliations.csv'))
coursegroups = pd.read_csv(os.path.join(datapath, 'cleaned_coursegroups.csv'))

In [33]:
institutions = pd.read_csv('/Users/erickpeirson/Projects/MBL-Data/Authority Files/institution.csv')

In [84]:
chicagoAff = affiliations[affiliations['Institution URI'] == 'http://history.archives.mbl.edu/concepts/institution/966a06a9-34d2-4fa1-ac01-e41887f55ff2']

In [92]:
len(set(chicagoAff['Person URI'].values))


Out[92]:
592

In [93]:
len(affiliations)


Out[93]:
85190

In [88]:
plt.figure(figsize=(40,10))

plt.bar(chicagoAff.Year.value_counts().index, chicagoAff.Year.value_counts().values)


Out[88]:
<Container object of 121 artists>

In [5]:
attendance.Year.max()


Out[5]:
2013

In [6]:
byyear = attendance.groupby('Year')

In [7]:
def isInstructor(role):
    if hasattr(role, '__iter__'):
        return [isInstructor(o) for o in role]
    keywords = 'instructor', 'assistant', 'lecturer', 'faculty', 'coordinator', 'director'
    for word in keywords:
        try:
            if word in role.lower():
                return True
        except AttributeError:
            return False
    return False

In [8]:
def isStudent(role):
    if hasattr(role, '__iter__'):
        return [isStudent(o) for o in role]
    try:
        if 'student' in role.lower():
            return True
    except AttributeError:    # Assume student by default.
        return True
    return False

In [56]:
def isChicago(aff):
    if hasattr(aff, '__iter__'):
        return [isChicago(o) for o in aff]
    try:
        if aff == 'University Of Chicago':
            return True
    except AttributeError:
        return False
    return False

In [57]:
students = attendance[isStudent(attendance.Role)]
instructors = attendance[isInstructor(attendance.Role.values)]

In [58]:
plt.figure(figsize=(40,10))
investigatorsByYear = investigators.Year.value_counts()
vOrder = argsort(investigatorsByYear.index)
plt.bar(investigatorsByYear.index[vOrder], investigatorsByYear.values[vOrder])


Out[58]:
<Container object of 126 artists>

In [59]:
max(investigatorsByYear.index)


Out[59]:
2013

In [60]:
plt.figure(figsize=(40,10))
instructorsByYear = instructors.Year.value_counts()
iOrder = argsort(instructorsByYear.index)
plt.bar(instructorsByYear.index[iOrder], instructorsByYear.values[iOrder])


Out[60]:
<Container object of 119 artists>

In [61]:
max(instructorsByYear.index)


Out[61]:
2013

In [62]:
plt.figure(figsize=(40,10))
studentsByYear = students.Year.value_counts()
sOrder = argsort(studentsByYear.index)
plt.bar(studentsByYear.index[sOrder], studentsByYear.values[sOrder])


Out[62]:
<Container object of 115 artists>

In [63]:
max(studentsByYear.index)


Out[63]:
2013

In [64]:
chicago = affiliations[isChicago(affiliations.Institution)]
chicagoByYear = chicago.Year.value_counts()
cOrder = argsort(chicagoByYear.index)

In [65]:
studentsD = dict(zip(studentsByYear.index[sOrder], studentsByYear.values[sOrder]))
instructorsD = dict(zip(instructorsByYear.index[iOrder], instructorsByYear.values[iOrder]))
investigatorsD = dict(zip(investigatorsByYear.index[vOrder], investigatorsByYear.values[vOrder]))
chicagoD = dict(zip(chicagoByYear.index[cOrder], chicagoByYear.values[cOrder]))

In [66]:
start = min(min(instructorsByYear.index), 
            min(studentsByYear.index),
            min(investigatorsByYear.index))
end = max(max(instructorsByYear.index),
          max(studentsByYear.index),
          max(investigatorsByYear.index))
domain = range(start, end + 1)

In [67]:
studentsDense = []
instructorsDense = []
investigatorsDense = []
chicagoDense = []
for i in domain:
    if i in studentsD:      studentsDense.append(studentsD[i])
    else:                   studentsDense.append(0.)

    if i in instructorsD:   instructorsDense.append(instructorsD[i])
    else:                   instructorsDense.append(0.)

    if i in investigatorsD: investigatorsDense.append(investigatorsD[i])
    else:                   investigatorsDense.append(0.)        
        
    if i in chicagoD:       chicagoDense.append(chicagoD[i])
    else:                   chicagoDense.append(0.)

In [68]:
plt.figure(figsize=(40, 20))
font = {'family' : 'georgia',
        'size'   : 32}
matplotlib.rc('font', **font)

plt.bar(domain, studentsDense, color='g', alpha=0.4, label='Students')
plt.bar(domain, instructorsDense, bottom=studentsDense, color='b', alpha=0.4, label='Instructors')
plt.bar(domain, investigatorsDense, bottom=array(studentsDense) + array(instructorsDense),
        color='#ff9933', alpha=0.6, label='Investigators')


# Porter highlights
plt.plot([1937, 1937], [0., 1000], color='k', lw=2)
plt.text(1936, 1000, 'Porter hired as Beginning Instructor (1937)',
         horizontalalignment='right', verticalalignment='top', fontsize=26)

plt.plot([1975, 1975], [0., 1600], color='k', lw=2)
plt.text(1974, 1600, 'Porter becomes MBL Director (1975)',
         horizontalalignment='right', verticalalignment='top', fontsize=26)

plt.plot([1976, 1976], [0., 1800], color='k', lw=2)
plt.text(1975, 1800, 'Porter gives special lecture in Neurobiology course (1976)',
         horizontalalignment='right', verticalalignment='top', fontsize=26)

plt.plot([1977, 1977], [0., 2000], color='k', lw=2)
plt.text(1976, 2000, 'Porter gives special lecture in Developmental Biology course (1977)',
         horizontalalignment='right', verticalalignment='top', fontsize=26)

plt.plot([1979, 1979], [0., 2400], color='k', lw=2)
plt.text(1978, 2400, 'Porter is faculty for Electron Microscopy in the Biological Sciences course (1979)',
         horizontalalignment='right', verticalalignment='top', fontsize=26)


# Missing data.
ax1 = plt.gca()
ax1.add_patch(plt.Rectangle((1903,0), 5.8, 3000, color='gray', alpha=0.2, lw=2))

# 
plt.title('Participation in the Marine Biological Laboratory: 1888-2013')
plt.legend(loc=2, fontsize=32)
plt.xlim(start, end -1)

plt.show()



In [69]:
def rollingAverage(series, yLeft=1, yRight=1):
    wSize = yLeft + yRight + 1.
    sIter = xrange(1, len(series)-1)
    return array([sum(series[i-yLeft:i+yRight+1])/wSize for i in sIter])

In [70]:
plt.figure(figsize=(40, 10))
font = {'family' : 'georgia',
        'size'   : 32}
matplotlib.rc('font', **font)

X = array(domain[1:-1])
Y = rollingAverage(chicagoDense)
x_smooth = np.linspace(X.min(), X.max(), 400)
y_smooth = spline(X, Y, x_smooth)

plt.plot(x_smooth, y_smooth, lw=2)
plt.xlim(start, end+1)
plt.ylim(0, 16)
plt.show()



In [71]:
chicagoans = set(chicago['Person URI'])

def isChicagoan(uri):
    if hasattr(uri, '__iter__'):
        return [isChicagoan(o) for o in uri]
    if uri in chicagoans:
        return True
    return False

In [72]:
allCourseGroups = set(coursegroups['Course Group'].values)

In [73]:
chicagoAttendance = attendance[isChicagoan(attendance['Person URI'])]
chicagoInvestigators = investigators[isChicagoan(investigators['Person URI'])]

In [75]:
chicagoInstructorByYear = chicagoAttendance[isInstructor(chicagoAttendance.Role)].Year.value_counts()
chicagoStudentByYear = chicagoAttendance[isStudent(chicagoAttendance.Role)].Year.value_counts()
chicagoInvestigatorByYear = chicagoInvestigators.Year.value_counts()

In [127]:
invYearGrouped = chicagoInvestigators.groupby('Year')
stuYearGrouped = chicagoAttendance[isStudent(chicagoAttendance.Role)].groupby('Year')
insYearGrouped = chicagoAttendance[isInstructor(chicagoAttendance.Role)].groupby('Year')

cI_pared = []
cS_pared = []
cV_pared = []
for year in domain:
    try:                  cV_pared.append(len(invYearGrouped['Person URI'].value_counts()[year]))
    except IndexError:    cV_pared.append(0)
    try:                  cS_pared.append(len(stuYearGrouped['Person URI'].value_counts()[year]))
    except IndexError:    cS_pared.append(0)
    try:                  cI_pared.append(len(insYearGrouped['Person URI'].value_counts()[year]))
    except IndexError:    cI_pared.append(0)

In [76]:
cIorder = argsort(chicagoInstructorByYear.index)
cSorder = argsort(chicagoStudentByYear.index)
cVorder = argsort(chicagoInvestigatorByYear.index)

cID = dict(zip(chicagoInstructorByYear.index, chicagoInstructorByYear.values))
cSD = dict(zip(chicagoStudentByYear.index, chicagoStudentByYear.values))
cVD = dict(zip(chicagoInvestigatorByYear.index, chicagoInvestigatorByYear.values))

In [77]:
cIDense = []
cSDense = []
cVDense = []

for i in domain:
    if i in cID:      cIDense.append(cID[i])
    else:             cIDense.append(0.)

    if i in cSD:      cSDense.append(cSD[i])
    else:             cSDense.append(0.)

    if i in cVD:      cVDense.append(cVD[i])
    else:             cVDense.append(0.)

In [128]:
plt.figure(figsize=(40, 20))
font = {'family' : 'georgia',
        'size'   : 32}
matplotlib.rc('font', **font)

plt.bar(domain, cS_pared, color='g', alpha=0.4, label='Students')
plt.bar(domain, cI_pared, bottom=cS_pared, color='b', alpha=0.4, label='Instructors')
plt.bar(domain, cV_pared, bottom=array(cS_pared) + array(cI_pared),
        color='#ff9933', alpha=0.6, label='Investigators')


# Missing data.
ax1 = plt.gca()
ax1.add_patch(plt.Rectangle((1903,0), 5.8, 3000, color='gray', alpha=0.2, lw=2))

# 
plt.title('University of Chicago at the Marine Biological Laboratory: 1888-2013')
plt.legend(loc=2, fontsize=32)
plt.xlim(start, end + 1)
plt.ylim(0, 60)
plt.show()



In [446]:
chicagoCourses = chicagoAttendance.groupby('Year')['Course URI'].value_counts()

In [450]:
cgLookup = dict(zip(coursegroups['Course URI'].values, coursegroups['Course Group']))

In [452]:
cgValues = {cg:[] for cg in list(allCourseGroups)}
for y in domain:
    if y in chicagoCourses:
        V = {cgLookup[u]: v for u, v in zip(chicagoCourses[y].index, chicagoCourses[y].values)}
        for k in cgValues.keys():
            if k in V:
                cgValues[k].append(V[k])
            else:
                cgValues[k].append(0.)
    else:
        for k in cgValues.keys():
            cgValues[k].append(0.)

In [456]:
c = plt.get_cmap('jet')
plt.figure(figsize=(40,20))

x_smooth = np.linspace(array(domain).min(), array(domain).max(), 1000)
subset = [k for k, v in cgValues.iteritems() if sum(v) >= 10 and not pd.isnull(k)]

other = array([V for k, V in cgValues.iteritems() if k not in subset]).sum(axis=0)
last = array(other)
last_smooth = array([max(v, 0.) for v in spline(domain, other, x_smooth)])
plt.fill_between(x_smooth, last_smooth, [0. for i in x_smooth], color='k', alpha=0.2)

legendArtists = [Rectangle((0, 0), 1, 1, fc='k', alpha=0.2)]
legendLabels = ['Other (<10 attendees)']

x = 0.
for key in subset:

    Values = array(cgValues[key])
    Values_smooth = array([max(v, 0.) for v in spline(domain, Values, x_smooth)])
    color = c((x/len(subset)))
    
    
    plt.fill_between(x_smooth, last_smooth, Values_smooth + last_smooth, alpha=0.5, color=color)
    plt.plot(x_smooth, Values_smooth + last_smooth, color=color, label=key, lw=1)
    last = Values + last
    last_smooth = Values_smooth + last_smooth

    legendArtists.append(Rectangle((0, 0), 1, 1, fc=color, alpha=0.5))
    legendLabels.append(key)
    
    x += 1.

plt.xlim(start, end)
plt.ylim(0, 35)
plt.legend(legendArtists, legendLabels, loc=2, fontsize=16)
plt.title('University of Chicago Participation in MBL Courses (1888-2013)')
plt.show()



In [280]:
[k for k, v in cgValues.iteritems() if sum(v) > 5 and not pd.isnull(k)]


Out[280]:
['Experimental Invertebrate Zoology',
 'Frontiers In Reproduction',
 'Embryology',
 'Invertebrate Zoology',
 'Workshop On Molecular Evolution',
 'Neural Systems & Behavior',
 'Zoology',
 'Microinjection Techniques In Cell Biology',
 'History Of Biology',
 'Pathogenesis Of Neuroimmunologic Diseases',
 'Neurobiology & Development Of The Leech',
 'Fertilization And Gamete Physiology Research Training Program',
 'Botany',
 'Neuroinformatics',
 'Biology Of Parasitism',
 'Physiology',
 'Neural Systems And Behavior',
 'Neurobiology',
 'Microbial Diversity',
 'Marine Ecology',
 'Methods In Computational Neuroscience',
 'Biology Of The Inner Ear',
 'Biomedical Informatics',
 'Comparative Physiology']

In [ ]:
spline()