In [ ]:
# HIDDEN
# This useful nonsense should just go at the top of your notebook.
from datascience import *
%matplotlib inline
import matplotlib.pyplot as plots
import numpy as np
from numpy import mean
plots.style.use('fivethirtyeight')
from ipywidgets import interact
# datascience version number of last run of this notebook
version.__version__
In [ ]:
#raw_fte = Table.read_table("./data/deptFTE.csv")
raw_fte = Table.read_table("https://deculler.github.io/DataScienceTableDemos/data/deptFTE.csv")
raw_fte
In [ ]:
fte = raw_fte.where(raw_fte['FTE'] > 0)
fte
In [ ]:
raw_degrees = Table().read_table("./data/DegreeCounts-1983-2016.csv") # Raw degree data from CalAnswers
raw_degrees
Six thousand rows! Now that's hard to deal with mousing around in a spreadsheet. But it is also hard to deal with the categories. So let's clean it up.
In [ ]:
degree_rows = raw_degrees.where(raw_degrees['Academic Yr'] != '2015-16')
degrees = degree_rows.drop('All Major Acad Hier Level Nm')
degrees.relabel('Academic Yr', 'Year')
degrees.relabel('Academic Department Short Nm', 'Dept')
degrees.relabel('Reporting Clg School Short Nm', 'College')
degrees.relabel('Academic Division Short Nm','Div')
degrees.relabel('Student Headcount', 'Grads')
degrees.relabel('Prorated Student Major Cnt Sum', 'Prorated')
degrees['Year'] = degrees.apply(lambda x:int(x[0:4])+1, 'Year')
degrees.show()
In [ ]:
degrees.num_rows
In [ ]:
Totals = degrees.select(['Year', 'DegLvlDesc', 'Grads']).groups(['Year', 'DegLvlDesc'], collect=sum)
Totals
In [ ]:
# Transpose this data into columns of years, per degree type, summing the number of grads.
Grads_by_year = Totals.pivot('DegLvlDesc', 'Year', 'Grads sum', collect=sum)
Grads_by_year
In [ ]:
# What does this look like?
Grads_by_year.plot('Year')
In [ ]:
# Let's look at the last year of data
degrees.where('Year', 2015)
In [ ]:
graduates = degrees.where('Year', 2015).select(['DegLvlDesc', 'College', 'Dept', 'Grads']).pivot('DegLvlDesc', ['College', 'Dept'], 'Grads', collect=sum)
graduates
In [ ]:
fte.join('Dept',graduates )
In [ ]:
depts = fte.join('Dept',graduates )
depts['BA per FTE'] = depts['Bachelor'] / depts['FTE']
depts.sort('Dept').show()
In [ ]:
depts.where(depts['BA per FTE'] < 50).scatter('FTE', 'BA per FTE')
In [ ]:
#raw_fte.where(raw_fte['FTE'] == 0).show()
In [ ]:
depts = raw_fte.join('Dept',graduates )
depts['BA per FTE'] = depts['Bachelor'] / depts['FTE']
depts.sort('Dept').show()
In [ ]:
depts = raw_fte.join('Dept',graduates.drop(['Doctoral', 'Masters/Professional']).where('Bachelor') )
depts['BA per FTE'] = depts['Bachelor'] / depts['FTE']
depts.sort('Dept').show()
In [ ]:
depts.where('FTE', 0)
In [ ]:
xlat = Table(['Dept', 'toDept']).with_rows([
('L&S Chemistry', 'Chemistry'),
('L&S Computer Science', 'Electrical Eng & Comp Sci'),
('L&S Envir Econ & Policy', 'Env Sci, Policy, & Mgmt'),
('L&S Legal Studies', 'Law'),
('L&S Ops Rsch & Mgmt Sci', 'Industrial Eng & Ops Rsch'),
('L&S Public Health', 'Public Health'),
('L&S Social Welfare', 'Social Welfare')
])
xlat
In [ ]:
def fix_dept(dept):
if dept in xlat['Dept']:
return xlat['toDept'][list(xlat['Dept']).index(dept)]
else:
return dept
In [ ]:
fix_dept('History')
In [ ]:
fix_dept('L&S Legal Studies')
In [ ]:
BAs = graduates.drop(['Doctoral', 'Masters/Professional']).where('Bachelor')
BAs['Dept'] = BAs.apply(fix_dept, 'Dept')
gBAs = BAs.drop('College').group('Dept', collect = sum)
gBAs.show()
In [ ]:
gdepts = raw_fte.join('Dept',gBAs)
gdepts['BA per FTE'] = gdepts['Bachelor sum'] / gdepts['FTE']
gdepts.sort('Dept')
In [ ]:
gdepts.where(gdepts['BA per FTE']<20).scatter('FTE', 'BA per FTE',fit_line=True)
In [ ]:
gdepts.where(gdepts['BA per FTE']<20).sort('BA per FTE', descending=True)
In [ ]:
In [ ]:
fte.groups('Div', collect=sum)
In [ ]:
def sum_collect(s):
try:
return sum(s)
except TypeError:
if np.all(np.array(s) == s[0]):
return s[0]
else:
return s
Divisions = fte.select(['Div', 'Dept', 'FTE']).group('Div', collect=sum_collect).show()
Divisions