In [1]:
ls -lah ../data
In [1]:
import csv
import pandas as pd
In [2]:
from __future__ import print_function
In [3]:
results = []
with open('../data/gradtimes.txt', 'r') as fi:
reader = csv.DictReader(fi, fieldnames=['program', 'grad_time'], delimiter='\t')
for rec in reader:
results.append(rec)
print(len(results))
In [4]:
df = pd.DataFrame.from_records(results)
df['grad_time'] = df['grad_time'].map(float)
In [5]:
df
Out[5]:
In [6]:
df.describe()
Out[6]:
In [7]:
df[df['program'] == 'NEURO'].describe()
Out[7]:
In [8]:
df[df['program'] <> 'NEURO'].describe()
Out[8]:
In [9]:
df[df['program'] == 'MBP-GPP'].describe()
Out[9]:
In [10]:
set(df['program'])
Out[10]:
In [11]:
from ggplot import *
In [12]:
%matplotlib inline
In [13]:
g = ggplot(aes(x='grad_time'), data=df) \
+ geom_density(fill='#586e75', alpha=0.4)
print(g)
In [14]:
import random
In [29]:
hits = []
nndf = df[df['program'] <> 'NEURO'].copy()
for i in range(100000):
mean = nndf.sample(n=84)['grad_time'].mean()
if mean >= 5.9327:
hits.append(mean)
In [30]:
len(hits)
Out[30]:
In [ ]: