In [1]:
import pandas as pd
import matplotlib.pyplot as plt
In [2]:
import statsmodels.formula.api as smf
from IPython.display import display
In [18]:
data = pd.read_csv('dormdata.csv')
data
Out[18]:
In [4]:
print data["fouryrgrad"].std()
print data["sixyrgrad"].std()
In [5]:
%matplotlib inline
plt.figure(figsize=(20,10))
plt.scatter(data.URM, data.LGB,s=1000)
plt.ylabel('% LGB',fontsize=40)
plt.xlabel('% URM',fontsize=40)
plt.title('MIT Dorm Demographics',fontsize=40)
plt.tick_params(axis='both', which='major', labelsize=20)
plt.ylim([0,50])
plt.xlim([0,50])
Out[5]:
In [23]:
reload(smf)
Out[23]:
In [30]:
data6yr = data.dropna()
In [31]:
lm4 = smf.ols(formula='fouryrgrad ~ URM + LGB + lessthaneightyk', data=data, missing='drop').fit()
lm6 = smf.ols(formula='sixyrgrad ~ URM + LGB + lessthaneightyk', data=data6yr, missing='drop').fit()
In [32]:
display(lm4.summary())
In [33]:
display(lm6.summary())
In [34]:
lm6.predict(data6yr)
Out[34]:
In [18]:
data.fouryrgrad.values
Out[18]:
In [36]:
p4yr = pd.DataFrame(index=data.Dorm, data=lm4.predict(data), columns=["pred4yr"])
p6yr = pd.DataFrame(index=data6yr.Dorm, data=lm6.predict(data6yr), columns=["pred6yr"])
predicted = p4yr.join(p6yr).join(data.set_index("Dorm"))
In [42]:
predicted
Out[42]:
In [38]:
plt.figure(figsize=(20,10))
plt.subplot(121)
plt.scatter(data.fouryrgrad,lm4.predict(data),s=500)
plt.plot(range(1,100),range(1,100))
plt.ylabel('Predicted 4-year \n Graduation Rate',fontsize=30)
plt.xlabel('Actual 4-year Graduation Rate',fontsize=30)
plt.tick_params(axis='both', which='major', labelsize=20)
plt.ylim([55,95])
plt.xlim([55,95])
plt.subplot(122)
plt.scatter(data6yr.sixyrgrad,lm6.predict(data6yr),s=500)
plt.plot(range(1,110),range(1,110))
plt.ylabel('Predicted 6-year \n Graduation Rate',fontsize=30)
plt.xlabel('Actual 6-year Graduation Rate',fontsize=30)
plt.tick_params(axis='both', which='major', labelsize=20)
plt.ylim([75,100])
plt.xlim([75,100])
plt.suptitle('Predicted vs. Actual Dorm Graduation Rates\n\n\n',fontsize=30,y=1.05)
plt.tight_layout()
In [129]:
plt.figure(figsize=(20,10))
plt.subplot(121)
plt.plot(data.fouryrgrad - lm4.predict(data),'o',markersize=20)
plt.xticks(range(11),data.Dorm,rotation='vertical');
plt.ylabel('Residuals',fontsize=30)
plt.tick_params(axis='both', which='major', labelsize=20)
plt.subplot(122)
plt.plot(data.sixyrgrad - lm6.predict(data),'o',markersize=20)
plt.xticks(range(11),data.Dorm,rotation='vertical');
plt.tick_params(axis='both', which='major', labelsize=20)
plt.suptitle('Residuals\n\n\n',fontsize=30,y=1.05)
plt.tight_layout()
In [43]:
display(lm6.predict(data))
In [ ]: