Import the data from the CSV file



In [1]:

    
import pandas as pd
import matplotlib.pyplot as plt



In [2]:

    
import statsmodels.formula.api as smf
from IPython.display import display



In [18]:

    
data = pd.read_csv('dormdata.csv')

data









    Out[18]:






  
    
      
      Dorm
      URM
      LGB
      lessthaneightyk
      fouryrgrad
      sixyrgrad
    
  
  
    
      0
      Next
      20.5
      11.0
      23.2
      86.4
      94.3
    
    
      1
      Baker
      16.0
      4.2
      20.6
      87.8
      95.7
    
    
      2
      Burton
      20.8
      10.2
      18.6
      86.3
      94.0
    
    
      3
      EC
      19.2
      38.0
      24.9
      75.3
      89.4
    
    
      4
      MacGregor
      27.2
      8.3
      28.2
      80.9
      92.2
    
    
      5
      Maseeh
      21.4
      6.4
      23.3
      87.2
      NaN
    
    
      6
      McCormick
      20.2
      8.1
      27.9
      88.8
      95.8
    
    
      7
      New
      43.5
      16.1
      31.4
      80.5
      91.3
    
    
      8
      Random
      17.2
      40.8
      35.5
      71.6
      88.0
    
    
      9
      Senior
      36.7
      40.0
      30.9
      59.7
      78.1
    
    
      10
      Simmons
      30.9
      8.7
      27.4
      83.3
      93.5



In [4]:

    
print data["fouryrgrad"].std()
print data["sixyrgrad"].std()









    



8.84267544869
5.27300251807

Plot the data



In [5]:

    
%matplotlib inline


plt.figure(figsize=(20,10))
plt.scatter(data.URM, data.LGB,s=1000)
plt.ylabel('% LGB',fontsize=40)
plt.xlabel('% URM',fontsize=40)
plt.title('MIT Dorm Demographics',fontsize=40)
plt.tick_params(axis='both', which='major', labelsize=20)
plt.ylim([0,50])
plt.xlim([0,50])









    Out[5]:





(0, 50)

Run a multiple regression



In [23]:

    
reload(smf)









    Out[23]:





<module 'statsmodels.formula.api' from '/home/mirthbottle/anaconda/lib/python2.7/site-packages/statsmodels/formula/api.pyc'>



In [30]:

    
data6yr = data.dropna()



In [31]:

    
lm4 = smf.ols(formula='fouryrgrad ~ URM + LGB + lessthaneightyk', data=data, missing='drop').fit()
lm6 = smf.ols(formula='sixyrgrad ~ URM + LGB + lessthaneightyk', data=data6yr, missing='drop').fit()



In [32]:

    
display(lm4.summary())









    





OLS Regression Results

  Dep. Variable:        fouryrgrad       R-squared:             0.880


  Model:                    OLS          Adj. R-squared:        0.829


  Method:              Least Squares     F-statistic:           17.13


  Date:              Tue, 04 Jul 2017    Prob (F-statistic):   0.00132


  Time:                  15:07:49        Log-Likelihood:      -27.393


  No. Observations:           11         AIC:                   62.79


  Df Residuals:                7         BIC:                   64.38


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                     coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  Intercept           96.9863      6.788     14.287   0.000     80.934   113.038


  URM                 -0.3188      0.151     -2.107   0.073     -0.677     0.039


  LGB                 -0.5252      0.100     -5.232   0.001     -0.763    -0.288


  lessthaneightyk      0.0305      0.324      0.094   0.928     -0.736     0.797




  Omnibus:         0.264    Durbin-Watson:         2.136


  Prob(Omnibus):   0.877    Jarque-Bera (JB):      0.275


  Skew:           -0.271    Prob(JB):              0.872


  Kurtosis:        2.446    Cond. No.               257.



In [33]:

    
display(lm6.summary())









    





OLS Regression Results

  Dep. Variable:         sixyrgrad       R-squared:             0.828


  Model:                    OLS          Adj. R-squared:        0.743


  Method:              Least Squares     F-statistic:           9.653


  Date:              Tue, 04 Jul 2017    Prob (F-statistic):   0.0103 


  Time:                  15:07:50        Log-Likelihood:      -21.477


  No. Observations:           10         AIC:                   50.95


  Df Residuals:                6         BIC:                   52.16


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




                     coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  Intercept           99.5284      5.052     19.699   0.000     87.166   111.891


  URM                 -0.2476      0.111     -2.231   0.067     -0.519     0.024


  LGB                 -0.3079      0.075     -4.132   0.006     -0.490    -0.126


  lessthaneightyk      0.1360      0.237      0.573   0.587     -0.445     0.717




  Omnibus:         0.699    Durbin-Watson:         2.373


  Prob(Omnibus):   0.705    Jarque-Bera (JB):      0.152


  Skew:           -0.287    Prob(JB):              0.927


  Kurtosis:        2.813    Cond. No.               255.



In [34]:

    
lm6.predict(data6yr)









    Out[34]:





array([ 94.22244227,  97.07675347,  93.76870783,  86.46096478,
        94.07552983,  95.82918733,  88.07375545,  87.53594202,
        82.32915778,  92.92755923])



In [18]:

    
data.fouryrgrad.values









    Out[18]:





array([ 86.4,  87.8,  86.3,  75.3,  80.9,  87.2,  88.8,  80.5,  71.6,
        59.7,  83.3])



In [36]:

    
p4yr = pd.DataFrame(index=data.Dorm, data=lm4.predict(data), columns=["pred4yr"])
p6yr = pd.DataFrame(index=data6yr.Dorm, data=lm6.predict(data6yr), columns=["pred6yr"])

predicted = p4yr.join(p6yr).join(data.set_index("Dorm"))



In [42]:

    
predicted









    Out[42]:






  
    
      
      pred4yr
      pred6yr
      URM
      LGB
      lessthaneightyk
      fouryrgrad
      sixyrgrad
    
    
      Dorm
      
      
      
      
      
      
      
    
  
  
    
      Next
      85.381794
      94.222442
      20.5
      11.0
      23.2
      86.4
      94.3
    
    
      Baker
      90.308531
      97.076753
      16.0
      4.2
      20.6
      87.8
      95.7
    
    
      Burton
      85.565986
      93.768708
      20.8
      10.2
      18.6
      86.3
      94.0
    
    
      EC
      71.666707
      86.460965
      19.2
      38.0
      24.9
      75.3
      89.4
    
    
      MacGregor
      84.816731
      94.075530
      27.2
      8.3
      28.2
      80.9
      92.2
    
    
      Maseeh
      87.514035
      NaN
      21.4
      6.4
      23.3
      87.2
      NaN
    
    
      McCormick
      87.144025
      95.829187
      20.2
      8.1
      27.9
      88.8
      95.8
    
    
      New
      75.621562
      88.073755
      43.5
      16.1
      31.4
      80.5
      91.3
    
    
      Random
      71.157039
      87.535942
      17.2
      40.8
      35.5
      71.6
      88.0
    
    
      Senior
      65.220819
      82.329158
      36.7
      40.0
      30.9
      59.7
      78.1
    
    
      Simmons
      83.402771
      92.927559
      30.9
      8.7
      27.4
      83.3
      93.5



In [38]:

    
plt.figure(figsize=(20,10))
plt.subplot(121)
plt.scatter(data.fouryrgrad,lm4.predict(data),s=500)
plt.plot(range(1,100),range(1,100))
plt.ylabel('Predicted 4-year \n Graduation Rate',fontsize=30)
plt.xlabel('Actual 4-year Graduation Rate',fontsize=30)
plt.tick_params(axis='both', which='major', labelsize=20)
plt.ylim([55,95])
plt.xlim([55,95])
plt.subplot(122)
plt.scatter(data6yr.sixyrgrad,lm6.predict(data6yr),s=500)
plt.plot(range(1,110),range(1,110))
plt.ylabel('Predicted 6-year \n Graduation Rate',fontsize=30)
plt.xlabel('Actual 6-year Graduation Rate',fontsize=30)
plt.tick_params(axis='both', which='major', labelsize=20)
plt.ylim([75,100])
plt.xlim([75,100])
plt.suptitle('Predicted vs. Actual Dorm Graduation Rates\n\n\n',fontsize=30,y=1.05)
plt.tight_layout()



In [129]:

    
plt.figure(figsize=(20,10))
plt.subplot(121)
plt.plot(data.fouryrgrad - lm4.predict(data),'o',markersize=20)
plt.xticks(range(11),data.Dorm,rotation='vertical');
plt.ylabel('Residuals',fontsize=30)
plt.tick_params(axis='both', which='major', labelsize=20)
plt.subplot(122)
plt.plot(data.sixyrgrad - lm6.predict(data),'o',markersize=20)
plt.xticks(range(11),data.Dorm,rotation='vertical');
plt.tick_params(axis='both', which='major', labelsize=20)
plt.suptitle('Residuals\n\n\n',fontsize=30,y=1.05)
plt.tight_layout()



In [43]:

    
display(lm6.predict(data))









    





array([ 94.22244227,  97.07675347,  93.76870783,  86.46096478,
        94.07552983,  95.42980641,  95.82918733,  88.07375545,
        87.53594202,  82.32915778,  92.92755923])



In [ ]:

	Dorm	URM	LGB	lessthaneightyk	fouryrgrad	sixyrgrad
0	Next	20.5	11.0	23.2	86.4	94.3
1	Baker	16.0	4.2	20.6	87.8	95.7
2	Burton	20.8	10.2	18.6	86.3	94.0
3	EC	19.2	38.0	24.9	75.3	89.4
4	MacGregor	27.2	8.3	28.2	80.9	92.2
5	Maseeh	21.4	6.4	23.3	87.2	NaN
6	McCormick	20.2	8.1	27.9	88.8	95.8
7	New	43.5	16.1	31.4	80.5	91.3
8	Random	17.2	40.8	35.5	71.6	88.0
9	Senior	36.7	40.0	30.9	59.7	78.1
10	Simmons	30.9	8.7	27.4	83.3	93.5

Dep. Variable:	fouryrgrad	R-squared:	0.880
Model:	OLS	Adj. R-squared:	0.829
Method:	Least Squares	F-statistic:	17.13
Date:	Tue, 04 Jul 2017	Prob (F-statistic):	0.00132
Time:	15:07:49	Log-Likelihood:	-27.393
No. Observations:	11	AIC:	62.79
Df Residuals:	7	BIC:	64.38
Df Model:	3
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[95.0% Conf. Int.]
Intercept	96.9863	6.788	14.287	0.000	80.934 113.038
URM	-0.3188	0.151	-2.107	0.073	-0.677 0.039
LGB	-0.5252	0.100	-5.232	0.001	-0.763 -0.288
lessthaneightyk	0.0305	0.324	0.094	0.928	-0.736 0.797

Omnibus:	0.264	Durbin-Watson:	2.136
Prob(Omnibus):	0.877	Jarque-Bera (JB):	0.275
Skew:	-0.271	Prob(JB):	0.872
Kurtosis:	2.446	Cond. No.	257.

Dep. Variable:	sixyrgrad	R-squared:	0.828
Model:	OLS	Adj. R-squared:	0.743
Method:	Least Squares	F-statistic:	9.653
Date:	Tue, 04 Jul 2017	Prob (F-statistic):	0.0103
Time:	15:07:50	Log-Likelihood:	-21.477
No. Observations:	10	AIC:	50.95
Df Residuals:	6	BIC:	52.16
Df Model:	3
Covariance Type:	nonrobust

Omnibus:	0.699	Durbin-Watson:	2.373
Prob(Omnibus):	0.705	Jarque-Bera (JB):	0.152
Skew:	-0.287	Prob(JB):	0.927
Kurtosis:	2.813	Cond. No.	255.

	pred4yr	pred6yr	URM	LGB	lessthaneightyk	fouryrgrad	sixyrgrad
Dorm
Next	85.381794	94.222442	20.5	11.0	23.2	86.4	94.3
Baker	90.308531	97.076753	16.0	4.2	20.6	87.8	95.7
Burton	85.565986	93.768708	20.8	10.2	18.6	86.3	94.0
EC	71.666707	86.460965	19.2	38.0	24.9	75.3	89.4
MacGregor	84.816731	94.075530	27.2	8.3	28.2	80.9	92.2
Maseeh	87.514035	NaN	21.4	6.4	23.3	87.2	NaN
McCormick	87.144025	95.829187	20.2	8.1	27.9	88.8	95.8
New	75.621562	88.073755	43.5	16.1	31.4	80.5	91.3
Random	71.157039	87.535942	17.2	40.8	35.5	71.6	88.0
Senior	65.220819	82.329158	36.7	40.0	30.9	59.7	78.1
Simmons	83.402771	92.927559	30.9	8.7	27.4	83.3	93.5