In [5]:
import pandas as pd
import statsmodels.api as sm
import pylab as pl
import numpy as np
In [17]:
# read the data in
df = pd.read_csv("http://www.ats.ucla.edu/stat/data/binary.csv")
# take a look at the dataset
print df.head()
# admit gre gpa rank
# 0 0 380 3.61 3
# 1 1 660 3.67 3
# 2 1 800 4.00 1
# 3 1 640 3.19 4
# 4 0 520 2.93 4
# rename the 'rank' column because there is also a DataFrame method called 'rank'
df.columns = ["admit", "gre", "gpa", "prestige"]
print df.columns
In [18]:
# summarize the data
print df.describe()
# admit gre gpa prestige
# count 400.000000 400.000000 400.000000 400.00000
# mean 0.317500 587.700000 3.389900 2.48500
# std 0.466087 115.516536 0.380567 0.94446
# min 0.000000 220.000000 2.260000 1.00000
# 25% 0.000000 520.000000 3.130000 2.00000
# 50% 0.000000 580.000000 3.395000 2.00000
# 75% 1.000000 660.000000 3.670000 3.00000
# max 1.000000 800.000000 4.000000 4.00000
# take a look at the standard deviation of each column
print df.std()
# admit 0.466087
# gre 115.516536
# gpa 0.380567
# prestige 0.944460
# frequency table cutting presitge and whether or not someone was admitted
print pd.crosstab(df['admit'], df['prestige'], rownames=['admit'])
# prestige 1 2 3 4
# admit
# 0 28 97 93 55
# 1 33 54 28 12
# plot all of the columns
df.hist()
pl.show()
In [8]:
# dummify rank
dummy_ranks = pd.get_dummies(df['prestige'], prefix='prestige')
print dummy_ranks.head()
# prestige_1 prestige_2 prestige_3 prestige_4
# 0 0 0 1 0
# 1 0 0 1 0
# 2 1 0 0 0
# 3 0 0 0 1
# 4 0 0 0 1
# create a clean data frame for the regression
cols_to_keep = ['admit', 'gre', 'gpa']
data = df[cols_to_keep].join(dummy_ranks.ix[:, 'prestige_2':])
print data.head()
# admit gre gpa prestige_2 prestige_3 prestige_4
# 0 0 380 3.61 0 1 0
# 1 1 660 3.67 0 1 0
# 2 1 800 4.00 0 0 0
# 3 1 640 3.19 0 0 1
# 4 0 520 2.93 0 0 1
# manually add the intercept
data['intercept'] = 1.0
In [9]:
train_cols = data.columns[1:]
# Index([gre, gpa, prestige_2, prestige_3, prestige_4], dtype=object)
logit = sm.Logit(data['admit'], data[train_cols])
# fit the model
result = logit.fit()
In [10]:
# cool enough to deserve it's own gist
print result.summary()
In [11]:
# look at the confidence interval of each coeffecient
print result.conf_int()
# 0 1
# gre 0.000120 0.004409
# gpa 0.153684 1.454391
# prestige_2 -1.295751 -0.055135
# prestige_3 -2.016992 -0.663416
# prestige_4 -2.370399 -0.732529
# intercept -6.224242 -1.755716
In [12]:
# odds ratios only
print np.exp(result.params)
# gre 1.002267
# gpa 2.234545
# prestige_2 0.508931
# prestige_3 0.261792
# prestige_4 0.211938
# intercept 0.018500
In [13]:
# odds ratios and 95% CI
params = result.params
conf = result.conf_int()
conf['OR'] = params
conf.columns = ['2.5%', '97.5%', 'OR']
print np.exp(conf)
# 2.5% 97.5% OR
# gre 1.000120 1.004418 1.002267
# gpa 1.166122 4.281877 2.234545
# prestige_2 0.273692 0.946358 0.508931
# prestige_3 0.133055 0.515089 0.261792
# prestige_4 0.093443 0.480692 0.211938
# intercept 0.001981 0.172783 0.018500
In [15]:
# instead of generating all possible values of GRE and GPA, we're going
# to use an evenly spaced range of 10 values from the min to the max
gres = np.linspace(data['gre'].min(), data['gre'].max(), 10)
print gres
# array([ 220. , 284.44444444, 348.88888889, 413.33333333,
# 477.77777778, 542.22222222, 606.66666667, 671.11111111,
# 735.55555556, 800. ])
gpas = np.linspace(data['gpa'].min(), data['gpa'].max(), 10)
print gpas
# array([ 2.26 , 2.45333333, 2.64666667, 2.84 , 3.03333333,
# 3.22666667, 3.42 , 3.61333333, 3.80666667, 4. ])
# enumerate all possibilities
combos = pd.DataFrame(cartesian([gres, gpas, [1, 2, 3, 4], [1.]]))
# recreate the dummy variables
combos.columns = ['gre', 'gpa', 'prestige', 'intercept']
dummy_ranks = pd.get_dummies(combos['prestige'], prefix='prestige')
dummy_ranks.columns = ['prestige_1', 'prestige_2', 'prestige_3', 'prestige_4']
# keep only what we need for making predictions
cols_to_keep = ['gre', 'gpa', 'prestige', 'intercept']
combos = combos[cols_to_keep].join(dummy_ranks.ix[:, 'prestige_2':])
# make predictions on the enumerated dataset
combos['admit_pred'] = result.predict(combos[train_cols])
print combos.head()
# gre gpa prestige intercept prestige_2 prestige_3 prestige_4 admit_pred
# 0 220 2.260000 1 1 0 0 0 0.157801
# 1 220 2.260000 2 1 1 0 0 0.087056
# 2 220 2.260000 3 1 0 1 0 0.046758
# 3 220 2.260000 4 1 0 0 1 0.038194
# 4 220 2.453333 1 1 0 0 0 0.179574
In [16]:
def isolate_and_plot(variable):
# isolate gre and class rank
grouped = pd.pivot_table(combos, values=['admit_pred'], rows=[variable, 'prestige'],
aggfunc=np.mean)
# in case you're curious as to what this looks like
# print grouped.head()
# admit_pred
# gre prestige
# 220.000000 1 0.282462
# 2 0.169987
# 3 0.096544
# 4 0.079859
# 284.444444 1 0.311718
# make a plot
colors = 'rbgyrbgy'
for col in combos.prestige.unique():
plt_data = grouped.ix[grouped.index.get_level_values(1)==col]
pl.plot(plt_data.index.get_level_values(0), plt_data['admit_pred'],
color=colors[int(col)])
pl.xlabel(variable)
pl.ylabel("P(admit=1)")
pl.legend(['1', '2', '3', '4'], loc='upper left', title='Prestige')
pl.title("Prob(admit=1) isolating " + variable + " and presitge")
pl.show()
isolate_and_plot('gre')
isolate_and_plot('gpa')
In [ ]: