In [128]:
import statsmodels.formula.api as smf
import pandas as pd
import numpy as np
import scipy as sp
In [129]:
# Create some example data
cs= [12, 14, 16, 18] # classes of carbons
ds = [0, 1, 2, 3] # classes of double bonds
df = pd.DataFrame({'RT': np.random.uniform(low=0.1, high=15, size=1000), 'Carbon': np.random.choice(cs, size=1000), 'DB': np.random.choice(ds, size=1000)})
df.head()
Out[129]:
In [130]:
# Write out my R-style formula
formula = 'RT ~ C(Carbon) + C(DB)'
# Generate a model using the formula and dataframe. This steps builds all of the matrices needed for OLS.
model = smf.ols(formula, df)
# Fit the model and get the results output
results = model.fit()
# Print an overall summary of the model
results.summary()
Out[130]:
In [131]:
# Only showing the first 5 to keep things short
dir(results)[:5]
Out[131]:
There are two $R^2$ values that you can get from these results: rsquared and rsquared_adj. To get the rsquared value you can do:
In [132]:
rSquared = results.rsquared
print rSquared
In [133]:
from statsmodels.stats.outliers_influence import summary_table
Summary table outputs 3 different datasets:
In [134]:
# Use summary table to pull of the stats that we need
st, data, header = summary_table(results, alpha=0.05)
For our sanity it will help to map a short header name with the column number. I just print out the header and make a dictionary relating a short version of the header with the column number. You don't really need to do this, but it makes things a little cleaner.
In [135]:
header
Out[135]:
In [136]:
head2index = {
'obs': 0,
'dep': 1,
'pred': 2,
'SEpred': 3,
'CIlow': 4,
'CIhi': 5,
'PIlow': 6,
'PIhi': 7,
'residual': 8,
'SEres': 9,
'student': 10,
'cooksd': 11
}
Notice there is no DFFITs, I use a different method below to handle this. Warning: I sort these results, so they show up as straight lines in the figure. Try it without sorting first
In [137]:
# Get fitted values
fittedvalues = np.sort(data[:, head2index['pred']])
# Get Upper and lower Predictition Intervals
predHi = np.sort(data[:, head2index['PIhi']])
predLow = np.sort(data[:, head2index['PIlow']])
# I think thse are the same as those output by
# from statsmodels.sandbox.regression.predstd import wls_prediction_std
# wls_prediction_std(results)
# but need to double check
# Get upper and lower Confidence Intervals
ciHi = np.sort(data[:, head2index['CIhi']])
ciLow = np.sort(data[:, head2index['CIlow']])
# Get Cook's D
cooks = data[:, head2index['cooksd']]
In [138]:
# Import plotting library
import matplotlib.pyplot as plt
# Ignore this command, it allows inline plotting in ipython
%matplotlib inline
In [139]:
# Get original data values
x = df.index.values
y = df['RT'].values
In [140]:
# Make Figure with only 1 plot
fig, ax = plt.subplots(1, 1, figsize=(20,20))
# Plot original data
ax.scatter(x, y)
# Plot fitted data and confidence regions
ax.plot(x, fittedvalues, 'g-', lw=2)
ax.plot(x, predHi, 'r--', lw=2)
ax.plot(x, predLow, 'r--', lw=2)
ax.plot(x, ciHi, 'y--', lw=2)
ax.plot(x, ciLow, 'y--', lw=2)
# Adjust plot to make it look a little better
ax.set_xlabel('Row', fontsize=18)
ax.set_ylabel('RT', fontsize=18)
ax.set_title('Regression of Carbon and Bond number on RT', fontsize=24)
ax.set_xlim(0, 1000)
Out[140]:
The plot above shows the following. Remember this is simulated data, so it looks a little strange.
In [141]:
infl = results.get_influence()
In [142]:
dir(infl)
Out[142]:
In [143]:
# Cook's D stat (same as above method)
cooksD = infl.cooks_distance
# PRESS residuals
press = infl.resid_press
# DFFITS provides the value and the cutoff threshold
(dffits, dffitsThres) = infl.dffits
# Flag DFFITS if greater than threshold
flagDffits = dffits > dffitsThres
In [144]:
# Make pretty table for everything
pretty = pd.DataFrame({
'cooksD': cooksD[0],
'dffits': dffits,
'dffits thresh': [dffitsThres]*1000,
'flag_dffits': flagDffits.astype(int),
'press resid': press
})
pretty.head()
Out[144]:
In [ ]: