In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from pandas.tools.plotting import scatter_matrix
import numpy as np
from sklearn.datasets import load_boston
import statsmodels.api as sm
from statsmodels.formula.api import ols
sns.set()
In [2]:
# scikit-learn dataset
df_dict = load_boston()
print df_dict["DESCR"]
In [3]:
features = pd.DataFrame(data=df_dict.data, columns = df_dict.feature_names)
target = pd.DataFrame(data=df_dict.target, columns = ['MEDV'])
df = pd.concat([features, target], axis=1)
df.head()
Out[3]:
In [4]:
fig, ax = plt.subplots(figsize=(12, 12))
scatter_matrix(df, alpha=0.2, diagonal='kde', ax=ax);
In [5]:
for col in df.columns:
print col, df[col].nunique()
In [6]:
df['RAD'].unique()
Out[6]:
In [7]:
df['RAD_bool'] = df['RAD'].apply(lambda x: x > 15).astype('bool')
In [8]:
# Target variable
fig, ax = plt.subplots(figsize=(12,8))
sns.distplot(df.MEDV, ax=ax, rug=True, hist=False)
Out[8]:
In [9]:
fig, ax = plt.subplots(figsize=(10,7))
sns.kdeplot(df.LSTAT, df.MEDV, ax=ax)
Out[9]:
In [10]:
mod = ols(formula='''MEDV ~ LSTAT + 1''', data=df).fit()
mod.summary()
Out[10]:
In [11]:
mod = ols(formula='''MEDV ~ LSTAT + I(np.log(LSTAT)) + 1''', data=df).fit()
mod.summary()
Out[11]:
In [12]:
fig, ax = plt.subplots(figsize=(12,8))
sm.graphics.plot_ccpr(mod, "I(np.log(LSTAT))", ax=ax)
Out[12]:
In [13]:
fig, ax = plt.subplots(figsize=(12,8))
sm.graphics.plot_ccpr(mod, "LSTAT", ax=ax)
Out[13]:
In [15]:
mod = ols(formula='''MEDV ~ RM + C(RAD_bool) + LSTAT + I(np.log(LSTAT)) + 1''', data=df).fit()
mod.summary()
Out[15]:
In [16]:
mod = ols(formula='''MEDV ~ RM + C(RAD) + LSTAT + I(np.log(LSTAT)) + 1''', data=df).fit()
mod.summary()
Out[16]:
In [17]:
fig, ax = plt.subplots(figsize=(10,8))
fig = sm.graphics.influence_plot(mod, ax=ax, criterion="cooks", )
In [18]:
fig, ax = plt.subplots(figsize=(10,8))
fig = sm.graphics.plot_leverage_resid2(mod, ax=ax)
The slope of the fitted line is the that of exog_i in the full multiple regression. The individual points can be used to assess the influence of points on the estimated coefficient. -- http://statsmodels.sourceforge.net/0.5.0/generated/statsmodels.graphics.regressionplots.plot_partregress.html
In [19]:
fig, ax = plt.subplots(figsize=(10,8))
fig = sm.graphics.plot_partregress("MEDV", "LSTAT", ["RAD", "RM"], data=df, ax=ax)
In [20]:
fig, ax = plt.subplots(figsize=(10,14))
sm.graphics.plot_partregress_grid(mod, fig=fig)
Out[20]:
In [21]:
fig = plt.figure(figsize=(10,8))
fig = sm.graphics.plot_regress_exog(mod, "LSTAT", fig=fig)
In [22]:
fig = plt.figure(figsize=(10,8))
fig = sm.graphics.plot_regress_exog(mod, "I(np.log(LSTAT))", fig=fig)
In [ ]: