Chapter 03
In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%matplotlib inline
In [2]:
auto_file_path = '../data/Auto'
autos = pd.read_table(auto_file_path,sep='\s+')
autos.head()
Out[2]:
In [3]:
# clearn the data
autos=autos.replace('?',np.NAN).dropna()
autos['horsepower']=autos['horsepower'].astype('float')
In [4]:
autos.plot.scatter(x='horsepower', y='mpg')
Out[4]:
In [5]:
g = sns.jointplot('horsepower','mpg',data=autos,kind='reg', xlim=(25,225))
In [6]:
from IPython.display import HTML, display
import statsmodels.api as sm
from statsmodels.formula.api import ols
mpg_model = ols("mpg ~ horsepower", autos).fit()
mpg_model_summary=mpg_model.summary()
# convert our table to HTML and add colors to headers for explanatory purposes
HTML(
mpg_model_summary\
.as_html()\
.replace(' Adj. R-squared: ', ' Adj. R-squared: ')\
.replace('coef', 'coef')\
.replace('std err', 'std err')\
.replace('P>|t|', 'P>|t|')\
.replace('[95.0% Conf. Int.]', '[95.0% Conf. Int.]')
)
Out[6]:
In [56]:
# confidence interval
from statsmodels.sandbox.regression.predstd import wls_prediction_std
_, confidence_interval_lower, confidence_interval_upper = wls_prediction_std(mpg_model)
x = autos['horsepower']
y = autos['mpg']
fig, ax = plt.subplots(figsize=(10,7))
ax.plot(x, confidence_interval_lower,'g--')
ax.plot(x, confidence_interval_upper,'g--')
ax.scatter(x,y)
Out[56]:
In [71]:
sns.lmplot(x='horsepower', y='mpg',data=autos)
Out[71]:
In [73]:
sns.residplot(x='horsepower', y='mpg',data=autos)
Out[73]:
In [67]:
from pandas.tools.plotting import scatter_matrix
# mpg cylinders displacement horsepower weight acceleration
fig, ax = plt.subplots(figsize=(15, 15))
df_auto = autos[['mpg','cylinders','displacement','horsepower','weight','acceleration','year','origin']]
scatter_matrix(df_auto, alpha=0.5,diagonal='kde', ax=ax);
In [68]:
df_auto.corr()
Out[68]:
In [70]:
mpg_multi_model = ols('mpg ~ cylinders + displacement + horsepower + weight + acceleration + year + origin',
data=df_auto).fit()
mpg_multi_model_summary = mpg_multi_model.summary()
HTML(
mpg_multi_model_summary\
.as_html()\
.replace(' Adj. R-squared: ', ' Adj. R-squared: ')\
.replace('coef', 'coef')\
.replace('std err', 'std err')\
.replace('P>|t|', 'P>|t|')\
.replace('[95.0% Conf. Int.]', '[95.0% Conf. Int.]')
)
Out[70]:
In [65]:
carseats_file_name = '../data/Carseats.csv'
carseats = pd.read_csv(carseats_file_name, index_col=0)
carseats.head()
Out[65]:
In [66]:
carseats_subset = carseats[['Sales','Price','Urban','US']]
carseats_subset=carseats_subset.replace(['Yes','No'],[1,-1])
sales_multi_model = ols('Sales ~ Price + Urban + US', data=carseats_subset).fit()
sales_multi_model_summary = sales_multi_model.summary()
HTML(
sales_multi_model_summary\
.as_html()\
.replace(' Adj. R-squared: ', ' Adj. R-squared: ')\
.replace('coef', 'coef')\
.replace('std err', 'std err')\
.replace('P>|t|', 'P>|t|')\
.replace('[95.0% Conf. Int.]', '[95.0% Conf. Int.]')
)
Out[66]:
The coefficients' null hypotheis of Intercept, Price and US can be rejected.
In [69]:
sales_multi_model = ols('Sales ~ Price + US', data=carseats_subset).fit()
sales_multi_model_summary = sales_multi_model.summary()
HTML(
sales_multi_model_summary\
.as_html()\
.replace(' Adj. R-squared: ', ' Adj. R-squared: ')\
.replace('coef', 'coef')\
.replace('std err', 'std err')\
.replace('P>|t|', 'P>|t|')\
.replace('[95.0% Conf. Int.]', '[95.0% Conf. Int.]')
)
Out[69]:
In [71]:
from statsmodels.graphics.regressionplots import plot_leverage_resid2
plot_leverage_resid2(sales_multi_model);
In [75]:
fig, ax = plt.subplots(figsize=(10,10))
fig=sm.graphics.influence_plot(sales_multi_model, ax=ax, criterion="cooks")
In [7]:
np.random.seed(1)
x = np.random.normal(0,1,100)
y = 2*x + np.random.normal(0, 1, 100)
In [8]:
df = pd.DataFrame({'x':x,'y':y})
df_y_x_model = ols('y ~ x + 0', data=df).fit()
df_y_x_model_summary = df_y_x_model.summary()
HTML(
df_y_x_model_summary\
.as_html()\
.replace(' Adj. R-squared: ', ' Adj. R-squared: ')\
.replace('coef', 'coef')\
.replace('std err', 'std err')\
.replace('P>|t|', 'P>|t|')\
.replace('[95.0% Conf. Int.]', '[95.0% Conf. Int.]')
)
Out[8]:
In [9]:
df_x_y_model = ols('x ~ y + 0', data=df).fit()
df_x_y_model_summary = df_x_y_model.summary()
HTML(
df_x_y_model_summary\
.as_html()\
.replace(' Adj. R-squared: ', ' Adj. R-squared: ')\
.replace('coef', 'coef')\
.replace('std err', 'std err')\
.replace('P>|t|', 'P>|t|')\
.replace('[95.0% Conf. Int.]', '[95.0% Conf. Int.]')
)
Out[9]:
The $y=2x+\epsilon$ can be written $x=0.5(y-\epsilon)$
We draw $x$ from the Gaussian normal distribution, so the $\bar{x}=0$, and the $\bar{y}=0$. $\hat{\beta}=\frac{\sum_{i=1}^n(x_i-\bar{x})(y_i-\bar{y})}{\sum_{i=1}^{n}(x_i-\bar{x})^2}=\frac{\sum_{i=1}^nx_iy_i}{\sum_{i=1}^{n}x_i^2}$ Here is the Proof: $$ t=\frac{\hat{\beta}}{\text{SE}(\hat{\beta})}=\frac{\sum x_iy_i}{\sum x_i^2}\sqrt{\frac{(n-1)\sum x_i^2}{\sum (y_i-x_i\hat{\beta})^2}} $$
In [10]:
df_y_x_intercept_model = ols('y ~ x', data=df).fit()
df_y_x_intercept_model_summary = df_y_x_intercept_model.summary()
HTML(
df_y_x_intercept_model_summary\
.as_html()\
.replace(' Adj. R-squared: ', ' Adj. R-squared: ')\
.replace('coef', 'coef')\
.replace('std err', 'std err')\
.replace('P>|t|', 'P>|t|')\
.replace('[95.0% Conf. Int.]', '[95.0% Conf. Int.]')
)
Out[10]:
In [11]:
df_x_y_intercept_model = ols('x ~ y', data=df).fit()
df_x_y_intercept_model_summary = df_x_y_intercept_model.summary()
HTML(
df_x_y_intercept_model_summary\
.as_html()\
.replace(' Adj. R-squared: ', ' Adj. R-squared: ')\
.replace('coef', 'coef')\
.replace('std err', 'std err')\
.replace('P>|t|', 'P>|t|')\
.replace('[95.0% Conf. Int.]', '[95.0% Conf. Int.]')
)
Out[11]:
In [12]:
np.random.seed(1)
x = np.random.normal(0,1,100)
In [13]:
eps = np.random.normal(0,0.25,100)
In [14]:
y = -1.0 + 0.5*x+eps
df = pd.DataFrame({'x':x,'y':y})
In [17]:
df.plot.scatter(x='x',y='y');
In [18]:
lm_model = ols('y ~ x', data=df).fit()
lm_model_summary = lm_model.summary()
HTML(
lm_model_summary\
.as_html()\
.replace(' Adj. R-squared: ', ' Adj. R-squared: ')\
.replace('coef', 'coef')\
.replace('std err', 'std err')\
.replace('P>|t|', 'P>|t|')\
.replace('[95.0% Conf. Int.]', '[95.0% Conf. Int.]')
)
Out[18]:
In [20]:
X = np.linspace(-2,2,100)
y_pred = X*0.5239 + (-0.9632)
y_actu = X*0.5 + (-1.0)
plt.plot(X, y_pred, 'r', label='Predict')
plt.plot(X, y_actu, 'b', label='Actual')
plt.scatter(x,y)
plt.legend()
plt.show()
In [23]:
df['x2'] = x**2
lm_quadratic_model = ols('y ~ x + x2', data= df).fit()
lm_quadratic_model_summary = lm_quadratic_model.summary()
HTML(
lm_quadratic_model_summary\
.as_html()\
.replace(' Adj. R-squared: ', ' Adj. R-squared: ')\
.replace('coef', 'coef')\
.replace('std err', 'std err')\
.replace('P>|t|', 'P>|t|')\
.replace('[95.0% Conf. Int.]', '[95.0% Conf. Int.]')
)
Out[23]:
As the $x^2$'s $p-$value is big enough to reject this relationship
In [77]:
np.random.seed(1)
x1 = np.random.uniform(0,1,100)
x2 = 0.5*x1+ np.random.normal(100)/10
y = 2 + 2*x1 + 0.3*x2+np.random.normal(100)
In [78]:
df = pd.DataFrame({'x1':x1,'x2':x2,'y':y})
df[['x1','x2']].corr()
Out[78]:
In [79]:
df[['x1','x2']].plot.scatter(x='x1',y='x2');
In [80]:
lm_y_x1_x2_model = ols('y ~ x1 + x2', data=df).fit()
lm_y_x1_x2_model_summary = lm_y_x1_x2_model.summary()
HTML(
lm_y_x1_x2_model_summary\
.as_html()\
.replace(' Adj. R-squared: ', ' Adj. R-squared: ')\
.replace('coef', 'coef')\
.replace('std err', 'std err')\
.replace('P>|t|', 'P>|t|')\
.replace('[95.0% Conf. Int.]', '[95.0% Conf. Int.]')
)
Out[80]:
From above tables, we get the $\hat{\beta_0}=1.577,\hat{\beta_1}=-2.9255,\hat{\beta_2}=10.15009$ which are quite various from the $\beta_0,\beta_1,\beta_2$
In [81]:
lm_y_x1_model = ols('y ~ x1', data=df).fit()
lm_y_x1_model_summary = lm_y_x1_model.summary()
HTML(
lm_y_x1_model_summary\
.as_html()\
.replace(' Adj. R-squared: ', ' Adj. R-squared: ')\
.replace('coef', 'coef')\
.replace('std err', 'std err')\
.replace('P>|t|', 'P>|t|')\
.replace('[95.0% Conf. Int.]', '[95.0% Conf. Int.]')
)
Out[81]:
In [82]:
lm_y_x2_model = ols('y ~ x2', data=df).fit()
lm_y_x2_model_summary = lm_y_x2_model.summary()
HTML(
lm_y_x2_model_summary\
.as_html()\
.replace(' Adj. R-squared: ', ' Adj. R-squared: ')\
.replace('coef', 'coef')\
.replace('std err', 'std err')\
.replace('P>|t|', 'P>|t|')\
.replace('[95.0% Conf. Int.]', '[95.0% Conf. Int.]')
)
Out[82]:
In [83]:
x1 = np.hstack((x1,[0.6]))
x2 = np.hstack([x2,[0.8]])
y = np.hstack([y, 6])
df = pd.DataFrame({'x1':x1,'x2':x2,'y':y})
lm_y_x1_x2_model = ols('y ~ x1 + x2',data=df).fit()
lm_y_x1_x2_model_summary = lm_y_x1_x2_model.summary()
HTML(
lm_y_x1_x2_model_summary\
.as_html()\
.replace(' Adj. R-squared: ', ' Adj. R-squared: ')\
.replace('coef', 'coef')\
.replace('std err', 'std err')\
.replace('P>|t|', 'P>|t|')\
.replace('[95.0% Conf. Int.]', '[95.0% Conf. Int.]')
)
Out[83]:
In [85]:
from statsmodels.graphics.regressionplots import plot_leverage_resid2
plot_leverage_resid2(lm_y_x1_x2_model);
In [46]:
boston_file_name = '../data/Boston.csv'
bostons = pd.read_csv(boston_file_name, index_col=0)
bostons.head()
Out[46]:
In [47]:
print(ols('crim ~ zn',data=bostons).fit().summary())
In [48]:
print(ols('crim ~ indus',data=bostons).fit().summary())
In [49]:
print(ols('crim ~ chas',data=bostons).fit().summary())
In [50]:
print(ols('crim ~ nox',data=bostons).fit().summary())
In [51]:
print(ols('crim ~ rm',data=bostons).fit().summary())
In [52]:
print(ols('crim ~ age',data=bostons).fit().summary())
In [53]:
print(ols('crim ~ dis',data=bostons).fit().summary())
In [54]:
print(ols('crim ~ rad',data=bostons).fit().summary())
In [55]:
print(ols('crim ~ tax',data=bostons).fit().summary())
In [56]:
print(ols('crim ~ ptratio',data=bostons).fit().summary())
In [57]:
print(ols('crim ~ black',data=bostons).fit().summary())
In [58]:
print(ols('crim ~ lstat',data=bostons).fit().summary())
In [59]:
print(ols('crim ~ medv',data=bostons).fit().summary())
In [63]:
print(ols('crim ~ zn + indus + chas + nox + rm + age + dis + rad + tax + ptratio + black + lstat + medv',
data=bostons).fit().summary())