In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import scipy as sp
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
In [2]:
boston_df = pd.read_csv('data/boston.csv')#波士顿房产预测medv
del boston_df['Unnamed: 0']
In [3]:
boston_df.head()
Out[3]:
In [18]:
boston_df.describe()
Out[18]:
In [73]:
#LSTAT - % of population with low status; MEDV - median value of home
fig,ax = plt.subplots()
ax.scatter(x=boston_df['lstat'],y=boston_df['medv'])
ax.set_xlabel('lstat')
ax.set_ylabel('medv')
Out[73]:
In [56]:
X = boston_df['lstat'].values
X = sm.add_constant(X) #增加截距项,矩阵运算
y = boston_df['medv'].values
ols = sm.OLS(y,X).fit()
ols.summary()
#sm模型偏向于统计,总结了参数估计的评估结果Ordinary Least Squares
Out[56]:
In [67]:
#sklearn拟合
reg = LinearRegression()
X = boston_df[['lstat']].values#数据格式[[1,2,3],[4,5,6],..[1,2,4]]
y = boston_df['medv'].values#[1,2,3,4]
reg.fit(X,y)
reg.intercept_,reg.coef_
Out[67]:
In [78]:
#可视化
fig,ax = plt.subplots()
ax.scatter(x=boston_df['lstat'],y=boston_df['medv'])
ax.set_xlabel('lstat')
ax.set_ylabel('medv')
lstats = boston_df['lstat'].values
xs = range(int(np.min(X[:,0])), int(np.max(X[:,0])))
ys = [reg.predict([x]) for x in xs]
ax.plot(xs,ys,'r',linewidth=2.5)
Out[78]:
In [12]:
#两个变量的线性回归
X = boston_df[['lstat','age']].values
X = sm.add_constant(X)
y = boston_df['medv'].values
ols2 = sm.OLS(y,X).fit()
ols2.summary()
Out[12]:
In [15]:
#利用所有特征拟合
xcols = boston_df.columns[0:-1]
X = boston_df[xcols]
reg3 = LinearRegression()
reg3.fit(X,y)
Out[15]:
In [18]:
reg3.intercept_,reg3.coef_
Out[18]:
In [10]:
#预测值与预测误差
#残差图:用于识别非线性度,在线性回归模型中,变量之间的关系通常不是线性的
def residuals_vs_fitted(fitted,residuals,x_label='Fitted',y_label='Residuals'):
plt.subplot(111)
plt.xlabel(x_label)
plt.ylabel(y_label)
plt.scatter(fitted,residuals)
polyline = np.poly1d(np.polyfit(fitted,residuals,2))#对残差的平滑拟合,用于显示残差的趋势
xs = range(int(np.min(fitted)),int(np.max(fitted)))
plt.plot(xs,polyline(xs),color='r',linewidth=2.5)
def qq_plot(residuals):
sm.qqplot(residuals)
def standardize(xs):
xmean = np.mean(xs)
xstd = np.std(xs)
return (xs-xmean)/ xstd
In [29]:
fitted = reg3.predict(X)
residuals = y -fitted
std_residuals = standardize(residuals)
residuals_vs_fitted(fitted,residuals)
#残差图呈现出U型,显示了数据的非线性度
In [31]:
fig = sm.qqplot(residuals,dist='norm',line='r')
In [32]:
residuals_vs_fitted(fitted,std_residuals,"Fitted","Std.Residual")
In [4]:
#拟合 medv~lastat*age
boston_df["lastat*age"] = boston_df["lstat"]*boston_df['age']#新建一列lastat*age
boston_df.head()
Out[4]:
In [8]:
reg3 =LinearRegression()
X = boston_df[["lstat","age","lastat*age"]]
y = boston_df["medv"]
reg3.fit(X,y)
(reg3.intercept_,reg3.coef_)
Out[8]:
In [11]:
fitted = reg3.predict(X)
residuals = y -fitted
std_residuals = standardize(residuals)
residuals_vs_fitted(fitted,residuals,)
In [16]:
#拟合 medv~lstat +I(lstat^2) 二次项
boston_df['lstat^2'] = boston_df["lstat"]**2
reg4 = LinearRegression()
X = boston_df[["lstat","lstat^2"]]
y = boston_df["medv"]
reg4.fit(X,y)
lstats = boston_df['lstat'].values
xs = range(int(np.min(lstats)),int(np.max(lstats)))
ys6 = [reg4.predict([x,x*x]) for x in xs]
reg4.intercept_,reg4.coef_
Out[16]:
In [17]:
fitted = reg4.predict(X)
residuals = y - fitted
std_residuals = standardize(residuals)
residuals_vs_fitted(fitted,residuals)
#加入了二次项,提高了数据的拟合度
#当残差图显示了数据的非线性联系,简单的办法引入X^2 log X 等非线性项
In [18]:
#拟合4次项 lstat^4
boston_df['lstat^4'] = np.power(boston_df['lstat'],4)
boston_df['lstat^3'] = np.power(boston_df['lstat'],3)
模型:$y = \beta_0 + \beta_1 X + \beta_2 X^2 + \beta_3 X^3 + \beta_4 X^4$
In [24]:
X = boston_df[['lstat','lstat^2','lstat^3','lstat^4']]
y = boston_df['medv']
reg5 = LinearRegression()
reg5.fit(X,y)
ys5 = [reg5.predict([x,x**2,x**3,x**4]) for x in xs]
reg5.intercept_,reg5.coef_
Out[24]:
In [26]:
fitted = reg5.predict(X)
residuals = y - fitted
std_residuals = standardize(residuals)
residuals_vs_fitted(fitted,residuals)
In [32]:
fig,ax = plt.subplots()
ax.scatter(boston_df['lstat'],boston_df['medv'])
ax.set_xlabel('lstat')
ax.set_ylabel('medv')
ax.plot(xs,ys6,color='r',linewidth=2.5)#二次项的拟合情况 红色
ax.plot(xs,ys5,color='g',linewidth=2.5)#四次项的拟合情况 蓝色
Out[32]:
In [ ]: