In [23]:
import math
import matplotlib.pyplot as plt
import random

In [45]:
x=random.sample(range(10000),300)

In [46]:
y=[math.sqrt(xx)+random.randint(0,100) for xx in x]

In [47]:
plt.scatter(x,y)


Out[47]:
<matplotlib.collections.PathCollection at 0x7faf00fa6c18>

In [48]:
plt.show()



In [49]:
import numpy as np
import pandas as pd
from numpy.linalg import inv
from numpy import dot

In [54]:
Xb=pd.DataFrame()
Xb['data']=x
Xb['constant']=np.ones(300)

XTy/(XTX)就是这个公式,求出了线性回归参数


In [55]:
np.dot(np.linalg.inv(np.dot(Xb.T,Xb)),np.dot(Xb.T,y))


Out[55]:
array([  8.66251066e-03,   7.45185183e+01])

In [56]:
import statsmodels.api as sm

In [65]:
dx=pd.Series(x)
dy=pd.Series(y)
dx=sm.add_constant(dx)

In [66]:
linear_regression=sm.OLS(dy,dx)

In [67]:
model_fitted=linear_regression.fit()

In [68]:
model_fitted.summary()


Out[68]:
OLS Regression Results
Dep. Variable: y R-squared: 0.369
Model: OLS Adj. R-squared: 0.366
Method: Least Squares F-statistic: 173.9
Date: Tue, 07 Mar 2017 Prob (F-statistic): 1.36e-31
Time: 16:22:59 Log-Likelihood: -1455.7
No. Observations: 300 AIC: 2915.
Df Residuals: 298 BIC: 2923.
Df Model: 1
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const 74.5185 3.795 19.636 0.000 67.050 81.987
None 0.0087 0.001 13.187 0.000 0.007 0.010
Omnibus: 172.203 Durbin-Watson: 2.095
Prob(Omnibus): 0.000 Jarque-Bera (JB): 18.846
Skew: -0.107 Prob(JB): 8.08e-05
Kurtosis: 1.791 Cond. No. 1.22e+04

In [69]:
xp=20
Xp=np.array([1,xp])
print(model_fitted.predict(Xp))


[ 74.6917685]

In [71]:
np.dot(Xp,np.array([7.45185183e+01,8.66251066e-03]))


Out[71]:
74.691768513200003

In [ ]: