notebook.community

Edit and run



In [44]:

    
#This notebook needs to run in python2.7
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.datasets import load_boston
from sklearn import linear_model

pd.set_option('precision',3)
pd.set_option('display.width',160)



In [45]:

    
boston=load_boston()
dataset = pd.DataFrame(boston.data,columns=boston.feature_names)
dataset['target'] = boston.target

print(dataset.describe())









    



         CRIM      ZN   INDUS    CHAS     NOX      RM     AGE     DIS     RAD     TAX  PTRATIO       B   LSTAT  target
count  506.00  506.00  506.00  506.00  506.00  506.00  506.00  506.00  506.00  506.00   506.00  506.00  506.00  506.00
mean     3.59   11.36   11.14    0.07    0.55    6.28   68.57    3.80    9.55  408.24    18.46  356.67   12.65   22.53
std      8.60   23.32    6.86    0.25    0.12    0.70   28.15    2.11    8.71  168.54     2.16   91.29    7.14    9.20
min      0.01    0.00    0.46    0.00    0.39    3.56    2.90    1.13    1.00  187.00    12.60    0.32    1.73    5.00
25%      0.08    0.00    5.19    0.00    0.45    5.89   45.02    2.10    4.00  279.00    17.40  375.38    6.95   17.02
50%      0.26    0.00    9.69    0.00    0.54    6.21   77.50    3.21    5.00  330.00    19.05  391.44   11.36   21.20
75%      3.65   12.50   18.10    0.00    0.62    6.62   94.07    5.19   24.00  666.00    20.20  396.23   16.96   25.00
max     88.98  100.00   27.74    1.00    0.87    8.78  100.00   12.13   24.00  711.00    22.00  396.90   37.97   50.00



In [46]:

    
observations = len(dataset)
variables = dataset.columns[:-1]
X = dataset.ix[:,:-1]
y = dataset['target'].values



In [47]:

    
variables









    Out[47]:





Index([u'CRIM', u'ZN', u'INDUS', u'CHAS', u'NOX', u'RM', u'AGE', u'DIS', u'RAD', u'TAX', u'PTRATIO', u'B', u'LSTAT'], dtype='object')



In [48]:

    
import statsmodels.api as sm
import statsmodels.formula.api as smf



In [49]:

    
Xc = sm.add_constant(X)
linear_regression = sm.OLS(y,Xc)
fitted_model = linear_regression.fit()
fitted_model.summary()









    Out[49]:





OLS Regression Results

  Dep. Variable:             y           R-squared:             0.741 


  Model:                    OLS          Adj. R-squared:        0.734 


  Method:              Least Squares     F-statistic:           108.1 


  Date:              Tue, 08 Nov 2016    Prob (F-statistic):  6.95e-135


  Time:                  09:49:05        Log-Likelihood:      -1498.8 


  No. Observations:          506         AIC:                   3026. 


  Df Residuals:              492         BIC:                   3085. 


  Df Model:                   13                                      




             coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const       36.4911      5.104      7.149   0.000     26.462    46.520


  CRIM        -0.1072      0.033     -3.276   0.001     -0.171    -0.043


  ZN           0.0464      0.014      3.380   0.001      0.019     0.073


  INDUS        0.0209      0.061      0.339   0.735     -0.100     0.142


  CHAS         2.6886      0.862      3.120   0.002      0.996     4.381


  NOX        -17.7958      3.821     -4.658   0.000    -25.302   -10.289


  RM           3.8048      0.418      9.102   0.000      2.983     4.626


  AGE          0.0008      0.013      0.057   0.955     -0.025     0.027


  DIS         -1.4758      0.199     -7.398   0.000     -1.868    -1.084


  RAD          0.3057      0.066      4.608   0.000      0.175     0.436


  TAX         -0.0123      0.004     -3.278   0.001     -0.020    -0.005


  PTRATIO     -0.9535      0.131     -7.287   0.000     -1.211    -0.696


  B            0.0094      0.003      3.500   0.001      0.004     0.015


  LSTAT       -0.5255      0.051    -10.366   0.000     -0.625    -0.426




  Omnibus:        178.029    Durbin-Watson:         1.078 


  Prob(Omnibus):   0.000     Jarque-Bera (JB):    782.015 


  Skew:            1.521     Prob(JB):           1.54e-170


  Kurtosis:        8.276     Cond. No.           1.51e+04



In [50]:

    
X = dataset.ix[:,:-1]
correlation_martix = X.corr()
print(correlation_martix)









    



         CRIM    ZN  INDUS  CHAS   NOX    RM   AGE   DIS   RAD   TAX  PTRATIO     B  LSTAT
CRIM     1.00 -0.20   0.40 -0.06  0.42 -0.22  0.35 -0.38  0.62  0.58     0.29 -0.38   0.45
ZN      -0.20  1.00  -0.53 -0.04 -0.52  0.31 -0.57  0.66 -0.31 -0.31    -0.39  0.18  -0.41
INDUS    0.40 -0.53   1.00  0.06  0.76 -0.39  0.64 -0.71  0.60  0.72     0.38 -0.36   0.60
CHAS    -0.06 -0.04   0.06  1.00  0.09  0.09  0.09 -0.10 -0.01 -0.04    -0.12  0.05  -0.05
NOX      0.42 -0.52   0.76  0.09  1.00 -0.30  0.73 -0.77  0.61  0.67     0.19 -0.38   0.59
RM      -0.22  0.31  -0.39  0.09 -0.30  1.00 -0.24  0.21 -0.21 -0.29    -0.36  0.13  -0.61
AGE      0.35 -0.57   0.64  0.09  0.73 -0.24  1.00 -0.75  0.46  0.51     0.26 -0.27   0.60
DIS     -0.38  0.66  -0.71 -0.10 -0.77  0.21 -0.75  1.00 -0.49 -0.53    -0.23  0.29  -0.50
RAD      0.62 -0.31   0.60 -0.01  0.61 -0.21  0.46 -0.49  1.00  0.91     0.46 -0.44   0.49
TAX      0.58 -0.31   0.72 -0.04  0.67 -0.29  0.51 -0.53  0.91  1.00     0.46 -0.44   0.54
PTRATIO  0.29 -0.39   0.38 -0.12  0.19 -0.36  0.26 -0.23  0.46  0.46     1.00 -0.18   0.37
B       -0.38  0.18  -0.36  0.05 -0.38  0.13 -0.27  0.29 -0.44 -0.44    -0.18  1.00  -0.37
LSTAT    0.45 -0.41   0.60 -0.05  0.59 -0.61  0.60 -0.50  0.49  0.54     0.37 -0.37   1.00



In [51]:

    
def visualize_correlation_matrix(data,hurdle=0.0):
    R = np.corrcoef(data,rowvar=0)
    R[np.where(np.abs(R)<hurdle)] = 0.0
    heatmap = plt.pcolor(R,cmap=mpl.cm.coolwarm,alpha=0.8)
    heatmap.axes.set_frame_on(False)
    heatmap.axes.set_yticks(np.arange(R.shape[0]) + 0.5, minor=False)
    heatmap.axes.set_xticks(np.arange(R.shape[1]) + 0.5, minor=False)
    heatmap.axes.set_xticklabels(variables,minor=False)
    heatmap.axes.set_yticklabels(variables,minor=False)
    plt.xticks(rotation=90)
    plt.tick_params(axis='both',which='both',bottom='off',top='off',left='off',right='off')
    plt.colorbar()
    plt.show()

visualize_correlation_matrix(X,hurdle=0.5)



In [52]:

    
corr = np.corrcoef(X,rowvar=0)
eigenvalues, eigenvectors = np.linalg.eig(corr)

print (eigenvalues)









    



[ 6.12265476  1.43206335  1.24116299  0.85779892  0.83456618  0.65965056
  0.53901749  0.39654415  0.06351553  0.27743495  0.16916744  0.18616388
  0.22025981]



In [53]:

    
print(eigenvectors[:,8])









    



[-0.04552843  0.08089873  0.25126664 -0.03590431 -0.04389033 -0.04580522
  0.03870705  0.01828389  0.63337285 -0.72024335 -0.02350903  0.00485021
 -0.02477196]



In [54]:

    
print(variables[2],variables[8],variables[9])









    



('INDUS', 'RAD', 'TAX')



In [59]:

    
#feature scaling
from sklearn.preprocessing import StandardScaler

observations = len(dataset)
variables = dataset.columns
standardization = StandardScaler()
Xst = standardization.fit_transform(X)
original_means = standardization.mean_
originanal_stds = standardization.scale_
original_var = standardization.var_
Xst = np.column_stack((Xst,np.ones(observations)))
y = dataset['target'].values

print(original_means)
print(originanal_stds)
print(original_var)









    



[  3.59376071e+00   1.13636364e+01   1.11367787e+01   6.91699605e-02
   5.54695059e-01   6.28463439e+00   6.85749012e+01   3.79504269e+00
   9.54940711e+00   4.08237154e+02   1.84555336e+01   3.56674032e+02
   1.26530632e+01]
[  8.58828355e+00   2.32993957e+01   6.85357058e+00   2.53742935e-01
   1.15763115e-01   7.01922514e-01   2.81210326e+01   2.10362836e+00
   8.69865112e+00   1.68370495e+02   2.16280519e+00   9.12046075e+01
   7.13400164e+00]
[  7.37586143e+01   5.42861840e+02   4.69714297e+01   6.43854770e-02
   1.34010989e-02   4.92695216e-01   7.90792473e+02   4.42525226e+00
   7.56665313e+01   2.83486236e+04   4.67772630e+00   8.31828042e+03
   5.08939794e+01]



In [56]:

    
print(X.head())
print(Xst)









    



   CRIM  ZN  INDUS  CHAS   NOX    RM   AGE   DIS  RAD  TAX  PTRATIO       B  LSTAT
0  0.01  18   2.31     0  0.54  6.58  65.2  4.09    1  296     15.3  396.90   4.98
1  0.03   0   7.07     0  0.47  6.42  78.9  4.97    2  242     17.8  396.90   9.14
2  0.03   0   7.07     0  0.47  7.18  61.1  4.97    2  242     17.8  392.83   4.03
3  0.03   0   2.18     0  0.46  7.00  45.8  6.06    3  222     18.7  394.63   2.94
4  0.07   0   2.18     0  0.46  7.15  54.2  6.06    3  222     18.7  396.90   5.33
[[-0.41771335  0.28482986 -1.2879095  ...,  0.44105193 -1.0755623   1.        ]
 [-0.41526932 -0.48772236 -0.59338101 ...,  0.44105193 -0.49243937  1.        ]
 [-0.41527165 -0.48772236 -0.59338101 ...,  0.39642699 -1.2087274   1.        ]
 ..., 
 [-0.41137448 -0.48772236  0.11573841 ...,  0.44105193 -0.98304761  1.        ]
 [-0.40568883 -0.48772236  0.11573841 ...,  0.4032249  -0.86530163  1.        ]
 [-0.41292893 -0.48772236  0.11573841 ...,  0.44105193 -0.66905833  1.        ]]



In [ ]:

Dep. Variable:	y	R-squared:	0.741
Model:	OLS	Adj. R-squared:	0.734
Method:	Least Squares	F-statistic:	108.1
Date:	Tue, 08 Nov 2016	Prob (F-statistic):	6.95e-135
Time:	09:49:05	Log-Likelihood:	-1498.8
No. Observations:	506	AIC:	3026.
Df Residuals:	492	BIC:	3085.
Df Model:	13

	coef	std err	t	P>\|t\|	[95.0% Conf. Int.]
const	36.4911	5.104	7.149	0.000	26.462 46.520
CRIM	-0.1072	0.033	-3.276	0.001	-0.171 -0.043
ZN	0.0464	0.014	3.380	0.001	0.019 0.073
INDUS	0.0209	0.061	0.339	0.735	-0.100 0.142
CHAS	2.6886	0.862	3.120	0.002	0.996 4.381
NOX	-17.7958	3.821	-4.658	0.000	-25.302 -10.289
RM	3.8048	0.418	9.102	0.000	2.983 4.626
AGE	0.0008	0.013	0.057	0.955	-0.025 0.027
DIS	-1.4758	0.199	-7.398	0.000	-1.868 -1.084
RAD	0.3057	0.066	4.608	0.000	0.175 0.436
TAX	-0.0123	0.004	-3.278	0.001	-0.020 -0.005
PTRATIO	-0.9535	0.131	-7.287	0.000	-1.211 -0.696
B	0.0094	0.003	3.500	0.001	0.004 0.015
LSTAT	-0.5255	0.051	-10.366	0.000	-0.625 -0.426

Omnibus:	178.029	Durbin-Watson:	1.078
Prob(Omnibus):	0.000	Jarque-Bera (JB):	782.015
Skew:	1.521	Prob(JB):	1.54e-170
Kurtosis:	8.276	Cond. No.	1.51e+04