In [127]:
%matplotlib inline
In [8]:
import numpy as np
import pandas as pd
# 統計用ツール
import statsmodels.api as sm
import statsmodels.tsa.api as tsa
from patsy import dmatrices
# 自作の空間統計用ツール
from spatialstat import *
#描画
import matplotlib.pyplot as plt
from pandas.tools.plotting import autocorrelation_plot
#クラスター
from sklearn import mixture
In [576]:
df = pd.read_csv('bukken_data.csv')
df = df[:][df['pay'] < 300000]
df = df.reset_index(drop=True)
In [417]:
df.columns
Out[417]:
Index(['index', 'apart_dummy', 'building_year', 'dk', 'fX', 'fY', 'floor', 'k',
'lk', 'mansyon_dumy', 'new_dummy', 'pay', 'published_date', 'r',
'rc_dummy', 'room_nums', 'sdk', 'sk', 'sldk', 'slk',
'south_direction_dummy', 'square', 'teiki_syakuya_dummy',
'walk_minute_dummy'],
dtype='object')
In [574]:
vars = ['pay', 'square', 'k', 'lk', 'dk', 'sdk', 'sldk', 'south_direction_dummy', 'building_year',
'new_dummy', 'mansyon_dumy', 'teiki_syakuya_dummy', 'walk_minute_dummy', 'r', 'rc_dummy', 'room_nums']
eq = fml_build(vars)
y, X = dmatrices(eq, data=df, return_type='dataframe')
logy = np.log(y)
model = sm.OLS(logy, X, intercept=True)
results = model.fit()
print(results.summary())
OLS Regression Results
==============================================================================
Dep. Variable: pay R-squared: 0.807
Model: OLS Adj. R-squared: 0.805
Method: Least Squares F-statistic: 453.8
Date: Sun, 20 Nov 2016 Prob (F-statistic): 0.00
Time: 22:59:55 Log-Likelihood: 660.63
No. Observations: 1427 AIC: -1293.
Df Residuals: 1413 BIC: -1220.
Df Model: 13
Covariance Type: nonrobust
=========================================================================================
coef std err t P>|t| [95.0% Conf. Int.]
-----------------------------------------------------------------------------------------
Intercept 5.6474 0.017 328.289 0.000 5.614 5.681
square 0.0237 0.001 39.271 0.000 0.023 0.025
k -0.0600 0.016 -3.660 0.000 -0.092 -0.028
lk -1.674e-14 6.12e-16 -27.375 0.000 -1.79e-14 -1.55e-14
dk 0.0173 0.016 1.068 0.286 -0.014 0.049
sdk -0.0807 0.154 -0.525 0.600 -0.382 0.221
sldk -0.2233 0.110 -2.039 0.042 -0.438 -0.009
south_direction_dummy -0.0134 0.012 -1.166 0.244 -0.036 0.009
building_year -0.0092 0.000 -23.890 0.000 -0.010 -0.008
new_dummy -0.0173 0.009 -1.966 0.049 -0.034 -4.25e-05
mansyon_dumy 5.6474 0.017 328.289 0.000 5.614 5.681
teiki_syakuya_dummy 0.0140 0.027 0.511 0.610 -0.040 0.068
walk_minute_dummy -0.0023 0.003 -0.681 0.496 -0.009 0.004
r -0.0584 0.016 -3.546 0.000 -0.091 -0.026
rc_dummy 0.0557 0.018 3.015 0.003 0.019 0.092
room_nums -0.0651 0.014 -4.649 0.000 -0.093 -0.038
==============================================================================
Omnibus: 1699.589 Durbin-Watson: 1.411
Prob(Omnibus): 0.000 Jarque-Bera (JB): 553540.298
Skew: -5.637 Prob(JB): 0.00
Kurtosis: 98.826 Cond. No. 2.60e+17
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 3.51e-29. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
In [575]:
pred = results.predict()
evl = df['pay'] - np.exp(pred)
evl.hist(figsize=(7,5), bins=50)
Out[575]:
<matplotlib.axes._subplots.AxesSubplot at 0x120172a58>
In [442]:
n=20
cluster_array = np.array([df['pay'], df['fX']*100, df['fY']*100])
gmm = mixture.GaussianMixture(n_components=n, covariance_type='full').fit(cluster_array.T)
dum = pd.get_dummies(gmm.predict(cluster_array.T))
dum_nam = ['d%s'%i for i in range(n)]
dum.columns = dum_nam
リストから回帰式を生成する関数。spatialstat内にあるのでimportすれば使える。
In [443]:
df_with_dummy = pd.concat((df, dum), axis=1)
vars = ['pay', 'square', 'k', 'lk', 'dk', 'sdk', 'sldk', 'south_direction_dummy', 'building_year',
'new_dummy', 'mansyon_dumy', 'teiki_syakuya_dummy', 'walk_minute_dummy', 'r', 'rc_dummy', 'room_nums']
vars = vars + dum_nam[:-1]
eq = fml_build(vars)
y, X = dmatrices(eq, data=df_with_dummy, return_type='dataframe')
logy = np.log(y)
model = sm.GLS(logy, X, intercept=True)
results = model.fit()
print(results.summary())
GLS Regression Results
==============================================================================
Dep. Variable: pay R-squared: 0.949
Model: GLS Adj. R-squared: 0.948
Method: Least Squares F-statistic: 818.1
Date: Sun, 20 Nov 2016 Prob (F-statistic): 0.00
Time: 22:36:00 Log-Likelihood: 1617.3
No. Observations: 1427 AIC: -3169.
Df Residuals: 1394 BIC: -2995.
Df Model: 32
Covariance Type: nonrobust
=========================================================================================
coef std err t P>|t| [95.0% Conf. Int.]
-----------------------------------------------------------------------------------------
Intercept 5.7529 0.010 586.460 0.000 5.734 5.772
square 0.0105 0.000 22.699 0.000 0.010 0.011
k -0.0440 0.009 -4.930 0.000 -0.061 -0.026
lk -3.893e-14 7.6e-15 -5.122 0.000 -5.38e-14 -2.4e-14
dk 0.0306 0.009 3.450 0.001 0.013 0.048
sdk -0.0955 0.080 -1.192 0.234 -0.253 0.062
sldk -0.1534 0.057 -2.696 0.007 -0.265 -0.042
south_direction_dummy 0.0147 0.006 2.396 0.017 0.003 0.027
building_year -0.0040 0.000 -15.544 0.000 -0.004 -0.003
new_dummy -0.0003 0.005 -0.072 0.942 -0.010 0.009
mansyon_dumy 5.7529 0.010 586.460 0.000 5.734 5.772
teiki_syakuya_dummy 0.0151 0.014 1.042 0.298 -0.013 0.043
walk_minute_dummy 0.0004 0.002 0.221 0.825 -0.003 0.004
r -0.0186 0.009 -2.093 0.037 -0.036 -0.001
rc_dummy 0.0091 0.011 0.854 0.394 -0.012 0.030
room_nums -0.0340 0.008 -4.449 0.000 -0.049 -0.019
d0 -0.0062 0.013 -0.461 0.645 -0.032 0.020
d1 0.3414 0.017 20.214 0.000 0.308 0.375
d2 0.4603 0.017 26.358 0.000 0.426 0.495
d3 -0.1048 0.013 -7.847 0.000 -0.131 -0.079
d4 0.1546 0.008 19.227 0.000 0.139 0.170
d5 0.0350 0.015 2.355 0.019 0.006 0.064
d6 -0.3609 0.016 -22.024 0.000 -0.393 -0.329
d7 0.5629 0.019 29.089 0.000 0.525 0.601
d8 0.2173 0.022 9.948 0.000 0.174 0.260
d9 0.3698 0.016 23.303 0.000 0.339 0.401
d10 -0.1216 0.009 -14.199 0.000 -0.138 -0.105
d11 0.1284 0.011 11.399 0.000 0.106 0.151
d12 0.2428 0.016 15.323 0.000 0.212 0.274
d13 -2.3830 0.057 -41.919 0.000 -2.495 -2.272
d14 -0.2154 0.013 -16.418 0.000 -0.241 -0.190
d15 0.0863 0.011 7.509 0.000 0.064 0.109
d16 0.3885 0.026 14.870 0.000 0.337 0.440
d17 -0.5513 0.033 -16.676 0.000 -0.616 -0.486
d18 0.3805 0.024 15.738 0.000 0.333 0.428
==============================================================================
Omnibus: 64.403 Durbin-Watson: 1.315
Prob(Omnibus): 0.000 Jarque-Bera (JB): 154.834
Skew: -0.236 Prob(JB): 2.39e-34
Kurtosis: 4.543 Cond. No. 8.28e+16
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 3.46e-28. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
OLSによる予測値。
In [445]:
pred = results.predict()
evl = df['pay'] - np.exp(pred)
evl.hist(figsize=(7,5), bins=50)
Out[445]:
<matplotlib.axes._subplots.AxesSubplot at 0x11fed35c0>
In [425]:
df['pay']
Out[425]:
0 176000
1 175000
2 171000
3 205200
4 176000
5 160000
6 170000
7 180000
8 130000
9 130000
10 135000
11 170000
12 170000
13 130000
14 160000
15 130000
16 130000
17 135000
18 130000
19 130000
20 135000
21 170000
22 230000
23 168000
24 170000
25 168000
26 260000
27 172000
28 163000
29 170000
...
1397 170000
1398 140000
1399 90000
1400 110000
1401 100000
1402 158000
1403 110000
1404 180000
1405 129000
1406 225000
1407 229000
1408 207000
1409 110000
1410 108000
1411 115000
1412 160000
1413 198000
1414 100000
1415 245000
1416 100000
1417 100000
1418 85000
1419 85000
1420 108000
1421 270000
1422 90000
1423 250000
1424 110000
1425 100000
1426 130000
Name: pay, dtype: int64
In [426]:
print(evl.mean())
print(evl.std())
605.0953661598534
13427.107926219865
In [436]:
n=100
cluster_array = np.array([df['square'], df['fX']*1000, df['fY']*1000])
gmm = mixture.GaussianMixture(n_components=n, covariance_type='full').fit(cluster_array.T)
dum = pd.get_dummies(gmm.predict(cluster_array.T))
dum_nam = ['d%s'%i for i in range(n)]
dum.columns = dum_nam
In [437]:
df_with_dummy = pd.concat((df, dum), axis=1)
vars = ['pay', 'square', 'k', 'lk', 'dk', 'sdk', 'sldk', 'south_direction_dummy', 'building_year',
'new_dummy', 'mansyon_dumy', 'teiki_syakuya_dummy', 'walk_minute_dummy', 'r', 'rc_dummy', 'room_nums']
vars = vars + dum_nam[:-1]
eq = fml_build(vars)
y, X = dmatrices(eq, data=df_with_dummy, return_type='dataframe')
logy = np.log(y)
model = sm.GLS(logy, X, intercept=True)
results = model.fit()
print(results.summary())
GLS Regression Results
==============================================================================
Dep. Variable: pay R-squared: 0.871
Model: GLS Adj. R-squared: 0.860
Method: Least Squares F-statistic: 79.16
Date: Sun, 20 Nov 2016 Prob (F-statistic): 0.00
Time: 22:35:10 Log-Likelihood: 948.54
No. Observations: 1427 AIC: -1671.
Df Residuals: 1314 BIC: -1076.
Df Model: 112
Covariance Type: nonrobust
=========================================================================================
coef std err t P>|t| [95.0% Conf. Int.]
-----------------------------------------------------------------------------------------
Intercept 5.4436 0.050 109.133 0.000 5.346 5.541
square 0.0333 0.004 8.037 0.000 0.025 0.041
k -0.0436 0.017 -2.538 0.011 -0.077 -0.010
lk -1.29e-13 9.29e-14 -1.389 0.165 -3.11e-13 5.32e-14
dk -0.0387 0.017 -2.240 0.025 -0.073 -0.005
sdk 0.0639 0.142 0.450 0.653 -0.215 0.342
sldk 0.0069 0.105 0.066 0.947 -0.198 0.212
south_direction_dummy -0.0049 0.011 -0.442 0.658 -0.026 0.017
building_year -0.0086 0.000 -18.747 0.000 -0.009 -0.008
new_dummy -0.0210 0.008 -2.622 0.009 -0.037 -0.005
mansyon_dumy 5.4436 0.050 109.133 0.000 5.346 5.541
teiki_syakuya_dummy 0.0085 0.026 0.324 0.746 -0.043 0.060
walk_minute_dummy 0.0023 0.004 0.563 0.574 -0.006 0.010
r -0.0347 0.017 -2.078 0.038 -0.067 -0.002
rc_dummy 0.0612 0.022 2.799 0.005 0.018 0.104
room_nums 0.0241 0.017 1.411 0.159 -0.009 0.058
d0 -0.0174 0.054 -0.323 0.747 -0.123 0.088
d1 -0.1105 0.137 -0.805 0.421 -0.380 0.159
d2 -0.0300 0.066 -0.451 0.652 -0.160 0.100
d3 -0.1748 0.110 -1.586 0.113 -0.391 0.041
d4 -0.5316 0.202 -2.632 0.009 -0.928 -0.135
d5 -0.4824 0.171 -2.816 0.005 -0.818 -0.146
d6 0.0489 0.058 0.841 0.401 -0.065 0.163
d7 0.0211 0.074 0.287 0.774 -0.123 0.165
d8 -0.1128 0.088 -1.289 0.198 -0.285 0.059
d9 0.0640 0.053 1.206 0.228 -0.040 0.168
d10 -0.0850 0.128 -0.666 0.505 -0.335 0.165
d11 -1.1494 0.273 -4.218 0.000 -1.684 -0.615
d12 0.1333 0.058 2.289 0.022 0.019 0.247
d13 -0.4156 0.166 -2.505 0.012 -0.741 -0.090
d14 0.0268 0.095 0.282 0.778 -0.160 0.213
d15 -0.0540 0.107 -0.506 0.613 -0.263 0.155
d16 -0.3531 0.155 -2.278 0.023 -0.657 -0.049
d17 -0.0154 0.059 -0.261 0.794 -0.131 0.100
d18 -0.2370 0.110 -2.156 0.031 -0.453 -0.021
d19 0.0281 0.076 0.369 0.712 -0.122 0.178
d20 0.0072 0.052 0.137 0.891 -0.096 0.110
d21 0.1294 0.068 1.892 0.059 -0.005 0.264
d22 0.0121 0.103 0.117 0.907 -0.191 0.215
d23 -0.0003 0.049 -0.006 0.995 -0.096 0.096
d24 -0.1533 0.152 -1.007 0.314 -0.452 0.145
d25 -0.0588 0.076 -0.770 0.442 -0.209 0.091
d26 -0.3967 0.179 -2.218 0.027 -0.748 -0.046
d27 0.1425 0.078 1.836 0.067 -0.010 0.295
d28 0.1011 0.048 2.103 0.036 0.007 0.195
d29 -0.0074 0.061 -0.122 0.903 -0.126 0.112
d30 0.1281 0.070 1.835 0.067 -0.009 0.265
d31 -0.7232 0.222 -3.256 0.001 -1.159 -0.287
d32 -0.0813 0.112 -0.725 0.468 -0.301 0.139
d33 -0.2937 0.162 -1.810 0.070 -0.612 0.025
d34 -0.0125 0.078 -0.159 0.874 -0.166 0.141
d35 -0.1941 0.129 -1.506 0.132 -0.447 0.059
d36 -0.4416 0.191 -2.316 0.021 -0.816 -0.068
d37 -0.1862 0.103 -1.805 0.071 -0.389 0.016
d38 -0.0473 0.063 -0.746 0.456 -0.172 0.077
d39 -0.0304 0.096 -0.318 0.751 -0.218 0.157
d40 -0.0138 0.052 -0.266 0.790 -0.116 0.088
d41 0.0466 0.076 0.610 0.542 -0.103 0.197
d42 -0.2470 0.143 -1.728 0.084 -0.527 0.033
d43 -0.2230 0.127 -1.758 0.079 -0.472 0.026
d44 0.0353 0.058 0.610 0.542 -0.078 0.149
d45 -0.0443 0.082 -0.537 0.591 -0.206 0.117
d46 -0.1214 0.063 -1.939 0.053 -0.244 0.001
d47 0.0214 0.099 0.217 0.828 -0.172 0.215
d48 -0.1826 0.125 -1.465 0.143 -0.427 0.062
d49 0.0358 0.057 0.622 0.534 -0.077 0.149
d50 0.0605 0.067 0.903 0.367 -0.071 0.192
d51 0.1106 0.065 1.706 0.088 -0.017 0.238
d52 0.1555 0.074 2.113 0.035 0.011 0.300
d53 -0.3446 0.137 -2.508 0.012 -0.614 -0.075
d54 -0.2822 0.171 -1.650 0.099 -0.618 0.053
d55 -0.0560 0.114 -0.490 0.624 -0.280 0.168
d56 0.1490 0.085 1.745 0.081 -0.019 0.317
d57 -0.0660 0.097 -0.678 0.498 -0.257 0.125
d58 0.0845 0.067 1.266 0.206 -0.046 0.215
d59 -0.3216 0.154 -2.084 0.037 -0.624 -0.019
d60 -0.2573 0.154 -1.675 0.094 -0.559 0.044
d61 0.0471 0.046 1.022 0.307 -0.043 0.137
d62 -0.0047 0.062 -0.076 0.939 -0.126 0.117
d63 0.0360 0.061 0.594 0.552 -0.083 0.155
d64 -0.1265 0.138 -0.916 0.360 -0.397 0.144
d65 -0.3189 0.157 -2.035 0.042 -0.626 -0.012
d66 0.0746 0.053 1.408 0.159 -0.029 0.178
d67 0.0818 0.077 1.062 0.289 -0.069 0.233
d68 -0.0173 0.051 -0.341 0.734 -0.117 0.082
d69 -0.4229 0.182 -2.325 0.020 -0.780 -0.066
d70 -0.1246 0.125 -1.000 0.318 -0.369 0.120
d71 -0.1419 0.103 -1.375 0.169 -0.344 0.061
d72 -0.1796 0.121 -1.479 0.139 -0.418 0.059
d73 -0.0692 0.108 -0.640 0.522 -0.281 0.143
d74 -0.4074 0.209 -1.947 0.052 -0.818 0.003
d75 -0.3698 0.133 -2.770 0.006 -0.632 -0.108
d76 -0.2747 0.156 -1.763 0.078 -0.580 0.031
d77 -0.0344 0.124 -0.278 0.781 -0.277 0.208
d78 0.1157 0.052 2.211 0.027 0.013 0.218
d79 -0.8500 0.283 -3.001 0.003 -1.406 -0.294
d80 -0.0631 0.105 -0.600 0.549 -0.270 0.143
d81 -0.0365 0.088 -0.414 0.679 -0.209 0.136
d82 -0.2350 0.137 -1.717 0.086 -0.503 0.034
d83 -0.3091 0.171 -1.812 0.070 -0.644 0.026
d84 -0.0187 0.054 -0.345 0.730 -0.125 0.088
d85 0.0329 0.087 0.379 0.705 -0.138 0.203
d86 -0.1773 0.101 -1.752 0.080 -0.376 0.021
d87 -0.6244 0.186 -3.361 0.001 -0.989 -0.260
d88 -0.4468 0.146 -3.064 0.002 -0.733 -0.161
d89 0.0456 0.114 0.400 0.689 -0.178 0.269
d90 0.0910 0.070 1.305 0.192 -0.046 0.228
d91 -0.0464 0.122 -0.381 0.703 -0.285 0.192
d92 0.0340 0.141 0.241 0.810 -0.243 0.311
d93 0.0497 0.069 0.722 0.471 -0.085 0.185
d94 0.0291 0.138 0.211 0.833 -0.241 0.299
d95 -0.1891 0.138 -1.372 0.170 -0.460 0.081
d96 -0.1254 0.107 -1.169 0.243 -0.336 0.085
d97 0.0023 0.063 0.036 0.971 -0.121 0.125
d98 0.1404 0.055 2.563 0.010 0.033 0.248
==============================================================================
Omnibus: 2284.358 Durbin-Watson: 1.844
Prob(Omnibus): 0.000 Jarque-Bera (JB): 2179753.688
Skew: -9.703 Prob(JB): 0.00
Kurtosis: 193.483 Cond. No. 2.59e+16
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 3.54e-27. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
In [438]:
pred = results.predict()
evl = df['pay'] - np.exp(pred)
evl.hist(figsize=(7,5), bins=50)
Out[438]:
<matplotlib.axes._subplots.AxesSubplot at 0x11db5cd30>
In [446]:
n=20
ransu = np.array([np.random.uniform() for i in range(len(df[:]))])
cluster_array = np.array([ransu, df['fX'], df['fY']])
gmm = mixture.GaussianMixture(n_components=n, covariance_type='full').fit(cluster_array.T)
dum = pd.get_dummies(gmm.predict(cluster_array.T))
dum_nam = ['d%s'%i for i in range(n)]
dum.columns = dum_nam
In [447]:
df_with_dummy = pd.concat((df, dum), axis=1)
vars = ['pay', 'square', 'k', 'lk', 'dk', 'sdk', 'sldk', 'south_direction_dummy', 'building_year',
'new_dummy', 'mansyon_dumy', 'teiki_syakuya_dummy', 'walk_minute_dummy', 'r', 'rc_dummy', 'room_nums']
vars = vars + dum_nam[:-1]
eq = fml_build(vars)
y, X = dmatrices(eq, data=df_with_dummy, return_type='dataframe')
logy = np.log(y)
model = sm.GLS(logy, X, intercept=True)
results = model.fit()
print(results.summary())
GLS Regression Results
==============================================================================
Dep. Variable: pay R-squared: 0.809
Model: GLS Adj. R-squared: 0.805
Method: Least Squares F-statistic: 184.6
Date: Sun, 20 Nov 2016 Prob (F-statistic): 0.00
Time: 22:36:35 Log-Likelihood: 669.26
No. Observations: 1427 AIC: -1273.
Df Residuals: 1394 BIC: -1099.
Df Model: 32
Covariance Type: nonrobust
=========================================================================================
coef std err t P>|t| [95.0% Conf. Int.]
-----------------------------------------------------------------------------------------
Intercept 5.6446 0.019 296.025 0.000 5.607 5.682
square 0.0237 0.001 38.905 0.000 0.022 0.025
k -0.0608 0.017 -3.682 0.000 -0.093 -0.028
lk -9.727e-14 1.06e-14 -9.219 0.000 -1.18e-13 -7.66e-14
dk 0.0151 0.016 0.925 0.355 -0.017 0.047
sdk -0.0735 0.155 -0.474 0.636 -0.378 0.231
sldk -0.2136 0.110 -1.939 0.053 -0.430 0.002
south_direction_dummy -0.0112 0.012 -0.967 0.334 -0.034 0.012
building_year -0.0091 0.000 -23.618 0.000 -0.010 -0.008
new_dummy -0.0160 0.009 -1.809 0.071 -0.033 0.001
mansyon_dumy 5.6446 0.019 296.025 0.000 5.607 5.682
teiki_syakuya_dummy 0.0088 0.028 0.320 0.749 -0.045 0.063
walk_minute_dummy -0.0027 0.003 -0.783 0.434 -0.009 0.004
r -0.0596 0.017 -3.595 0.000 -0.092 -0.027
rc_dummy 0.0539 0.019 2.899 0.004 0.017 0.090
room_nums -0.0660 0.014 -4.661 0.000 -0.094 -0.038
d0 0.0058 0.026 0.219 0.827 -0.046 0.058
d1 0.0031 0.025 0.125 0.901 -0.046 0.052
d2 0.0065 0.026 0.248 0.804 -0.045 0.058
d3 -0.0085 0.025 -0.347 0.728 -0.057 0.040
d4 0.0086 0.026 0.335 0.737 -0.042 0.059
d5 0.0319 0.026 1.212 0.226 -0.020 0.084
d6 0.0377 0.025 1.499 0.134 -0.012 0.087
d7 0.0081 0.025 0.317 0.752 -0.042 0.058
d8 0.0186 0.024 0.788 0.431 -0.028 0.065
d9 0.0067 0.026 0.254 0.800 -0.045 0.058
d10 0.0245 0.025 0.989 0.323 -0.024 0.073
d11 -0.0059 0.025 -0.238 0.812 -0.054 0.043
d12 0.0081 0.026 0.309 0.758 -0.043 0.059
d13 0.0501 0.026 1.926 0.054 -0.001 0.101
d14 0.0174 0.025 0.687 0.492 -0.032 0.067
d15 0.0114 0.027 0.420 0.674 -0.042 0.065
d16 0.0112 0.024 0.466 0.641 -0.036 0.058
d17 -0.0386 0.028 -1.389 0.165 -0.093 0.016
d18 0.0156 0.024 0.637 0.524 -0.032 0.063
==============================================================================
Omnibus: 1689.139 Durbin-Watson: 1.413
Prob(Omnibus): 0.000 Jarque-Bera (JB): 532848.797
Skew: -5.582 Prob(JB): 0.00
Kurtosis: 97.006 Cond. No. 1.92e+17
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 6.45e-29. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
これより単に変数を増やしたことが決定係数を上げたわけではないことがわかる。
In [448]:
def cluster_OLS(n):
cluster_array = np.array([df['pay'], df['fX'], df['fY']])
gmm = mixture.GaussianMixture(n_components=n, covariance_type='full').fit(cluster_array.T)
dum = pd.get_dummies(gmm.predict(cluster_array.T))
dum_nam = ['d%s'%i for i in range(n)]
dum.columns = dum_nam
df_with_dummy = pd.concat((df, dum), axis=1)
vars = ['pay', 'square', 'k', 'lk', 'dk', 'sdk', 'sldk', 'south_direction_dummy', 'building_year',
'new_dummy', 'mansyon_dumy', 'teiki_syakuya_dummy', 'walk_minute_dummy', 'r', 'rc_dummy', 'room_nums']
vars = vars + dum_nam[:-1]
eq = fml_build(vars)
y, X = dmatrices(eq, data=df_with_dummy, return_type='dataframe')
logy = np.log(y)
model = sm.GLS(logy, X, intercept=True)
results = model.fit()
print(results.summary())
return results
In [449]:
n=1
results = cluster_OLS(n)
GLS Regression Results
==============================================================================
Dep. Variable: pay R-squared: 0.807
Model: GLS Adj. R-squared: 0.805
Method: Least Squares F-statistic: 453.8
Date: Sun, 20 Nov 2016 Prob (F-statistic): 0.00
Time: 22:36:36 Log-Likelihood: 660.63
No. Observations: 1427 AIC: -1293.
Df Residuals: 1413 BIC: -1220.
Df Model: 13
Covariance Type: nonrobust
=========================================================================================
coef std err t P>|t| [95.0% Conf. Int.]
-----------------------------------------------------------------------------------------
Intercept 5.6474 0.017 328.289 0.000 5.614 5.681
square 0.0237 0.001 39.271 0.000 0.023 0.025
k -0.0600 0.016 -3.660 0.000 -0.092 -0.028
lk -1.674e-14 6.12e-16 -27.375 0.000 -1.79e-14 -1.55e-14
dk 0.0173 0.016 1.068 0.286 -0.014 0.049
sdk -0.0807 0.154 -0.525 0.600 -0.382 0.221
sldk -0.2233 0.110 -2.039 0.042 -0.438 -0.009
south_direction_dummy -0.0134 0.012 -1.166 0.244 -0.036 0.009
building_year -0.0092 0.000 -23.890 0.000 -0.010 -0.008
new_dummy -0.0173 0.009 -1.966 0.049 -0.034 -4.25e-05
mansyon_dumy 5.6474 0.017 328.289 0.000 5.614 5.681
teiki_syakuya_dummy 0.0140 0.027 0.511 0.610 -0.040 0.068
walk_minute_dummy -0.0023 0.003 -0.681 0.496 -0.009 0.004
r -0.0584 0.016 -3.546 0.000 -0.091 -0.026
rc_dummy 0.0557 0.018 3.015 0.003 0.019 0.092
room_nums -0.0651 0.014 -4.649 0.000 -0.093 -0.038
==============================================================================
Omnibus: 1699.589 Durbin-Watson: 1.411
Prob(Omnibus): 0.000 Jarque-Bera (JB): 553540.298
Skew: -5.637 Prob(JB): 0.00
Kurtosis: 98.826 Cond. No. 2.60e+17
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 3.51e-29. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
In [450]:
n=2
results = cluster_OLS(n)
GLS Regression Results
==============================================================================
Dep. Variable: pay R-squared: 0.818
Model: GLS Adj. R-squared: 0.817
Method: Least Squares F-statistic: 454.4
Date: Sun, 20 Nov 2016 Prob (F-statistic): 0.00
Time: 22:36:36 Log-Likelihood: 704.80
No. Observations: 1427 AIC: -1380.
Df Residuals: 1412 BIC: -1301.
Df Model: 14
Covariance Type: nonrobust
=========================================================================================
coef std err t P>|t| [95.0% Conf. Int.]
-----------------------------------------------------------------------------------------
Intercept 5.6581 0.017 338.364 0.000 5.625 5.691
square 0.0203 0.001 29.401 0.000 0.019 0.022
k -0.0309 0.016 -1.910 0.056 -0.063 0.001
lk 3.277e-15 3.82e-16 8.585 0.000 2.53e-15 4.03e-15
dk 0.0304 0.016 1.927 0.054 -0.001 0.061
sdk -0.0643 0.149 -0.431 0.666 -0.357 0.228
sldk -0.2002 0.106 -1.885 0.060 -0.409 0.008
south_direction_dummy -0.0168 0.011 -1.506 0.132 -0.039 0.005
building_year -0.0077 0.000 -18.949 0.000 -0.008 -0.007
new_dummy -0.0187 0.009 -2.197 0.028 -0.035 -0.002
mansyon_dumy 5.6581 0.017 338.364 0.000 5.625 5.691
teiki_syakuya_dummy 0.0055 0.027 0.208 0.835 -0.047 0.058
walk_minute_dummy -0.0011 0.003 -0.345 0.730 -0.008 0.005
r -0.0537 0.016 -3.357 0.001 -0.085 -0.022
rc_dummy 0.0689 0.018 3.836 0.000 0.034 0.104
room_nums -0.0708 0.014 -5.212 0.000 -0.097 -0.044
d0 0.1468 0.015 9.495 0.000 0.116 0.177
==============================================================================
Omnibus: 1743.571 Durbin-Watson: 1.415
Prob(Omnibus): 0.000 Jarque-Bera (JB): 582054.200
Skew: -5.914 Prob(JB): 0.00
Kurtosis: 101.231 Cond. No. 1.18e+18
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 1.69e-30. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
In [451]:
n=5
results = cluster_OLS(n)
GLS Regression Results
==============================================================================
Dep. Variable: pay R-squared: 0.890
Model: GLS Adj. R-squared: 0.889
Method: Least Squares F-statistic: 673.3
Date: Sun, 20 Nov 2016 Prob (F-statistic): 0.00
Time: 22:36:36 Log-Likelihood: 1065.2
No. Observations: 1427 AIC: -2094.
Df Residuals: 1409 BIC: -2000.
Df Model: 17
Covariance Type: nonrobust
=========================================================================================
coef std err t P>|t| [95.0% Conf. Int.]
-----------------------------------------------------------------------------------------
Intercept 5.5674 0.014 387.918 0.000 5.539 5.596
square 0.0082 0.001 12.255 0.000 0.007 0.010
k 0.0037 0.013 0.291 0.771 -0.022 0.029
lk 2.771e-14 3.98e-16 69.643 0.000 2.69e-14 2.85e-14
dk -0.0002 0.013 -0.018 0.985 -0.025 0.024
sdk 0.0269 0.116 0.232 0.817 -0.201 0.255
sldk -0.0548 0.083 -0.661 0.509 -0.217 0.108
south_direction_dummy -0.0083 0.009 -0.951 0.342 -0.025 0.009
building_year -0.0029 0.000 -8.003 0.000 -0.004 -0.002
new_dummy -0.0114 0.007 -1.714 0.087 -0.024 0.002
mansyon_dumy 5.5674 0.014 387.918 0.000 5.539 5.596
teiki_syakuya_dummy 0.0041 0.021 0.200 0.842 -0.036 0.045
walk_minute_dummy -0.0084 0.003 -3.265 0.001 -0.014 -0.003
r -0.0338 0.013 -2.699 0.007 -0.058 -0.009
rc_dummy 0.0144 0.014 1.021 0.307 -0.013 0.042
room_nums -0.0364 0.011 -3.424 0.001 -0.057 -0.016
d0 0.5380 0.020 26.719 0.000 0.498 0.577
d1 0.9428 0.030 31.342 0.000 0.884 1.002
d2 0.3511 0.017 20.246 0.000 0.317 0.385
d3 0.7239 0.024 30.551 0.000 0.677 0.770
==============================================================================
Omnibus: 1810.800 Durbin-Watson: 1.513
Prob(Omnibus): 0.000 Jarque-Bera (JB): 653791.712
Skew: -6.338 Prob(JB): 0.00
Kurtosis: 107.092 Cond. No. 1.09e+18
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 2e-30. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
In [452]:
n=10
results = cluster_OLS(n)
GLS Regression Results
==============================================================================
Dep. Variable: pay R-squared: 0.941
Model: GLS Adj. R-squared: 0.941
Method: Least Squares F-statistic: 1026.
Date: Sun, 20 Nov 2016 Prob (F-statistic): 0.00
Time: 22:36:36 Log-Likelihood: 1512.2
No. Observations: 1427 AIC: -2978.
Df Residuals: 1404 BIC: -2857.
Df Model: 22
Covariance Type: nonrobust
=========================================================================================
coef std err t P>|t| [95.0% Conf. Int.]
-----------------------------------------------------------------------------------------
Intercept 6.0408 0.012 486.685 0.000 6.016 6.065
square 0.0016 0.001 3.059 0.002 0.001 0.003
k 0.0114 0.010 1.203 0.229 -0.007 0.030
lk 5.717e-14 4.79e-16 119.389 0.000 5.62e-14 5.81e-14
dk -0.0140 0.009 -1.521 0.128 -0.032 0.004
sdk -0.0145 0.086 -0.168 0.866 -0.183 0.154
sldk -0.0551 0.061 -0.904 0.366 -0.175 0.065
south_direction_dummy -0.0160 0.006 -2.489 0.013 -0.029 -0.003
building_year -0.0003 0.000 -1.123 0.262 -0.001 0.000
new_dummy -0.0096 0.005 -1.956 0.051 -0.019 2.59e-05
mansyon_dumy 6.0408 0.012 486.685 0.000 6.016 6.065
teiki_syakuya_dummy 0.0148 0.015 0.968 0.333 -0.015 0.045
walk_minute_dummy 0.0064 0.002 3.332 0.001 0.003 0.010
r -0.0066 0.009 -0.712 0.476 -0.025 0.012
rc_dummy 0.0163 0.011 1.541 0.124 -0.004 0.037
room_nums 0.0094 0.008 1.182 0.237 -0.006 0.025
d0 -0.1256 0.012 -10.479 0.000 -0.149 -0.102
d1 -0.6906 0.016 -43.325 0.000 -0.722 -0.659
d2 0.1203 0.014 8.522 0.000 0.093 0.148
d3 -0.3772 0.013 -28.367 0.000 -0.403 -0.351
d4 0.3482 0.018 19.830 0.000 0.314 0.383
d5 -1.0638 0.022 -48.397 0.000 -1.107 -1.021
d6 -0.5006 0.015 -34.229 0.000 -0.529 -0.472
d7 0.2328 0.014 16.185 0.000 0.205 0.261
d8 -0.2562 0.012 -20.657 0.000 -0.281 -0.232
==============================================================================
Omnibus: 2403.349 Durbin-Watson: 1.791
Prob(Omnibus): 0.000 Jarque-Bera (JB): 2744803.113
Skew: -10.754 Prob(JB): 0.00
Kurtosis: 216.778 Cond. No. 2.00e+17
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 5.93e-29. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
In [453]:
n=15
results = cluster_OLS(n)
GLS Regression Results
==============================================================================
Dep. Variable: pay R-squared: 0.991
Model: GLS Adj. R-squared: 0.991
Method: Least Squares F-statistic: 5697.
Date: Sun, 20 Nov 2016 Prob (F-statistic): 0.00
Time: 22:36:37 Log-Likelihood: 2847.6
No. Observations: 1427 AIC: -5639.
Df Residuals: 1399 BIC: -5492.
Df Model: 27
Covariance Type: nonrobust
=========================================================================================
coef std err t P>|t| [95.0% Conf. Int.]
-----------------------------------------------------------------------------------------
Intercept 4.6593 0.013 370.289 0.000 4.635 4.684
square 0.0020 0.000 8.967 0.000 0.002 0.002
k 0.0009 0.004 0.227 0.820 -0.007 0.008
lk 8.757e-15 5.32e-17 164.471 0.000 8.65e-15 8.86e-15
dk 0.0026 0.004 0.701 0.484 -0.005 0.010
sdk -0.0190 0.034 -0.564 0.573 -0.085 0.047
sldk 0.0099 0.024 0.414 0.679 -0.037 0.057
south_direction_dummy 0.0066 0.003 2.583 0.010 0.002 0.012
building_year -0.0008 0.000 -7.254 0.000 -0.001 -0.001
new_dummy -0.0012 0.002 -0.624 0.533 -0.005 0.003
mansyon_dumy 4.6593 0.013 370.289 0.000 4.635 4.684
teiki_syakuya_dummy 0.0139 0.006 2.294 0.022 0.002 0.026
walk_minute_dummy -0.0039 0.001 -5.179 0.000 -0.005 -0.002
r -0.0001 0.004 -0.036 0.971 -0.007 0.007
rc_dummy 0.0182 0.004 4.342 0.000 0.010 0.026
room_nums -0.0013 0.003 -0.408 0.683 -0.007 0.005
d0 2.5410 0.024 104.853 0.000 2.493 2.589
d1 3.0294 0.025 122.686 0.000 2.981 3.078
d2 2.1322 0.024 88.287 0.000 2.085 2.180
d3 2.8214 0.024 115.927 0.000 2.774 2.869
d4 2.3353 0.024 97.196 0.000 2.288 2.382
d5 1.7317 0.025 68.968 0.000 1.682 1.781
d6 2.7252 0.024 112.858 0.000 2.678 2.773
d7 2.9187 0.024 119.746 0.000 2.871 2.967
d8 2.6249 0.024 109.507 0.000 2.578 2.672
d9 2.4141 0.024 100.540 0.000 2.367 2.461
d10 2.0009 0.024 82.321 0.000 1.953 2.049
d11 3.1198 0.025 125.911 0.000 3.071 3.168
d12 2.2451 0.024 93.363 0.000 2.198 2.292
d13 2.4807 0.024 102.977 0.000 2.433 2.528
==============================================================================
Omnibus: 146.752 Durbin-Watson: 1.474
Prob(Omnibus): 0.000 Jarque-Bera (JB): 415.574
Skew: -0.539 Prob(JB): 5.74e-91
Kurtosis: 5.414 Cond. No. 1.27e+16
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 1.47e-26. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
In [454]:
n=20
results = cluster_OLS(n)
GLS Regression Results
==============================================================================
Dep. Variable: pay R-squared: 0.993
Model: GLS Adj. R-squared: 0.993
Method: Least Squares F-statistic: 6436.
Date: Sun, 20 Nov 2016 Prob (F-statistic): 0.00
Time: 22:36:37 Log-Likelihood: 3056.9
No. Observations: 1427 AIC: -6048.
Df Residuals: 1394 BIC: -5874.
Df Model: 32
Covariance Type: nonrobust
=========================================================================================
coef std err t P>|t| [95.0% Conf. Int.]
-----------------------------------------------------------------------------------------
Intercept 6.2748 0.008 801.367 0.000 6.259 6.290
square 0.0010 0.000 4.893 0.000 0.001 0.001
k -0.0043 0.003 -1.298 0.194 -0.011 0.002
lk 9.37e-14 7.87e-16 119.052 0.000 9.22e-14 9.52e-14
dk 0.0038 0.003 1.193 0.233 -0.002 0.010
sdk -0.0043 0.029 -0.147 0.883 -0.062 0.053
sldk 0.0092 0.021 0.441 0.659 -0.032 0.050
south_direction_dummy -0.0047 0.002 -2.102 0.036 -0.009 -0.000
building_year -0.0005 9.98e-05 -4.773 0.000 -0.001 -0.000
new_dummy 0.0013 0.002 0.790 0.430 -0.002 0.005
mansyon_dumy 6.2748 0.008 801.367 0.000 6.259 6.290
teiki_syakuya_dummy 0.0092 0.005 1.765 0.078 -0.001 0.019
walk_minute_dummy -0.0035 0.001 -5.328 0.000 -0.005 -0.002
r -0.0004 0.003 -0.121 0.904 -0.007 0.006
rc_dummy 0.0135 0.004 3.731 0.000 0.006 0.021
room_nums -0.0032 0.003 -1.181 0.238 -0.009 0.002
d0 -1.0360 0.014 -75.513 0.000 -1.063 -1.009
d1 -0.2280 0.013 -17.762 0.000 -0.253 -0.203
d2 -0.6255 0.013 -48.904 0.000 -0.651 -0.600
d3 -0.8008 0.013 -60.533 0.000 -0.827 -0.775
d4 -0.4353 0.013 -34.097 0.000 -0.460 -0.410
d5 -0.0623 0.013 -4.828 0.000 -0.088 -0.037
d6 -1.4910 0.016 -95.621 0.000 -1.522 -1.460
d7 -1.2386 0.015 -85.251 0.000 -1.267 -1.210
d8 -0.8600 0.013 -64.107 0.000 -0.886 -0.834
d9 -0.7172 0.013 -54.748 0.000 -0.743 -0.691
d10 -0.5385 0.013 -41.657 0.000 -0.564 -0.513
d11 -0.1465 0.013 -11.677 0.000 -0.171 -0.122
d12 -0.3053 0.012 -24.473 0.000 -0.330 -0.281
d13 -0.7607 0.013 -56.959 0.000 -0.787 -0.734
d14 -0.3808 0.013 -29.113 0.000 -0.406 -0.355
d15 -0.9417 0.014 -69.548 0.000 -0.968 -0.915
d16 -3.1894 0.024 -132.548 0.000 -3.237 -3.142
d17 -0.4871 0.013 -37.910 0.000 -0.512 -0.462
d18 -1.1177 0.014 -79.949 0.000 -1.145 -1.090
==============================================================================
Omnibus: 175.882 Durbin-Watson: 1.558
Prob(Omnibus): 0.000 Jarque-Bera (JB): 963.408
Skew: -0.430 Prob(JB): 6.29e-210
Kurtosis: 6.932 Cond. No. 1.46e+16
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 1.12e-26. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
In [455]:
n=25
results = cluster_OLS(n)
GLS Regression Results
==============================================================================
Dep. Variable: pay R-squared: 0.997
Model: GLS Adj. R-squared: 0.996
Method: Least Squares F-statistic: 1.084e+04
Date: Sun, 20 Nov 2016 Prob (F-statistic): 0.00
Time: 22:36:37 Log-Likelihood: 3532.7
No. Observations: 1427 AIC: -6989.
Df Residuals: 1389 BIC: -6789.
Df Model: 37
Covariance Type: nonrobust
=========================================================================================
coef std err t P>|t| [95.0% Conf. Int.]
-----------------------------------------------------------------------------------------
Intercept 5.6961 0.003 1898.759 0.000 5.690 5.702
square 0.0009 0.000 6.587 0.000 0.001 0.001
k 0.0020 0.002 0.820 0.412 -0.003 0.007
lk -5.768e-14 1.35e-15 -42.823 0.000 -6.03e-14 -5.5e-14
dk 0.0008 0.002 0.345 0.730 -0.004 0.005
sdk -0.0018 0.021 -0.088 0.930 -0.043 0.039
sldk 0.0041 0.015 0.275 0.783 -0.025 0.033
south_direction_dummy 0.0013 0.002 0.816 0.415 -0.002 0.004
building_year -0.0002 7.22e-05 -3.357 0.001 -0.000 -0.000
new_dummy -0.0016 0.001 -1.369 0.171 -0.004 0.001
mansyon_dumy 5.6961 0.003 1898.759 0.000 5.690 5.702
teiki_syakuya_dummy 0.0022 0.004 0.575 0.565 -0.005 0.010
walk_minute_dummy -0.0012 0.000 -2.589 0.010 -0.002 -0.000
r 0.0057 0.002 2.432 0.015 0.001 0.010
rc_dummy -0.0016 0.003 -0.587 0.557 -0.007 0.004
room_nums -0.0068 0.002 -3.386 0.001 -0.011 -0.003
d0 0.7290 0.006 124.126 0.000 0.717 0.741
d1 0.1808 0.004 48.750 0.000 0.173 0.188
d2 1.1082 0.007 160.764 0.000 1.095 1.122
d3 0.4514 0.004 106.664 0.000 0.443 0.460
d4 -0.0448 0.006 -7.356 0.000 -0.057 -0.033
d5 0.8817 0.006 142.247 0.000 0.870 0.894
d6 0.5688 0.005 126.204 0.000 0.560 0.578
d7 0.3049 0.004 78.206 0.000 0.297 0.313
d8 1.0081 0.008 131.545 0.000 0.993 1.023
d9 -0.4757 0.009 -52.546 0.000 -0.493 -0.458
d10 0.0855 0.004 23.124 0.000 0.078 0.093
d11 0.6768 0.005 133.067 0.000 0.667 0.687
d12 0.7888 0.006 132.869 0.000 0.777 0.800
d13 0.5082 0.005 106.677 0.000 0.499 0.518
d14 1.0347 0.007 149.251 0.000 1.021 1.048
d15 0.2440 0.004 62.229 0.000 0.236 0.252
d16 -0.2943 0.005 -57.116 0.000 -0.304 -0.284
d17 0.3756 0.004 95.150 0.000 0.368 0.383
d18 0.6277 0.005 119.071 0.000 0.617 0.638
d19 -2.0194 0.015 -132.180 0.000 -2.049 -1.989
d20 0.9440 0.007 140.037 0.000 0.931 0.957
d21 0.8308 0.007 118.909 0.000 0.817 0.844
d22 -0.1043 0.004 -23.915 0.000 -0.113 -0.096
d23 1.1696 0.010 112.127 0.000 1.149 1.190
==============================================================================
Omnibus: 27.673 Durbin-Watson: 1.722
Prob(Omnibus): 0.000 Jarque-Bera (JB): 50.819
Skew: -0.097 Prob(JB): 9.22e-12
Kurtosis: 3.904 Cond. No. 5.76e+16
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 7.16e-28. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
In [456]:
n=30
results = cluster_OLS(n)
GLS Regression Results
==============================================================================
Dep. Variable: pay R-squared: 0.998
Model: GLS Adj. R-squared: 0.998
Method: Least Squares F-statistic: 1.383e+04
Date: Sun, 20 Nov 2016 Prob (F-statistic): 0.00
Time: 22:36:37 Log-Likelihood: 3798.7
No. Observations: 1427 AIC: -7511.
Df Residuals: 1384 BIC: -7285.
Df Model: 42
Covariance Type: nonrobust
=========================================================================================
coef std err t P>|t| [95.0% Conf. Int.]
-----------------------------------------------------------------------------------------
Intercept 5.7828 0.002 2529.369 0.000 5.778 5.787
square 0.0005 0.000 4.052 0.000 0.000 0.001
k -0.0010 0.002 -0.469 0.639 -0.005 0.003
lk 9.185e-14 1.17e-15 78.433 0.000 8.96e-14 9.41e-14
dk -0.0008 0.002 -0.402 0.687 -0.005 0.003
sdk 0.0027 0.017 0.153 0.878 -0.032 0.037
sldk -0.0195 0.013 -1.557 0.120 -0.044 0.005
south_direction_dummy -0.0007 0.001 -0.514 0.608 -0.003 0.002
building_year -5.665e-05 6.08e-05 -0.932 0.351 -0.000 6.26e-05
new_dummy 0.0007 0.001 0.685 0.493 -0.001 0.003
mansyon_dumy 5.7828 0.002 2529.369 0.000 5.778 5.787
teiki_syakuya_dummy 0.0010 0.003 0.316 0.752 -0.005 0.007
walk_minute_dummy -0.0010 0.000 -2.400 0.017 -0.002 -0.000
r -5.138e-05 0.002 -0.026 0.979 -0.004 0.004
rc_dummy -0.0036 0.002 -1.579 0.115 -0.008 0.001
room_nums -0.0021 0.002 -1.224 0.221 -0.005 0.001
d0 -0.2593 0.003 -87.380 0.000 -0.265 -0.254
d1 0.5691 0.004 135.186 0.000 0.561 0.577
d2 0.1254 0.003 45.473 0.000 0.120 0.131
d3 0.8456 0.006 153.597 0.000 0.835 0.856
d4 0.3046 0.003 93.788 0.000 0.298 0.311
d5 0.4646 0.004 124.963 0.000 0.457 0.472
d6 -0.0648 0.003 -25.882 0.000 -0.070 -0.060
d7 0.7250 0.004 161.919 0.000 0.716 0.734
d8 0.9523 0.005 186.835 0.000 0.942 0.962
d9 0.1995 0.003 72.824 0.000 0.194 0.205
d10 -0.6511 0.007 -89.039 0.000 -0.665 -0.637
d11 0.0398 0.003 13.950 0.000 0.034 0.045
d12 -0.1594 0.003 -52.349 0.000 -0.165 -0.153
d13 0.6281 0.004 147.907 0.000 0.620 0.636
d14 0.7994 0.006 137.237 0.000 0.788 0.811
d15 0.4039 0.003 136.036 0.000 0.398 0.410
d16 -0.4723 0.004 -113.722 0.000 -0.480 -0.464
d17 1.0161 0.008 122.916 0.000 1.000 1.032
d18 -2.1807 0.013 -174.190 0.000 -2.205 -2.156
d19 0.6706 0.005 127.319 0.000 0.660 0.681
d20 0.5147 0.004 145.148 0.000 0.508 0.522
d21 0.1628 0.003 56.646 0.000 0.157 0.168
d22 0.8783 0.005 171.335 0.000 0.868 0.888
d23 0.2716 0.003 85.324 0.000 0.265 0.278
d24 -0.1058 0.003 -33.245 0.000 -0.112 -0.100
d25 0.3426 0.003 101.425 0.000 0.336 0.349
d26 0.2346 0.003 74.061 0.000 0.228 0.241
d27 0.7625 0.006 125.524 0.000 0.751 0.774
d28 0.0808 0.002 32.846 0.000 0.076 0.086
==============================================================================
Omnibus: 229.693 Durbin-Watson: 1.733
Prob(Omnibus): 0.000 Jarque-Bera (JB): 2486.837
Skew: -0.380 Prob(JB): 0.00
Kurtosis: 9.422 Cond. No. 6.23e+16
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 6.12e-28. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
In [463]:
def cluster_OLS_onlyprice(n):
cluster_array = np.array([df['pay']])
gmm = mixture.GaussianMixture(n_components=n, covariance_type='full').fit(cluster_array.T)
dum = pd.get_dummies(gmm.predict(cluster_array.T))
dum_nam = ['d%s'%i for i in range(n)]
dum.columns = dum_nam
df_with_dummy = pd.concat((df, dum), axis=1)
vars = ['pay', 'square', 'k', 'lk', 'dk', 'sdk', 'sldk', 'south_direction_dummy', 'building_year',
'new_dummy', 'mansyon_dumy', 'teiki_syakuya_dummy', 'walk_minute_dummy', 'r', 'rc_dummy', 'room_nums']
vars = vars + dum_nam[:-1]
eq = fml_build(vars)
y, X = dmatrices(eq, data=df_with_dummy, return_type='dataframe')
logy = np.log(y)
model = sm.GLS(logy, X, intercept=True)
results = model.fit()
print(results.summary())
return results
In [461]:
n=10
results = cluster_OLS_onlyprice(n)
GLS Regression Results
==============================================================================
Dep. Variable: pay R-squared: 0.947
Model: GLS Adj. R-squared: 0.946
Method: Least Squares F-statistic: 1143.
Date: Sun, 20 Nov 2016 Prob (F-statistic): 0.00
Time: 22:37:29 Log-Likelihood: 1585.4
No. Observations: 1427 AIC: -3125.
Df Residuals: 1404 BIC: -3004.
Df Model: 22
Covariance Type: nonrobust
=========================================================================================
coef std err t P>|t| [95.0% Conf. Int.]
-----------------------------------------------------------------------------------------
Intercept 5.8333 0.010 584.192 0.000 5.814 5.853
square 0.0016 0.001 3.240 0.001 0.001 0.003
k 0.0046 0.009 0.501 0.616 -0.013 0.023
lk 4.104e-14 1.36e-16 300.922 0.000 4.08e-14 4.13e-14
dk -0.0094 0.009 -1.061 0.289 -0.027 0.008
sdk -0.0197 0.082 -0.241 0.809 -0.180 0.140
sldk -0.0583 0.058 -1.005 0.315 -0.172 0.056
south_direction_dummy -0.0163 0.006 -2.674 0.008 -0.028 -0.004
building_year -0.0008 0.000 -3.237 0.001 -0.001 -0.000
new_dummy -0.0099 0.005 -2.117 0.034 -0.019 -0.001
mansyon_dumy 5.8333 0.010 584.192 0.000 5.814 5.853
teiki_syakuya_dummy 0.0019 0.015 0.130 0.896 -0.027 0.030
walk_minute_dummy 0.0044 0.002 2.437 0.015 0.001 0.008
r -0.0071 0.009 -0.801 0.423 -0.024 0.010
rc_dummy 0.0309 0.010 3.142 0.002 0.012 0.050
room_nums 0.0065 0.008 0.859 0.391 -0.008 0.021
d0 0.1025 0.009 11.568 0.000 0.085 0.120
d1 0.6603 0.018 37.193 0.000 0.625 0.695
d2 -0.2835 0.008 -34.573 0.000 -0.300 -0.267
d3 0.2036 0.010 20.885 0.000 0.184 0.223
d4 0.5166 0.014 37.736 0.000 0.490 0.543
d5 -0.1202 0.008 -15.404 0.000 -0.135 -0.105
d6 0.3302 0.011 30.693 0.000 0.309 0.351
d7 -0.6621 0.015 -44.191 0.000 -0.692 -0.633
d8 0.7644 0.020 39.188 0.000 0.726 0.803
==============================================================================
Omnibus: 2583.659 Durbin-Watson: 1.854
Prob(Omnibus): 0.000 Jarque-Bera (JB): 4021412.752
Skew: -12.484 Prob(JB): 0.00
Kurtosis: 261.864 Cond. No. 3.95e+17
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 1.53e-29. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
In [557]:
def cluster_OLS_honmei(n):
dum1 = pd.DataFrame((df['pay'] < 100000)*1)
dum1.columns = ['low']
dum2 = pd.DataFrame((df['pay'] > 150000)*1)
dum2.columns = ['high']
dum = pd.concat((dum1, dum2), axis=1)
df_with_dummy = pd.concat((df, dum), axis=1)
cluster_array = np.array([df['square'], df['fX']*1000, df['fY']*1000])
gmm = mixture.GaussianMixture(n_components=n, covariance_type='full').fit(cluster_array.T)
dum = pd.get_dummies(gmm.predict(cluster_array.T))
dum_nam = ['d%s'%i for i in range(n)]
dum.columns = dum_nam
df_with_dummy = pd.concat((df_with_dummy, dum), axis=1)
vars = ['pay', 'square', 'k', 'lk', 'dk', 'sdk', 'sldk', 'south_direction_dummy', 'building_year',
'new_dummy', 'mansyon_dumy', 'teiki_syakuya_dummy', 'walk_minute_dummy', 'r', 'rc_dummy',
'room_nums', 'low', 'high']
vars = vars + dum_nam[:-1]
eq = fml_build(vars)
y, X = dmatrices(eq, data=df_with_dummy, return_type='dataframe')
logy = np.log(y)
model = sm.GLS(logy, X, intercept=True)
results = model.fit()
print(results.summary())
return results
In [572]:
n=50
results = cluster_OLS_honmei(n)
GLS Regression Results
==============================================================================
Dep. Variable: pay R-squared: 0.888
Model: GLS Adj. R-squared: 0.882
Method: Least Squares F-statistic: 168.0
Date: Sun, 20 Nov 2016 Prob (F-statistic): 0.00
Time: 22:58:53 Log-Likelihood: 1047.1
No. Observations: 1427 AIC: -1964.
Df Residuals: 1362 BIC: -1622.
Df Model: 64
Covariance Type: nonrobust
=========================================================================================
coef std err t P>|t| [95.0% Conf. Int.]
-----------------------------------------------------------------------------------------
Intercept 5.6607 0.087 64.831 0.000 5.489 5.832
square 0.0153 0.003 5.436 0.000 0.010 0.021
k -0.0440 0.014 -3.054 0.002 -0.072 -0.016
lk -3.918e-14 4.65e-15 -8.420 0.000 -4.83e-14 -3e-14
dk -0.0370 0.015 -2.480 0.013 -0.066 -0.008
sdk 0.0880 0.122 0.719 0.473 -0.152 0.328
sldk 0.0004 0.094 0.004 0.997 -0.185 0.185
south_direction_dummy -0.0030 0.010 -0.314 0.753 -0.022 0.016
building_year -0.0050 0.000 -12.427 0.000 -0.006 -0.004
new_dummy -0.0224 0.007 -3.148 0.002 -0.036 -0.008
mansyon_dumy 5.6607 0.087 64.831 0.000 5.489 5.832
teiki_syakuya_dummy -0.0007 0.023 -0.030 0.976 -0.045 0.044
walk_minute_dummy 0.0073 0.003 2.150 0.032 0.001 0.014
r -0.0524 0.014 -3.784 0.000 -0.080 -0.025
rc_dummy 0.0313 0.017 1.835 0.067 -0.002 0.065
room_nums -0.0031 0.013 -0.229 0.819 -0.030 0.023
low -0.1832 0.013 -14.041 0.000 -0.209 -0.158
high 0.2205 0.013 16.370 0.000 0.194 0.247
d0 -0.0237 0.052 -0.453 0.651 -0.127 0.079
d1 0.0637 0.112 0.569 0.570 -0.156 0.284
d2 0.0778 0.085 0.920 0.358 -0.088 0.244
d3 -0.1364 0.077 -1.765 0.078 -0.288 0.015
d4 0.0189 0.064 0.296 0.767 -0.106 0.144
d5 6.239e-05 0.044 0.001 0.999 -0.085 0.086
d6 0.0183 0.133 0.138 0.890 -0.242 0.278
d7 0.1680 0.100 1.686 0.092 -0.027 0.363
d8 0.0384 0.092 0.419 0.675 -0.141 0.218
d9 0.0333 0.120 0.278 0.781 -0.202 0.269
d10 0.0409 0.070 0.585 0.559 -0.096 0.178
d11 0.1180 0.110 1.073 0.283 -0.098 0.334
d12 0.0464 0.059 0.793 0.428 -0.068 0.161
d13 0.1574 0.087 1.808 0.071 -0.013 0.328
d14 0.0262 0.042 0.621 0.535 -0.057 0.109
d15 -0.2781 0.091 -3.057 0.002 -0.457 -0.100
d16 0.0191 0.055 0.347 0.729 -0.089 0.127
d17 -0.0147 0.067 -0.220 0.826 -0.145 0.116
d18 0.0666 0.102 0.652 0.514 -0.134 0.267
d19 0.0715 0.106 0.672 0.501 -0.137 0.280
d20 -0.2029 0.050 -4.049 0.000 -0.301 -0.105
d21 -0.1252 0.123 -1.015 0.310 -0.367 0.117
d22 0.0604 0.115 0.528 0.598 -0.164 0.285
d23 0.0779 0.118 0.659 0.510 -0.154 0.310
d24 -0.0147 0.048 -0.309 0.757 -0.108 0.079
d25 0.0716 0.116 0.616 0.538 -0.157 0.300
d26 0.0688 0.103 0.667 0.505 -0.134 0.271
d27 -0.0309 0.098 -0.314 0.754 -0.224 0.162
d28 0.0477 0.111 0.428 0.669 -0.171 0.266
d29 0.0807 0.087 0.930 0.353 -0.090 0.251
d30 0.0453 0.057 0.799 0.425 -0.066 0.156
d31 -0.0578 0.051 -1.137 0.256 -0.157 0.042
d32 0.0693 0.065 1.070 0.285 -0.058 0.196
d33 0.0085 0.056 0.151 0.880 -0.102 0.119
d34 0.0631 0.088 0.719 0.473 -0.109 0.235
d35 0.0631 0.065 0.964 0.335 -0.065 0.191
d36 0.0720 0.082 0.880 0.379 -0.088 0.232
d37 0.0644 0.116 0.553 0.580 -0.164 0.293
d38 0.0270 0.119 0.227 0.820 -0.206 0.260
d39 0.1793 0.095 1.890 0.059 -0.007 0.365
d40 0.0506 0.093 0.544 0.586 -0.132 0.233
d41 -0.0884 0.138 -0.643 0.520 -0.358 0.181
d42 0.1036 0.092 1.125 0.261 -0.077 0.284
d43 -0.0301 0.058 -0.520 0.603 -0.144 0.083
d44 0.0061 0.064 0.096 0.924 -0.119 0.132
d45 0.0898 0.080 1.128 0.260 -0.066 0.246
d46 0.1824 0.055 3.330 0.001 0.075 0.290
d47 -0.0425 0.072 -0.587 0.558 -0.184 0.100
d48 0.0049 0.085 0.057 0.955 -0.162 0.172
==============================================================================
Omnibus: 2111.563 Durbin-Watson: 1.654
Prob(Omnibus): 0.000 Jarque-Bera (JB): 1423155.455
Skew: -8.350 Prob(JB): 0.00
Kurtosis: 156.806 Cond. No. 2.36e+16
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 4.26e-27. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
In [573]:
pred = results.predict()
evl = df['pay'] - np.exp(pred)
evl.hist(figsize=(7,5), bins=50)
Out[573]:
<matplotlib.axes._subplots.AxesSubplot at 0x11fc10080>
In [ ]: