In [127]:
%matplotlib inline

In [8]:
import numpy as np
import pandas as pd

# 統計用ツール
import statsmodels.api as sm
import statsmodels.tsa.api as tsa
from patsy import dmatrices

# 自作の空間統計用ツール
from spatialstat import *

#描画
import matplotlib.pyplot as plt
from pandas.tools.plotting import autocorrelation_plot

#クラスター
from sklearn import mixture

クラスター分けしてOLS

1 CSVをpandasで取り込む。


In [576]:
df = pd.read_csv('bukken_data.csv')
df = df[:][df['pay'] < 300000]
df = df.reset_index(drop=True)

In [417]:
df.columns


Out[417]:
Index(['index', 'apart_dummy', 'building_year', 'dk', 'fX', 'fY', 'floor', 'k',
       'lk', 'mansyon_dumy', 'new_dummy', 'pay', 'published_date', 'r',
       'rc_dummy', 'room_nums', 'sdk', 'sk', 'sldk', 'slk',
       'south_direction_dummy', 'square', 'teiki_syakuya_dummy',
       'walk_minute_dummy'],
      dtype='object')

2 普通のOLS


In [574]:
vars = ['pay', 'square', 'k', 'lk', 'dk', 'sdk', 'sldk', 'south_direction_dummy', 'building_year', 
        'new_dummy', 'mansyon_dumy', 'teiki_syakuya_dummy', 'walk_minute_dummy', 'r', 'rc_dummy', 'room_nums']
eq = fml_build(vars)

y, X = dmatrices(eq, data=df, return_type='dataframe')

logy = np.log(y)

model = sm.OLS(logy, X, intercept=True)
results = model.fit()
print(results.summary())


                            OLS Regression Results                            
==============================================================================
Dep. Variable:                    pay   R-squared:                       0.807
Model:                            OLS   Adj. R-squared:                  0.805
Method:                 Least Squares   F-statistic:                     453.8
Date:                Sun, 20 Nov 2016   Prob (F-statistic):               0.00
Time:                        22:59:55   Log-Likelihood:                 660.63
No. Observations:                1427   AIC:                            -1293.
Df Residuals:                    1413   BIC:                            -1220.
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
=========================================================================================
                            coef    std err          t      P>|t|      [95.0% Conf. Int.]
-----------------------------------------------------------------------------------------
Intercept                 5.6474      0.017    328.289      0.000         5.614     5.681
square                    0.0237      0.001     39.271      0.000         0.023     0.025
k                        -0.0600      0.016     -3.660      0.000        -0.092    -0.028
lk                    -1.674e-14   6.12e-16    -27.375      0.000     -1.79e-14 -1.55e-14
dk                        0.0173      0.016      1.068      0.286        -0.014     0.049
sdk                      -0.0807      0.154     -0.525      0.600        -0.382     0.221
sldk                     -0.2233      0.110     -2.039      0.042        -0.438    -0.009
south_direction_dummy    -0.0134      0.012     -1.166      0.244        -0.036     0.009
building_year            -0.0092      0.000    -23.890      0.000        -0.010    -0.008
new_dummy                -0.0173      0.009     -1.966      0.049        -0.034 -4.25e-05
mansyon_dumy              5.6474      0.017    328.289      0.000         5.614     5.681
teiki_syakuya_dummy       0.0140      0.027      0.511      0.610        -0.040     0.068
walk_minute_dummy        -0.0023      0.003     -0.681      0.496        -0.009     0.004
r                        -0.0584      0.016     -3.546      0.000        -0.091    -0.026
rc_dummy                  0.0557      0.018      3.015      0.003         0.019     0.092
room_nums                -0.0651      0.014     -4.649      0.000        -0.093    -0.038
==============================================================================
Omnibus:                     1699.589   Durbin-Watson:                   1.411
Prob(Omnibus):                  0.000   Jarque-Bera (JB):           553540.298
Skew:                          -5.637   Prob(JB):                         0.00
Kurtosis:                      98.826   Cond. No.                     2.60e+17
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 3.51e-29. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.

In [575]:
pred = results.predict()
evl = df['pay'] - np.exp(pred)
evl.hist(figsize=(7,5), bins=50)


Out[575]:
<matplotlib.axes._subplots.AxesSubplot at 0x120172a58>

3 クラスターの作成


In [442]:
n=20

cluster_array = np.array([df['pay'], df['fX']*100, df['fY']*100])
gmm = mixture.GaussianMixture(n_components=n, covariance_type='full').fit(cluster_array.T)
dum = pd.get_dummies(gmm.predict(cluster_array.T))
dum_nam = ['d%s'%i for i in range(n)] 
dum.columns = dum_nam

4 クラスター分類を利用したOLS推定

リストから回帰式を生成する関数。spatialstat内にあるのでimportすれば使える。


In [443]:
df_with_dummy = pd.concat((df, dum), axis=1)

vars = ['pay', 'square', 'k', 'lk', 'dk', 'sdk', 'sldk', 'south_direction_dummy', 'building_year', 
        'new_dummy', 'mansyon_dumy', 'teiki_syakuya_dummy', 'walk_minute_dummy', 'r', 'rc_dummy', 'room_nums']
vars = vars + dum_nam[:-1]

eq = fml_build(vars)

y, X = dmatrices(eq, data=df_with_dummy, return_type='dataframe')

logy = np.log(y)

model = sm.GLS(logy, X, intercept=True)
results = model.fit()
print(results.summary())


                            GLS Regression Results                            
==============================================================================
Dep. Variable:                    pay   R-squared:                       0.949
Model:                            GLS   Adj. R-squared:                  0.948
Method:                 Least Squares   F-statistic:                     818.1
Date:                Sun, 20 Nov 2016   Prob (F-statistic):               0.00
Time:                        22:36:00   Log-Likelihood:                 1617.3
No. Observations:                1427   AIC:                            -3169.
Df Residuals:                    1394   BIC:                            -2995.
Df Model:                          32                                         
Covariance Type:            nonrobust                                         
=========================================================================================
                            coef    std err          t      P>|t|      [95.0% Conf. Int.]
-----------------------------------------------------------------------------------------
Intercept                 5.7529      0.010    586.460      0.000         5.734     5.772
square                    0.0105      0.000     22.699      0.000         0.010     0.011
k                        -0.0440      0.009     -4.930      0.000        -0.061    -0.026
lk                    -3.893e-14    7.6e-15     -5.122      0.000     -5.38e-14  -2.4e-14
dk                        0.0306      0.009      3.450      0.001         0.013     0.048
sdk                      -0.0955      0.080     -1.192      0.234        -0.253     0.062
sldk                     -0.1534      0.057     -2.696      0.007        -0.265    -0.042
south_direction_dummy     0.0147      0.006      2.396      0.017         0.003     0.027
building_year            -0.0040      0.000    -15.544      0.000        -0.004    -0.003
new_dummy                -0.0003      0.005     -0.072      0.942        -0.010     0.009
mansyon_dumy              5.7529      0.010    586.460      0.000         5.734     5.772
teiki_syakuya_dummy       0.0151      0.014      1.042      0.298        -0.013     0.043
walk_minute_dummy         0.0004      0.002      0.221      0.825        -0.003     0.004
r                        -0.0186      0.009     -2.093      0.037        -0.036    -0.001
rc_dummy                  0.0091      0.011      0.854      0.394        -0.012     0.030
room_nums                -0.0340      0.008     -4.449      0.000        -0.049    -0.019
d0                       -0.0062      0.013     -0.461      0.645        -0.032     0.020
d1                        0.3414      0.017     20.214      0.000         0.308     0.375
d2                        0.4603      0.017     26.358      0.000         0.426     0.495
d3                       -0.1048      0.013     -7.847      0.000        -0.131    -0.079
d4                        0.1546      0.008     19.227      0.000         0.139     0.170
d5                        0.0350      0.015      2.355      0.019         0.006     0.064
d6                       -0.3609      0.016    -22.024      0.000        -0.393    -0.329
d7                        0.5629      0.019     29.089      0.000         0.525     0.601
d8                        0.2173      0.022      9.948      0.000         0.174     0.260
d9                        0.3698      0.016     23.303      0.000         0.339     0.401
d10                      -0.1216      0.009    -14.199      0.000        -0.138    -0.105
d11                       0.1284      0.011     11.399      0.000         0.106     0.151
d12                       0.2428      0.016     15.323      0.000         0.212     0.274
d13                      -2.3830      0.057    -41.919      0.000        -2.495    -2.272
d14                      -0.2154      0.013    -16.418      0.000        -0.241    -0.190
d15                       0.0863      0.011      7.509      0.000         0.064     0.109
d16                       0.3885      0.026     14.870      0.000         0.337     0.440
d17                      -0.5513      0.033    -16.676      0.000        -0.616    -0.486
d18                       0.3805      0.024     15.738      0.000         0.333     0.428
==============================================================================
Omnibus:                       64.403   Durbin-Watson:                   1.315
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              154.834
Skew:                          -0.236   Prob(JB):                     2.39e-34
Kurtosis:                       4.543   Cond. No.                     8.28e+16
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 3.46e-28. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.

OLSによる予測値。


In [445]:
pred = results.predict()
evl = df['pay'] - np.exp(pred)
evl.hist(figsize=(7,5), bins=50)


Out[445]:
<matplotlib.axes._subplots.AxesSubplot at 0x11fed35c0>

In [425]:
df['pay']


Out[425]:
0       176000
1       175000
2       171000
3       205200
4       176000
5       160000
6       170000
7       180000
8       130000
9       130000
10      135000
11      170000
12      170000
13      130000
14      160000
15      130000
16      130000
17      135000
18      130000
19      130000
20      135000
21      170000
22      230000
23      168000
24      170000
25      168000
26      260000
27      172000
28      163000
29      170000
         ...  
1397    170000
1398    140000
1399     90000
1400    110000
1401    100000
1402    158000
1403    110000
1404    180000
1405    129000
1406    225000
1407    229000
1408    207000
1409    110000
1410    108000
1411    115000
1412    160000
1413    198000
1414    100000
1415    245000
1416    100000
1417    100000
1418     85000
1419     85000
1420    108000
1421    270000
1422     90000
1423    250000
1424    110000
1425    100000
1426    130000
Name: pay, dtype: int64

In [426]:
print(evl.mean())
print(evl.std())


605.0953661598534
13427.107926219865

5 成約価格ではなく部屋面積を利用したクラスターの作成とOLS


In [436]:
n=100

cluster_array = np.array([df['square'], df['fX']*1000, df['fY']*1000])
gmm = mixture.GaussianMixture(n_components=n, covariance_type='full').fit(cluster_array.T)
dum = pd.get_dummies(gmm.predict(cluster_array.T))
dum_nam = ['d%s'%i for i in range(n)] 
dum.columns = dum_nam

In [437]:
df_with_dummy = pd.concat((df, dum), axis=1)

vars = ['pay', 'square', 'k', 'lk', 'dk', 'sdk', 'sldk', 'south_direction_dummy', 'building_year', 
        'new_dummy', 'mansyon_dumy', 'teiki_syakuya_dummy', 'walk_minute_dummy', 'r', 'rc_dummy', 'room_nums']
vars = vars + dum_nam[:-1]

eq = fml_build(vars)

y, X = dmatrices(eq, data=df_with_dummy, return_type='dataframe')

logy = np.log(y)

model = sm.GLS(logy, X, intercept=True)
results = model.fit()
print(results.summary())


                            GLS Regression Results                            
==============================================================================
Dep. Variable:                    pay   R-squared:                       0.871
Model:                            GLS   Adj. R-squared:                  0.860
Method:                 Least Squares   F-statistic:                     79.16
Date:                Sun, 20 Nov 2016   Prob (F-statistic):               0.00
Time:                        22:35:10   Log-Likelihood:                 948.54
No. Observations:                1427   AIC:                            -1671.
Df Residuals:                    1314   BIC:                            -1076.
Df Model:                         112                                         
Covariance Type:            nonrobust                                         
=========================================================================================
                            coef    std err          t      P>|t|      [95.0% Conf. Int.]
-----------------------------------------------------------------------------------------
Intercept                 5.4436      0.050    109.133      0.000         5.346     5.541
square                    0.0333      0.004      8.037      0.000         0.025     0.041
k                        -0.0436      0.017     -2.538      0.011        -0.077    -0.010
lk                     -1.29e-13   9.29e-14     -1.389      0.165     -3.11e-13  5.32e-14
dk                       -0.0387      0.017     -2.240      0.025        -0.073    -0.005
sdk                       0.0639      0.142      0.450      0.653        -0.215     0.342
sldk                      0.0069      0.105      0.066      0.947        -0.198     0.212
south_direction_dummy    -0.0049      0.011     -0.442      0.658        -0.026     0.017
building_year            -0.0086      0.000    -18.747      0.000        -0.009    -0.008
new_dummy                -0.0210      0.008     -2.622      0.009        -0.037    -0.005
mansyon_dumy              5.4436      0.050    109.133      0.000         5.346     5.541
teiki_syakuya_dummy       0.0085      0.026      0.324      0.746        -0.043     0.060
walk_minute_dummy         0.0023      0.004      0.563      0.574        -0.006     0.010
r                        -0.0347      0.017     -2.078      0.038        -0.067    -0.002
rc_dummy                  0.0612      0.022      2.799      0.005         0.018     0.104
room_nums                 0.0241      0.017      1.411      0.159        -0.009     0.058
d0                       -0.0174      0.054     -0.323      0.747        -0.123     0.088
d1                       -0.1105      0.137     -0.805      0.421        -0.380     0.159
d2                       -0.0300      0.066     -0.451      0.652        -0.160     0.100
d3                       -0.1748      0.110     -1.586      0.113        -0.391     0.041
d4                       -0.5316      0.202     -2.632      0.009        -0.928    -0.135
d5                       -0.4824      0.171     -2.816      0.005        -0.818    -0.146
d6                        0.0489      0.058      0.841      0.401        -0.065     0.163
d7                        0.0211      0.074      0.287      0.774        -0.123     0.165
d8                       -0.1128      0.088     -1.289      0.198        -0.285     0.059
d9                        0.0640      0.053      1.206      0.228        -0.040     0.168
d10                      -0.0850      0.128     -0.666      0.505        -0.335     0.165
d11                      -1.1494      0.273     -4.218      0.000        -1.684    -0.615
d12                       0.1333      0.058      2.289      0.022         0.019     0.247
d13                      -0.4156      0.166     -2.505      0.012        -0.741    -0.090
d14                       0.0268      0.095      0.282      0.778        -0.160     0.213
d15                      -0.0540      0.107     -0.506      0.613        -0.263     0.155
d16                      -0.3531      0.155     -2.278      0.023        -0.657    -0.049
d17                      -0.0154      0.059     -0.261      0.794        -0.131     0.100
d18                      -0.2370      0.110     -2.156      0.031        -0.453    -0.021
d19                       0.0281      0.076      0.369      0.712        -0.122     0.178
d20                       0.0072      0.052      0.137      0.891        -0.096     0.110
d21                       0.1294      0.068      1.892      0.059        -0.005     0.264
d22                       0.0121      0.103      0.117      0.907        -0.191     0.215
d23                      -0.0003      0.049     -0.006      0.995        -0.096     0.096
d24                      -0.1533      0.152     -1.007      0.314        -0.452     0.145
d25                      -0.0588      0.076     -0.770      0.442        -0.209     0.091
d26                      -0.3967      0.179     -2.218      0.027        -0.748    -0.046
d27                       0.1425      0.078      1.836      0.067        -0.010     0.295
d28                       0.1011      0.048      2.103      0.036         0.007     0.195
d29                      -0.0074      0.061     -0.122      0.903        -0.126     0.112
d30                       0.1281      0.070      1.835      0.067        -0.009     0.265
d31                      -0.7232      0.222     -3.256      0.001        -1.159    -0.287
d32                      -0.0813      0.112     -0.725      0.468        -0.301     0.139
d33                      -0.2937      0.162     -1.810      0.070        -0.612     0.025
d34                      -0.0125      0.078     -0.159      0.874        -0.166     0.141
d35                      -0.1941      0.129     -1.506      0.132        -0.447     0.059
d36                      -0.4416      0.191     -2.316      0.021        -0.816    -0.068
d37                      -0.1862      0.103     -1.805      0.071        -0.389     0.016
d38                      -0.0473      0.063     -0.746      0.456        -0.172     0.077
d39                      -0.0304      0.096     -0.318      0.751        -0.218     0.157
d40                      -0.0138      0.052     -0.266      0.790        -0.116     0.088
d41                       0.0466      0.076      0.610      0.542        -0.103     0.197
d42                      -0.2470      0.143     -1.728      0.084        -0.527     0.033
d43                      -0.2230      0.127     -1.758      0.079        -0.472     0.026
d44                       0.0353      0.058      0.610      0.542        -0.078     0.149
d45                      -0.0443      0.082     -0.537      0.591        -0.206     0.117
d46                      -0.1214      0.063     -1.939      0.053        -0.244     0.001
d47                       0.0214      0.099      0.217      0.828        -0.172     0.215
d48                      -0.1826      0.125     -1.465      0.143        -0.427     0.062
d49                       0.0358      0.057      0.622      0.534        -0.077     0.149
d50                       0.0605      0.067      0.903      0.367        -0.071     0.192
d51                       0.1106      0.065      1.706      0.088        -0.017     0.238
d52                       0.1555      0.074      2.113      0.035         0.011     0.300
d53                      -0.3446      0.137     -2.508      0.012        -0.614    -0.075
d54                      -0.2822      0.171     -1.650      0.099        -0.618     0.053
d55                      -0.0560      0.114     -0.490      0.624        -0.280     0.168
d56                       0.1490      0.085      1.745      0.081        -0.019     0.317
d57                      -0.0660      0.097     -0.678      0.498        -0.257     0.125
d58                       0.0845      0.067      1.266      0.206        -0.046     0.215
d59                      -0.3216      0.154     -2.084      0.037        -0.624    -0.019
d60                      -0.2573      0.154     -1.675      0.094        -0.559     0.044
d61                       0.0471      0.046      1.022      0.307        -0.043     0.137
d62                      -0.0047      0.062     -0.076      0.939        -0.126     0.117
d63                       0.0360      0.061      0.594      0.552        -0.083     0.155
d64                      -0.1265      0.138     -0.916      0.360        -0.397     0.144
d65                      -0.3189      0.157     -2.035      0.042        -0.626    -0.012
d66                       0.0746      0.053      1.408      0.159        -0.029     0.178
d67                       0.0818      0.077      1.062      0.289        -0.069     0.233
d68                      -0.0173      0.051     -0.341      0.734        -0.117     0.082
d69                      -0.4229      0.182     -2.325      0.020        -0.780    -0.066
d70                      -0.1246      0.125     -1.000      0.318        -0.369     0.120
d71                      -0.1419      0.103     -1.375      0.169        -0.344     0.061
d72                      -0.1796      0.121     -1.479      0.139        -0.418     0.059
d73                      -0.0692      0.108     -0.640      0.522        -0.281     0.143
d74                      -0.4074      0.209     -1.947      0.052        -0.818     0.003
d75                      -0.3698      0.133     -2.770      0.006        -0.632    -0.108
d76                      -0.2747      0.156     -1.763      0.078        -0.580     0.031
d77                      -0.0344      0.124     -0.278      0.781        -0.277     0.208
d78                       0.1157      0.052      2.211      0.027         0.013     0.218
d79                      -0.8500      0.283     -3.001      0.003        -1.406    -0.294
d80                      -0.0631      0.105     -0.600      0.549        -0.270     0.143
d81                      -0.0365      0.088     -0.414      0.679        -0.209     0.136
d82                      -0.2350      0.137     -1.717      0.086        -0.503     0.034
d83                      -0.3091      0.171     -1.812      0.070        -0.644     0.026
d84                      -0.0187      0.054     -0.345      0.730        -0.125     0.088
d85                       0.0329      0.087      0.379      0.705        -0.138     0.203
d86                      -0.1773      0.101     -1.752      0.080        -0.376     0.021
d87                      -0.6244      0.186     -3.361      0.001        -0.989    -0.260
d88                      -0.4468      0.146     -3.064      0.002        -0.733    -0.161
d89                       0.0456      0.114      0.400      0.689        -0.178     0.269
d90                       0.0910      0.070      1.305      0.192        -0.046     0.228
d91                      -0.0464      0.122     -0.381      0.703        -0.285     0.192
d92                       0.0340      0.141      0.241      0.810        -0.243     0.311
d93                       0.0497      0.069      0.722      0.471        -0.085     0.185
d94                       0.0291      0.138      0.211      0.833        -0.241     0.299
d95                      -0.1891      0.138     -1.372      0.170        -0.460     0.081
d96                      -0.1254      0.107     -1.169      0.243        -0.336     0.085
d97                       0.0023      0.063      0.036      0.971        -0.121     0.125
d98                       0.1404      0.055      2.563      0.010         0.033     0.248
==============================================================================
Omnibus:                     2284.358   Durbin-Watson:                   1.844
Prob(Omnibus):                  0.000   Jarque-Bera (JB):          2179753.688
Skew:                          -9.703   Prob(JB):                         0.00
Kurtosis:                     193.483   Cond. No.                     2.59e+16
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 3.54e-27. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.

In [438]:
pred = results.predict()
evl = df['pay'] - np.exp(pred)
evl.hist(figsize=(7,5), bins=50)


Out[438]:
<matplotlib.axes._subplots.AxesSubplot at 0x11db5cd30>

6 無意味な乱数を利用したクラスターの作成とOLS


In [446]:
n=20

ransu = np.array([np.random.uniform() for i in range(len(df[:]))])
cluster_array = np.array([ransu, df['fX'], df['fY']])
gmm = mixture.GaussianMixture(n_components=n, covariance_type='full').fit(cluster_array.T)
dum = pd.get_dummies(gmm.predict(cluster_array.T))
dum_nam = ['d%s'%i for i in range(n)] 
dum.columns = dum_nam

In [447]:
df_with_dummy = pd.concat((df, dum), axis=1)

vars = ['pay', 'square', 'k', 'lk', 'dk', 'sdk', 'sldk', 'south_direction_dummy', 'building_year', 
        'new_dummy', 'mansyon_dumy', 'teiki_syakuya_dummy', 'walk_minute_dummy', 'r', 'rc_dummy', 'room_nums']
vars = vars + dum_nam[:-1]

eq = fml_build(vars)

y, X = dmatrices(eq, data=df_with_dummy, return_type='dataframe')

logy = np.log(y)

model = sm.GLS(logy, X, intercept=True)
results = model.fit()
print(results.summary())


                            GLS Regression Results                            
==============================================================================
Dep. Variable:                    pay   R-squared:                       0.809
Model:                            GLS   Adj. R-squared:                  0.805
Method:                 Least Squares   F-statistic:                     184.6
Date:                Sun, 20 Nov 2016   Prob (F-statistic):               0.00
Time:                        22:36:35   Log-Likelihood:                 669.26
No. Observations:                1427   AIC:                            -1273.
Df Residuals:                    1394   BIC:                            -1099.
Df Model:                          32                                         
Covariance Type:            nonrobust                                         
=========================================================================================
                            coef    std err          t      P>|t|      [95.0% Conf. Int.]
-----------------------------------------------------------------------------------------
Intercept                 5.6446      0.019    296.025      0.000         5.607     5.682
square                    0.0237      0.001     38.905      0.000         0.022     0.025
k                        -0.0608      0.017     -3.682      0.000        -0.093    -0.028
lk                    -9.727e-14   1.06e-14     -9.219      0.000     -1.18e-13 -7.66e-14
dk                        0.0151      0.016      0.925      0.355        -0.017     0.047
sdk                      -0.0735      0.155     -0.474      0.636        -0.378     0.231
sldk                     -0.2136      0.110     -1.939      0.053        -0.430     0.002
south_direction_dummy    -0.0112      0.012     -0.967      0.334        -0.034     0.012
building_year            -0.0091      0.000    -23.618      0.000        -0.010    -0.008
new_dummy                -0.0160      0.009     -1.809      0.071        -0.033     0.001
mansyon_dumy              5.6446      0.019    296.025      0.000         5.607     5.682
teiki_syakuya_dummy       0.0088      0.028      0.320      0.749        -0.045     0.063
walk_minute_dummy        -0.0027      0.003     -0.783      0.434        -0.009     0.004
r                        -0.0596      0.017     -3.595      0.000        -0.092    -0.027
rc_dummy                  0.0539      0.019      2.899      0.004         0.017     0.090
room_nums                -0.0660      0.014     -4.661      0.000        -0.094    -0.038
d0                        0.0058      0.026      0.219      0.827        -0.046     0.058
d1                        0.0031      0.025      0.125      0.901        -0.046     0.052
d2                        0.0065      0.026      0.248      0.804        -0.045     0.058
d3                       -0.0085      0.025     -0.347      0.728        -0.057     0.040
d4                        0.0086      0.026      0.335      0.737        -0.042     0.059
d5                        0.0319      0.026      1.212      0.226        -0.020     0.084
d6                        0.0377      0.025      1.499      0.134        -0.012     0.087
d7                        0.0081      0.025      0.317      0.752        -0.042     0.058
d8                        0.0186      0.024      0.788      0.431        -0.028     0.065
d9                        0.0067      0.026      0.254      0.800        -0.045     0.058
d10                       0.0245      0.025      0.989      0.323        -0.024     0.073
d11                      -0.0059      0.025     -0.238      0.812        -0.054     0.043
d12                       0.0081      0.026      0.309      0.758        -0.043     0.059
d13                       0.0501      0.026      1.926      0.054        -0.001     0.101
d14                       0.0174      0.025      0.687      0.492        -0.032     0.067
d15                       0.0114      0.027      0.420      0.674        -0.042     0.065
d16                       0.0112      0.024      0.466      0.641        -0.036     0.058
d17                      -0.0386      0.028     -1.389      0.165        -0.093     0.016
d18                       0.0156      0.024      0.637      0.524        -0.032     0.063
==============================================================================
Omnibus:                     1689.139   Durbin-Watson:                   1.413
Prob(Omnibus):                  0.000   Jarque-Bera (JB):           532848.797
Skew:                          -5.582   Prob(JB):                         0.00
Kurtosis:                      97.006   Cond. No.                     1.92e+17
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 6.45e-29. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.

これより単に変数を増やしたことが決定係数を上げたわけではないことがわかる。

7 クラスターの数


In [448]:
def cluster_OLS(n):
    cluster_array = np.array([df['pay'], df['fX'], df['fY']])
    gmm = mixture.GaussianMixture(n_components=n, covariance_type='full').fit(cluster_array.T)
    dum = pd.get_dummies(gmm.predict(cluster_array.T))
    dum_nam = ['d%s'%i for i in range(n)] 
    dum.columns = dum_nam

    df_with_dummy = pd.concat((df, dum), axis=1)

    vars = ['pay', 'square', 'k', 'lk', 'dk', 'sdk', 'sldk', 'south_direction_dummy', 'building_year', 
            'new_dummy', 'mansyon_dumy', 'teiki_syakuya_dummy', 'walk_minute_dummy', 'r', 'rc_dummy', 'room_nums']
    vars = vars + dum_nam[:-1]

    eq = fml_build(vars)

    y, X = dmatrices(eq, data=df_with_dummy, return_type='dataframe')

    logy = np.log(y)

    model = sm.GLS(logy, X, intercept=True)
    results = model.fit()
    print(results.summary())
    
    return results

In [449]:
n=1

results = cluster_OLS(n)


                            GLS Regression Results                            
==============================================================================
Dep. Variable:                    pay   R-squared:                       0.807
Model:                            GLS   Adj. R-squared:                  0.805
Method:                 Least Squares   F-statistic:                     453.8
Date:                Sun, 20 Nov 2016   Prob (F-statistic):               0.00
Time:                        22:36:36   Log-Likelihood:                 660.63
No. Observations:                1427   AIC:                            -1293.
Df Residuals:                    1413   BIC:                            -1220.
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
=========================================================================================
                            coef    std err          t      P>|t|      [95.0% Conf. Int.]
-----------------------------------------------------------------------------------------
Intercept                 5.6474      0.017    328.289      0.000         5.614     5.681
square                    0.0237      0.001     39.271      0.000         0.023     0.025
k                        -0.0600      0.016     -3.660      0.000        -0.092    -0.028
lk                    -1.674e-14   6.12e-16    -27.375      0.000     -1.79e-14 -1.55e-14
dk                        0.0173      0.016      1.068      0.286        -0.014     0.049
sdk                      -0.0807      0.154     -0.525      0.600        -0.382     0.221
sldk                     -0.2233      0.110     -2.039      0.042        -0.438    -0.009
south_direction_dummy    -0.0134      0.012     -1.166      0.244        -0.036     0.009
building_year            -0.0092      0.000    -23.890      0.000        -0.010    -0.008
new_dummy                -0.0173      0.009     -1.966      0.049        -0.034 -4.25e-05
mansyon_dumy              5.6474      0.017    328.289      0.000         5.614     5.681
teiki_syakuya_dummy       0.0140      0.027      0.511      0.610        -0.040     0.068
walk_minute_dummy        -0.0023      0.003     -0.681      0.496        -0.009     0.004
r                        -0.0584      0.016     -3.546      0.000        -0.091    -0.026
rc_dummy                  0.0557      0.018      3.015      0.003         0.019     0.092
room_nums                -0.0651      0.014     -4.649      0.000        -0.093    -0.038
==============================================================================
Omnibus:                     1699.589   Durbin-Watson:                   1.411
Prob(Omnibus):                  0.000   Jarque-Bera (JB):           553540.298
Skew:                          -5.637   Prob(JB):                         0.00
Kurtosis:                      98.826   Cond. No.                     2.60e+17
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 3.51e-29. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.

In [450]:
n=2

results = cluster_OLS(n)


                            GLS Regression Results                            
==============================================================================
Dep. Variable:                    pay   R-squared:                       0.818
Model:                            GLS   Adj. R-squared:                  0.817
Method:                 Least Squares   F-statistic:                     454.4
Date:                Sun, 20 Nov 2016   Prob (F-statistic):               0.00
Time:                        22:36:36   Log-Likelihood:                 704.80
No. Observations:                1427   AIC:                            -1380.
Df Residuals:                    1412   BIC:                            -1301.
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
=========================================================================================
                            coef    std err          t      P>|t|      [95.0% Conf. Int.]
-----------------------------------------------------------------------------------------
Intercept                 5.6581      0.017    338.364      0.000         5.625     5.691
square                    0.0203      0.001     29.401      0.000         0.019     0.022
k                        -0.0309      0.016     -1.910      0.056        -0.063     0.001
lk                     3.277e-15   3.82e-16      8.585      0.000      2.53e-15  4.03e-15
dk                        0.0304      0.016      1.927      0.054        -0.001     0.061
sdk                      -0.0643      0.149     -0.431      0.666        -0.357     0.228
sldk                     -0.2002      0.106     -1.885      0.060        -0.409     0.008
south_direction_dummy    -0.0168      0.011     -1.506      0.132        -0.039     0.005
building_year            -0.0077      0.000    -18.949      0.000        -0.008    -0.007
new_dummy                -0.0187      0.009     -2.197      0.028        -0.035    -0.002
mansyon_dumy              5.6581      0.017    338.364      0.000         5.625     5.691
teiki_syakuya_dummy       0.0055      0.027      0.208      0.835        -0.047     0.058
walk_minute_dummy        -0.0011      0.003     -0.345      0.730        -0.008     0.005
r                        -0.0537      0.016     -3.357      0.001        -0.085    -0.022
rc_dummy                  0.0689      0.018      3.836      0.000         0.034     0.104
room_nums                -0.0708      0.014     -5.212      0.000        -0.097    -0.044
d0                        0.1468      0.015      9.495      0.000         0.116     0.177
==============================================================================
Omnibus:                     1743.571   Durbin-Watson:                   1.415
Prob(Omnibus):                  0.000   Jarque-Bera (JB):           582054.200
Skew:                          -5.914   Prob(JB):                         0.00
Kurtosis:                     101.231   Cond. No.                     1.18e+18
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 1.69e-30. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.

In [451]:
n=5

results = cluster_OLS(n)


                            GLS Regression Results                            
==============================================================================
Dep. Variable:                    pay   R-squared:                       0.890
Model:                            GLS   Adj. R-squared:                  0.889
Method:                 Least Squares   F-statistic:                     673.3
Date:                Sun, 20 Nov 2016   Prob (F-statistic):               0.00
Time:                        22:36:36   Log-Likelihood:                 1065.2
No. Observations:                1427   AIC:                            -2094.
Df Residuals:                    1409   BIC:                            -2000.
Df Model:                          17                                         
Covariance Type:            nonrobust                                         
=========================================================================================
                            coef    std err          t      P>|t|      [95.0% Conf. Int.]
-----------------------------------------------------------------------------------------
Intercept                 5.5674      0.014    387.918      0.000         5.539     5.596
square                    0.0082      0.001     12.255      0.000         0.007     0.010
k                         0.0037      0.013      0.291      0.771        -0.022     0.029
lk                     2.771e-14   3.98e-16     69.643      0.000      2.69e-14  2.85e-14
dk                       -0.0002      0.013     -0.018      0.985        -0.025     0.024
sdk                       0.0269      0.116      0.232      0.817        -0.201     0.255
sldk                     -0.0548      0.083     -0.661      0.509        -0.217     0.108
south_direction_dummy    -0.0083      0.009     -0.951      0.342        -0.025     0.009
building_year            -0.0029      0.000     -8.003      0.000        -0.004    -0.002
new_dummy                -0.0114      0.007     -1.714      0.087        -0.024     0.002
mansyon_dumy              5.5674      0.014    387.918      0.000         5.539     5.596
teiki_syakuya_dummy       0.0041      0.021      0.200      0.842        -0.036     0.045
walk_minute_dummy        -0.0084      0.003     -3.265      0.001        -0.014    -0.003
r                        -0.0338      0.013     -2.699      0.007        -0.058    -0.009
rc_dummy                  0.0144      0.014      1.021      0.307        -0.013     0.042
room_nums                -0.0364      0.011     -3.424      0.001        -0.057    -0.016
d0                        0.5380      0.020     26.719      0.000         0.498     0.577
d1                        0.9428      0.030     31.342      0.000         0.884     1.002
d2                        0.3511      0.017     20.246      0.000         0.317     0.385
d3                        0.7239      0.024     30.551      0.000         0.677     0.770
==============================================================================
Omnibus:                     1810.800   Durbin-Watson:                   1.513
Prob(Omnibus):                  0.000   Jarque-Bera (JB):           653791.712
Skew:                          -6.338   Prob(JB):                         0.00
Kurtosis:                     107.092   Cond. No.                     1.09e+18
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is  2e-30. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.

In [452]:
n=10

results = cluster_OLS(n)


                            GLS Regression Results                            
==============================================================================
Dep. Variable:                    pay   R-squared:                       0.941
Model:                            GLS   Adj. R-squared:                  0.941
Method:                 Least Squares   F-statistic:                     1026.
Date:                Sun, 20 Nov 2016   Prob (F-statistic):               0.00
Time:                        22:36:36   Log-Likelihood:                 1512.2
No. Observations:                1427   AIC:                            -2978.
Df Residuals:                    1404   BIC:                            -2857.
Df Model:                          22                                         
Covariance Type:            nonrobust                                         
=========================================================================================
                            coef    std err          t      P>|t|      [95.0% Conf. Int.]
-----------------------------------------------------------------------------------------
Intercept                 6.0408      0.012    486.685      0.000         6.016     6.065
square                    0.0016      0.001      3.059      0.002         0.001     0.003
k                         0.0114      0.010      1.203      0.229        -0.007     0.030
lk                     5.717e-14   4.79e-16    119.389      0.000      5.62e-14  5.81e-14
dk                       -0.0140      0.009     -1.521      0.128        -0.032     0.004
sdk                      -0.0145      0.086     -0.168      0.866        -0.183     0.154
sldk                     -0.0551      0.061     -0.904      0.366        -0.175     0.065
south_direction_dummy    -0.0160      0.006     -2.489      0.013        -0.029    -0.003
building_year            -0.0003      0.000     -1.123      0.262        -0.001     0.000
new_dummy                -0.0096      0.005     -1.956      0.051        -0.019  2.59e-05
mansyon_dumy              6.0408      0.012    486.685      0.000         6.016     6.065
teiki_syakuya_dummy       0.0148      0.015      0.968      0.333        -0.015     0.045
walk_minute_dummy         0.0064      0.002      3.332      0.001         0.003     0.010
r                        -0.0066      0.009     -0.712      0.476        -0.025     0.012
rc_dummy                  0.0163      0.011      1.541      0.124        -0.004     0.037
room_nums                 0.0094      0.008      1.182      0.237        -0.006     0.025
d0                       -0.1256      0.012    -10.479      0.000        -0.149    -0.102
d1                       -0.6906      0.016    -43.325      0.000        -0.722    -0.659
d2                        0.1203      0.014      8.522      0.000         0.093     0.148
d3                       -0.3772      0.013    -28.367      0.000        -0.403    -0.351
d4                        0.3482      0.018     19.830      0.000         0.314     0.383
d5                       -1.0638      0.022    -48.397      0.000        -1.107    -1.021
d6                       -0.5006      0.015    -34.229      0.000        -0.529    -0.472
d7                        0.2328      0.014     16.185      0.000         0.205     0.261
d8                       -0.2562      0.012    -20.657      0.000        -0.281    -0.232
==============================================================================
Omnibus:                     2403.349   Durbin-Watson:                   1.791
Prob(Omnibus):                  0.000   Jarque-Bera (JB):          2744803.113
Skew:                         -10.754   Prob(JB):                         0.00
Kurtosis:                     216.778   Cond. No.                     2.00e+17
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 5.93e-29. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.

In [453]:
n=15

results = cluster_OLS(n)


                            GLS Regression Results                            
==============================================================================
Dep. Variable:                    pay   R-squared:                       0.991
Model:                            GLS   Adj. R-squared:                  0.991
Method:                 Least Squares   F-statistic:                     5697.
Date:                Sun, 20 Nov 2016   Prob (F-statistic):               0.00
Time:                        22:36:37   Log-Likelihood:                 2847.6
No. Observations:                1427   AIC:                            -5639.
Df Residuals:                    1399   BIC:                            -5492.
Df Model:                          27                                         
Covariance Type:            nonrobust                                         
=========================================================================================
                            coef    std err          t      P>|t|      [95.0% Conf. Int.]
-----------------------------------------------------------------------------------------
Intercept                 4.6593      0.013    370.289      0.000         4.635     4.684
square                    0.0020      0.000      8.967      0.000         0.002     0.002
k                         0.0009      0.004      0.227      0.820        -0.007     0.008
lk                     8.757e-15   5.32e-17    164.471      0.000      8.65e-15  8.86e-15
dk                        0.0026      0.004      0.701      0.484        -0.005     0.010
sdk                      -0.0190      0.034     -0.564      0.573        -0.085     0.047
sldk                      0.0099      0.024      0.414      0.679        -0.037     0.057
south_direction_dummy     0.0066      0.003      2.583      0.010         0.002     0.012
building_year            -0.0008      0.000     -7.254      0.000        -0.001    -0.001
new_dummy                -0.0012      0.002     -0.624      0.533        -0.005     0.003
mansyon_dumy              4.6593      0.013    370.289      0.000         4.635     4.684
teiki_syakuya_dummy       0.0139      0.006      2.294      0.022         0.002     0.026
walk_minute_dummy        -0.0039      0.001     -5.179      0.000        -0.005    -0.002
r                        -0.0001      0.004     -0.036      0.971        -0.007     0.007
rc_dummy                  0.0182      0.004      4.342      0.000         0.010     0.026
room_nums                -0.0013      0.003     -0.408      0.683        -0.007     0.005
d0                        2.5410      0.024    104.853      0.000         2.493     2.589
d1                        3.0294      0.025    122.686      0.000         2.981     3.078
d2                        2.1322      0.024     88.287      0.000         2.085     2.180
d3                        2.8214      0.024    115.927      0.000         2.774     2.869
d4                        2.3353      0.024     97.196      0.000         2.288     2.382
d5                        1.7317      0.025     68.968      0.000         1.682     1.781
d6                        2.7252      0.024    112.858      0.000         2.678     2.773
d7                        2.9187      0.024    119.746      0.000         2.871     2.967
d8                        2.6249      0.024    109.507      0.000         2.578     2.672
d9                        2.4141      0.024    100.540      0.000         2.367     2.461
d10                       2.0009      0.024     82.321      0.000         1.953     2.049
d11                       3.1198      0.025    125.911      0.000         3.071     3.168
d12                       2.2451      0.024     93.363      0.000         2.198     2.292
d13                       2.4807      0.024    102.977      0.000         2.433     2.528
==============================================================================
Omnibus:                      146.752   Durbin-Watson:                   1.474
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              415.574
Skew:                          -0.539   Prob(JB):                     5.74e-91
Kurtosis:                       5.414   Cond. No.                     1.27e+16
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 1.47e-26. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.

In [454]:
n=20

results = cluster_OLS(n)


                            GLS Regression Results                            
==============================================================================
Dep. Variable:                    pay   R-squared:                       0.993
Model:                            GLS   Adj. R-squared:                  0.993
Method:                 Least Squares   F-statistic:                     6436.
Date:                Sun, 20 Nov 2016   Prob (F-statistic):               0.00
Time:                        22:36:37   Log-Likelihood:                 3056.9
No. Observations:                1427   AIC:                            -6048.
Df Residuals:                    1394   BIC:                            -5874.
Df Model:                          32                                         
Covariance Type:            nonrobust                                         
=========================================================================================
                            coef    std err          t      P>|t|      [95.0% Conf. Int.]
-----------------------------------------------------------------------------------------
Intercept                 6.2748      0.008    801.367      0.000         6.259     6.290
square                    0.0010      0.000      4.893      0.000         0.001     0.001
k                        -0.0043      0.003     -1.298      0.194        -0.011     0.002
lk                      9.37e-14   7.87e-16    119.052      0.000      9.22e-14  9.52e-14
dk                        0.0038      0.003      1.193      0.233        -0.002     0.010
sdk                      -0.0043      0.029     -0.147      0.883        -0.062     0.053
sldk                      0.0092      0.021      0.441      0.659        -0.032     0.050
south_direction_dummy    -0.0047      0.002     -2.102      0.036        -0.009    -0.000
building_year            -0.0005   9.98e-05     -4.773      0.000        -0.001    -0.000
new_dummy                 0.0013      0.002      0.790      0.430        -0.002     0.005
mansyon_dumy              6.2748      0.008    801.367      0.000         6.259     6.290
teiki_syakuya_dummy       0.0092      0.005      1.765      0.078        -0.001     0.019
walk_minute_dummy        -0.0035      0.001     -5.328      0.000        -0.005    -0.002
r                        -0.0004      0.003     -0.121      0.904        -0.007     0.006
rc_dummy                  0.0135      0.004      3.731      0.000         0.006     0.021
room_nums                -0.0032      0.003     -1.181      0.238        -0.009     0.002
d0                       -1.0360      0.014    -75.513      0.000        -1.063    -1.009
d1                       -0.2280      0.013    -17.762      0.000        -0.253    -0.203
d2                       -0.6255      0.013    -48.904      0.000        -0.651    -0.600
d3                       -0.8008      0.013    -60.533      0.000        -0.827    -0.775
d4                       -0.4353      0.013    -34.097      0.000        -0.460    -0.410
d5                       -0.0623      0.013     -4.828      0.000        -0.088    -0.037
d6                       -1.4910      0.016    -95.621      0.000        -1.522    -1.460
d7                       -1.2386      0.015    -85.251      0.000        -1.267    -1.210
d8                       -0.8600      0.013    -64.107      0.000        -0.886    -0.834
d9                       -0.7172      0.013    -54.748      0.000        -0.743    -0.691
d10                      -0.5385      0.013    -41.657      0.000        -0.564    -0.513
d11                      -0.1465      0.013    -11.677      0.000        -0.171    -0.122
d12                      -0.3053      0.012    -24.473      0.000        -0.330    -0.281
d13                      -0.7607      0.013    -56.959      0.000        -0.787    -0.734
d14                      -0.3808      0.013    -29.113      0.000        -0.406    -0.355
d15                      -0.9417      0.014    -69.548      0.000        -0.968    -0.915
d16                      -3.1894      0.024   -132.548      0.000        -3.237    -3.142
d17                      -0.4871      0.013    -37.910      0.000        -0.512    -0.462
d18                      -1.1177      0.014    -79.949      0.000        -1.145    -1.090
==============================================================================
Omnibus:                      175.882   Durbin-Watson:                   1.558
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              963.408
Skew:                          -0.430   Prob(JB):                    6.29e-210
Kurtosis:                       6.932   Cond. No.                     1.46e+16
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 1.12e-26. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.

In [455]:
n=25

results = cluster_OLS(n)


                            GLS Regression Results                            
==============================================================================
Dep. Variable:                    pay   R-squared:                       0.997
Model:                            GLS   Adj. R-squared:                  0.996
Method:                 Least Squares   F-statistic:                 1.084e+04
Date:                Sun, 20 Nov 2016   Prob (F-statistic):               0.00
Time:                        22:36:37   Log-Likelihood:                 3532.7
No. Observations:                1427   AIC:                            -6989.
Df Residuals:                    1389   BIC:                            -6789.
Df Model:                          37                                         
Covariance Type:            nonrobust                                         
=========================================================================================
                            coef    std err          t      P>|t|      [95.0% Conf. Int.]
-----------------------------------------------------------------------------------------
Intercept                 5.6961      0.003   1898.759      0.000         5.690     5.702
square                    0.0009      0.000      6.587      0.000         0.001     0.001
k                         0.0020      0.002      0.820      0.412        -0.003     0.007
lk                    -5.768e-14   1.35e-15    -42.823      0.000     -6.03e-14  -5.5e-14
dk                        0.0008      0.002      0.345      0.730        -0.004     0.005
sdk                      -0.0018      0.021     -0.088      0.930        -0.043     0.039
sldk                      0.0041      0.015      0.275      0.783        -0.025     0.033
south_direction_dummy     0.0013      0.002      0.816      0.415        -0.002     0.004
building_year            -0.0002   7.22e-05     -3.357      0.001        -0.000    -0.000
new_dummy                -0.0016      0.001     -1.369      0.171        -0.004     0.001
mansyon_dumy              5.6961      0.003   1898.759      0.000         5.690     5.702
teiki_syakuya_dummy       0.0022      0.004      0.575      0.565        -0.005     0.010
walk_minute_dummy        -0.0012      0.000     -2.589      0.010        -0.002    -0.000
r                         0.0057      0.002      2.432      0.015         0.001     0.010
rc_dummy                 -0.0016      0.003     -0.587      0.557        -0.007     0.004
room_nums                -0.0068      0.002     -3.386      0.001        -0.011    -0.003
d0                        0.7290      0.006    124.126      0.000         0.717     0.741
d1                        0.1808      0.004     48.750      0.000         0.173     0.188
d2                        1.1082      0.007    160.764      0.000         1.095     1.122
d3                        0.4514      0.004    106.664      0.000         0.443     0.460
d4                       -0.0448      0.006     -7.356      0.000        -0.057    -0.033
d5                        0.8817      0.006    142.247      0.000         0.870     0.894
d6                        0.5688      0.005    126.204      0.000         0.560     0.578
d7                        0.3049      0.004     78.206      0.000         0.297     0.313
d8                        1.0081      0.008    131.545      0.000         0.993     1.023
d9                       -0.4757      0.009    -52.546      0.000        -0.493    -0.458
d10                       0.0855      0.004     23.124      0.000         0.078     0.093
d11                       0.6768      0.005    133.067      0.000         0.667     0.687
d12                       0.7888      0.006    132.869      0.000         0.777     0.800
d13                       0.5082      0.005    106.677      0.000         0.499     0.518
d14                       1.0347      0.007    149.251      0.000         1.021     1.048
d15                       0.2440      0.004     62.229      0.000         0.236     0.252
d16                      -0.2943      0.005    -57.116      0.000        -0.304    -0.284
d17                       0.3756      0.004     95.150      0.000         0.368     0.383
d18                       0.6277      0.005    119.071      0.000         0.617     0.638
d19                      -2.0194      0.015   -132.180      0.000        -2.049    -1.989
d20                       0.9440      0.007    140.037      0.000         0.931     0.957
d21                       0.8308      0.007    118.909      0.000         0.817     0.844
d22                      -0.1043      0.004    -23.915      0.000        -0.113    -0.096
d23                       1.1696      0.010    112.127      0.000         1.149     1.190
==============================================================================
Omnibus:                       27.673   Durbin-Watson:                   1.722
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               50.819
Skew:                          -0.097   Prob(JB):                     9.22e-12
Kurtosis:                       3.904   Cond. No.                     5.76e+16
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 7.16e-28. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.

In [456]:
n=30

results = cluster_OLS(n)


                            GLS Regression Results                            
==============================================================================
Dep. Variable:                    pay   R-squared:                       0.998
Model:                            GLS   Adj. R-squared:                  0.998
Method:                 Least Squares   F-statistic:                 1.383e+04
Date:                Sun, 20 Nov 2016   Prob (F-statistic):               0.00
Time:                        22:36:37   Log-Likelihood:                 3798.7
No. Observations:                1427   AIC:                            -7511.
Df Residuals:                    1384   BIC:                            -7285.
Df Model:                          42                                         
Covariance Type:            nonrobust                                         
=========================================================================================
                            coef    std err          t      P>|t|      [95.0% Conf. Int.]
-----------------------------------------------------------------------------------------
Intercept                 5.7828      0.002   2529.369      0.000         5.778     5.787
square                    0.0005      0.000      4.052      0.000         0.000     0.001
k                        -0.0010      0.002     -0.469      0.639        -0.005     0.003
lk                     9.185e-14   1.17e-15     78.433      0.000      8.96e-14  9.41e-14
dk                       -0.0008      0.002     -0.402      0.687        -0.005     0.003
sdk                       0.0027      0.017      0.153      0.878        -0.032     0.037
sldk                     -0.0195      0.013     -1.557      0.120        -0.044     0.005
south_direction_dummy    -0.0007      0.001     -0.514      0.608        -0.003     0.002
building_year         -5.665e-05   6.08e-05     -0.932      0.351        -0.000  6.26e-05
new_dummy                 0.0007      0.001      0.685      0.493        -0.001     0.003
mansyon_dumy              5.7828      0.002   2529.369      0.000         5.778     5.787
teiki_syakuya_dummy       0.0010      0.003      0.316      0.752        -0.005     0.007
walk_minute_dummy        -0.0010      0.000     -2.400      0.017        -0.002    -0.000
r                     -5.138e-05      0.002     -0.026      0.979        -0.004     0.004
rc_dummy                 -0.0036      0.002     -1.579      0.115        -0.008     0.001
room_nums                -0.0021      0.002     -1.224      0.221        -0.005     0.001
d0                       -0.2593      0.003    -87.380      0.000        -0.265    -0.254
d1                        0.5691      0.004    135.186      0.000         0.561     0.577
d2                        0.1254      0.003     45.473      0.000         0.120     0.131
d3                        0.8456      0.006    153.597      0.000         0.835     0.856
d4                        0.3046      0.003     93.788      0.000         0.298     0.311
d5                        0.4646      0.004    124.963      0.000         0.457     0.472
d6                       -0.0648      0.003    -25.882      0.000        -0.070    -0.060
d7                        0.7250      0.004    161.919      0.000         0.716     0.734
d8                        0.9523      0.005    186.835      0.000         0.942     0.962
d9                        0.1995      0.003     72.824      0.000         0.194     0.205
d10                      -0.6511      0.007    -89.039      0.000        -0.665    -0.637
d11                       0.0398      0.003     13.950      0.000         0.034     0.045
d12                      -0.1594      0.003    -52.349      0.000        -0.165    -0.153
d13                       0.6281      0.004    147.907      0.000         0.620     0.636
d14                       0.7994      0.006    137.237      0.000         0.788     0.811
d15                       0.4039      0.003    136.036      0.000         0.398     0.410
d16                      -0.4723      0.004   -113.722      0.000        -0.480    -0.464
d17                       1.0161      0.008    122.916      0.000         1.000     1.032
d18                      -2.1807      0.013   -174.190      0.000        -2.205    -2.156
d19                       0.6706      0.005    127.319      0.000         0.660     0.681
d20                       0.5147      0.004    145.148      0.000         0.508     0.522
d21                       0.1628      0.003     56.646      0.000         0.157     0.168
d22                       0.8783      0.005    171.335      0.000         0.868     0.888
d23                       0.2716      0.003     85.324      0.000         0.265     0.278
d24                      -0.1058      0.003    -33.245      0.000        -0.112    -0.100
d25                       0.3426      0.003    101.425      0.000         0.336     0.349
d26                       0.2346      0.003     74.061      0.000         0.228     0.241
d27                       0.7625      0.006    125.524      0.000         0.751     0.774
d28                       0.0808      0.002     32.846      0.000         0.076     0.086
==============================================================================
Omnibus:                      229.693   Durbin-Watson:                   1.733
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             2486.837
Skew:                          -0.380   Prob(JB):                         0.00
Kurtosis:                       9.422   Cond. No.                     6.23e+16
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 6.12e-28. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.

8 成約価格だけ


In [463]:
def cluster_OLS_onlyprice(n):
    cluster_array = np.array([df['pay']])
    gmm = mixture.GaussianMixture(n_components=n, covariance_type='full').fit(cluster_array.T)
    dum = pd.get_dummies(gmm.predict(cluster_array.T))
    dum_nam = ['d%s'%i for i in range(n)] 
    dum.columns = dum_nam

    df_with_dummy = pd.concat((df, dum), axis=1)

    vars = ['pay', 'square', 'k', 'lk', 'dk', 'sdk', 'sldk', 'south_direction_dummy', 'building_year', 
            'new_dummy', 'mansyon_dumy', 'teiki_syakuya_dummy', 'walk_minute_dummy', 'r', 'rc_dummy', 'room_nums']
    vars = vars + dum_nam[:-1]

    eq = fml_build(vars)

    y, X = dmatrices(eq, data=df_with_dummy, return_type='dataframe')

    logy = np.log(y)

    model = sm.GLS(logy, X, intercept=True)
    results = model.fit()
    print(results.summary())
    
    return results

In [461]:
n=10

results = cluster_OLS_onlyprice(n)


                            GLS Regression Results                            
==============================================================================
Dep. Variable:                    pay   R-squared:                       0.947
Model:                            GLS   Adj. R-squared:                  0.946
Method:                 Least Squares   F-statistic:                     1143.
Date:                Sun, 20 Nov 2016   Prob (F-statistic):               0.00
Time:                        22:37:29   Log-Likelihood:                 1585.4
No. Observations:                1427   AIC:                            -3125.
Df Residuals:                    1404   BIC:                            -3004.
Df Model:                          22                                         
Covariance Type:            nonrobust                                         
=========================================================================================
                            coef    std err          t      P>|t|      [95.0% Conf. Int.]
-----------------------------------------------------------------------------------------
Intercept                 5.8333      0.010    584.192      0.000         5.814     5.853
square                    0.0016      0.001      3.240      0.001         0.001     0.003
k                         0.0046      0.009      0.501      0.616        -0.013     0.023
lk                     4.104e-14   1.36e-16    300.922      0.000      4.08e-14  4.13e-14
dk                       -0.0094      0.009     -1.061      0.289        -0.027     0.008
sdk                      -0.0197      0.082     -0.241      0.809        -0.180     0.140
sldk                     -0.0583      0.058     -1.005      0.315        -0.172     0.056
south_direction_dummy    -0.0163      0.006     -2.674      0.008        -0.028    -0.004
building_year            -0.0008      0.000     -3.237      0.001        -0.001    -0.000
new_dummy                -0.0099      0.005     -2.117      0.034        -0.019    -0.001
mansyon_dumy              5.8333      0.010    584.192      0.000         5.814     5.853
teiki_syakuya_dummy       0.0019      0.015      0.130      0.896        -0.027     0.030
walk_minute_dummy         0.0044      0.002      2.437      0.015         0.001     0.008
r                        -0.0071      0.009     -0.801      0.423        -0.024     0.010
rc_dummy                  0.0309      0.010      3.142      0.002         0.012     0.050
room_nums                 0.0065      0.008      0.859      0.391        -0.008     0.021
d0                        0.1025      0.009     11.568      0.000         0.085     0.120
d1                        0.6603      0.018     37.193      0.000         0.625     0.695
d2                       -0.2835      0.008    -34.573      0.000        -0.300    -0.267
d3                        0.2036      0.010     20.885      0.000         0.184     0.223
d4                        0.5166      0.014     37.736      0.000         0.490     0.543
d5                       -0.1202      0.008    -15.404      0.000        -0.135    -0.105
d6                        0.3302      0.011     30.693      0.000         0.309     0.351
d7                       -0.6621      0.015    -44.191      0.000        -0.692    -0.633
d8                        0.7644      0.020     39.188      0.000         0.726     0.803
==============================================================================
Omnibus:                     2583.659   Durbin-Watson:                   1.854
Prob(Omnibus):                  0.000   Jarque-Bera (JB):          4021412.752
Skew:                         -12.484   Prob(JB):                         0.00
Kurtosis:                     261.864   Cond. No.                     3.95e+17
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 1.53e-29. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.

9 面積と位置でクラスター分類に加え値段による3区分


In [557]:
def cluster_OLS_honmei(n):
    dum1 = pd.DataFrame((df['pay'] < 100000)*1)
    dum1.columns = ['low']
    dum2 = pd.DataFrame((df['pay'] > 150000)*1)
    dum2.columns = ['high']
    dum = pd.concat((dum1, dum2), axis=1)
    
    df_with_dummy = pd.concat((df, dum), axis=1)
    
    cluster_array = np.array([df['square'], df['fX']*1000, df['fY']*1000])
    gmm = mixture.GaussianMixture(n_components=n, covariance_type='full').fit(cluster_array.T)
    dum = pd.get_dummies(gmm.predict(cluster_array.T))
    dum_nam = ['d%s'%i for i in range(n)] 
    dum.columns = dum_nam

    df_with_dummy = pd.concat((df_with_dummy, dum), axis=1)

    vars = ['pay', 'square', 'k', 'lk', 'dk', 'sdk', 'sldk', 'south_direction_dummy', 'building_year', 
            'new_dummy', 'mansyon_dumy', 'teiki_syakuya_dummy', 'walk_minute_dummy', 'r', 'rc_dummy', 
            'room_nums', 'low', 'high']
    vars = vars + dum_nam[:-1]

    eq = fml_build(vars)

    y, X = dmatrices(eq, data=df_with_dummy, return_type='dataframe')

    logy = np.log(y)

    model = sm.GLS(logy, X, intercept=True)
    results = model.fit()
    print(results.summary())
    
    return results

In [572]:
n=50

results = cluster_OLS_honmei(n)


                            GLS Regression Results                            
==============================================================================
Dep. Variable:                    pay   R-squared:                       0.888
Model:                            GLS   Adj. R-squared:                  0.882
Method:                 Least Squares   F-statistic:                     168.0
Date:                Sun, 20 Nov 2016   Prob (F-statistic):               0.00
Time:                        22:58:53   Log-Likelihood:                 1047.1
No. Observations:                1427   AIC:                            -1964.
Df Residuals:                    1362   BIC:                            -1622.
Df Model:                          64                                         
Covariance Type:            nonrobust                                         
=========================================================================================
                            coef    std err          t      P>|t|      [95.0% Conf. Int.]
-----------------------------------------------------------------------------------------
Intercept                 5.6607      0.087     64.831      0.000         5.489     5.832
square                    0.0153      0.003      5.436      0.000         0.010     0.021
k                        -0.0440      0.014     -3.054      0.002        -0.072    -0.016
lk                    -3.918e-14   4.65e-15     -8.420      0.000     -4.83e-14    -3e-14
dk                       -0.0370      0.015     -2.480      0.013        -0.066    -0.008
sdk                       0.0880      0.122      0.719      0.473        -0.152     0.328
sldk                      0.0004      0.094      0.004      0.997        -0.185     0.185
south_direction_dummy    -0.0030      0.010     -0.314      0.753        -0.022     0.016
building_year            -0.0050      0.000    -12.427      0.000        -0.006    -0.004
new_dummy                -0.0224      0.007     -3.148      0.002        -0.036    -0.008
mansyon_dumy              5.6607      0.087     64.831      0.000         5.489     5.832
teiki_syakuya_dummy      -0.0007      0.023     -0.030      0.976        -0.045     0.044
walk_minute_dummy         0.0073      0.003      2.150      0.032         0.001     0.014
r                        -0.0524      0.014     -3.784      0.000        -0.080    -0.025
rc_dummy                  0.0313      0.017      1.835      0.067        -0.002     0.065
room_nums                -0.0031      0.013     -0.229      0.819        -0.030     0.023
low                      -0.1832      0.013    -14.041      0.000        -0.209    -0.158
high                      0.2205      0.013     16.370      0.000         0.194     0.247
d0                       -0.0237      0.052     -0.453      0.651        -0.127     0.079
d1                        0.0637      0.112      0.569      0.570        -0.156     0.284
d2                        0.0778      0.085      0.920      0.358        -0.088     0.244
d3                       -0.1364      0.077     -1.765      0.078        -0.288     0.015
d4                        0.0189      0.064      0.296      0.767        -0.106     0.144
d5                     6.239e-05      0.044      0.001      0.999        -0.085     0.086
d6                        0.0183      0.133      0.138      0.890        -0.242     0.278
d7                        0.1680      0.100      1.686      0.092        -0.027     0.363
d8                        0.0384      0.092      0.419      0.675        -0.141     0.218
d9                        0.0333      0.120      0.278      0.781        -0.202     0.269
d10                       0.0409      0.070      0.585      0.559        -0.096     0.178
d11                       0.1180      0.110      1.073      0.283        -0.098     0.334
d12                       0.0464      0.059      0.793      0.428        -0.068     0.161
d13                       0.1574      0.087      1.808      0.071        -0.013     0.328
d14                       0.0262      0.042      0.621      0.535        -0.057     0.109
d15                      -0.2781      0.091     -3.057      0.002        -0.457    -0.100
d16                       0.0191      0.055      0.347      0.729        -0.089     0.127
d17                      -0.0147      0.067     -0.220      0.826        -0.145     0.116
d18                       0.0666      0.102      0.652      0.514        -0.134     0.267
d19                       0.0715      0.106      0.672      0.501        -0.137     0.280
d20                      -0.2029      0.050     -4.049      0.000        -0.301    -0.105
d21                      -0.1252      0.123     -1.015      0.310        -0.367     0.117
d22                       0.0604      0.115      0.528      0.598        -0.164     0.285
d23                       0.0779      0.118      0.659      0.510        -0.154     0.310
d24                      -0.0147      0.048     -0.309      0.757        -0.108     0.079
d25                       0.0716      0.116      0.616      0.538        -0.157     0.300
d26                       0.0688      0.103      0.667      0.505        -0.134     0.271
d27                      -0.0309      0.098     -0.314      0.754        -0.224     0.162
d28                       0.0477      0.111      0.428      0.669        -0.171     0.266
d29                       0.0807      0.087      0.930      0.353        -0.090     0.251
d30                       0.0453      0.057      0.799      0.425        -0.066     0.156
d31                      -0.0578      0.051     -1.137      0.256        -0.157     0.042
d32                       0.0693      0.065      1.070      0.285        -0.058     0.196
d33                       0.0085      0.056      0.151      0.880        -0.102     0.119
d34                       0.0631      0.088      0.719      0.473        -0.109     0.235
d35                       0.0631      0.065      0.964      0.335        -0.065     0.191
d36                       0.0720      0.082      0.880      0.379        -0.088     0.232
d37                       0.0644      0.116      0.553      0.580        -0.164     0.293
d38                       0.0270      0.119      0.227      0.820        -0.206     0.260
d39                       0.1793      0.095      1.890      0.059        -0.007     0.365
d40                       0.0506      0.093      0.544      0.586        -0.132     0.233
d41                      -0.0884      0.138     -0.643      0.520        -0.358     0.181
d42                       0.1036      0.092      1.125      0.261        -0.077     0.284
d43                      -0.0301      0.058     -0.520      0.603        -0.144     0.083
d44                       0.0061      0.064      0.096      0.924        -0.119     0.132
d45                       0.0898      0.080      1.128      0.260        -0.066     0.246
d46                       0.1824      0.055      3.330      0.001         0.075     0.290
d47                      -0.0425      0.072     -0.587      0.558        -0.184     0.100
d48                       0.0049      0.085      0.057      0.955        -0.162     0.172
==============================================================================
Omnibus:                     2111.563   Durbin-Watson:                   1.654
Prob(Omnibus):                  0.000   Jarque-Bera (JB):          1423155.455
Skew:                          -8.350   Prob(JB):                         0.00
Kurtosis:                     156.806   Cond. No.                     2.36e+16
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 4.26e-27. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.

In [573]:
pred = results.predict()
evl = df['pay'] - np.exp(pred)
evl.hist(figsize=(7,5), bins=50)


Out[573]:
<matplotlib.axes._subplots.AxesSubplot at 0x11fc10080>

In [ ]: