In [12]:
%matplotlib inline

In [13]:
import numpy as np
import pandas as pd

# 統計用ツール
import statsmodels.api as sm
import statsmodels.tsa.api as tsa
from patsy import dmatrices

# 自作の空間統計用ツール
from spatialstat import *

#描画
import matplotlib.pyplot as plt
from pandas.tools.plotting import autocorrelation_plot

#クラスター
from sklearn import mixture

#深層学習
import chainer
from chainer import cuda, Function, gradient_check, Variable, optimizers, serializers, utils
from chainer import Link, Chain, ChainList
import chainer.functions as F
import chainer.links as L

クラスター分けしてOLS

1 CSVをpandasで取り込む。


In [14]:
df = pd.read_csv('bukken_data.csv')
df = df[:][df['pay'] < 300000]
df = df.reset_index(drop=True)

In [15]:
df.columns


Out[15]:
Index(['apart_dummy', 'building_year', 'dk', 'fX', 'fY', 'floor', 'k', 'lk',
       'mansyon_dumy', 'new_dummy', 'pay', 'published_date', 'r', 'rc_dummy',
       'room_nums', 'sdk', 'sk', 'sldk', 'slk', 'south_direction_dummy',
       'square', 'teiki_syakuya_dummy', 'walk_minute_dummy'],
      dtype='object')

2 部屋面積と位置でクラスター分類を利用してからOLS


In [59]:
def cluster_OLS(n):
    dum1 = pd.DataFrame((df['pay'] < 100000)*1)
    dum1.columns = ['low']
    dum2 = pd.DataFrame((df['pay'] > 150000)*1)
    dum2.columns = ['high']
    dum = pd.concat((dum1, dum2), axis=1)
    
    df_with_dummy = pd.concat((df, dum), axis=1)
    
    cluster_array = np.array([df['square'], df['fX']*1000, df['fY']*1000])
    gmm = mixture.GaussianMixture(n_components=n, covariance_type='full').fit(cluster_array.T)
    dum = pd.get_dummies(gmm.predict(cluster_array.T))
    dum_nam = ['d%s'%i for i in range(n)] 
    dum.columns = dum_nam
    
    df_with_dummy = pd.concat((df_with_dummy, dum), axis=1)

    vars = ['pay', 'square', 'k', 'lk', 'dk', 'sdk', 'sldk', 'south_direction_dummy', 'building_year', 
            'new_dummy', 'mansyon_dumy', 'teiki_syakuya_dummy', 'walk_minute_dummy', 'r', 'rc_dummy', 
            'room_nums', 'low', 'high']
    vars = vars + dum_nam[:-1]

    eq = fml_build(vars)

    y, X = dmatrices(eq, data=df_with_dummy, return_type='dataframe')
    
    y_in = y[1:1000]
    X_in = X[1:1000]
    
    y_ex = y[1000:]
    X_ex = X[1000:]

    logy_in = np.log(y_in)

    model = sm.GLS(logy_in, X_in, intercept=True)
    results = model.fit()
    print(results.summary())
    
    return results, np.array(y_ex).reshape(len(y_ex)), np.array(X_ex)

In [60]:
n=50

results, y_ex, X_ex = cluster_OLS(n)


                            GLS Regression Results                            
==============================================================================
Dep. Variable:                    pay   R-squared:                       0.865
Model:                            GLS   Adj. R-squared:                  0.856
Method:                 Least Squares   F-statistic:                     97.03
Date:                Fri, 25 Nov 2016   Prob (F-statistic):               0.00
Time:                        18:51:40   Log-Likelihood:                 646.31
No. Observations:                 999   AIC:                            -1167.
Df Residuals:                     936   BIC:                            -857.5
Df Model:                          62                                         
Covariance Type:            nonrobust                                         
=========================================================================================
                            coef    std err          t      P>|t|      [95.0% Conf. Int.]
-----------------------------------------------------------------------------------------
Intercept                 5.8398      0.095     61.381      0.000         5.653     6.026
square                    0.0091      0.003      2.731      0.006         0.003     0.016
k                        -0.0504      0.021     -2.401      0.017        -0.092    -0.009
lk                    -7.499e-14   2.97e-15    -25.256      0.000     -8.08e-14 -6.92e-14
dk                       -0.1023      0.023     -4.480      0.000        -0.147    -0.057
sdk                    1.748e-15   7.15e-16      2.446      0.015      3.45e-16  3.15e-15
sldk                  -5.187e-15   8.71e-16     -5.953      0.000      -6.9e-15 -3.48e-15
south_direction_dummy    -0.0164      0.013     -1.298      0.195        -0.041     0.008
building_year            -0.0049      0.001     -7.744      0.000        -0.006    -0.004
new_dummy                -0.0249      0.009     -2.634      0.009        -0.043    -0.006
mansyon_dumy              5.8398      0.095     61.381      0.000         5.653     6.026
teiki_syakuya_dummy      -0.0279      0.036     -0.785      0.433        -0.098     0.042
walk_minute_dummy         0.0019      0.005      0.369      0.712        -0.008     0.012
r                        -0.0780      0.020     -3.840      0.000        -0.118    -0.038
rc_dummy                  0.0544      0.024      2.259      0.024         0.007     0.102
room_nums                 0.0431      0.020      2.203      0.028         0.005     0.081
low                      -0.2157      0.020    -10.817      0.000        -0.255    -0.177
high                      0.2289      0.019     12.108      0.000         0.192     0.266
d0                       -0.0869      0.062     -1.411      0.159        -0.208     0.034
d1                       -0.1741      0.108     -1.617      0.106        -0.385     0.037
d2                       -0.1127      0.058     -1.942      0.052        -0.227     0.001
d3                       -0.1910      0.123     -1.555      0.120        -0.432     0.050
d4                       -0.1257      0.085     -1.479      0.139        -0.293     0.041
d5                       -0.1645      0.068     -2.415      0.016        -0.298    -0.031
d6                       -0.1667      0.116     -1.431      0.153        -0.395     0.062
d7                       -0.3002      0.115     -2.620      0.009        -0.525    -0.075
d8                       -0.0815      0.096     -0.845      0.398        -0.271     0.108
d9                       -0.0855      0.057     -1.493      0.136        -0.198     0.027
d10                      -0.3596      0.145     -2.477      0.013        -0.645    -0.075
d11                      -0.1105      0.095     -1.162      0.245        -0.297     0.076
d12                      -0.1461      0.066     -2.211      0.027        -0.276    -0.016
d13                      -0.1962      0.123     -1.592      0.112        -0.438     0.046
d14                      -0.0274      0.088     -0.310      0.756        -0.201     0.146
d15                      -0.1418      0.116     -1.226      0.221        -0.369     0.085
d16                      -0.3229      0.129     -2.510      0.012        -0.575    -0.070
d17                      -0.1438      0.086     -1.672      0.095        -0.313     0.025
d18                      -0.1020      0.061     -1.676      0.094        -0.222     0.017
d19                      -0.0168      0.108     -0.155      0.877        -0.229     0.196
d20                      -0.0037      0.054     -0.068      0.946        -0.109     0.101
d21                      -0.0801      0.111     -0.724      0.469        -0.297     0.137
d22                      -0.1173      0.063     -1.858      0.064        -0.241     0.007
d23                      -0.1982      0.116     -1.705      0.089        -0.426     0.030
d24                      -0.1131      0.085     -1.337      0.181        -0.279     0.053
d25                      -0.4661      0.137     -3.393      0.001        -0.736    -0.197
d26                      -0.1257      0.059     -2.114      0.035        -0.242    -0.009
d27                      -0.1242      0.092     -1.346      0.179        -0.305     0.057
d28                      -0.1837      0.107     -1.709      0.088        -0.395     0.027
d29                      -0.2127      0.118     -1.798      0.072        -0.445     0.019
d30                      -0.1834      0.123     -1.488      0.137        -0.425     0.058
d31                      -0.1909      0.138     -1.383      0.167        -0.462     0.080
d32                      -0.2033      0.111     -1.827      0.068        -0.422     0.015
d33                      -0.0237      0.073     -0.326      0.745        -0.166     0.119
d34                      -0.1896      0.060     -3.141      0.002        -0.308    -0.071
d35                      -0.1891      0.119     -1.588      0.113        -0.423     0.045
d36                      -0.1856      0.053     -3.472      0.001        -0.290    -0.081
d37                      -0.0890      0.067     -1.324      0.186        -0.221     0.043
d38                      -0.0088      0.097     -0.090      0.928        -0.199     0.182
d39                      -0.1576      0.093     -1.694      0.091        -0.340     0.025
d40                      -0.2027      0.104     -1.955      0.051        -0.406     0.001
d41                      -0.1905      0.095     -2.003      0.045        -0.377    -0.004
d42                      -0.1266      0.090     -1.400      0.162        -0.304     0.051
d43                      -0.1233      0.117     -1.050      0.294        -0.354     0.107
d44                      -0.1869      0.112     -1.666      0.096        -0.407     0.033
d45                      -0.1006      0.094     -1.073      0.284        -0.285     0.083
d46                      -0.1634      0.088     -1.854      0.064        -0.336     0.010
d47                      -0.0671      0.084     -0.800      0.424        -0.232     0.097
d48                      -0.0277      0.069     -0.399      0.690        -0.164     0.109
==============================================================================
Omnibus:                     1525.247   Durbin-Watson:                   1.695
Prob(Omnibus):                  0.000   Jarque-Bera (JB):           875214.888
Skew:                          -8.758   Prob(JB):                         0.00
Kurtosis:                     146.942   Cond. No.                     1.57e+16
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 6.5e-27. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.

In [61]:
logy_pred = results.params.dot(X_ex.T)
y_pred = np.exp(logy_pred)
error = y_ex - y_pred
print(error[:20])
plt.hist(error)


[ -2058.72822694 -12058.72822694   2025.9571949   -2058.72822694
   1778.59356576  -2974.0428051    9850.00174397   4957.38474505
  10161.81131176  -5646.95695557  -5646.95695557 -12646.95695557
  32134.46686969  59730.46910525   7326.92849608  67955.82667695
  36757.51873117   8461.72022571  49211.38057774   7783.15981049]
Out[61]:
(array([   4.,   19.,   64.,  152.,  117.,   46.,   11.,    5.,    6.,    3.]),
 array([-51644.22437441, -38328.01669036, -25011.80900631, -11695.60132225,
          1620.6063618 ,  14936.81404585,  28253.0217299 ,  41569.22941395,
         54885.437098  ,  68201.64478206,  81517.85246611]),
 <a list of 10 Patch objects>)

In [62]:
print(sum(((error > -5000)*1)*((error < 5000)*1)))
print(sum(((error > -15000)*1)*((error < 15000)*1)))
print(sum(((error > -30000)*1)*((error < 30000)*1)))


130
299
393

In [83]:
plt.plot(y_pred)
plt.plot(y_ex)


Out[83]:
[<matplotlib.lines.Line2D at 0x11bfae080>]

3 部屋面積と位置でクラスター分類を利用してからDeepLearning


In [25]:
class CAR(Chain):
    def __init__(self, unit1, unit2, unit3, col_num):
        self.unit1 = unit1
        self.unit2 = unit2
        self.unit3 = unit3
        super(CAR, self).__init__(
            l1 = L.Linear(col_num, unit1),
            l2 = L.Linear(self.unit1, self.unit1),
            l3 = L.Linear(self.unit1, self.unit2),
            l4 = L.Linear(self.unit2, self.unit3),
            l5 = L.Linear(self.unit3, self.unit3),
            l6 = L.Linear(self.unit3, 1),
        )
    
    def __call__(self, x, y):
        fv = self.fwd(x, y)
        loss = F.mean_squared_error(fv, y)
        return loss
    
    def fwd(self, x, y):
        h1 = F.sigmoid(self.l1(x))
        h2 = F.sigmoid(self.l2(h1))
        h3 = F.sigmoid(self.l3(h2))
        h4 = F.sigmoid(self.l4(h3))
        h5 = F.sigmoid(self.l5(h4))
        h6 = self.l6(h5)
        return h6

In [42]:
def DL(df, n, bs = 200):
    dum1 = pd.DataFrame((df['pay'] < 100000)*1)
    dum1.columns = ['low']
    dum2 = pd.DataFrame((df['pay'] > 150000)*1)
    dum2.columns = ['high']
    dum = pd.concat((dum1, dum2), axis=1)
    
    df_with_dummy = pd.concat((df, dum), axis=1)
    
    cluster_array = np.array([df['square'], df['fX']*1000, df['fY']*1000])
    gmm = mixture.GaussianMixture(n_components=n, covariance_type='full').fit(cluster_array.T)
    dum = pd.get_dummies(gmm.predict(cluster_array.T))
    dum_nam = ['d%s'%i for i in range(n)] 
    dum.columns = dum_nam
    
    df_with_dummy = pd.concat((df_with_dummy, dum), axis=1)

    vars = ['pay', 'square', 'k', 'lk', 'dk', 'sdk', 'sldk', 'south_direction_dummy', 'building_year', 
            'new_dummy', 'mansyon_dumy', 'teiki_syakuya_dummy', 'walk_minute_dummy', 'r', 'rc_dummy', 
            'room_nums', 'low', 'high']
    vars = vars + dum_nam[:-1]

    eq = fml_build(vars)

    y, X = dmatrices(eq, data=df_with_dummy, return_type='dataframe')
    
    y_in = y[1:1000]
    X_in = X[1:1000]
    
    y_ex = y[1000:]
    X_ex = X[1000:]

    logy_in = np.log(y_in)
    
    logy_in = np.array(logy_in, dtype='float32')
    X_in = np.array(X_in, dtype='float32')
    
    y = Variable(logy_in)
    x = Variable(X_in)
    
    num, col_num = X_in.shape

    model1 = CAR(10, 10, 3, col_num)
    optimizer = optimizers.SGD()
    optimizer.setup(model1)
    
    for j in range(1000):
        sffindx = np.random.permutation(num)
        for i in range(0, num, bs):
            x = Variable(X_in[sffindx[i:(i+bs) if (i+bs) < num else num]])
            y = Variable(logy_in[sffindx[i:(i+bs) if (i+bs) < num else num]])
            model1.zerograds()
            loss = model1(x, y)
            loss.backward()
            optimizer.update()
        if j % 1000 == 0:
            loss_val = loss.data
            print('epoch:', j)
            print('train mean loss={}'.format(loss_val))
            print(' - - - - - - - - - ')
    
    return model1, np.array(y_ex, dtype='float32').reshape(len(y_ex)), np.array(X_ex, dtype='float32')

In [44]:
results, y_ex, X_ex = DL(df, 20)


epoch: 0
train mean loss=105.35887145996094
 - - - - - - - - - 
epoch: 1000
train mean loss=0.10744299739599228
 - - - - - - - - - 
epoch: 2000
train mean loss=0.12936271727085114
 - - - - - - - - - 
epoch: 3000
train mean loss=0.11869735270738602
 - - - - - - - - - 
epoch: 4000
train mean loss=0.11930324882268906
 - - - - - - - - - 
epoch: 5000
train mean loss=0.0849682092666626
 - - - - - - - - - 
epoch: 6000
train mean loss=0.07333460450172424
 - - - - - - - - - 
epoch: 7000
train mean loss=0.04295388236641884
 - - - - - - - - - 
epoch: 8000
train mean loss=0.04604930803179741
 - - - - - - - - - 
epoch: 9000
train mean loss=0.00910791102796793
 - - - - - - - - - 

In [57]:
X_ex = Variable(X_ex)
logy_ex = Variable(np.log(y_ex))

logy_pred =  results.fwd(X_ex, logy_ex).data
y_pred = np.exp(logy_pred)
error = y_ex - y_pred.reshape(len(y_pred),)
print(error[:20])
plt.hist(error)


[    84.2109375  -9915.7890625   7287.765625      84.2109375   3296.4375
   2287.765625   24926.1953125   5728.5         7833.2265625 -31672.1875
 -31672.1875    -38672.1875     18973.8125     39655.375       6056.984375
  48828.03125     6185.84375   -10770.375      47317.0625      6918.078125 ]
Out[57]:
(array([  10.,   27.,   75.,  178.,   80.,   41.,    4.,    4.,    5.,    3.]),
 array([-39911.09375 , -28267.921875, -16624.75    ,  -4981.578125,
          6661.59375 ,  18304.765625,  29947.9375  ,  41591.109375,
         53234.28125 ,  64877.453125,  76520.625   ]),
 <a list of 10 Patch objects>)

In [58]:
print(sum(((error > -5000)*1)*((error < 5000)*1)))
print(sum(((error > -15000)*1)*((error < 15000)*1)))
print(sum(((error > -30000)*1)*((error < 30000)*1)))


154
314
403

In [ ]: