In [12]:
%matplotlib inline
In [13]:
import numpy as np
import pandas as pd
# 統計用ツール
import statsmodels.api as sm
import statsmodels.tsa.api as tsa
from patsy import dmatrices
# 自作の空間統計用ツール
from spatialstat import *
#描画
import matplotlib.pyplot as plt
from pandas.tools.plotting import autocorrelation_plot
#クラスター
from sklearn import mixture
#深層学習
import chainer
from chainer import cuda, Function, gradient_check, Variable, optimizers, serializers, utils
from chainer import Link, Chain, ChainList
import chainer.functions as F
import chainer.links as L
In [14]:
df = pd.read_csv('bukken_data.csv')
df = df[:][df['pay'] < 300000]
df = df.reset_index(drop=True)
In [15]:
df.columns
Out[15]:
In [59]:
def cluster_OLS(n):
dum1 = pd.DataFrame((df['pay'] < 100000)*1)
dum1.columns = ['low']
dum2 = pd.DataFrame((df['pay'] > 150000)*1)
dum2.columns = ['high']
dum = pd.concat((dum1, dum2), axis=1)
df_with_dummy = pd.concat((df, dum), axis=1)
cluster_array = np.array([df['square'], df['fX']*1000, df['fY']*1000])
gmm = mixture.GaussianMixture(n_components=n, covariance_type='full').fit(cluster_array.T)
dum = pd.get_dummies(gmm.predict(cluster_array.T))
dum_nam = ['d%s'%i for i in range(n)]
dum.columns = dum_nam
df_with_dummy = pd.concat((df_with_dummy, dum), axis=1)
vars = ['pay', 'square', 'k', 'lk', 'dk', 'sdk', 'sldk', 'south_direction_dummy', 'building_year',
'new_dummy', 'mansyon_dumy', 'teiki_syakuya_dummy', 'walk_minute_dummy', 'r', 'rc_dummy',
'room_nums', 'low', 'high']
vars = vars + dum_nam[:-1]
eq = fml_build(vars)
y, X = dmatrices(eq, data=df_with_dummy, return_type='dataframe')
y_in = y[1:1000]
X_in = X[1:1000]
y_ex = y[1000:]
X_ex = X[1000:]
logy_in = np.log(y_in)
model = sm.GLS(logy_in, X_in, intercept=True)
results = model.fit()
print(results.summary())
return results, np.array(y_ex).reshape(len(y_ex)), np.array(X_ex)
In [60]:
n=50
results, y_ex, X_ex = cluster_OLS(n)
In [61]:
logy_pred = results.params.dot(X_ex.T)
y_pred = np.exp(logy_pred)
error = y_ex - y_pred
print(error[:20])
plt.hist(error)
Out[61]:
In [62]:
print(sum(((error > -5000)*1)*((error < 5000)*1)))
print(sum(((error > -15000)*1)*((error < 15000)*1)))
print(sum(((error > -30000)*1)*((error < 30000)*1)))
In [83]:
plt.plot(y_pred)
plt.plot(y_ex)
Out[83]:
In [25]:
class CAR(Chain):
def __init__(self, unit1, unit2, unit3, col_num):
self.unit1 = unit1
self.unit2 = unit2
self.unit3 = unit3
super(CAR, self).__init__(
l1 = L.Linear(col_num, unit1),
l2 = L.Linear(self.unit1, self.unit1),
l3 = L.Linear(self.unit1, self.unit2),
l4 = L.Linear(self.unit2, self.unit3),
l5 = L.Linear(self.unit3, self.unit3),
l6 = L.Linear(self.unit3, 1),
)
def __call__(self, x, y):
fv = self.fwd(x, y)
loss = F.mean_squared_error(fv, y)
return loss
def fwd(self, x, y):
h1 = F.sigmoid(self.l1(x))
h2 = F.sigmoid(self.l2(h1))
h3 = F.sigmoid(self.l3(h2))
h4 = F.sigmoid(self.l4(h3))
h5 = F.sigmoid(self.l5(h4))
h6 = self.l6(h5)
return h6
In [42]:
def DL(df, n, bs = 200):
dum1 = pd.DataFrame((df['pay'] < 100000)*1)
dum1.columns = ['low']
dum2 = pd.DataFrame((df['pay'] > 150000)*1)
dum2.columns = ['high']
dum = pd.concat((dum1, dum2), axis=1)
df_with_dummy = pd.concat((df, dum), axis=1)
cluster_array = np.array([df['square'], df['fX']*1000, df['fY']*1000])
gmm = mixture.GaussianMixture(n_components=n, covariance_type='full').fit(cluster_array.T)
dum = pd.get_dummies(gmm.predict(cluster_array.T))
dum_nam = ['d%s'%i for i in range(n)]
dum.columns = dum_nam
df_with_dummy = pd.concat((df_with_dummy, dum), axis=1)
vars = ['pay', 'square', 'k', 'lk', 'dk', 'sdk', 'sldk', 'south_direction_dummy', 'building_year',
'new_dummy', 'mansyon_dumy', 'teiki_syakuya_dummy', 'walk_minute_dummy', 'r', 'rc_dummy',
'room_nums', 'low', 'high']
vars = vars + dum_nam[:-1]
eq = fml_build(vars)
y, X = dmatrices(eq, data=df_with_dummy, return_type='dataframe')
y_in = y[1:1000]
X_in = X[1:1000]
y_ex = y[1000:]
X_ex = X[1000:]
logy_in = np.log(y_in)
logy_in = np.array(logy_in, dtype='float32')
X_in = np.array(X_in, dtype='float32')
y = Variable(logy_in)
x = Variable(X_in)
num, col_num = X_in.shape
model1 = CAR(10, 10, 3, col_num)
optimizer = optimizers.SGD()
optimizer.setup(model1)
for j in range(1000):
sffindx = np.random.permutation(num)
for i in range(0, num, bs):
x = Variable(X_in[sffindx[i:(i+bs) if (i+bs) < num else num]])
y = Variable(logy_in[sffindx[i:(i+bs) if (i+bs) < num else num]])
model1.zerograds()
loss = model1(x, y)
loss.backward()
optimizer.update()
if j % 1000 == 0:
loss_val = loss.data
print('epoch:', j)
print('train mean loss={}'.format(loss_val))
print(' - - - - - - - - - ')
return model1, np.array(y_ex, dtype='float32').reshape(len(y_ex)), np.array(X_ex, dtype='float32')
In [44]:
results, y_ex, X_ex = DL(df, 20)
In [57]:
X_ex = Variable(X_ex)
logy_ex = Variable(np.log(y_ex))
logy_pred = results.fwd(X_ex, logy_ex).data
y_pred = np.exp(logy_pred)
error = y_ex - y_pred.reshape(len(y_pred),)
print(error[:20])
plt.hist(error)
Out[57]:
In [58]:
print(sum(((error > -5000)*1)*((error < 5000)*1)))
print(sum(((error > -15000)*1)*((error < 15000)*1)))
print(sum(((error > -30000)*1)*((error < 30000)*1)))
In [ ]: