In [1]:
%matplotlib inline
In [2]:
import numpy as np
import pandas as pd
# 統計用ツール
import statsmodels.api as sm
import statsmodels.tsa.api as tsa
from patsy import dmatrices
# 自作の空間統計用ツール
from spatialstat import *
#描画
import matplotlib.pyplot as plt
from pandas.tools.plotting import autocorrelation_plot
import seaborn as sns
sns.set(font=['IPAmincho'])
#深層学習
import chainer
from chainer import cuda, Function, gradient_check, Variable, optimizers, serializers, utils
from chainer import Link, Chain, ChainList
import chainer.functions as F
import chainer.links as L
import pyper
In [3]:
data = pd.read_csv("TokyoSingle.csv")
In [4]:
data
Out[4]:
In [5]:
data = data.dropna()
In [6]:
CITY_NAME = data['CITY_CODE'].copy()
In [7]:
CITY_NAME[CITY_NAME == 13101] = '01千代田区'
CITY_NAME[CITY_NAME == 13102] = "02中央区"
CITY_NAME[CITY_NAME == 13103] = "03港区"
CITY_NAME[CITY_NAME == 13104] = "04新宿区"
CITY_NAME[CITY_NAME == 13105] = "05文京区"
CITY_NAME[CITY_NAME == 13106] = "06台東区"
CITY_NAME[CITY_NAME == 13107] = "07墨田区"
CITY_NAME[CITY_NAME == 13108] = "08江東区"
CITY_NAME[CITY_NAME == 13109] = "09品川区"
CITY_NAME[CITY_NAME == 13110] = "10目黒区"
CITY_NAME[CITY_NAME == 13111] = "11大田区"
CITY_NAME[CITY_NAME == 13112] = "12世田谷区"
CITY_NAME[CITY_NAME == 13113] = "13渋谷区"
CITY_NAME[CITY_NAME == 13114] = "14中野区"
CITY_NAME[CITY_NAME == 13115] = "15杉並区"
CITY_NAME[CITY_NAME == 13116] = "16豊島区"
CITY_NAME[CITY_NAME == 13117] = "17北区"
CITY_NAME[CITY_NAME == 13118] = "18荒川区"
CITY_NAME[CITY_NAME == 13119] = "19板橋区"
CITY_NAME[CITY_NAME == 13120] = "20練馬区"
CITY_NAME[CITY_NAME == 13121] = "21足立区"
CITY_NAME[CITY_NAME == 13122] = "22葛飾区"
CITY_NAME[CITY_NAME == 13123] = "23江戸川区"
In [8]:
#Make Japanese Block name
BLOCK = data["CITY_CODE"].copy()
BLOCK[BLOCK == 13101] = "01都心・城南"
BLOCK[BLOCK == 13102] = "01都心・城南"
BLOCK[BLOCK == 13103] = "01都心・城南"
BLOCK[BLOCK == 13104] = "01都心・城南"
BLOCK[BLOCK == 13109] = "01都心・城南"
BLOCK[BLOCK == 13110] = "01都心・城南"
BLOCK[BLOCK == 13111] = "01都心・城南"
BLOCK[BLOCK == 13112] = "01都心・城南"
BLOCK[BLOCK == 13113] = "01都心・城南"
BLOCK[BLOCK == 13114] = "02城西・城北"
BLOCK[BLOCK == 13115] = "02城西・城北"
BLOCK[BLOCK == 13105] = "02城西・城北"
BLOCK[BLOCK == 13106] = "02城西・城北"
BLOCK[BLOCK == 13116] = "02城西・城北"
BLOCK[BLOCK == 13117] = "02城西・城北"
BLOCK[BLOCK == 13119] = "02城西・城北"
BLOCK[BLOCK == 13120] = "02城西・城北"
BLOCK[BLOCK == 13107] = "03城東"
BLOCK[BLOCK == 13108] = "03城東"
BLOCK[BLOCK == 13118] = "03城東"
BLOCK[BLOCK == 13121] = "03城東"
BLOCK[BLOCK == 13122] = "03城東"
BLOCK[BLOCK == 13123] = "03城東"
In [9]:
names = list(data.columns) + ['CITY_NAME', 'BLOCK']
data = pd.concat((data, CITY_NAME, BLOCK), axis = 1)
data.columns = names
CENSUS: 市区町村コード(9桁)
P: 成約価格
S: 専有面積
L: 土地面積
R: 部屋数
RW: 前面道路幅員
CY: 建築年
A: 建築後年数(成約時)
TS: 最寄駅までの距離
TT: 東京駅までの時間
ACC: ターミナル駅までの時間
WOOD: 木造ダミー
SOUTH: 南向きダミー
RSD: 住居系地域ダミー
CMD: 商業系地域ダミー
IDD: 工業系地域ダミー
FAR: 建ぺい率
FLR: 容積率
TDQ: 成約時点(四半期)
X: 緯度
Y: 経度
CITY_CODE: 市区町村コード(5桁)
CITY_NAME: 市区町村名
BLOCK: 地域ブロック名
In [10]:
s_data = data[['P', 'S', 'L', 'R', 'A', 'RW', 'TS', 'TT']]
In [11]:
s_data.describe()
Out[11]:
In [12]:
print(data['CITY_NAME'].value_counts())
In [13]:
print(data.pivot_table(index=['TDQ'], columns=['CITY_NAME']))
In [14]:
print(data.pivot_table(index=['TDQ'], columns=['BLOCK']))
In [15]:
data['P'].hist()
Out[15]:
In [16]:
(np.log(data['P'])).hist()
Out[16]:
In [17]:
data['A'].hist()
Out[17]:
In [18]:
plt.figure(figsize=(20,8))
plt.subplot(4, 2, 1)
data['P'].hist()
plt.title(u"成約価格")
plt.subplot(4, 2, 2)
data['S'].hist()
plt.title("専有面積")
plt.subplot(4, 2, 3)
data['L'].hist()
plt.title("土地面積")
plt.subplot(4, 2, 4)
data['R'].hist()
plt.title("部屋数")
plt.subplot(4, 2, 5)
data['A'].hist()
plt.title("建築後年数")
plt.subplot(4, 2, 6)
data['RW'].hist()
plt.title("前面道路幅員")
plt.subplot(4, 2, 7)
data['TS'].hist()
plt.title("最寄駅までの距離")
plt.subplot(4, 2, 8)
data['TT'].hist()
plt.title(u"東京駅までの時間")
Out[18]:
In [19]:
plt.figure(figsize=(20,8))
data['TDQ'].value_counts().plot(kind='bar')
Out[19]:
In [20]:
plt.figure(figsize=(20,8))
data['CITY_NAME'].value_counts().plot(kind='bar') #市区町村別の件数
Out[20]:
In [21]:
vars = ['P', 'S', 'L', 'R', 'RW', 'A', 'TS', 'TT', 'WOOD', 'SOUTH', 'CMD', 'IDD', 'FAR', 'X', 'Y']
eq = fml_build(vars)
y, X = dmatrices(eq, data=data, return_type='dataframe')
CITY_NAME = pd.get_dummies(data['CITY_NAME'])
TDQ = pd.get_dummies(data['TDQ'])
X = pd.concat((X, CITY_NAME, TDQ), axis=1)
datas = pd.concat((y, X), axis=1)
datas = datas[datas['12世田谷区'] == 1][0:5000]
In [22]:
datas.head()
Out[22]:
In [23]:
vars = ['S', 'L', 'R', 'RW', 'A', 'TS', 'TT', 'WOOD', 'SOUTH', 'CMD', 'IDD', 'FAR']
#vars += vars + list(TDQ.columns)
In [272]:
class CAR(Chain):
def __init__(self, unit1, unit2, unit3, col_num):
self.unit1 = unit1
self.unit2 = unit2
self.unit3 = unit3
super(CAR, self).__init__(
l1 = L.Linear(col_num, unit1),
l2 = L.Linear(self.unit1, self.unit1),
l3 = L.Linear(self.unit1, self.unit2),
l4 = L.Linear(self.unit2, self.unit3),
l5 = L.Linear(self.unit3, self.unit3),
l6 = L.Linear(self.unit3, 1),
)
def __call__(self, x, y):
fv = self.fwd(x, y)
print(fv.data.shape)
print(y.data.shape)
loss = F.mean_squared_error(fv, y)
return loss
def fwd(self, x, y):
h1 = F.sigmoid(self.l1(x))
h2 = F.sigmoid(self.l2(h1))
h3 = F.sigmoid(self.l3(h2))
h4 = F.sigmoid(self.l4(h3))
h5 = F.sigmoid(self.l5(h4))
h6 = self.l6(h5)
return h6
In [273]:
class OLS_DLmodel(object):
def __init__(self, data, vars, bs=200, n=1000):
self.vars = vars
eq = fml_build(vars)
y, X = dmatrices(eq, data=datas, return_type='dataframe')
self.y_in = y[:-n]
self.X_in = X[:-n]
self.y_ex = y[-n:]
self.X_ex = X[-n:]
self.ido_in = data['X'][:-n]
self.keido_in = data['Y'][:-n]
self.ido_ex = data['X'][-n:]
self.keido_ex = data['Y'][-n:]
self.logy_in = np.log(self.y_in)
self.logy_ex = np.log(self.y_ex)
self.bs = bs
def OLS(self):
X_in = self.X_in
model = sm.OLS(self.logy_in, X_in, intercept=False)
self.reg = model.fit()
print(self.reg.summary())
def directDL(self, ite=100, bs=200, add=False):
logy_in = np.array(self.logy_in, dtype='float32')
X_in = np.array(self.X_in, dtype='float32')
y = Variable(logy_in)
x = Variable(X_in)
num, col_num = X_in.shape
if add is False:
self.model1 = CAR(15, 15, 5, col_num)
optimizer = optimizers.SGD()
optimizer.setup(self.model1)
for j in range(ite):
sffindx = np.random.permutation(num)
for i in range(0, num, bs):
x = Variable(X_in[sffindx[i:(i+bs) if (i+bs) < num else num]])
y = Variable(logy_in[sffindx[i:(i+bs) if (i+bs) < num else num]])
self.model1.zerograds()
loss = self.model1(x, y)
loss.backward()
optimizer.update()
if j % 1000 == 0:
loss_val = loss.data
print('epoch:', j)
print('train mean loss={}'.format(loss_val))
print(' - - - - - - - - - ')
y_ex = np.array(self.y_ex, dtype='float32').reshape(len(self.y_ex))
X_ex = np.array(self.X_ex, dtype='float32')
X_ex = Variable(X_ex)
logy_pred = self.model1.fwd(X_ex, X_ex).data
y_pred = np.exp(logy_pred)
error = y_ex - y_pred.reshape(len(y_pred),)
plt.hist(error[:])
def DL(self, ite=100, bs=200, add=False):
y_in = np.array(self.y_in, dtype='float32').reshape(len(self.y_in))
resid = y_in - np.exp(self.reg.predict())
resid = np.array(resid, dtype='float32').reshape(len(resid),1)
X_in = np.array(self.X_in, dtype='float32')
y = Variable(resid)
x = Variable(X_in)
num, col_num = X_in.shape
if add is False:
self.model1 = CAR(10, 10, 3, col_num)
optimizer = optimizers.Adam()
optimizer.setup(self.model1)
for j in range(ite):
sffindx = np.random.permutation(num)
for i in range(0, num, bs):
x = Variable(X_in[sffindx[i:(i+bs) if (i+bs) < num else num]])
y = Variable(resid[sffindx[i:(i+bs) if (i+bs) < num else num]])
self.model1.zerograds()
loss = self.model1(x, y)
loss.backward()
optimizer.update()
if j % 1000 == 0:
loss_val = loss.data
print('epoch:', j)
print('train mean loss={}'.format(loss_val))
print(' - - - - - - - - - ')
resid_pred = self.model1.fwd(X_in, X_in).data
y_in = np.array(self.y_in, dtype='float32').reshape(len(self.y_in))
logy_pred = np.matrix(self.X_in)*np.matrix(self.reg.params).T
self.pred = np.exp(logy_pred) + resid_pred
self.resid2 = np.array(y_in - self.pred.reshape(len(self.pred),))[0]
def predict(self):
y_ex = np.array(self.y_ex, dtype='float32').reshape(len(self.y_ex))
X_ex = np.array(self.X_ex, dtype='float32')
X_ex = Variable(X_ex)
resid_pred = self.model1.fwd(X_ex, X_ex).data
print(resid_pred[:10])
self.logy_pred = np.matrix(self.X_ex)*np.matrix(self.reg.params).T
self.error1 = np.array(y_ex - np.exp(self.logy_pred.reshape(len(self.logy_pred),)))[0]
self.pred = np.exp(self.logy_pred) + resid_pred
self.error2 = np.array(y_ex - self.pred.reshape(len(self.pred),))[0]
def compare(self):
plt.hist(self.error1)
plt.hist(self.error2)
def sar(self):
weight = ((np.array([self.ido_in]).T - np.array([self.ido_in]))**2 +
(np.array([self.keido_in]).T - np.array([self.keido_in]))**2)
for i in range(len(weight)):
N = sum(weight[i])
weight[i] = weight[i]/N
Ws = np.matrix(weight)*np.matrix(self.resid2).T
model2 = sm.OLS(np.matrix(self.resid2).T, Ws)
reg = model2.fit()
rho = reg.params
self.sigmas = []
for i in range(len(self.ido_ex)):
x = np.append(np.array(self.ido_in), np.array(self.ido_ex)[i])
y = np.append(np.array(self.keido_in), np.array(self.keido_ex)[i])
weight = ((np.array([x]).T - np.array([x]))**2 +
(np.array([y]).T - np.array([y]))**2)
N = np.array([list(np.matrix(weight)*np.matrix(np.ones(len(weight), 1)))[0] for k in range(len(weight))])
weight[j] = weight/N
res = np.append(model.resid2, 0)
Ws = np.matrix(weight)*np.matrix(res).T
sigma = rho*Ws.T
sigma = np.array(sigma)[0][-1]
self.sigmas.append(sigma)
In [274]:
vars = ['P', 'S', 'L', 'R', 'RW', 'A', 'TS', 'TT', 'WOOD', 'SOUTH', 'CMD', 'IDD', 'FAR', 'X', 'Y']
#vars += vars + list(TDQ.columns)
In [275]:
model = OLS_DLmodel(datas, vars)
In [276]:
model.OLS()
In [277]:
model.DL(ite=10, bs=200)
In [211]:
model.predict()
In [212]:
model.compare()
In [216]:
model.DL(30000, bs=200, add=True)
In [218]:
serializers.save_npz('model1(1)', model.model1)
In [226]:
model.DL(10000, bs=200, add=True)
In [228]:
serializers.save_npz('model1(2)', model.model1)
In [229]:
model.predict()
青がOLSの誤差、緑がOLSと深層学習を組み合わせた誤差。
In [230]:
model.compare()
In [231]:
fig = plt.figure()
ax = fig.add_subplot(111)
errors = [model.error1, model.error2]
bp = ax.boxplot(errors)
plt.grid()
plt.ylim([-5000,5000])
plt.title('分布の箱ひげ図')
plt.show()
In [232]:
print(np.mean(model.error1))
print(np.mean(model.error2))
In [233]:
print(np.var(model.error1))
print(np.var(model.error2))
In [245]:
model.sar()
Out[245]:
In [263]:
error2 = (model.error2[:741] - np.array(model.sigmas))
In [264]:
fig = plt.figure()
ax = fig.add_subplot(111)
errors = [model.error1, error2]
bp = ax.boxplot(errors)
plt.grid()
plt.ylim([-5000,5000])
plt.title('分布の箱ひげ図')
plt.show()
In [265]:
print(np.mean(model.error1))
print(np.mean(error2))
In [266]:
print(np.mean(np.abs(model.error1)))
print(np.mean(np.abs(error2)))
In [267]:
print(max(np.abs(model.error1)))
print(max(np.abs(error2)))
In [268]:
print(np.var(model.error1))
print(np.var(error2))
In [269]:
X = model.X_ex['X'].values
Y = model.X_ex['Y'].values
In [270]:
import numpy
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d.axes3d import Axes3D
fig=plt.figure()
ax=Axes3D(fig)
ax.scatter3D(X[:741], Y[:741], error2)
plt.show()
In [271]:
import numpy
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d.axes3d import Axes3D
fig=plt.figure()
ax=Axes3D(fig)
ax.scatter3D(X, Y, model.error2)
plt.show()
In [265]:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.tri as mtri
#============
# First plot
#============
# Plot the surface. The triangles in parameter space determine which x, y, z
# points are connected by an edge.
ax = fig.add_subplot(1, 2, 1, projection='3d')
ax.plot_trisurf(X, Y, e)
ax.set_zlim(-1, 1)
plt.show()
In [ ]: