In [1]:
%matplotlib inline
In [4]:
import numpy as np
import pandas as pd
# 統計用ツール
import statsmodels.api as sm
import statsmodels.tsa.api as tsa
from patsy import dmatrices
# 自作の空間統計用ツール
from spatialstat import *
#描画
import matplotlib.pyplot as plt
from pandas.tools.plotting import autocorrelation_plot
import seaborn as sns
sns.set(font=['IPAmincho'])
#深層学習
import chainer
from chainer import cuda, Function, gradient_check, Variable, optimizers, serializers, utils
from chainer import Link, Chain, ChainList
import chainer.functions as F
import chainer.links as L
from MakeData import makedata
In [ ]:
data = pd.read_csv("TokyoSingle.csv")
data = data.dropna()
In [23]:
datas = makedata(data)
In [10]:
vars = ['P', 'S', 'L', 'R', 'RW', 'A', 'TS', 'TT', 'WOOD', 'SOUTH', 'CMD', 'IDD', 'FAR', 'X', 'Y']
eq = fml_build(vars)
y, X = dmatrices(eq, data=data, return_type='dataframe')
CITY_NAME = pd.get_dummies(data['CITY_NAME'])
TDQ = pd.get_dummies(data['TDQ'])
X = pd.concat((X, CITY_NAME, TDQ), axis=1)
datas = pd.concat((y, X), axis=1)
datas = datas[datas['12世田谷区'] == 1][0:5000]
In [295]:
class CAR(Chain):
def __init__(self, unit1, unit2, unit3, col_num):
self.unit1 = unit1
self.unit2 = unit2
self.unit3 = unit3
super(CAR, self).__init__(
l1 = L.Linear(col_num, unit1),
l2 = L.Linear(self.unit1, self.unit1),
l3 = L.Linear(self.unit1, self.unit2),
l4 = L.Linear(self.unit2, self.unit3),
l5 = L.Linear(self.unit3, self.unit3),
l6 = L.Linear(self.unit3, 1),
)
def __call__(self, x, y):
fv = self.fwd(x, y)
loss = F.mean_squared_error(fv, y)
return loss
def fwd(self, x, y):
h1 = F.sigmoid(self.l1(x))
h2 = F.sigmoid(self.l2(h1))
h3 = F.sigmoid(self.l3(h2))
h4 = F.sigmoid(self.l4(h3))
h5 = F.sigmoid(self.l5(h4))
h6 = self.l6(h5)
return h6
In [296]:
class OLS_DLmodel(object):
def __init__(self, data, vars, bs=200, n=1000):
self.vars = vars
eq = fml_build(vars)
y, X = dmatrices(eq, data=datas, return_type='dataframe')
self.y_in = y[:-n]
self.X_in = X[:-n]
self.y_ex = y[-n:]
self.X_ex = X[-n:]
self.logy_in = np.log(self.y_in)
self.logy_ex = np.log(self.y_ex)
self.bs = bs
def OLS(self):
X_in = self.X_in
X_in = X_in.drop(['X', 'Y'], axis=1)
model = sm.OLS(self.logy_in, X_in, intercept=False)
self.reg = model.fit()
print(self.reg.summary())
df = (pd.DataFrame(self.reg.params)).T
df['X'] = 0
df['Y'] = 0
self.reg.params = pd.Series((df.T)[0])
def directDL(self, ite=100, bs=200, add=False):
logy_in = np.array(self.logy_in, dtype='float32')
X_in = np.array(self.X_in, dtype='float32')
y = Variable(logy_in)
x = Variable(X_in)
num, col_num = X_in.shape
if add is False:
self.model1 = CAR(15, 15, 5, col_num)
optimizer = optimizers.SGD()
optimizer.setup(self.model1)
for j in range(ite):
sffindx = np.random.permutation(num)
for i in range(0, num, bs):
x = Variable(X_in[sffindx[i:(i+bs) if (i+bs) < num else num]])
y = Variable(logy_in[sffindx[i:(i+bs) if (i+bs) < num else num]])
self.model1.zerograds()
loss = self.model1(x, y)
loss.backward()
optimizer.update()
if j % 1000 == 0:
loss_val = loss.data
print('epoch:', j)
print('train mean loss={}'.format(loss_val))
print(' - - - - - - - - - ')
y_ex = np.array(self.y_ex, dtype='float32').reshape(len(self.y_ex))
X_ex = np.array(self.X_ex, dtype='float32')
X_ex = Variable(X_ex)
logy_pred = self.model1.fwd(X_ex, X_ex).data
y_pred = np.exp(logy_pred)
error = y_ex - y_pred.reshape(len(y_pred),)
plt.hist(error[:])
def DL(self, ite=100, bs=200, add=False):
y_in = np.array(self.y_in, dtype='float32').reshape(len(self.y_in))
resid = y_in - np.exp(self.reg.predict())
resid = np.array(resid, dtype='float32').reshape(len(resid),1)
X_in = np.array(self.X_in, dtype='float32')
y = Variable(resid)
x = Variable(X_in)
num, col_num = X_in.shape
if add is False:
self.model1 = CAR(10, 10, 3, col_num)
optimizer = optimizers.Adam()
optimizer.setup(self.model1)
for j in range(ite):
sffindx = np.random.permutation(num)
for i in range(0, num, bs):
x = Variable(X_in[sffindx[i:(i+bs) if (i+bs) < num else num]])
y = Variable(resid[sffindx[i:(i+bs) if (i+bs) < num else num]])
self.model1.zerograds()
loss = self.model1(x, y)
loss.backward()
optimizer.update()
if j % 1000 == 0:
loss_val = loss.data
print('epoch:', j)
print('train mean loss={}'.format(loss_val))
print(' - - - - - - - - - ')
def predict(self):
y_ex = np.array(self.y_ex, dtype='float32').reshape(len(self.y_ex))
X_ex = np.array(self.X_ex, dtype='float32')
X_ex = Variable(X_ex)
resid_pred = self.model1.fwd(X_ex, X_ex).data
print(resid_pred[:10])
self.logy_pred = np.matrix(self.X_ex)*np.matrix(self.reg.params).T
self.error1 = np.array(y_ex - np.exp(self.logy_pred.reshape(len(self.logy_pred),)))[0]
self.pred = np.exp(self.logy_pred) + resid_pred
self.error2 = np.array(y_ex - self.pred.reshape(len(self.pred),))[0]
def compare(self):
plt.hist(self.error1)
plt.hist(self.error2)
In [297]:
vars = ['P', 'S', 'L', 'R', 'RW', 'A', 'TS', 'TT', 'WOOD', 'SOUTH', 'CMD', 'IDD', 'FAR', 'X', 'Y']
#vars += vars + list(TDQ.columns)
In [298]:
model = OLS_DLmodel(datas, vars)
In [299]:
model.OLS()
In [300]:
model.DL(ite=10, bs=200)
In [301]:
model.predict()
In [302]:
model.DL(ite=20000, bs=200, add=True)
In [320]:
model.DL(ite=10000, bs=200, add=True)
In [303]:
model.predict()
青がOLSの誤差、緑がOLSと深層学習を組み合わせた誤差。
In [311]:
model.compare()
In [312]:
print(np.mean(model.error1))
print(np.mean(model.error2))
In [313]:
print(np.mean(np.abs(model.error1)))
print(np.mean(np.abs(model.error2)))
In [314]:
print(max(np.abs(model.error1)))
print(max(np.abs(model.error2)))
In [315]:
print(np.var(model.error1))
print(np.var(model.error2))
In [316]:
fig = plt.figure()
ax = fig.add_subplot(111)
errors = [model.error1, model.error2]
bp = ax.boxplot(errors)
plt.grid()
plt.ylim([-5000,5000])
plt.title('分布の箱ひげ図')
plt.show()
In [415]:
X = model.X_ex['X'].values
Y = model.X_ex['Y'].values
In [318]:
e = model.error2
In [319]:
import numpy
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d.axes3d import Axes3D
fig=plt.figure()
ax=Axes3D(fig)
ax.scatter3D(X, Y, e)
plt.show()
In [331]:
t
Out[331]:
In [358]:
plt.hist(Xs)
Out[358]:
In [339]:
import numpy as np
from scipy.stats import gaussian_kde
import matplotlib.pyplot as plt
In [564]:
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import numpy as np
Xs = np.linspace(min(X),max(X),10)
Ys = np.linspace(min(Y),max(Y),10)
error = model.error1
Xgrid, Ygrid = np.meshgrid(Xs, Ys)
Z = LL(X, Y, Xs, Ys, error)
fig = plt.figure()
ax = Axes3D(fig)
ax.plot_wireframe(Xgrid,Ygrid,Z) #<---ここでplot
plt.show()
In [505]:
fig = plt.figure()
ax = Axes3D(fig)
ax.set_zlim(-100, 500)
ax.plot_surface(Xgrid,Ygrid,Z) #<---ここでplot
plt.show()
In [541]:
h = 10
(0.9375*(1-((X-1)/h)**2)**2)*(0.9375*(1-((Y-2)/h)**2)**2)
Out[541]:
In [562]:
def LL(X, Y, Xs, Ys, error):
n = len(X)
h = 0.1
error = model.error2
mean_of_error = np.zeros((len(Xs), len(Ys)))
for i in range(len(Xs)):
for j in range(len(Ys)):
u1 = ((X-Xs[i])/h)**2
u2 = ((Y-Ys[j])/h)**2
k = (0.9375*(1-((X-Xs[i])/h)**2)**2)*(0.9375*(1-((Y-Ys[j])/h)**2)**2)
K = np.diag(k)
indep = np.matrix(np.array([np.ones(n), X - Xs[i], Y-Ys[j]]).T)
dep = np.matrix(np.array([error]).T)
gls_model = sm.GLS(dep, indep, sigma=K)
gls_results = gls_model.fit()
mean_of_error[i, j] = gls_results.params[0]
return mean_of_error
In [558]:
h = 200
u1 = ((X-30)/h)**2
In [559]:
u1
Out[559]:
In [557]:
u1[u1 < 0] = 0
In [467]:
for x in range(lXs[:2]):
print(x)
In [499]:
mean_of_error
Out[499]:
In [468]:
Out[468]:
In [367]:
plt.plot(gaussian_kde(Y, 0.1)(Ys))
Out[367]:
In [ ]:
N = 5
means = np.random.randn(N,2) * 10 + np.array([100, 200])
stdev = np.random.randn(N,2) * 10 + 30
count = np.int64(np.int64(np.random.randn(N,2) * 10000 + 50000))
a = [
np.hstack([
np.random.randn(count[i,j]) * stdev[i,j] + means[i,j]
for j in range(2)])
for i in range(N)]
In [337]:
for x in Xs:
for y in Ys:
Out[337]:
In [249]:
def loclinearc(points,x,y,h):
n = len(points[,1])
const = matrix(1, nrow=length(x), ncol=1)
bhat = matrix(0, nrow=3, ncol=n)
b1 = matrix(0, n, n)
predict = matrix(0, n, 1)
for (j in 1:n) {
for (i in 1:n) {
a <- -.5*sign( abs( (points[i, 1]*const - x[,1])/h ) -1 ) + .5
#get the right data points, (K(x) ~=0)
b <- -.5*sign( abs( (points[j, 2]*const - x[,2])/h ) -1 ) + .5
x1andy <- nonzmat(cbind((x[,1]*a*b), (y*a*b)))
x2andy <- nonzmat(cbind((x[,2]*a*b), (y*a*b)))
ztheta1 <- x1andy[,1]
ztheta2 <- x2andy[,1]
yuse <- x1andy[,2]
q1 <- (ztheta1 - points[i,1]);
q2 <- (ztheta2 - points[j,2]);
nt1 <- ( (ztheta1- points[i,1])/h )
nt2 <- ( (ztheta2- points[j,2])/h )
#q2 = ((ztheta - points(i,1)).^2)/2;
weights <- diag(c((15/16)%*%( 1-(nt1^2))^2*((15/16)%*%( 1-(nt2^2))^2)))
#Biweight Kernel
tempp3 <- cbind(matrix(1, nrow=length(ztheta1), ncol=1), q1, q2)
bhat[,i] <- solve(t(tempp3)%*%weights%*%tempp3)%*%t(tempp3)%*%weights%*%yuse
}
b1[,j] <- t(bhat[1,])
}
return(b1)
}
Out[249]:
In [ ]:
nonzmat(x):
#This function computes nonzeros of a MATRIX when certain ROWS of the
#matrix are zero. This function returns a matrix with the
#zero rows deleted
m, k = x.shape
xtemp = matrix(np.zeros(m, k))
for (i in 1:m) {
xtemp[i,] <- ifelse(x[i,] == matrix(0, nrow=1, ncol=k), 99999*matrix(1, nrow=1, ncol=k), x[i,])
}
xtemp <- xtemp - 99999
if (length(which(xtemp !=0,arr.ind = T)) == 0) {
a <- matrix(-99999, nrow=1, ncol=k)
} else {
a <- xtemp[which(xtemp !=0,arr.ind = T)]
}
a <- a + 99999
n1 <- length(a)
rowlen <- n1/k
collen <- k
out = matrix(a, nrow=rowlen, ncol=collen)
return(out)
}
In [265]:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.tri as mtri
#============
# First plot
#============
# Plot the surface. The triangles in parameter space determine which x, y, z
# points are connected by an edge.
ax = fig.add_subplot(1, 2, 1, projection='3d')
ax.plot_trisurf(X, Y, e)
ax.set_zlim(-1, 1)
plt.show()
In [ ]: