In [1]:
import cPickle,gzip,os,sys,time
dataset = 'mnist.pkl.gz'
f = gzip.open(dataset, 'rb')
train_set, valid_set, test_set = cPickle.load(f)
f.close()
In [2]:
train_set[0].shape
Out[2]:
In [3]:
import numpy as np
import numpy.random as npr
import pylab as pl
pl.rcParams['font.family'] = 'serif'
%matplotlib inline
from sklearn.decomposition import PCA
from sklearn import neighbors
In [4]:
def one_nn_baseline(X,Y):
one_nn = neighbors.kneighbors_graph(X,2)
inds = np.zeros(len(X),dtype=int)
for i in range(len(X)):
inds[i] = [ind for ind in one_nn[i].indices if ind != i][0]
preds = Y[inds]
return 1.0*sum(preds==Y) / len(Y)
In [5]:
one_nn_baseline(train_set[0][:5000],train_set[1][:5000])
Out[5]:
In [10]:
sigmoid = lambda x : 1.0 / (1.0 + np.exp(-x))
class MLAutoencoder():
def __init__(self,layer_dims=[32,16,8,4,2,4,8,16,32]):
self.layer_dims = layer_dims
self.W = []
for i in range(1,len(layer_dims)):
self.W.append(npr.randn(layer_dims[i-1],layer_dims[i]))
self.num_params = sum([np.prod(w.shape) for w in self.W])
self.bottleneck = np.argmin(np.array(layer_dims))
def mats_to_vec(self,W):
w_vecs = []
for w in W:
w_vecs.append(np.reshape(w,np.prod(w.shape)))
return np.hstack(w_vecs)
def vec_to_mats(self,w_vecs):
ind = 0
W = []
for i in range(len(self.W)):
size = np.prod(self.W[i].shape)
W.append(np.reshape(w_vecs[ind:ind+size],self.W[i].shape))
ind += size
return W
def predict(self,X):
def predict_one(x):
L = x
for w in self.W:
L = sigmoid(L.dot(w))
return L
if len(X.shape) > 1:
y = np.zeros(X.shape)
for i,x in enumerate(X):
y[i] = predict_one(x)
else:
y = predict_one(X)
return y
def transform(self,X):
def transform_one(x):
L = x
for i in range(self.bottleneck):
L = sigmoid(L.dot(self.W[i]))
return L
if len(X.shape) > 1:
y = np.zeros((len(X),self.layer_dims[self.bottleneck]))
for i,x in enumerate(X):
y[i] = transform_one(x)
else:
y = transform_one(X)
return y
def loss(self,y,y_pred):
return np.sum(abs(y-y_pred))
def score(self,X):
X_pred = self.predict(X)
assert(X_pred.shape == X.shape)
return sum([self.loss(pred,truth) for (pred,truth) in zip(X_pred, X)])
def gradient(self,func,x0,h=0.0001):
x0 = np.array(x0)#,dtype=float)
y = func(x0)
deriv = np.zeros(len(x0))
for i in range(len(x0)):
x = np.array(x0)
x[i] += h
deriv[i] = (func(x) - y)/h
return deriv
def vec_score(self,w,X):
self.W = self.vec_to_mats(w)
return self.score(X)
def train(self,X_,batch_size=20,epochs=10,learning_rate=0.1,cooling=False):
def report(counter,epoch):
status = "Epoch {1} loss: {0:.3f}".format(self.score(X_),epoch)
print(status)
X_r = self.transform(X_)
pl.scatter(X_r[:,0],X_r[:,1],c=Y,linewidth=0)
pl.xlim((0,1))
pl.ylim((0,1))
pl.title(status)
pl.savefig('ae/{0}.jpg'.format(counter))
pl.close()
if X_.shape[1] == 3:
X_r = self.predict(X_)
fig = pl.figure()
ax = Axes3D(fig)
#pl.xlim((0,1))
#pl.ylim((0,1))
#ax.zlim((0,1))
ax.scatter(X_r[:,0],X_r[:,1],X_r[:,2],c=Y)
pl.savefig('ae/reconstruction-{0}.jpg'.format(counter))
pl.close()
n = len(X_) /batch_size
w = self.mats_to_vec(self.W)
X = X_.copy()
counter = 0
report(counter,0)
counter += 1
if cooling:
start_cooling = 100
delta = learning_rate / (epochs - start_cooling)
for t in range(epochs):
npr.shuffle(X)
if cooling and t > start_cooling:
#learning_rate -= delta
learning_rate *= 0.95
for i in range(n):
batch_X = X[i*batch_size:(i+1)*batch_size]
loss_func = lambda w : self.vec_score(w,batch_X)
w -= learning_rate * self.gradient(loss_func,w)
report(counter,t+1)
counter +=1
In [38]:
input_dim = 784
forward_layers = [input_dim/(2**n) for n in range(10) if input_dim/(2**n) > 1]
if forward_layers[-1] != 2:
forward_layers.append(2)
forward_layers
Out[38]:
In [39]:
layers = forward_layers + forward_layers[::-1][1:]
layers
Out[39]:
In [40]:
ae = MLAutoencoder(layers)
In [41]:
X_ = ae.transform(train_set[0][:5000])
In [42]:
pl.scatter(X_[:,0],X_[:,0],c=train_set[1][:len(X_)])
Out[42]:
In [43]:
one_nn_baseline(X_,train_set[1][:len(X_)])
Out[43]:
In [44]:
ae_2 = MLAutoencoder([input_dim,2,input_dim])
In [46]:
X_2 = ae_2.transform(train_set[0][:5000])
pl.scatter(X_2[:,0],X_2[:,0],c=train_set[1][:len(X_2)])
print(one_nn_baseline(X_2,train_set[1][:len(X_2)]))
In [6]:
pca = PCA(64)
X_pca = pca.fit_transform(train_set[0][:5000])
In [8]:
pl.plot(pca.explained_variance_ratio_)
print(sum(pca.explained_variance_ratio_))
In [14]:
ae_3 = MLAutoencoder([64,32,16,8,4,2,4,8,16,32,64])
X_3 = ae_3.transform(X_pca)
pl.scatter(X_3[:,0],X_3[:,0],c=train_set[1][:len(X_3)])
print(one_nn_baseline(X_3,train_set[1][:len(X_3)]))
In [17]:
ae_4 = MLAutoencoder([64,16,4,2,4,16,64])
X_4 = ae_4.transform(X_pca)
pl.scatter(X_4[:,0],X_4[:,0],c=train_set[1][:len(X_4)])
print(one_nn_baseline(X_4,train_set[1][:len(X_4)]))
In [25]:
ae_5 = MLAutoencoder([64,10,64])
X_5 = ae_5.transform(X_pca)
pl.scatter(X_5[:,0],X_5[:,0],c=train_set[1][:len(X_5)])
print(one_nn_baseline(X_5,train_set[1][:len(X_5)]))
In [ ]:
In [22]:
from sklearn import datasets
data = datasets.load_digits()
X = data.data
Y = data.target
In [23]:
X.shape
Out[23]:
In [29]:
pca = PCA(32)
X_pca = pca.fit_transform(X)
pl.plot(pca.explained_variance_ratio_)
print(sum(pca.explained_variance_ratio_))
In [34]:
ae_6 = MLAutoencoder([32,2,32])
X_6 = ae_6.transform(X_pca)
pl.scatter(X_6[:,0],X_6[:,0],c=train_set[1][:len(X_6)])
print(one_nn_baseline(X_6,Y[:len(X_6)]))
In [43]:
from time import time
t = time()
n = 1000
data = np.zeros(n)
aes = np.empty(n,dtype=object)
for i in range(n):
aes[i] = MLAutoencoder([32,2,32])
data[i] = one_nn_baseline(aes[i].transform(X_pca),Y)
if i % 10 == 0:
print(i)
print(time() - t)
print(time() - t)
In [47]:
pl.hist(data,bins=10);
In [48]:
data_single_layer = data
In [49]:
from time import time
t = time()
n = 1000
data = np.zeros(n)
aes = np.empty(n,dtype=object)
for i in range(n):
aes[i] = MLAutoencoder([32,16,2,16,32])
data[i] = one_nn_baseline(aes[i].transform(X_pca),Y)
if i % 10 == 0:
print(i)
print(time() - t)
print(time() - t)
In [51]:
data_two_layer = data
In [54]:
pl.hist(data,bins=50);
In [ ]:
from time import time
t = time()
n = 1000
data = np.zeros(n)
aes = np.empty(n,dtype=object)
for i in range(n):
aes[i] = MLAutoencoder([32,16,8,4,2,4,8,16,32])
data[i] = one_nn_baseline(aes[i].transform(X_pca),Y)
if i % 10 == 0:
print(i)
print(time() - t)
print(time() - t)
In [ ]:
data_two_layer = data
In [ ]:
pl.hist(data,bins=50);
In [ ]:
In [209]:
import numba
weights = [npr.randn(32,16),npr.randn(16,2)]
relu = lambda x: x*(x>0)
def project(X,weight1,weight2,bias1=0,bias2=0,f=sigmoid):
return f(f(X.dot(weight1)+bias1).dot(weight2)+bias2)
In [211]:
relu(npr.randn(10))
Out[211]:
In [217]:
X_relu = project(X_pca,npr.randn(32,16),npr.randn(16,2),npr.randn(),npr.rand(),relu)
In [218]:
pl.scatter(X_relu[:,0],X_relu[:,1],c=Y)
Out[218]:
In [225]:
np.sum(X_relu==0)
Out[225]:
In [229]:
relu_proj = [project(X_pca,npr.randn(32,16),npr.randn(16,2),npr.randn(),npr.rand(),relu) for _ in xrange(10000)]
In [230]:
zero = np.array([np.sum(x==0) for x in relu_proj])
In [231]:
pl.hist(zero,bins=50);
In [232]:
np.argmin(zero)
Out[232]:
In [235]:
zero[np.argmin(zero)]
Out[235]:
In [233]:
X_relu = relu_proj[np.argmin(zero)]
In [234]:
pl.scatter(X_relu[:,0],X_relu[:,1],c=Y)
Out[234]:
In [236]:
one_nn_baseline(X_relu,Y)
Out[236]:
In [60]:
X_pca.dot(weights[0]).shape
Out[60]:
In [61]:
%timeit project(X_pca,npr.randn(32,16),npr.randn(16,2))
In [137]:
t = time()
data = np.array([project(X_pca,npr.randn(32,16),npr.randn(16,2),*npr.randn(2)) for _ in range(10000)])
print(time() - t)
In [138]:
raw_data = data
raw_data.shape
Out[138]:
In [139]:
raw_data.shape
Out[139]:
In [128]:
%timeit np.array([j for j in range(raw_data.shape[1]) if npr.rand() < 0.01])
In [130]:
sample = np.array([j for j in range(raw_data.shape[1]) if npr.rand() < 0.1])
sample
Out[130]:
In [131]:
%timeit one_nn_baseline(raw_data[i][sample],Y[sample])
In [140]:
t = time()
data = np.zeros(len(raw_data))
for i in range(len(data)):
sample = np.array([j for j in range(raw_data.shape[1]) if npr.rand() < 0.1])
data[i] = one_nn_baseline(raw_data[i][sample],Y[sample])
if i % 100 == 0:
print(i)
print(time()-t)
In [142]:
pl.hist(data*100,bins=100,normed=True);
In [ ]:
In [164]:
from sklearn.neighbors import KernelDensity
kde = KernelDensity()
In [159]:
kde.fit(np.reshape(data*100,(len(data),1)))
Out[159]:
In [160]:
data.shape
Out[160]:
In [165]:
rdata = np.reshape(data*100,(len(data),1))
rdata.shape
Out[165]:
In [173]:
kde = KernelDensity(0.1)
kde.fit(rdata)
Out[173]:
In [174]:
kde.score_samples(rdata)
Out[174]:
In [175]:
smoothed_density = kde.score_samples(np.reshape(np.arange(0,50,0.5),(100,1)))
In [176]:
pl.plot(smoothed_density)
Out[176]:
In [193]:
def density(input_points,output_points,bandwidth=0.1):
output = np.zeros(len(output_points))
for i in range(len(output_points)):
for j in range(len(input_points)):
output[i] += np.exp(-np.sqrt((output_points[i] - input_points[j])**2)/(2*bandwidth))
return output
In [202]:
pl.plot(density(np.arange(10),np.arange(0,20,0.1),bandwidth=0.05))
Out[202]:
In [240]:
kde.fit(np.arange(0,10,1))
kde.score_samples(np.arange(0,10,0.1))
In [150]:
np.arange(0,50,0.5).shape
Out[150]:
In [72]:
from scipy.spatial.distance import pdist
In [79]:
pdist(raw_data[0]).shape
Out[79]:
In [74]:
pdist(X_pca).shape
Out[74]:
In [76]:
pl.scatter(pdist(X_pca),pdist(raw_data[0]))
Out[76]:
In [77]:
from sklearn.linear_model import LinearRegression
In [84]:
lr = LinearRegression()
len_pdist = len(pdist(X_pca))
lr.fit(pdist(raw_data[0]).reshape((len_pdist,1)),pdist(X_pca))
Out[84]:
In [86]:
pred = pdist(raw_data[0]).reshape((len_pdist,1))
the_true_true = pdist(X_pca)
In [85]:
%timeit lr.fit(npr.randn(10).reshape((10,1)),npr.randn(10))
In [89]:
lr.fit(pred.reshape((len_pdist,1)),the_true_true)
lr.score(pred.reshape((len_pdist,1)),the_true_true)
Out[89]:
In [92]:
%%timeit
pred = pdist(raw_data[i]).reshape((len_pdist,1))
lr.fit(pred.reshape((len_pdist,1)),the_true_true)
data[i] = lr.score(pred.reshape((len_pdist,1)),the_true_true)
In [93]:
sample_size = 100
data = np.zeros(sample_size)
for i in range(sample_size):
pred = pdist(raw_data[i]).reshape((len_pdist,1))
lr.fit(pred.reshape((len_pdist,1)),the_true_true)
data[i] = lr.score(pred.reshape((len_pdist,1)),the_true_true)
In [94]:
pl.hist(data,bins=50);
In [95]:
from numba import jit, autojit, double, float64, float32, void, int32
In [96]:
project_jit = jit(double[:,:](double[:,:],double[:,:],double[:,:]))(project)
In [99]:
%timeit project_jit(X_pca,npr.randn(32,16),npr.randn(16,2))
In [109]:
ae_7 = MLAutoencoder([32,16,2,16,32])
In [111]:
%timeit ae_7.transform(X_pca)
In [ ]:
class MLAutoencoder():
def __init__(self,layer_dims=[32,16,8,4,2,4,8,16,32]):
self.layer_dims = layer_dims
self.W = []
for i in range(1,len(layer_dims)):
self.W.append(npr.randn(layer_dims[i-1],layer_dims[i]))
self.num_params = sum([np.prod(w.shape) for w in self.W])
self.bottleneck = np.argmin(np.array(layer_dims))
def mats_to_vec(self,W):
w_vecs = []
for w in W:
w_vecs.append(np.reshape(w,np.prod(w.shape)))
return np.hstack(w_vecs)
def vec_to_mats(self,w_vecs):
ind = 0
W = []
for i in range(len(self.W)):
size = np.prod(self.W[i].shape)
W.append(np.reshape(w_vecs[ind:ind+size],self.W[i].shape))
ind += size
return W
def predict(self,X):
def predict_one(x):
L = x
for w in self.W:
L = sigmoid(L.dot(w))
return L
if len(X.shape) > 1:
y = np.zeros(X.shape)
for i,x in enumerate(X):
y[i] = predict_one(x)
else:
y = predict_one(X)
return y
def transform(self,X):
def transform_one(x):
L = x
for i in range(self.bottleneck):
L = sigmoid(L.dot(self.W[i]))
return L
if len(X.shape) > 1:
y = np.zeros((len(X),self.layer_dims[self.bottleneck]))
for i,x in enumerate(X):
y[i] = transform_one(x)
else:
y = transform_one(X)
return y
def loss(self,y,y_pred):
return sum(abs(y-y_pred))
def score(self,X):
X_pred = self.predict(X)
assert(X_pred.shape == X.shape)
return sum([self.loss(pred,truth) for (pred,truth) in zip(X_pred, X)])
def gradient(self,func,x0,h=0.0001):
x0 = np.array(x0)#,dtype=float)
y = func(x0)
deriv = np.zeros(len(x0))
for i in range(len(x0)):
x = np.array(x0)
x[i] += h
deriv[i] = (func(x) - y)/h
return deriv
def vec_score(self,w,X):
self.W = self.vec_to_mats(w)
return self.score(X)
def train(self,X_,batch_size=20,epochs=10,learning_rate=0.1):
def report(counter,epoch):
status = "Epoch {1} loss: {0:.3f}".format(self.score(X_),epoch)
print(status)
X_r = self.transform(X_)
pl.scatter(X_r[:,0],X_r[:,1],c=y,linewidth=0)
pl.xlim((0,1))
pl.ylim((0,1))
pl.title(status)
pl.savefig('Code/autoencoder/take7/{0}.jpg'.format(counter))
pl.close()
n = len(X_) /batch_size
w = self.mats_to_vec(self.W)
X = X_.copy()
counter = 0
report(counter,0)
counter += 1
start_cooling = 100
delta = learning_rate / (epochs - start_cooling)
for t in range(epochs):
npr.shuffle(X)
if t > start_cooling:
#learning_rate -= delta
learning_rate *= 0.95
for i in range(n):
batch_X = X[i*batch_size:(i+1)*batch_size]
loss_func = lambda w : self.vec_score(w,batch_X)
w -= learning_rate * self.gradient(loss_func,w)
report(counter,t+1)
counter +=1