In [1]:
# from https://triangleinequality.wordpress.com/2014/08/12/theano-autoencoders-and-mnist/
import numpy as np
import theano as th
from theano import tensor as T
from numpy import random as rng
class AutoEncoder(object):
def __init__(self, X, hidden_size, activation_function,
output_function):
#X is the data, an m x n numpy matrix
#where rows correspond to datapoints
#and columns correspond to features.
assert type(X) is np.ndarray
assert len(X.shape)==2
self.X=X
self.X=th.shared(name='X', value=np.asarray(self.X,
dtype=th.config.floatX),borrow=True)
#The config.floatX and borrow=True stuff is to get this to run
#fast on the gpu. I recommend just doing this without thinking about
#it until you understand the code as a whole, then learning more
#about gpus and theano.
self.n = X.shape[1]
self.m = X.shape[0]
#Hidden_size is the number of neurons in the hidden layer, an int.
assert type(hidden_size) is int
assert hidden_size > 0
self.hidden_size=hidden_size
initial_W = np.asarray(rng.uniform(
low=-4 * np.sqrt(6. / (self.hidden_size + self.n)),
high=4 * np.sqrt(6. / (self.hidden_size + self.n)),
size=(self.n, self.hidden_size)), dtype=th.config.floatX)
self.W = th.shared(value=initial_W, name='W', borrow=True)
self.b1 = th.shared(name='b1', value=np.zeros(shape=(self.hidden_size,),
dtype=th.config.floatX),borrow=True)
self.b2 = th.shared(name='b2', value=np.zeros(shape=(self.n,),
dtype=th.config.floatX),borrow=True)
self.activation_function=activation_function
self.output_function=output_function
def train(self, n_epochs=100, mini_batch_size=1, learning_rate=0.1):
index = T.lscalar()
x=T.matrix('x')
params = [self.W, self.b1, self.b2]
hidden = self.activation_function(T.dot(x, self.W)+self.b1)
output = T.dot(hidden,T.transpose(self.W))+self.b2
output = self.output_function(output)
#Use cross-entropy loss.
L = -T.sum(x*T.log(output) + (1-x)*T.log(1-output), axis=1)
cost=L.mean()
updates=[]
#Return gradient with respect to W, b1, b2.
gparams = T.grad(cost,params)
#Create a list of 2 tuples for updates.
for param, gparam in zip(params, gparams):
updates.append((param, param-learning_rate*gparam))
#Train given a mini-batch of the data.
train = th.function(inputs=[index], outputs=[cost], updates=updates,
givens={x:self.X[index:index+mini_batch_size,:]})
import time
start_time = time.clock()
for epoch in xrange(n_epochs):
print "Epoch:",epoch
for row in xrange(0,self.m, mini_batch_size):
train(row)
end_time = time.clock()
print "Average time per epoch=", (end_time-start_time)/n_epochs
def get_hidden(self,data):
x=T.dmatrix('x')
hidden = self.activation_function(T.dot(x,self.W)+self.b1)
transformed_data = th.function(inputs=[x], outputs=[hidden])
return transformed_data(data)
def get_weights(self):
return [self.W.get_value(), self.b1.get_value(), self.b2.get_value()]
In [2]:
activation_function = T.nnet.sigmoid
In [4]:
output_function=activation_function
In [2]:
import cPickle, gzip, numpy
# Load the dataset
f = gzip.open('mnist.pkl.gz', 'rb')
train_set, valid_set, test_set = cPickle.load(f)
f.close()
In [3]:
from sklearn.decomposition import PCA
In [4]:
pca = PCA(32)
In [5]:
X = pca.fit_transform(train_set[0])
In [6]:
sum(pca.explained_variance_ratio_)
Out[6]:
In [7]:
X.shape
Out[7]:
In [18]:
A = AutoEncoder(X, 2, activation_function, output_function)
In [19]:
A.train(20,20)
In [20]:
A.get_hidden(X)
Out[20]:
In [339]:
x = np.zeros(2)
type(x)
Out[339]:
In [537]:
import numpy as np
import numpy.random as npr
import pylab as pl
pl.rcParams['font.family'] = 'serif'
%matplotlib inline
from sklearn.decomposition import PCA
dim = X.shape[1]
sigmoid = lambda x : 1.0 / (1.0 + np.exp(-x))
class MLAutoencoder():
def __init__(self,layer_dims=[32,16,8,4,2,4,8,16,32],
activation=relu):
self.activation=activation
self.layer_dims = layer_dims
self.W = []
self.b = []
for i in range(1,len(layer_dims)):
self.W.append(npr.randn(layer_dims[i-1],layer_dims[i]))
self.b.append(npr.randn())
self.num_params = sum([np.prod(w.shape) for w in self.W]) + len(self.b)
self.bottleneck = np.argmin(np.array(layer_dims))
def mats_to_vec(self,W,b):
w_vecs = []
for w in W:
w_vecs.append(np.reshape(w,np.prod(w.shape)))
w_vecs.append(np.array(b))
return np.hstack(w_vecs)
def vec_to_mats(self,w_vecs):
ind = 0
W = []
for i in range(len(self.W)):
size = np.prod(self.W[i].shape)
W.append(np.reshape(w_vecs[ind:ind+size],self.W[i].shape))
ind += size
bias = w_vecs[ind:]
return W,bias
def predict(self,X):
def predict_one(x):
L = x
for i,w in enumerate(self.W):
L = self.activation(L.dot(w) + self.b[i])
return L
if len(X.shape) > 1:
y = np.zeros(X.shape)
for i,x in enumerate(X):
y[i] = predict_one(x)
else:
y = predict_one(X)
return y
def transform(self,X):
def transform_one(x):
L = x
for i in range(self.bottleneck):
L = self.activation(L.dot(self.W[i])+self.b[i])
return L
if len(X.shape) > 1:
y = np.zeros((len(X),self.layer_dims[self.bottleneck]))
for i,x in enumerate(X):
y[i] = transform_one(x)
else:
y = transform_one(X)
return y
def loss(self,y,y_pred):
return np.sum(abs(y-y_pred))
def score(self,X):
X_pred = self.predict(X)
assert(X_pred.shape == X.shape)
return sum([self.loss(pred,truth) for (pred,truth) in zip(X_pred, X)])
def gradient(self,func,x0,h=0.001):
x0 = np.array(x0)#,dtype=float)
y = func(x0)
deriv = np.zeros(len(x0))
for i in range(len(x0)):
x = np.array(x0)
x[i] += h
deriv[i] = (func(x) - y)/h
return deriv
def vec_score(self,w,X):
self.W,self.b = self.vec_to_mats(w)
return self.score(X)
def train(self,X_,batch_size=20,epochs=10,
learning_rate=0.1,cooling=False,
record=False):
def report(counter,epoch):
status = "Epoch {1} loss: {0:.3f}".format(self.score(X_),epoch)
print(status)
X_r = self.transform(X_)
pl.scatter(X_r[:,0],X_r[:,1],c=Y,linewidth=0)
pl.xlim((X_.min(),X_.max()))
pl.ylim((X_.min(),X_.max()))
pl.title(status)
pl.savefig('ae/{0}.jpg'.format(counter))
pl.close()
if X_.shape[1] == 3:
X_r = self.predict(X_)
fig = pl.figure()
ax = Axes3D(fig)
#pl.xlim((0,1))
#pl.ylim((0,1))
#ax.zlim((0,1))
ax.scatter(X_r[:,0],X_r[:,1],X_r[:,2],c=Y)
pl.savefig('ae/reconstruction-{0}.jpg'.format(counter))
pl.close()
n = len(X_) /batch_size
w = self.mats_to_vec(self.W,self.b)
X = X_.copy()
counter = 0
report(counter,0)
counter += 1
if cooling:
start_cooling = 100
delta = learning_rate / (epochs - start_cooling)
if record:
history = np.zeros((epochs*len(X)/(batch_size-1),self.num_params))
#history = [w]
history[0] = w
recordi = 1
momentum=np.zeros(len(w))
for t in range(epochs):
npr.shuffle(X)
if cooling and t > start_cooling:
#learning_rate -= delta
learning_rate *= 0.95
for i in range(n):
w_prev = w
batch_X = X[i*batch_size:(i+1)*batch_size]
loss_func = lambda w : self.vec_score(w,batch_X)
#w -= learning_rate * self.gradient(loss_func,w)
w -= learning_rate*(momentum+0.001)*self.gradient(loss_func,w)
momentum+=w - w_prev
if record:
#history.append(w)
history[recordi] = w
recordi+=1
report(counter,t+1)
counter +=1
if record:
return history[:recordi]
In [553]:
from sklearn import datasets
data = datasets.load_digits()
X = data.data
Y = data.target
pca = PCA(4,whiten=True)
X_ = pca.fit_transform(X)
X_-=X_.min()
X_/= X_.max()
#X_-=0.5
#X_*=2
X_.min(),X_.max()
Out[553]:
In [532]:
X.shape
Out[532]:
In [ ]:
In [554]:
ae = MLAutoencoder(layer_dims=[4,3,2,3,4],activation=sigmoid)
ae.num_params
Out[554]:
In [546]:
%timeit ae.score(X_)
In [555]:
ae.train(X_,epochs=20)
In [556]:
embedding = ae.transform(X_)
In [557]:
pl.scatter(embedding[:,0],embedding[:,1],c=Y,linewidths=0)
Out[557]:
In [489]:
ae.score(X_)
Out[489]:
In [315]:
ae.b
Out[315]:
In [316]:
ae.W,ae.b = ae.vec_to_mats(npr.randn(ae.num_params))
ae.b
Out[316]:
In [334]:
ae.num_params
Out[334]:
In [342]:
%timeit ae.vec_score(npr.randn(ae.num_params),X_[npr.rand(len(X_))<0.1])
In [350]:
params = npr.randn(ae.num_params)
scores = [ae.vec_score(npr.randn(ae.num_params),X_[npr.rand(len(X_))<0.5]) for _ in range(50)]
In [351]:
pl.hist(scores,bins=50);
In [336]:
ae_loss(npr.randn(ae.num_params))
In [459]:
X_.shape[0]*0.1
Out[459]:
In [490]:
def loss(params,thinning=0.1):
return ae.vec_score(params,X_[npr.rand(len(X_))<thinning])#[:len(X_)/2])
In [491]:
def min_line_loss(vec,loss=loss,alpha=np.arange(0.0,1.5,0.1)):
return np.min([loss(vec*a) for a in alpha])
In [492]:
[min_line_loss(params) for i in range(10)]
Out[492]:
In [493]:
def adaptive_metropolis_hastings(func,x0,num_steps=1000,step_size=0.1,verbose=False):
locs = np.zeros((num_steps+1,len(x0)))
vals = np.zeros(num_steps+1)
locs[0] = x0
vals[0] = func(x0)
num_rejects=0
num_accepts=0
message = ''
adaptation_rate=1.02
steps = np.zeros(num_steps+1)
steps[0] = step_size
for i in range(num_steps):
new_loc = locs[i] + npr.randn(len(locs[i]))*step_size
new_val = func(new_loc)
#accept = npr.rand() < np.exp(-(new_val - vals[i]))#/vals[i]#np.exp(-(new_val - vals[i]))
accept = new_val <= vals[i]#*(1+npr.randn()*0.01)# or (npr.rand() < 0.05)
if not accept:
new_loc = locs[i]
new_val = vals[i]
num_rejects+=1
num_accepts=0
if num_rejects > 100:
if step_size > 1e-3:
step_size/=adaptation_rate
message = message+'\n reduced step size'
else:
num_rejects=0
num_accepts+=1
if num_accepts > 10:
if step_size < 10.0:
step_size*=adaptation_rate
message = message+ '\n increased step size'
locs[i+1] = new_loc
vals[i+1] = new_val
if i % 100 ==0:
print("{0}: {1:.3f}".format(i,new_val))
if verbose:
print(message)
message = ''
steps[i+1] = step_size
return locs,vals,steps
In [462]:
locs,vals,steps=adaptive_metropolis_hastings(min_line_loss,npr.randn(ae.num_params))
In [517]:
locs,vals,steps=adaptive_metropolis_hastings(loss,npr.randn(ae.num_params)*0.1,num_steps=1000)
In [518]:
pl.plot(vals)
Out[518]:
In [519]:
pl.plot(steps)
Out[519]:
In [520]:
ae.W,ae.b = ae.vec_to_mats(locs[-1])
In [521]:
ae.score(X_)
Out[521]:
In [522]:
vals[-1]
Out[522]:
In [523]:
embedding = ae.transform(X_)
In [525]:
pl.scatter(embedding[:,0],embedding[:,1],c=Y,linewidths=0)
Out[525]:
In [453]:
alpha=np.arange(0.0,1.5,0.1)
a = alpha[np.argmin([ae.vec_score(locs[-1]*a,X_) for a in alpha])]
a
Out[453]:
In [418]:
ae.W,ae.b = ae.vec_to_mats(locs[-1]*a)
In [419]:
ae.score(X_)
Out[419]:
In [ ]:
ae.transform
In [ ]:
pl.scatter(
In [300]:
ae = MLAutoencoder(layer_dims=[8,4,2,4,8])
ae.num_params
Out[300]:
In [210]:
ae.train(X_)
In [ ]:
In [12]:
Y = train_set[1]
In [13]:
X.shape
Out[13]:
In [14]:
Y.shape
Out[14]:
In [237]:
pl.scatter(X_[:,0],X_[:,1],c=Y,linewidths=0,alpha=0.5)
Out[237]:
In [238]:
X_r = ae.transform(X_)
In [239]:
pl.scatter(X_r[:,0],X_r[:,1],c=Y,linewidths=0,alpha=0.5)
Out[239]:
In [52]:
#Y = Y[:10000]
In [53]:
#X = X_[:10000]
In [54]:
X_.min()
Out[54]:
In [176]:
relu = lambda x: x*(x>0)
In [185]:
tanh = lambda x:np.tanh(x)
In [243]:
from sklearn import datasets
data = datasets.load_diabetes()
X = data.data
Y = data.target
In [244]:
X.shape
Out[244]:
In [245]:
pca = PCA()
pca.fit(X)
Out[245]:
In [246]:
pl.plot(pca.explained_variance_ratio_)
Out[246]:
In [250]:
pl.scatter(pca.fit_transform(X)[:,0],
pca.fit_transform(X)[:,1],
c=Y,
linewidths=0,
alpha=0.5)
pl.colorbar()
Out[250]:
In [252]:
from sklearn.cross_decomposition import PLSRegression
pls = PLSRegression()
pls.fit(X,Y)
pls.score(X,Y)
Out[252]:
In [253]:
pca = PCA(4,whiten=True)
In [254]:
activation = relu
if activation==relu:
X_ = pca.fit_transform(X)
X_-=X_.min()
X_/= X_.max()
else:
X_ = pca.fit_transform(X)
X_-=X_.min()
X_/= X_.max()
X_-=0.5
X_*=2
In [ ]:
In [ ]:
In [284]:
ae = MLAutoencoder(layer_dims=[4,3,2,3,4],activation=activation)
ae.num_params
Out[284]:
In [285]:
record = ae.train(X_,batch_size=10,epochs=10,learning_rate=0.001,record=True)
In [287]:
record[::10].shape
Out[287]:
In [289]:
w = ae.mats_to_vec(ae.W,ae.b)
w = record[0]
gradients = []
for w in record[::10]:
loc_grad = []
for i in range(50):
batch_X = X_[npr.rand(len(X)) < 0.1]
loss_func = lambda w : ae.vec_score(w,batch_X)
loc_grad.append(ae.gradient(loss_func,w))
gradients.append(np.array(loc_grad))
gradients = np.array(gradients)
In [291]:
len(gradients)
Out[291]:
In [282]:
grads2 = np.array(grads2)
In [292]:
for grad in gradients:
pl.plot(grad.mean(0))
In [297]:
pl.plot(gradients[0].mean(0))
w = record[0]
for i in range(50):
batch_X = X_[npr.rand(len(X)) < 0.1]
loss_func = lambda w : ae.vec_score(w,batch_X)
pl.plot(ae.gradient(loss_func,w))
In [ ]:
In [130]:
record.shape
Out[130]:
In [149]:
record = record[:-180]
In [150]:
differences = np.zeros((len(record),len(record)))
In [151]:
for i in range(len(record)):
for j in range(len(record)):
differences[i,j] = np.sqrt(sum((record[i]-record[j])**2))
if i % 100 == 0:
print(i)
In [124]:
sum(differences==0)
Out[124]:
In [ ]:
# for a case with batch_size=10, step_size=0.01
pl.imshow(differences,interpolation='none')
pl.colorbar()
In [153]:
# for a case with batch_size=10, step_size=0.01
pl.imshow(differences,interpolation='none')
pl.colorbar()
Out[153]:
In [111]:
# for a case with batch_size=20, step_size=1.0
pl.imshow(differences[8:-8,8:-8],interpolation='none')
pl.colorbar()
Out[111]:
In [ ]:
# how much does the "angle" change during training?
In [154]:
def theta(X,Y):
norm_X = np.sqrt(sum((X)**2))
norm_Y = np.sqrt(sum((Y)**2))
return np.arccos(X.dot(Y) / (norm_X * norm_Y))
In [155]:
theta(record[1]-record[0],record[2]-record[1])
Out[155]:
In [163]:
differences = record[1:] - record[:-1]
In [164]:
differences.sum(1).shape
Out[164]:
In [165]:
pl.plot(differences.sum(1))
Out[165]:
In [166]:
pl.hist(differences.sum(1),bins=50);
In [167]:
angles = np.zeros(len(differences)-1)
for i in range(1,len(differences)):
angles[i-1] = theta(differences[i-1],differences[i])
In [171]:
pl.plot(angles)
Out[171]:
In [170]:
pl.hist(angles,bins=50);
In [29]:
ae.num_params
Out[29]:
In [ ]:
ae.num_p
In [51]:
X_ = X[:2000]
Y = Y[:2000]
In [ ]:
X
In [57]:
ae.train(X_[:,:16])
In [ ]:
ae.n