In [1]:
import cPickle,gzip,os,sys,time
dataset = 'mnist.pkl.gz'
f = gzip.open(dataset, 'rb')
train_set, valid_set, test_set = cPickle.load(f)
f.close()

In [2]:
train_set[0].shape


Out[2]:
(50000, 784)

In [3]:
import numpy as np
import numpy.random as npr
import pylab as pl
pl.rcParams['font.family'] = 'serif'
%matplotlib inline
from sklearn.decomposition import PCA
from sklearn import neighbors

In [4]:
def one_nn_baseline(X,Y):
    one_nn = neighbors.kneighbors_graph(X,2)
    inds = np.zeros(len(X),dtype=int)
    for i in range(len(X)):
        inds[i] = [ind for ind in one_nn[i].indices if ind != i][0]
    preds = Y[inds]
    return 1.0*sum(preds==Y) / len(Y)

In [5]:
one_nn_baseline(train_set[0][:5000],train_set[1][:5000])


Out[5]:
0.9462

In [10]:
sigmoid = lambda x : 1.0 / (1.0 + np.exp(-x))
class MLAutoencoder():
    def __init__(self,layer_dims=[32,16,8,4,2,4,8,16,32]):
        self.layer_dims = layer_dims
        self.W = []
        for i in range(1,len(layer_dims)):
            self.W.append(npr.randn(layer_dims[i-1],layer_dims[i]))
        self.num_params = sum([np.prod(w.shape) for w in self.W])
        self.bottleneck = np.argmin(np.array(layer_dims))
        
    def mats_to_vec(self,W):
        w_vecs = []
        for w in W:
            w_vecs.append(np.reshape(w,np.prod(w.shape)))
        return np.hstack(w_vecs)
    
    def vec_to_mats(self,w_vecs):
        ind = 0
        W = []
        for i in range(len(self.W)):
            size = np.prod(self.W[i].shape)
            W.append(np.reshape(w_vecs[ind:ind+size],self.W[i].shape))
            ind += size
        return W
    
    def predict(self,X):
        def predict_one(x):
            L = x
            for w in self.W:
                L = sigmoid(L.dot(w))
            return L
        
        if len(X.shape) > 1:
            y = np.zeros(X.shape)
            for i,x in enumerate(X):
                y[i] = predict_one(x)
        else:
            y = predict_one(X)
        return y

    def transform(self,X):
        def transform_one(x):
            L = x
            for i in range(self.bottleneck):
                L = sigmoid(L.dot(self.W[i]))
            return L
        
        if len(X.shape) > 1:
            y = np.zeros((len(X),self.layer_dims[self.bottleneck]))
            for i,x in enumerate(X):
                y[i] = transform_one(x)
        else:
            y = transform_one(X)
        return y
        
    def loss(self,y,y_pred):
        return np.sum(abs(y-y_pred))
    
    def score(self,X):
        X_pred = self.predict(X)
        assert(X_pred.shape == X.shape)
        
        return sum([self.loss(pred,truth) for (pred,truth) in zip(X_pred, X)]) 
    
    def gradient(self,func,x0,h=0.0001):
        x0 = np.array(x0)#,dtype=float)
        y = func(x0)
        deriv = np.zeros(len(x0))
        for i in range(len(x0)):
            x = np.array(x0)
            x[i] += h
            deriv[i] = (func(x) - y)/h
        return deriv
    
    def vec_score(self,w,X):
        self.W = self.vec_to_mats(w)
        return self.score(X)

    def train(self,X_,batch_size=20,epochs=10,learning_rate=0.1,cooling=False):
        def report(counter,epoch):
            status = "Epoch {1} loss: {0:.3f}".format(self.score(X_),epoch)
            print(status)
            X_r = self.transform(X_)
            pl.scatter(X_r[:,0],X_r[:,1],c=Y,linewidth=0)
            pl.xlim((0,1))
            pl.ylim((0,1))
            pl.title(status)
            pl.savefig('ae/{0}.jpg'.format(counter))
            pl.close()
            
            if X_.shape[1] == 3:
                X_r = self.predict(X_)
                fig = pl.figure()
                ax = Axes3D(fig)
                
                #pl.xlim((0,1))
                #pl.ylim((0,1))
                #ax.zlim((0,1))
                ax.scatter(X_r[:,0],X_r[:,1],X_r[:,2],c=Y)
                pl.savefig('ae/reconstruction-{0}.jpg'.format(counter))
                pl.close()
        
        n = len(X_) /batch_size
        w = self.mats_to_vec(self.W)
        
        X = X_.copy()
        
        counter = 0
        report(counter,0)
        counter += 1
        
        if cooling:
            start_cooling = 100
            delta = learning_rate / (epochs - start_cooling)
        
        for t in range(epochs):
            npr.shuffle(X)
            
            if cooling and t > start_cooling:
                #learning_rate -= delta
                learning_rate *= 0.95
            
            for i in range(n):
                batch_X = X[i*batch_size:(i+1)*batch_size]
                loss_func = lambda w : self.vec_score(w,batch_X)
                w -= learning_rate * self.gradient(loss_func,w)
            report(counter,t+1)
            counter +=1

In [38]:
input_dim = 784
forward_layers = [input_dim/(2**n) for n in range(10) if input_dim/(2**n) > 1]
if forward_layers[-1] != 2:
    forward_layers.append(2)
forward_layers


Out[38]:
[784, 392, 196, 98, 49, 24, 12, 6, 3, 2]

In [39]:
layers = forward_layers + forward_layers[::-1][1:]
layers


Out[39]:
[784, 392, 196, 98, 49, 24, 12, 6, 3, 2, 3, 6, 12, 24, 49, 98, 196, 392, 784]

In [40]:
ae = MLAutoencoder(layers)

In [41]:
X_ = ae.transform(train_set[0][:5000])

In [42]:
pl.scatter(X_[:,0],X_[:,0],c=train_set[1][:len(X_)])


Out[42]:
<matplotlib.collections.PathCollection at 0x11aa47b90>

In [43]:
one_nn_baseline(X_,train_set[1][:len(X_)])


Out[43]:
0.1166

In [44]:
ae_2 = MLAutoencoder([input_dim,2,input_dim])

In [46]:
X_2 = ae_2.transform(train_set[0][:5000])
pl.scatter(X_2[:,0],X_2[:,0],c=train_set[1][:len(X_2)])
print(one_nn_baseline(X_2,train_set[1][:len(X_2)]))


0.1702

In [6]:
pca = PCA(64)
X_pca = pca.fit_transform(train_set[0][:5000])

In [8]:
pl.plot(pca.explained_variance_ratio_)
print(sum(pca.explained_variance_ratio_))


0.86632352625

In [14]:
ae_3 = MLAutoencoder([64,32,16,8,4,2,4,8,16,32,64])
X_3 = ae_3.transform(X_pca)
pl.scatter(X_3[:,0],X_3[:,0],c=train_set[1][:len(X_3)])
print(one_nn_baseline(X_3,train_set[1][:len(X_3)]))


0.1534

In [17]:
ae_4 = MLAutoencoder([64,16,4,2,4,16,64])
X_4 = ae_4.transform(X_pca)
pl.scatter(X_4[:,0],X_4[:,0],c=train_set[1][:len(X_4)])
print(one_nn_baseline(X_4,train_set[1][:len(X_4)]))


0.1574

In [25]:
ae_5 = MLAutoencoder([64,10,64])
X_5 = ae_5.transform(X_pca)
pl.scatter(X_5[:,0],X_5[:,0],c=train_set[1][:len(X_5)])
print(one_nn_baseline(X_5,train_set[1][:len(X_5)]))


0.5106

In [ ]:


In [22]:
from sklearn import datasets
data = datasets.load_digits()
X = data.data
Y = data.target

In [23]:
X.shape


Out[23]:
(1797, 64)

In [29]:
pca = PCA(32)
X_pca = pca.fit_transform(X)
pl.plot(pca.explained_variance_ratio_)
print(sum(pca.explained_variance_ratio_))


0.966354206963

In [34]:
ae_6 = MLAutoencoder([32,2,32])
X_6 = ae_6.transform(X_pca)
pl.scatter(X_6[:,0],X_6[:,0],c=train_set[1][:len(X_6)])
print(one_nn_baseline(X_6,Y[:len(X_6)]))


0.204785754035

In [43]:
from time import time
t = time()
n = 1000
data = np.zeros(n)
aes = np.empty(n,dtype=object)
for i in range(n):
    aes[i] = MLAutoencoder([32,2,32])
    data[i] = one_nn_baseline(aes[i].transform(X_pca),Y)
    if i % 10 == 0:
        print(i)
        print(time() - t)
print(time() - t)


0
0.194649934769
10
2.03154706955
20
3.84395194054
30
5.65981888771
40
7.44265699387
50
9.25756311417
60
11.0184390545
70
12.8358619213
80
14.6209049225
90
16.4241070747
100
18.2939870358
110
20.1695868969
120
22.0671439171
130
23.912115097
140
25.7082569599
150
27.5964438915
160
29.3909459114
170
31.2660939693
180
33.1158239841
190
34.999382019
200
36.8691940308
210
38.8097789288
220
40.6097939014
230
42.4128229618
240
44.1908180714
250
46.0589969158
260
47.8918960094
270
49.8150060177
280
51.6993510723
290
53.616314888
300
55.5104351044
310
57.4169640541
320
59.302448988
330
61.2635939121
340
63.1324279308
350
65.0048880577
360
66.8759388924
370
68.7133328915
380
70.5116930008
390
72.3125629425
400
74.1636300087
410
76.0858910084
420
77.9752109051
430
79.8385710716
440
81.653496027
450
83.4592349529
460
85.3402769566
470
87.2055609226
480
89.002959013
490
90.8366649151
500
92.7533609867
510
94.6725211143
520
96.5500469208
530
98.4657280445
540
100.366890907
550
102.223402977
560
104.093169928
570
105.994754076
580
107.848988056
590
109.733473063
600
111.51403904
610
113.310707092
620
115.167814016
630
116.977962017
640
118.756346941
650
120.499243021
660
122.330401897
670
124.126036882
680
125.916685104
690
127.737445116
700
129.583395004
710
131.48704505
720
133.282830954
730
135.062798023
740
136.866306067
750
138.65612793
760
140.648972988
770
142.499213934
780
144.423583031
790
146.219480038
800
148.24416399
810
150.061836958
820
151.865967989
830
153.68166995
840
155.598054886
850
157.539693117
860
159.303280115
870
161.414482117
880
163.226574898
890
164.999115944
900
166.782206059
910
168.618218899
920
170.446815968
930
172.184514999
940
173.981235027
950
175.849663973
960
177.72834897
970
179.525043011
980
181.307591915
990
183.114211082
185.045624971

In [47]:
pl.hist(data,bins=10);



In [48]:
data_single_layer = data

In [49]:
from time import time
t = time()
n = 1000
data = np.zeros(n)
aes = np.empty(n,dtype=object)
for i in range(n):
    aes[i] = MLAutoencoder([32,16,2,16,32])
    data[i] = one_nn_baseline(aes[i].transform(X_pca),Y)
    if i % 10 == 0:
        print(i)
        print(time() - t)
print(time() - t)


0
0.216962099075
10
2.2189950943
20
4.15798616409
30
6.06711602211
40
8.01408815384
50
10.0655210018
60
12.0787651539
70
14.070389986
80
16.1167831421
90
18.1455640793
100
20.1917440891
110
22.2622401714
120
24.2454390526
130
26.1740751266
140
28.1101310253
150
30.0664069653
160
31.963545084
170
33.858533144
180
35.7541871071
190
37.6930379868
200
39.6187551022
210
41.545979023
220
43.4524550438
230
45.3248341084
240
47.2275929451
250
49.1496789455
260
51.0615711212
270
52.9538550377
280
54.8211610317
290
56.7717170715
300
58.772331953
310
60.8229401112
320
62.821532011
330
64.9485230446
340
66.9792029858
350
68.9326629639
360
70.8849110603
370
72.789700985
380
74.7128090858
390
76.6637570858
400
78.6264181137
410
80.5597600937
420
82.5029611588
430
84.4260849953
440
86.3793540001
450
88.3611690998
460
90.3413231373
470
92.3073141575
480
94.2598969936
490
96.1740159988
500
98.2094810009
510
100.152284145
520
102.069875002
530
104.016078949
540
105.971562147
550
107.916990995
560
109.905498981
570
111.841619015
580
113.817512989
590
115.789940119
600
117.727342129
610
119.675325155
620
121.586244106
630
123.54840517
640
125.457971096
650
127.386712074
660
129.343052149
670
131.405107975
680
133.423280001
690
135.472536087
700
137.427443981
710
139.318254948
720
141.243524075
730
143.174734116
740
145.09325695
750
147.038089037
760
148.968778133
770
150.902760029
780
152.856822968
790
154.760342121
800
156.705861092
810
158.650561094
820
160.563406944
830
162.493993998
840
164.417673111
850
166.485352039
860
168.732316017
870
171.028941154
880
173.089281082
890
175.151801109
900
177.224868059
910
179.281350136
920
181.229735136
930
183.18156004
940
185.159044981
950
187.088528156
960
189.093688011
970
191.064901114
980
193.019813061
990
194.962717056
196.718389988

In [51]:
data_two_layer = data

In [54]:
pl.hist(data,bins=50);



In [ ]:
from time import time
t = time()
n = 1000
data = np.zeros(n)
aes = np.empty(n,dtype=object)
for i in range(n):
    aes[i] = MLAutoencoder([32,16,8,4,2,4,8,16,32])
    data[i] = one_nn_baseline(aes[i].transform(X_pca),Y)
    if i % 10 == 0:
        print(i)
        print(time() - t)
print(time() - t)

In [ ]:
data_two_layer = data

In [ ]:
pl.hist(data,bins=50);

In [ ]:


In [209]:
import numba

weights = [npr.randn(32,16),npr.randn(16,2)]

relu = lambda x: x*(x>0)

def project(X,weight1,weight2,bias1=0,bias2=0,f=sigmoid):
    return f(f(X.dot(weight1)+bias1).dot(weight2)+bias2)

In [211]:
relu(npr.randn(10))


Out[211]:
array([ 0.20238481,  0.61745683, -0.        ,  1.89809362,  3.02038842,
        0.38837096,  1.99024488, -0.        ,  0.32938833,  1.09238095])

In [217]:
X_relu = project(X_pca,npr.randn(32,16),npr.randn(16,2),npr.randn(),npr.rand(),relu)

In [218]:
pl.scatter(X_relu[:,0],X_relu[:,1],c=Y)


Out[218]:
<matplotlib.collections.PathCollection at 0x11bfca1d0>

In [225]:
np.sum(X_relu==0)


Out[225]:
2075

In [229]:
relu_proj = [project(X_pca,npr.randn(32,16),npr.randn(16,2),npr.randn(),npr.rand(),relu) for _ in xrange(10000)]

In [230]:
zero = np.array([np.sum(x==0) for x in relu_proj])

In [231]:
pl.hist(zero,bins=50);



In [232]:
np.argmin(zero)


Out[232]:
6331

In [235]:
zero[np.argmin(zero)]


Out[235]:
49

In [233]:
X_relu = relu_proj[np.argmin(zero)]

In [234]:
pl.scatter(X_relu[:,0],X_relu[:,1],c=Y)


Out[234]:
<matplotlib.collections.PathCollection at 0x123803210>

In [236]:
one_nn_baseline(X_relu,Y)


Out[236]:
0.20812465219810797

In [60]:
X_pca.dot(weights[0]).shape


Out[60]:
(1797, 16)

In [61]:
%timeit project(X_pca,npr.randn(32,16),npr.randn(16,2))


1000 loops, best of 3: 529 µs per loop

In [137]:
t = time()
data = np.array([project(X_pca,npr.randn(32,16),npr.randn(16,2),*npr.randn(2)) for _ in range(10000)])
print(time() - t)


16.4821131229

In [138]:
raw_data = data
raw_data.shape


Out[138]:
(10000, 1797, 2)

In [139]:
raw_data.shape


Out[139]:
(10000, 1797, 2)

In [128]:
%timeit np.array([j for j in range(raw_data.shape[1]) if npr.rand() < 0.01])


1000 loops, best of 3: 303 µs per loop

In [130]:
sample = np.array([j for j in range(raw_data.shape[1]) if npr.rand() < 0.1])
sample


Out[130]:
array([   4,   45,   76,   79,   85,  109,  111,  124,  126,  134,  163,
        168,  190,  207,  229,  237,  245,  250,  261,  294,  296,  298,
        303,  304,  331,  335,  348,  351,  352,  358,  362,  373,  380,
        381,  382,  385,  388,  428,  435,  437,  442,  473,  475,  476,
        512,  523,  525,  543,  554,  559,  570,  574,  579,  583,  590,
        592,  594,  602,  615,  619,  624,  630,  636,  643,  645,  664,
        670,  675,  700,  701,  734,  735,  737,  738,  756,  765,  770,
        774,  784,  806,  876,  884,  899,  905,  912,  914,  919,  935,
        937,  947,  956,  959,  967,  978,  986,  990,  991, 1009, 1019,
       1023, 1038, 1056, 1082, 1094, 1102, 1106, 1107, 1108, 1113, 1116,
       1119, 1138, 1145, 1148, 1150, 1155, 1161, 1176, 1191, 1196, 1244,
       1246, 1268, 1286, 1299, 1301, 1321, 1342, 1359, 1371, 1387, 1400,
       1412, 1413, 1415, 1423, 1429, 1437, 1445, 1454, 1467, 1472, 1476,
       1498, 1500, 1513, 1520, 1531, 1552, 1558, 1560, 1564, 1566, 1582,
       1604, 1621, 1623, 1624, 1628, 1634, 1649, 1654, 1656, 1658, 1670,
       1673, 1675, 1693, 1695, 1721, 1723, 1727, 1743, 1753, 1757, 1760,
       1762, 1764, 1772, 1785, 1792, 1793])

In [131]:
%timeit one_nn_baseline(raw_data[i][sample],Y[sample])


100 loops, best of 3: 16.5 ms per loop

In [140]:
t = time()
data = np.zeros(len(raw_data))

for i in range(len(data)):
    sample = np.array([j for j in range(raw_data.shape[1]) if npr.rand() < 0.1])
    data[i] = one_nn_baseline(raw_data[i][sample],Y[sample])
    if i % 100 == 0:
        print(i)
print(time()-t)


0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700
8800
8900
9000
9100
9200
9300
9400
9500
9600
9700
9800
9900
174.296450138

In [142]:
pl.hist(data*100,bins=100,normed=True);



In [ ]:


In [164]:
from sklearn.neighbors import KernelDensity
kde = KernelDensity()

In [159]:
kde.fit(np.reshape(data*100,(len(data),1)))


Out[159]:
KernelDensity(algorithm='auto', atol=0, bandwidth=1.0, breadth_first=True,
       kernel='gaussian', leaf_size=40, metric='euclidean',
       metric_params=None, rtol=0)

In [160]:
data.shape


Out[160]:
(10000,)

In [165]:
rdata = np.reshape(data*100,(len(data),1))
rdata.shape


Out[165]:
(10000, 1)

In [173]:
kde = KernelDensity(0.1)
kde.fit(rdata)


Out[173]:
KernelDensity(algorithm='auto', atol=0, bandwidth=0.1, breadth_first=True,
       kernel='gaussian', leaf_size=40, metric='euclidean',
       metric_params=None, rtol=0)

In [174]:
kde.score_samples(rdata)


Out[174]:
array([-2.66317798, -2.44164569, -2.60862005, ..., -3.92904209,
       -2.65994247, -4.40523765])

In [175]:
smoothed_density = kde.score_samples(np.reshape(np.arange(0,50,0.5),(100,1)))

In [176]:
pl.plot(smoothed_density)


Out[176]:
[<matplotlib.lines.Line2D at 0x11704f090>]

In [193]:
def density(input_points,output_points,bandwidth=0.1):
    output = np.zeros(len(output_points))
    for i in range(len(output_points)):
        for j in range(len(input_points)):
            output[i] += np.exp(-np.sqrt((output_points[i] - input_points[j])**2)/(2*bandwidth))
    return output

In [202]:
pl.plot(density(np.arange(10),np.arange(0,20,0.1),bandwidth=0.05))


Out[202]:
[<matplotlib.lines.Line2D at 0x11772d850>]

In [240]:
kde.fit(np.arange(0,10,1))
kde.score_samples(np.arange(0,10,0.1))


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-240-b3524308b6c6> in <module>()
      1 kde.fit(np.arange(0,10,1))
----> 2 kde.score_samples(np.arange(0,10,0.1))

/Users/joshuafass/Library/Enthought/Canopy_64bit/User/lib/python2.7/site-packages/sklearn/neighbors/kde.pyc in score_samples(self, X)
    153         log_density = self.tree_.kernel_density(
    154             X, h=self.bandwidth, kernel=self.kernel, atol=atol_N,
--> 155             rtol=self.rtol, breadth_first=self.breadth_first, return_log=True)
    156         log_density -= np.log(N)
    157         return log_density

/Users/joshuafass/Library/Enthought/Canopy_64bit/User/lib/python2.7/site-packages/sklearn/neighbors/kd_tree.so in sklearn.neighbors.kd_tree.BinaryTree.kernel_density (sklearn/neighbors/kd_tree.c:12787)()

ValueError: query data dimension must match training data dimension

In [150]:
np.arange(0,50,0.5).shape


Out[150]:
(100,)

In [72]:
from scipy.spatial.distance import pdist

In [79]:
pdist(raw_data[0]).shape


Out[79]:
(1613706,)

In [74]:
pdist(X_pca).shape


Out[74]:
(1613706,)

In [76]:
pl.scatter(pdist(X_pca),pdist(raw_data[0]))


Out[76]:
<matplotlib.collections.PathCollection at 0x10d33ce50>

In [77]:
from sklearn.linear_model import LinearRegression

In [84]:
lr = LinearRegression()
len_pdist = len(pdist(X_pca))

lr.fit(pdist(raw_data[0]).reshape((len_pdist,1)),pdist(X_pca))


Out[84]:
LinearRegression(copy_X=True, fit_intercept=True, normalize=False)

In [86]:
pred = pdist(raw_data[0]).reshape((len_pdist,1))
the_true_true = pdist(X_pca)

In [85]:
%timeit lr.fit(npr.randn(10).reshape((10,1)),npr.randn(10))


10000 loops, best of 3: 171 µs per loop

In [89]:
lr.fit(pred.reshape((len_pdist,1)),the_true_true)
lr.score(pred.reshape((len_pdist,1)),the_true_true)


Out[89]:
0.080704223334626679

In [92]:
%%timeit
pred = pdist(raw_data[i]).reshape((len_pdist,1))
lr.fit(pred.reshape((len_pdist,1)),the_true_true)
data[i] = lr.score(pred.reshape((len_pdist,1)),the_true_true)


10 loops, best of 3: 113 ms per loop

In [93]:
sample_size = 100
data = np.zeros(sample_size)

for i in range(sample_size):
    pred = pdist(raw_data[i]).reshape((len_pdist,1))
    lr.fit(pred.reshape((len_pdist,1)),the_true_true)
    data[i] = lr.score(pred.reshape((len_pdist,1)),the_true_true)

In [94]:
pl.hist(data,bins=50);



In [95]:
from numba import jit, autojit, double, float64, float32, void, int32

In [96]:
project_jit = jit(double[:,:](double[:,:],double[:,:],double[:,:]))(project)

In [99]:
%timeit project_jit(X_pca,npr.randn(32,16),npr.randn(16,2))


1000 loops, best of 3: 519 µs per loop

In [109]:
ae_7 = MLAutoencoder([32,16,2,16,32])

In [111]:
%timeit ae_7.transform(X_pca)


10 loops, best of 3: 31 ms per loop

In [ ]:
class MLAutoencoder():
    def __init__(self,layer_dims=[32,16,8,4,2,4,8,16,32]):
        self.layer_dims = layer_dims
        self.W = []
        for i in range(1,len(layer_dims)):
            self.W.append(npr.randn(layer_dims[i-1],layer_dims[i]))
        self.num_params = sum([np.prod(w.shape) for w in self.W])
        self.bottleneck = np.argmin(np.array(layer_dims))
        
    def mats_to_vec(self,W):
        w_vecs = []
        for w in W:
            w_vecs.append(np.reshape(w,np.prod(w.shape)))
        return np.hstack(w_vecs)
    
    def vec_to_mats(self,w_vecs):
        ind = 0
        W = []
        for i in range(len(self.W)):
            size = np.prod(self.W[i].shape)
            W.append(np.reshape(w_vecs[ind:ind+size],self.W[i].shape))
            ind += size
        return W
    
    def predict(self,X):
        def predict_one(x):
            L = x
            for w in self.W:
                L = sigmoid(L.dot(w))
            return L
        
        if len(X.shape) > 1:
            y = np.zeros(X.shape)
            for i,x in enumerate(X):
                y[i] = predict_one(x)
        else:
            y = predict_one(X)
        return y

    def transform(self,X):
        def transform_one(x):
            L = x
            for i in range(self.bottleneck):
                L = sigmoid(L.dot(self.W[i]))
            return L
        
        if len(X.shape) > 1:
            y = np.zeros((len(X),self.layer_dims[self.bottleneck]))
            for i,x in enumerate(X):
                y[i] = transform_one(x)
        else:
            y = transform_one(X)
        return y
        
    def loss(self,y,y_pred):
        return sum(abs(y-y_pred))
    
    def score(self,X):
        X_pred = self.predict(X)
        assert(X_pred.shape == X.shape)
        
        return sum([self.loss(pred,truth) for (pred,truth) in zip(X_pred, X)]) 
    
    def gradient(self,func,x0,h=0.0001):
        x0 = np.array(x0)#,dtype=float)
        y = func(x0)
        deriv = np.zeros(len(x0))
        for i in range(len(x0)):
            x = np.array(x0)
            x[i] += h
            deriv[i] = (func(x) - y)/h
        return deriv
    
    def vec_score(self,w,X):
        self.W = self.vec_to_mats(w)
        return self.score(X)

    def train(self,X_,batch_size=20,epochs=10,learning_rate=0.1):
        def report(counter,epoch):
            status = "Epoch {1} loss: {0:.3f}".format(self.score(X_),epoch)
            print(status)
            X_r = self.transform(X_)
            pl.scatter(X_r[:,0],X_r[:,1],c=y,linewidth=0)
            pl.xlim((0,1))
            pl.ylim((0,1))
            pl.title(status)
            pl.savefig('Code/autoencoder/take7/{0}.jpg'.format(counter))
            pl.close()
        
        n = len(X_) /batch_size
        w = self.mats_to_vec(self.W)
        
        X = X_.copy()
        
        counter = 0
        report(counter,0)
        counter += 1
        
        start_cooling = 100
        delta = learning_rate / (epochs - start_cooling)
        
        for t in range(epochs):
            npr.shuffle(X)
            
            if t > start_cooling:
                #learning_rate -= delta
                learning_rate *= 0.95
            
            for i in range(n):
                batch_X = X[i*batch_size:(i+1)*batch_size]
                loss_func = lambda w : self.vec_score(w,batch_X)
                w -= learning_rate * self.gradient(loss_func,w)
            report(counter,t+1)
            counter +=1