In [2]:
# this part imports libs and load data from csv file
import csv
from dateutil import parser
from datetime import timedelta
from sklearn import svm
import numpy as np
import pandas as pd
import pdb
import pickle
from sklearn.cross_validation import train_test_split
from sklearn import preprocessing
import sklearn
import scipy.stats as ss
import cPickle
import gzip
import os
import sys
import time

import numpy

import theano
import theano.tensor as T
from theano.tensor.shared_randomstreams import RandomStreams
from DL_libs import *


/usr/lib/python2.7/dist-packages/nose/util.py:14: DeprecationWarning: The compiler package is deprecated and removed in Python 3.x.
  from compiler.consts import CO_GENERATOR

In [2]:
f = gzip.open('mnist.pkl.gz', 'rb')
train_set, valid_set, test_set = cPickle.load(f)
X_train,y_train = train_set

X_valid,y_valid = valid_set
X_total=np.vstack((X_train, X_valid))
X_total = np.array(X_total, dtype= theano.config.floatX)
print(X_total.shape)
y_total = np.concatenate([y_train, y_valid])
print y_total.shape
array_A =[]
array_B =[]
for i in range(100000):
    array_A.append(np.random.random_integers(0, 59999))
    array_B.append(np.random.random_integers(0, 59999))
pos_index = []
neg_index = []
for index in xrange(100000):
    if y_total[array_A[index]] == y_total[array_B[index]]:
        pos_index.append(index)
    else:
        neg_index.append(index)
print len(pos_index)
selected_neg_index= neg_index[ : len(pos_index)]
print len(selected_neg_index)


(60000L, 784L)
(60000L,)
9948
9948

In [26]:
type(X_train)


Out[26]:
numpy.ndarray

In [3]:
import pandas as pd
array_A = np.array(array_A)
array_B = np.array(array_B)
index_for_positive_image_A = array_A[pos_index]
index_for_positive_image_B = array_B[pos_index]
index_for_neg_image_A = array_A[selected_neg_index]
index_for_neg_image_B = array_B[selected_neg_index]

X_pos_A = X_total[index_for_positive_image_A]
X_pos_B = X_total[index_for_positive_image_B]
X_pos_whole = np.hstack((X_pos_A,X_pos_B))
X_neg_A = X_total[index_for_neg_image_A]
X_neg_B = X_total[index_for_neg_image_B]
X_neg_whole = np.hstack((X_neg_A, X_neg_B))
print X_pos_A.shape,  X_pos_B.shape, X_pos_whole.shape
print X_neg_A.shape,  X_neg_B.shape, X_neg_whole.shape

X_whole = np.vstack((X_pos_whole, X_neg_whole))
print X_whole.shape
y_pos = np.ones(X_pos_whole.shape[0])
y_neg = np.zeros(X_neg_whole.shape[0])
y_whole = np.concatenate([y_pos,y_neg])
print y_whole


(9948L, 784L) (9948L, 784L) (9948L, 1568L)
(9948L, 784L) (9948L, 784L) (9948L, 1568L)
(19896L, 1568L)
[ 1.  1.  1. ...,  0.  0.  0.]

In [4]:
x_train_pre_validation_minmax, x_test_minmax, y_train_pre_validation_minmax, y_test_minmax = train_test_split(X_whole,y_whole,\
                                                            test_size=0.2, random_state=211)
x_train_minmax, x_validation_minmax, y_train_minmax, y_validation_minmax = train_test_split(x_train_pre_validation_minmax,
                                                                                            y_train_pre_validation_minmax,\
                                                            test_size=0.2, random_state=21)
print x_train_minmax.shape, y_train_minmax.shape, x_validation_minmax.shape, \
y_validation_minmax.shape, x_test_minmax.shape, y_test_minmax.shape


(12732L, 1568L) (12732L,) (3184L, 1568L) (3184L,) (3980L, 1568L) (3980L,)

In [9]:
# this is a use case.
pretraining_epochs=1
pretrain_lr=0.001
batch_size=30
hidden_layers_sizes =[100, 100]
corruption_levels=[0, 0]
x =x_train_minmax
print "original shape", x.shape
a_MAE = train_a_MultipleAEs(x, pretraining_epochs=pretraining_epochs, pretrain_lr=pretrain_lr, batch_size=batch_size, 
                        hidden_layers_sizes =hidden_layers_sizes, corruption_levels=corruption_levels)
print a_MAE.transform(x).shape


original shape (12732L, 1568L)
... building the model
... getting the pretraining functions
... pre-training the model
Pre-training layer 0, epoch 0, cost  649.800200642
Pre-training layer 0, epoch 1, cost  460.981851636
Pre-training layer 0, epoch 2, cost  424.77356587
Pre-training layer 0, epoch 3, cost  401.89155937
Pre-training layer 0, epoch 4, cost  384.69806117
Pre-training layer 0, epoch 5, cost  371.079913154
Pre-training layer 0, epoch 6, cost  359.965664574
Pre-training layer 0, epoch 7, cost  350.730824108
Pre-training layer 0, epoch 8, cost  342.940302137
Pre-training layer 0, epoch 9, cost  336.193980432
Pre-training layer 0, epoch 10, cost  330.347403174
Pre-training layer 0, epoch 11, cost  325.212812772
Pre-training layer 0, epoch 12, cost  320.631475096
Pre-training layer 0, epoch 13, cost  316.543858634
Pre-training layer 0, epoch 14, cost  312.811695984
Pre-training layer 1, epoch 0, cost  67.8823744112
Pre-training layer 1, epoch 1, cost  57.1802584305
Pre-training layer 1, epoch 2, cost  55.5621241771
Pre-training layer 1, epoch 3, cost  54.1539331595
Pre-training layer 1, epoch 4, cost  52.8968902348
Pre-training layer 1, epoch 5, cost  51.7700517652
Pre-training layer 1, epoch 6, cost  50.6964844004
Pre-training layer 1, epoch 7, cost  49.7465968584
Pre-training layer 1, epoch 8, cost  48.9431584208
Pre-training layer 1, epoch 9, cost  48.2292707932
Pre-training layer 1, epoch 10, cost  47.5119329546
Pre-training layer 1, epoch 11, cost  46.970501863
Pre-training layer 1, epoch 12, cost  46.3474649181
Pre-training layer 1, epoch 13, cost  45.9114873236
Pre-training layer 1, epoch 14, cost  45.4500529929
(12732L, 100L)

In [16]:
new_x_train_minmax = a_MAE.transform(x_train_minmax)
new_x_test_minmax = a_MAE.transform(x_test_minmax)

In [12]:
x_train_minmax_A = x_train_minmax[:, :x_train_minmax.shape[1]/2].shape


Out[12]:
(12732L, 784L)

In [13]:
# get the new representation for A set. first 784-D
pretraining_epochs=15
pretrain_lr=0.001
batch_size=30
hidden_layers_sizes =[100, 100]
corruption_levels=[0, 0]
x = x_train_minmax[:, :x_train_minmax.shape[1]/2]
print "original shape", x.shape
a_MAE_A = train_a_MultipleAEs(x, pretraining_epochs=pretraining_epochs, pretrain_lr=pretrain_lr, batch_size=batch_size, 
                        hidden_layers_sizes =hidden_layers_sizes, corruption_levels=corruption_levels)
new_x_train_minmax_A =  a_MAE_A.transform(x_train_minmax[:, :x_train_minmax.shape[1]/2])


original shape (12732L, 784L)
... building the model
... getting the pretraining functions
... pre-training the model
Pre-training layer 0, epoch 0, cost  337.817380803
Pre-training layer 0, epoch 1, cost  220.31406839
Pre-training layer 0, epoch 2, cost  197.911487485
Pre-training layer 0, epoch 3, cost  184.741428666
Pre-training layer 0, epoch 4, cost  175.186300541
Pre-training layer 0, epoch 5, cost  167.808856757
Pre-training layer 0, epoch 6, cost  161.924048097
Pre-training layer 0, epoch 7, cost  157.114576891
Pre-training layer 0, epoch 8, cost  153.09225115
Pre-training layer 0, epoch 9, cost  149.685612135
Pre-training layer 0, epoch 10, cost  146.735354872
Pre-training layer 0, epoch 11, cost  144.16240492
Pre-training layer 0, epoch 12, cost  141.89319661
Pre-training layer 0, epoch 13, cost  139.859680231
Pre-training layer 0, epoch 14, cost  138.032116363
Pre-training layer 1, epoch 0, cost  67.5070690172
Pre-training layer 1, epoch 1, cost  58.0326097094
Pre-training layer 1, epoch 2, cost  56.1593438841
Pre-training layer 1, epoch 3, cost  54.5597875459
Pre-training layer 1, epoch 4, cost  53.167724821
Pre-training layer 1, epoch 5, cost  52.034442306
Pre-training layer 1, epoch 6, cost  50.9350355549
Pre-training layer 1, epoch 7, cost  50.0320403005
Pre-training layer 1, epoch 8, cost  49.2972887607
Pre-training layer 1, epoch 9, cost  48.6069476679
Pre-training layer 1, epoch 10, cost  47.974396873
Pre-training layer 1, epoch 11, cost  47.4748985238
Pre-training layer 1, epoch 12, cost  46.9994107418
Pre-training layer 1, epoch 13, cost  46.5927932659
Pre-training layer 1, epoch 14, cost  46.236759452

In [ ]:
# get the new representation for A set. first 784-D
pretraining_epochs=15
pretrain_lr=0.001
batch_size=30
hidden_layers_sizes =[100, 100]
corruption_levels=[0, 0]
x = x_train_minmax[:, x_train_minmax.shape[1]/2:]
print "original shape", x.shape
a_MAE_B = train_a_MultipleAEs(x, pretraining_epochs=pretraining_epochs, pretrain_lr=pretrain_lr, batch_size=batch_size, 
                        hidden_layers_sizes =hidden_layers_sizes, corruption_levels=corruption_levels)
new_x_train_minmax_B =  a_MAE_B.transform(x_train_minmax[:, x_train_minmax.shape[1]/2:])

In [39]:
new_x_test_minmax_A = a_MAE_A.transform(x_test_minmax[:, :x_test_minmax.shape[1]/2])
new_x_test_minmax_B = a_MAE_B.transform(x_test_minmax[:, x_test_minmax.shape[1]/2:])
new_x_validation_minmax_A = a_MAE_A.transform(x_validation_minmax[:, :x_validation_minmax.shape[1]/2])
new_x_validation_minmax_B = a_MAE_B.transform(x_validation_minmax[:, x_validation_minmax.shape[1]/2:])
new_x_train_minmax_whole = np.hstack((new_x_train_minmax_A, new_x_train_minmax_B))
new_x_test_minmax_whole = np.hstack((new_x_test_minmax_A, new_x_test_minmax_B))
new_x_validationt_minmax_whole = np.hstack((new_x_validation_minmax_A, new_x_validation_minmax_B))

In [24]:
new_x_train_minmax_whole.shape


Out[24]:
(12732L, 200L)

In [27]:
#### L1-based SVM original dataset
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.utils.extmath import density
from sklearn import metrics
x_train_pre_validation_scaled = x_train_minmax
X_test_scaled = x_test_minmax
print 'SVM result on original dataset'
L1_SVC_Selector= LinearSVC(C=1, penalty="l1", dual=False)
L1_SVC_X = L1_SVC_Selector.fit_transform(x_train_pre_validation_scaled, y_train_minmax)
traning_accuracy = L1_SVC_Selector.score(x_train_pre_validation_scaled, y_train_minmax)
print 'training accuracy', traning_accuracy
testing_accuracy = L1_SVC_Selector.score(X_test_scaled, y_test_minmax)
print 'testing accuracy', testing_accuracy
print L1_SVC_X.shape
predicted = L1_SVC_Selector.predict(X_test_scaled)
print 'testing precision', sklearn.metrics.precision_score(y_test_minmax, predicted, pos_label=1)
print 'testing recall', sklearn.metrics.recall_score(y_test_minmax, predicted, pos_label=1)
print sklearn.metrics.recall_score(y_test_minmax, predicted, pos_label=1)


SVM result on original dataset
training accuracy 0.62205466541
testing accuracy 0.506532663317
(12732L, 1123L)
testing precision 0.509073075037
testing recall 0.518740629685
0.518740629685

In [19]:
#### L1-based SVM on trasformed original dataset
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.utils.extmath import density
from sklearn import metrics
x_train_pre_validation_scaled = new_x_train_minmax
X_test_scaled = new_x_test_minmax

L1_SVC_Selector= LinearSVC(C=1, penalty="l1", dual=False)
L1_SVC_X = L1_SVC_Selector.fit_transform(x_train_pre_validation_scaled, y_train_minmax)
traning_accuracy = L1_SVC_Selector.score(x_train_pre_validation_scaled, y_train_minmax)
print 'training accuracy', traning_accuracy
testing_accuracy = L1_SVC_Selector.score(X_test_scaled, y_test_minmax)
print 'testing accuracy', testing_accuracy
print L1_SVC_X.shape
predicted = L1_SVC_Selector.predict(X_test_scaled)
print 'testing precision', sklearn.metrics.precision_score(y_test_minmax, predicted, pos_label=1)
print 'testing recall', sklearn.metrics.recall_score(y_test_minmax, predicted, pos_label=1)


0.666116870877
0.664824120603
[ 0.13231347 -0.48519935 -0.22325074  0.34680357  0.04326873 -0.27321924
  0.07862841  0.44932854 -0.34190608  0.38316937 -0.01044972 -0.21721321
  0.50240482 -0.39035884 -0.31312725  0.01151576  0.56378795  0.04428225
  0.33907228  0.         -0.33666173 -0.12056051 -0.46462771 -0.43237551
  0.07352032  0.09038167  0.30261275 -0.50892766 -0.39775445  0.24241258
 -0.42710535  0.12876215 -0.46147814 -0.09238173  0.13124765 -0.26658227
  0.28673445  0.28350923  0.22724499 -0.09118415  0.06103919  0.33645532
  0.11847707 -0.07932715  0.37430961 -0.23113092  0.38557638 -0.46344793
 -0.29615814 -0.04622971 -0.14215857  0.06699989  0.19177327  0.24704688
  0.61381036 -0.0965982   0.3936586   0.13057655  0.12821931 -0.14362696
 -0.11223977 -0.10215103 -0.13170324 -0.36995767  0.          0.25723654
  0.24408299  0.86654127 -0.33664626  0.77894101 -0.91230335  0.22601876
 -0.16450929  0.20720663 -0.1506681  -0.2621404   0.11929355  0.01645169
  0.09848597 -0.201903    0.          0.68642705 -0.24564303 -0.69178349
  0.12749478  0.13469155 -0.29813797 -0.16639811 -0.36221166 -0.25216807
  0.03604714  0.10520982  0.08170771  0.0748805  -0.05571527 -0.14392926
  0.40222628  0.56246282 -0.12742461 -0.26395558]
(12732L, 97L)
0.661033317238
0.684157921039

In [23]:
#### L1-based SVM on seperately trasformed original dataset
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.utils.extmath import density
from sklearn import metrics
x_train_pre_validation_scaled = new_x_train_minmax_whole
X_test_scaled = new_x_test_minmax_whole

L1_SVC_Selector= LinearSVC(C=1, penalty="l1", dual=False)
L1_SVC_X = L1_SVC_Selector.fit_transform(x_train_pre_validation_scaled, y_train_minmax)
traning_accuracy = L1_SVC_Selector.score(x_train_pre_validation_scaled, y_train_minmax)
print 'training accuracy', traning_accuracy
testing_accuracy = L1_SVC_Selector.score(X_test_scaled, y_test_minmax)
print 'testing accuracy', testing_accuracy
print L1_SVC_X.shape
predicted = L1_SVC_Selector.predict(X_test_scaled)
print 'testing precision', sklearn.metrics.precision_score(y_test_minmax, predicted, pos_label=1)
print 'testing recall', sklearn.metrics.recall_score(y_test_minmax, predicted, pos_label=1)


training accuracy 0.55670750864
testing accuracy 0.519095477387
(12732L, 179L)
testing precision 0.520963855422
testing recall 0.540229885057

In [25]:
# get the new representation for A set. first 784-D
pretraining_epochs=15
pretrain_lr=0.001
batch_size=30
hidden_layers_sizes =[100, 100]
corruption_levels=[0, 0]
x = new_x_train_minmax_whole
print "original shape", x.shape
a_MAE_new_set = train_a_MultipleAEs(x, pretraining_epochs=pretraining_epochs, pretrain_lr=pretrain_lr, batch_size=batch_size, 
                        hidden_layers_sizes =hidden_layers_sizes, corruption_levels=corruption_levels)
new_x_train_minmax_new_set =  a_MAE_new_set.transform(new_x_train_minmax_whole)
new_x_test_minmax_new_set = a_MAE_new_set.transform(new_x_test_minmax_whole)


original shape (12732L, 200L)
... building the model
... getting the pretraining functions
... pre-training the model
Pre-training layer 0, epoch 0, cost  116.861765345
Pre-training layer 0, epoch 1, cost  98.1881844906
Pre-training layer 0, epoch 2, cost  96.0873925995
Pre-training layer 0, epoch 3, cost  94.4012644952
Pre-training layer 0, epoch 4, cost  92.8243882839
Pre-training layer 0, epoch 5, cost  91.4559687424
Pre-training layer 0, epoch 6, cost  90.189061244
Pre-training layer 0, epoch 7, cost  89.0403325652
Pre-training layer 0, epoch 8, cost  88.0753981276
Pre-training layer 0, epoch 9, cost  87.1464799055
Pre-training layer 0, epoch 10, cost  86.3104897839
Pre-training layer 0, epoch 11, cost  85.566923754
Pre-training layer 0, epoch 12, cost  84.9103751258
Pre-training layer 0, epoch 13, cost  84.267200384
Pre-training layer 0, epoch 14, cost  83.6972667306
Pre-training layer 1, epoch 0, cost  68.3334425992
Pre-training layer 1, epoch 1, cost  57.7690975766
Pre-training layer 1, epoch 2, cost  56.5580884877
Pre-training layer 1, epoch 3, cost  55.5750597851
Pre-training layer 1, epoch 4, cost  54.7659002416
Pre-training layer 1, epoch 5, cost  53.9809858062
Pre-training layer 1, epoch 6, cost  53.3609554609
Pre-training layer 1, epoch 7, cost  52.7131026261
Pre-training layer 1, epoch 8, cost  52.2208843206
Pre-training layer 1, epoch 9, cost  51.742521477
Pre-training layer 1, epoch 10, cost  51.2897486831
Pre-training layer 1, epoch 11, cost  50.910742126
Pre-training layer 1, epoch 12, cost  50.537562316
Pre-training layer 1, epoch 13, cost  50.2275851874
Pre-training layer 1, epoch 14, cost  49.9425176584

In [26]:
#### L1-based SVM on seperately trasformed original dataset
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.utils.extmath import density
from sklearn import metrics
x_train_pre_validation_scaled = new_x_train_minmax_new_set
X_test_scaled = new_x_test_minmax_new_set

L1_SVC_Selector= LinearSVC(C=1, penalty="l1", dual=False)
L1_SVC_X = L1_SVC_Selector.fit_transform(x_train_pre_validation_scaled, y_train_minmax)
traning_accuracy = L1_SVC_Selector.score(x_train_pre_validation_scaled, y_train_minmax)
print 'training accuracy', traning_accuracy
testing_accuracy = L1_SVC_Selector.score(X_test_scaled, y_test_minmax)
print 'testing accuracy', testing_accuracy
print L1_SVC_X.shape
predicted = L1_SVC_Selector.predict(X_test_scaled)
print 'testing precision', sklearn.metrics.precision_score(y_test_minmax, predicted, pos_label=1)
print 'testing recall', sklearn.metrics.recall_score(y_test_minmax, predicted, pos_label=1)


training accuracy 0.728479421929
testing accuracy 0.731658291457
(12732L, 92L)
testing precision 0.725253500724
testing recall 0.750624687656

In [49]:
# feature log selection based on L1
log_clf = sklearn.linear_model.LogisticRegression(C=1, penalty='l1' )
log_clf.fit(x_train_pre_validation_scaled, y_train_pre_validation)
predicted = log_clf.predict(X_test_scaled)
print log_clf.score(x_train_pre_validation_scaled, y_train_pre_validation), log_clf.score(X_test_scaled, y_test)
print 'precision', sklearn.metrics.precision_score(y_test, predicted, pos_label=1)
print 'recall', sklearn.metrics.recall_score(y_test, predicted, pos_label=1)


0.533074656984 0.502279464886
precision 0.500221751343
recall 0.507854712828

In [52]:
# feature log selection based on L2
log_clf_l2 = sklearn.linear_model.LogisticRegression(C=1, penalty='l2' )
log_clf_l2.fit(x_train_pre_validation_scaled, y_train_pre_validation)
predicted = log_clf_l2.predict(X_test_scaled)
print log_clf_l2.score(x_train_pre_validation_scaled, y_train_pre_validation)
print log_clf_l2.score(X_test_scaled, y_test)
print 'precision', sklearn.metrics.precision_score(y_test, predicted, pos_label=1)
print 'recall', sklearn.metrics.recall_score(y_test, predicted, pos_label=1)


0.534401255598
0.503500161929
precision 0.501424641383
recall 0.510656393836

In [ ]:


In [28]:
# feature log selection based on L2 on original dataset
X_test = x_test_minmax
y_test = y_test_minmax
X_train = x_train_minmax
y_train = y_train_minmax
print 'performance for logisitc regression'
log_clf = sklearn.linear_model.LogisticRegression(C=1, penalty='l2' )
log_clf.fit(X_train, y_train)
predicted = log_clf.predict(X_test)
print 'training accuracy', log_clf.score(X_train, y_train)
print 'testing accuracy', log_clf.score(X_test, y_test)
print 'precision', sklearn.metrics.precision_score(y_test, predicted, pos_label=1)
print 'recall', sklearn.metrics.recall_score(y_test, predicted, pos_label=1)


performance for logisitc regression
training accuracy 0.619776939994
testing accuracy 0.507286432161
precision 0.509832841691
recall 0.51824087956

In [29]:
# feature log selection based on L2 on transformed original
X_test = new_x_test_minmax
y_test = y_test_minmax
X_train = new_x_train_minmax
y_train = y_train_minmax
print 'performance for logisitc regression'
log_clf = sklearn.linear_model.LogisticRegression(C=1, penalty='l2' )
log_clf.fit(X_train, y_train)
predicted = log_clf.predict(X_test)
print 'training accuracy', log_clf.score(X_train, y_train)
print 'testing accuracy', log_clf.score(X_test, y_test)
print 'precision', sklearn.metrics.precision_score(y_test, predicted, pos_label=1)
print 'recall', sklearn.metrics.recall_score(y_test, predicted, pos_label=1)


performance for logisitc regression
training accuracy 0.665724159598
testing accuracy 0.665075376884
precision 0.661665053243
recall 0.68315842079

In [30]:
# feature log selection based on L2 on seperately transformed dataset
X_test = new_x_test_minmax_new_set
y_test = y_test_minmax
X_train = new_x_train_minmax_new_set
y_train = y_train_minmax
print 'performance for logisitc regression'
log_clf = sklearn.linear_model.LogisticRegression(C=1, penalty='l2' )
log_clf.fit(X_train, y_train)
predicted = log_clf.predict(X_test)
print 'training accuracy', log_clf.score(X_train, y_train)
print 'testing accuracy', log_clf.score(X_test, y_test)
print 'precision', sklearn.metrics.precision_score(y_test, predicted, pos_label=1)
print 'recall', sklearn.metrics.recall_score(y_test, predicted, pos_label=1)


performance for logisitc regression
training accuracy 0.72808671065
testing accuracy 0.733417085427
precision 0.727492739593
recall 0.751124437781

In [59]:
# build a sda, pretraining plus fine tuning on original data.
# set up paramaters

#finetune_lr=0.1
finetune_lr = 0.1
pretraining_epochs = 30
#pretrain_lr=0.001
pretrain_lr = 0.001
training_epochs = 300
batch_size = 30


hidden_layers_sizes= [100, 100]
corruption_levels = [0, 0]

sda = trainSda(x_train_minmax, y_train_minmax,
             x_validation_minmax, y_validation_minmax , 
             x_test_minmax, y_test_minmax,
             hidden_layers_sizes = hidden_layers_sizes, corruption_levels = corruption_levels, batch_size = batch_size , \
             training_epochs = training_epochs, pretraining_epochs = pretraining_epochs, 
             pretrain_lr = pretrain_lr, finetune_lr=finetune_lr
             )


... building the model
... getting the pretraining functions
... pre-training the model
Pre-training layer 0, epoch 0, cost  646.848182025
Pre-training layer 0, epoch 1, cost  459.686013772
Pre-training layer 0, epoch 2, cost  423.845966827
Pre-training layer 0, epoch 3, cost  400.959866184
Pre-training layer 0, epoch 4, cost  383.742901325
Pre-training layer 0, epoch 5, cost  370.090932908
Pre-training layer 0, epoch 6, cost  358.952404075
Pre-training layer 0, epoch 7, cost  349.677116671
Pre-training layer 0, epoch 8, cost  341.823522869
Pre-training layer 0, epoch 9, cost  335.075950016
Pre-training layer 0, epoch 10, cost  329.202395559
Pre-training layer 0, epoch 11, cost  324.029726303
Pre-training layer 0, epoch 12, cost  319.427254903
Pre-training layer 0, epoch 13, cost  315.295150212
Pre-training layer 0, epoch 14, cost  311.556113825
Pre-training layer 0, epoch 15, cost  308.149415362
Pre-training layer 0, epoch 16, cost  305.02663802
Pre-training layer 0, epoch 17, cost  302.148644249
Pre-training layer 0, epoch 18, cost  299.483398763
Pre-training layer 0, epoch 19, cost  297.004389282
Pre-training layer 0, epoch 20, cost  294.689463927
Pre-training layer 0, epoch 21, cost  292.519960848
Pre-training layer 0, epoch 22, cost  290.480045028
Pre-training layer 0, epoch 23, cost  288.556194175
Pre-training layer 0, epoch 24, cost  286.736793896
Pre-training layer 0, epoch 25, cost  285.011814694
Pre-training layer 0, epoch 26, cost  283.372551552
Pre-training layer 0, epoch 27, cost  281.811412231
Pre-training layer 0, epoch 28, cost  280.321744023
Pre-training layer 0, epoch 29, cost  278.897691136
Pre-training layer 1, epoch 0, cost  57.9600125005
Pre-training layer 1, epoch 1, cost  47.3333566411
Pre-training layer 1, epoch 2, cost  45.565589731
Pre-training layer 1, epoch 3, cost  44.0646515035
Pre-training layer 1, epoch 4, cost  42.7346051649
Pre-training layer 1, epoch 5, cost  41.5628752534
Pre-training layer 1, epoch 6, cost  40.5318409255
Pre-training layer 1, epoch 7, cost  39.6228004042
Pre-training layer 1, epoch 8, cost  38.8185163237
Pre-training layer 1, epoch 9, cost  38.1039411937
Pre-training layer 1, epoch 10, cost  37.4662302937
Pre-training layer 1, epoch 11, cost  36.8945311591
Pre-training layer 1, epoch 12, cost  36.3797210656
Pre-training layer 1, epoch 13, cost  35.9141467886
Pre-training layer 1, epoch 14, cost  35.4913842767
Pre-training layer 1, epoch 15, cost  35.1060255286
Pre-training layer 1, epoch 16, cost  34.7534963204
Pre-training layer 1, epoch 17, cost  34.4299056559
Pre-training layer 1, epoch 18, cost  34.1319248424
Pre-training layer 1, epoch 19, cost  33.8566916862
Pre-training layer 1, epoch 20, cost  33.6017343804
Pre-training layer 1, epoch 21, cost  33.364910191
Pre-training layer 1, epoch 22, cost  33.1443552664
Pre-training layer 1, epoch 23, cost  32.9384430532
Pre-training layer 1, epoch 24, cost  32.7457495918
Pre-training layer 1, epoch 25, cost  32.5650244364
Pre-training layer 1, epoch 26, cost  32.3951662139
Pre-training layer 1, epoch 27, cost  32.2352020149
Pre-training layer 1, epoch 28, cost  32.0842699522
Pre-training layer 1, epoch 29, cost  31.9416043361
... getting the finetuning functions
... finetunning the model
epoch 1, minibatch 424/424, validation error 39.465409 %
 epoch 1, minibatch 424/424, test error of best model 38.409091 %
epoch 2, minibatch 424/424, validation error 34.182390 %
 epoch 2, minibatch 424/424, test error of best model 31.464646 %
epoch 3, minibatch 424/424, validation error 30.754717 %
 epoch 3, minibatch 424/424, test error of best model 29.090909 %
epoch 4, minibatch 424/424, validation error 28.270440 %
 epoch 4, minibatch 424/424, test error of best model 26.767677 %
epoch 5, minibatch 424/424, validation error 26.037736 %
 epoch 5, minibatch 424/424, test error of best model 25.328283 %
epoch 6, minibatch 424/424, validation error 24.276730 %
 epoch 6, minibatch 424/424, test error of best model 23.636364 %
epoch 7, minibatch 424/424, validation error 23.176101 %
 epoch 7, minibatch 424/424, test error of best model 22.676768 %
epoch 8, minibatch 424/424, validation error 21.981132 %
 epoch 8, minibatch 424/424, test error of best model 21.843434 %
epoch 9, minibatch 424/424, validation error 21.194969 %
 epoch 9, minibatch 424/424, test error of best model 20.984848 %
epoch 10, minibatch 424/424, validation error 20.251572 %
 epoch 10, minibatch 424/424, test error of best model 20.176768 %
epoch 11, minibatch 424/424, validation error 19.371069 %
 epoch 11, minibatch 424/424, test error of best model 19.368687 %
epoch 12, minibatch 424/424, validation error 18.710692 %
 epoch 12, minibatch 424/424, test error of best model 18.989899 %
epoch 13, minibatch 424/424, validation error 18.238994 %
 epoch 13, minibatch 424/424, test error of best model 18.459596 %
epoch 14, minibatch 424/424, validation error 17.893082 %
 epoch 14, minibatch 424/424, test error of best model 18.106061 %
epoch 15, minibatch 424/424, validation error 17.704403 %
 epoch 15, minibatch 424/424, test error of best model 17.752525 %
epoch 16, minibatch 424/424, validation error 17.075472 %
 epoch 16, minibatch 424/424, test error of best model 17.651515 %
epoch 17, minibatch 424/424, validation error 16.886792 %
 epoch 17, minibatch 424/424, test error of best model 17.601010 %
epoch 18, minibatch 424/424, validation error 16.603774 %
 epoch 18, minibatch 424/424, test error of best model 17.297980 %
epoch 19, minibatch 424/424, validation error 16.477987 %
 epoch 19, minibatch 424/424, test error of best model 17.095960 %
epoch 20, minibatch 424/424, validation error 16.352201 %
 epoch 20, minibatch 424/424, test error of best model 16.893939 %
epoch 21, minibatch 424/424, validation error 16.352201 %
epoch 22, minibatch 424/424, validation error 16.320755 %
 epoch 22, minibatch 424/424, test error of best model 16.338384 %
epoch 23, minibatch 424/424, validation error 16.257862 %
 epoch 23, minibatch 424/424, test error of best model 16.212121 %
epoch 24, minibatch 424/424, validation error 16.352201 %
epoch 25, minibatch 424/424, validation error 16.352201 %
epoch 26, minibatch 424/424, validation error 16.289308 %
epoch 27, minibatch 424/424, validation error 16.100629 %
 epoch 27, minibatch 424/424, test error of best model 16.237374 %
epoch 28, minibatch 424/424, validation error 16.226415 %
epoch 29, minibatch 424/424, validation error 16.257862 %
epoch 30, minibatch 424/424, validation error 16.257862 %
epoch 31, minibatch 424/424, validation error 16.226415 %
epoch 32, minibatch 424/424, validation error 16.289308 %
epoch 33, minibatch 424/424, validation error 16.257862 %
epoch 34, minibatch 424/424, validation error 16.257862 %
epoch 35, minibatch 424/424, validation error 16.289308 %
epoch 36, minibatch 424/424, validation error 16.289308 %
epoch 37, minibatch 424/424, validation error 16.257862 %
epoch 38, minibatch 424/424, validation error 16.289308 %
epoch 39, minibatch 424/424, validation error 16.352201 %
epoch 40, minibatch 424/424, validation error 16.352201 %
epoch 41, minibatch 424/424, validation error 16.383648 %
epoch 42, minibatch 424/424, validation error 16.320755 %
epoch 43, minibatch 424/424, validation error 16.257862 %
epoch 44, minibatch 424/424, validation error 16.226415 %
epoch 45, minibatch 424/424, validation error 16.289308 %
epoch 46, minibatch 424/424, validation error 16.415094 %
epoch 47, minibatch 424/424, validation error 16.446541 %
epoch 48, minibatch 424/424, validation error 16.477987 %
epoch 49, minibatch 424/424, validation error 16.415094 %
epoch 50, minibatch 424/424, validation error 16.415094 %
epoch 51, minibatch 424/424, validation error 16.352201 %
epoch 52, minibatch 424/424, validation error 16.320755 %
epoch 53, minibatch 424/424, validation error 16.257862 %
The pretraining code ran for 9.64m

In [60]:
prob = sda.predict_p(x_train_minmax)
y_p = prob[:, 1]
y_p


Out[60]:
array([ 0.9999969 ,  0.99358194,  0.03238713, ...,  0.89236379,
        0.99985128,  0.99693639])

In [61]:
print 'hidden_layers_sizes:', hidden_layers_sizes
print 'corruption_levels:', corruption_levels

training_predicted = sda.predict(x_train_minmax)
y_train = y_train_minmax
print 'train accuracy: ', '{percent:.1%}'.format(percent=sklearn.metrics.accuracy_score(y_train, training_predicted)) 
print 'precision: ', '{percent:.1%}'.format(percent=sklearn.metrics.precision_score(y_train, training_predicted, pos_label=1)) 
print 'recall: ', '{percent:.1%}'.format( percent= sklearn.metrics.recall_score(y_train, training_predicted, pos_label=1))


test_predicted = sda.predict(x_test_minmax)
y_test = y_test_minmax
print 'testing accuracy: ', '{percent:.1%}'.format(percent=sklearn.metrics.accuracy_score(y_test, test_predicted)) 
print 'precision: ', '{percent:.1%}'.format(percent=sklearn.metrics.precision_score(y_test, test_predicted, pos_label=1)) 
print 'recall: ', '{percent:.1%}'.format( percent= sklearn.metrics.recall_score(y_test, test_predicted, pos_label=1))


hidden_layers_sizes: [100, 100]
corruption_levels: [0, 0]
train accuracy:  100.0%
precision:  99.9%
recall:  100.0%
testing accuracy:  84.2%
precision:  83.8%
recall:  85.0%

In [49]:
# build a sda, pretraining plus fine tuning on seperately transformed data
# set up paramaters

#finetune_lr=0.1
finetune_lr = 0.1
pretraining_epochs = 30
#pretrain_lr=0.001
pretrain_lr = 0.001
training_epochs = 300
batch_size = 30


hidden_layers_sizes= [100, 100]
corruption_levels = [0, 0]

sda_transformed = trainSda(new_x_train_minmax_whole, y_train_minmax,
             new_x_validationt_minmax_whole, y_validation_minmax , 
             new_x_test_minmax_whole, y_test_minmax,
             hidden_layers_sizes = hidden_layers_sizes, corruption_levels = corruption_levels, batch_size = batch_size , \
             training_epochs = training_epochs, pretraining_epochs = pretraining_epochs, 
             pretrain_lr = pretrain_lr, finetune_lr=finetune_lr
             )


... building the model
... getting the pretraining functions
... pre-training the model
Pre-training layer 0, epoch 0, cost  112.836086527
Pre-training layer 0, epoch 1, cost  94.2030775816
Pre-training layer 0, epoch 2, cost  92.0378017635
Pre-training layer 0, epoch 3, cost  90.3206247155
Pre-training layer 0, epoch 4, cost  88.8000412028
Pre-training layer 0, epoch 5, cost  87.4479972472
Pre-training layer 0, epoch 6, cost  86.2472003717
Pre-training layer 0, epoch 7, cost  85.1805217915
Pre-training layer 0, epoch 8, cost  84.231475801
Pre-training layer 0, epoch 9, cost  83.3851253563
Pre-training layer 0, epoch 10, cost  82.6284283976
Pre-training layer 0, epoch 11, cost  81.9501774645
Pre-training layer 0, epoch 12, cost  81.3407649341
Pre-training layer 0, epoch 13, cost  80.7919185943
Pre-training layer 0, epoch 14, cost  80.2964720712
Pre-training layer 0, epoch 15, cost  79.8481858139
Pre-training layer 0, epoch 16, cost  79.4416115015
Pre-training layer 0, epoch 17, cost  79.0719861122
Pre-training layer 0, epoch 18, cost  78.7351434229
Pre-training layer 0, epoch 19, cost  78.4274355216
Pre-training layer 0, epoch 20, cost  78.145662191
Pre-training layer 0, epoch 21, cost  77.8870094221
Pre-training layer 0, epoch 22, cost  77.6489984648
Pre-training layer 0, epoch 23, cost  77.4294446574
Pre-training layer 0, epoch 24, cost  77.2264233525
Pre-training layer 0, epoch 25, cost  77.0382401188
Pre-training layer 0, epoch 26, cost  76.8634035598
Pre-training layer 0, epoch 27, cost  76.7006002834
Pre-training layer 0, epoch 28, cost  76.5486721544
Pre-training layer 0, epoch 29, cost  76.4065960228
Pre-training layer 1, epoch 0, cost  64.1499001144
Pre-training layer 1, epoch 1, cost  55.1246129871
Pre-training layer 1, epoch 2, cost  53.962952688
Pre-training layer 1, epoch 3, cost  53.0232695193
Pre-training layer 1, epoch 4, cost  52.1812964872
Pre-training layer 1, epoch 5, cost  51.4281284243
Pre-training layer 1, epoch 6, cost  50.7573489463
Pre-training layer 1, epoch 7, cost  50.1620071689
Pre-training layer 1, epoch 8, cost  49.6349026572
Pre-training layer 1, epoch 9, cost  49.1688995619
Pre-training layer 1, epoch 10, cost  48.7571510817
Pre-training layer 1, epoch 11, cost  48.3932494675
Pre-training layer 1, epoch 12, cost  48.0713174331
Pre-training layer 1, epoch 13, cost  47.786051882
Pre-training layer 1, epoch 14, cost  47.5327297965
Pre-training layer 1, epoch 15, cost  47.3071869306
Pre-training layer 1, epoch 16, cost  47.1057798629
Pre-training layer 1, epoch 17, cost  46.9253400274
Pre-training layer 1, epoch 18, cost  46.7631253598
Pre-training layer 1, epoch 19, cost  46.6167725239
Pre-training layer 1, epoch 20, cost  46.4842510996
Pre-training layer 1, epoch 21, cost  46.3638205059
Pre-training layer 1, epoch 22, cost  46.2539902691
Pre-training layer 1, epoch 23, cost  46.1534841165
Pre-training layer 1, epoch 24, cost  46.0612081593
Pre-training layer 1, epoch 25, cost  45.9762231815
Pre-training layer 1, epoch 26, cost  45.8977208312
Pre-training layer 1, epoch 27, cost  45.8250033755
Pre-training layer 1, epoch 28, cost  45.757466606
Pre-training layer 1, epoch 29, cost  45.694585476
... getting the finetuning functions
... finetunning the model
epoch 1, minibatch 424/424, validation error 35.817610 %
 epoch 1, minibatch 424/424, test error of best model 33.535354 %
epoch 2, minibatch 424/424, validation error 28.962264 %
 epoch 2, minibatch 424/424, test error of best model 27.196970 %
epoch 3, minibatch 424/424, validation error 26.729560 %
 epoch 3, minibatch 424/424, test error of best model 24.722222 %
epoch 4, minibatch 424/424, validation error 25.188679 %
 epoch 4, minibatch 424/424, test error of best model 23.459596 %
epoch 5, minibatch 424/424, validation error 23.616352 %
 epoch 5, minibatch 424/424, test error of best model 22.777778 %
epoch 6, minibatch 424/424, validation error 22.547170 %
 epoch 6, minibatch 424/424, test error of best model 21.944444 %
epoch 7, minibatch 424/424, validation error 21.761006 %
 epoch 7, minibatch 424/424, test error of best model 21.111111 %
epoch 8, minibatch 424/424, validation error 21.446541 %
 epoch 8, minibatch 424/424, test error of best model 20.429293 %
epoch 9, minibatch 424/424, validation error 20.471698 %
 epoch 9, minibatch 424/424, test error of best model 19.848485 %
epoch 10, minibatch 424/424, validation error 19.874214 %
 epoch 10, minibatch 424/424, test error of best model 19.444444 %
epoch 11, minibatch 424/424, validation error 19.465409 %
 epoch 11, minibatch 424/424, test error of best model 19.166667 %
epoch 12, minibatch 424/424, validation error 18.930818 %
 epoch 12, minibatch 424/424, test error of best model 18.787879 %
epoch 13, minibatch 424/424, validation error 18.616352 %
 epoch 13, minibatch 424/424, test error of best model 18.510101 %
epoch 14, minibatch 424/424, validation error 18.301887 %
 epoch 14, minibatch 424/424, test error of best model 18.232323 %
epoch 15, minibatch 424/424, validation error 18.081761 %
 epoch 15, minibatch 424/424, test error of best model 18.030303 %
epoch 16, minibatch 424/424, validation error 17.830189 %
 epoch 16, minibatch 424/424, test error of best model 17.752525 %
epoch 17, minibatch 424/424, validation error 17.641509 %
 epoch 17, minibatch 424/424, test error of best model 17.500000 %
epoch 18, minibatch 424/424, validation error 17.358491 %
 epoch 18, minibatch 424/424, test error of best model 17.196970 %
epoch 19, minibatch 424/424, validation error 17.327044 %
 epoch 19, minibatch 424/424, test error of best model 16.944444 %
epoch 20, minibatch 424/424, validation error 16.981132 %
 epoch 20, minibatch 424/424, test error of best model 16.666667 %
epoch 21, minibatch 424/424, validation error 16.761006 %
 epoch 21, minibatch 424/424, test error of best model 16.388889 %
epoch 22, minibatch 424/424, validation error 16.698113 %
 epoch 22, minibatch 424/424, test error of best model 16.212121 %
epoch 23, minibatch 424/424, validation error 16.509434 %
 epoch 23, minibatch 424/424, test error of best model 16.010101 %
epoch 24, minibatch 424/424, validation error 16.415094 %
 epoch 24, minibatch 424/424, test error of best model 15.883838 %
epoch 25, minibatch 424/424, validation error 16.069182 %
 epoch 25, minibatch 424/424, test error of best model 15.833333 %
epoch 26, minibatch 424/424, validation error 15.974843 %
 epoch 26, minibatch 424/424, test error of best model 15.782828 %
epoch 27, minibatch 424/424, validation error 15.817610 %
 epoch 27, minibatch 424/424, test error of best model 15.656566 %
epoch 28, minibatch 424/424, validation error 15.471698 %
 epoch 28, minibatch 424/424, test error of best model 15.303030 %
epoch 29, minibatch 424/424, validation error 15.314465 %
 epoch 29, minibatch 424/424, test error of best model 15.227273 %
epoch 30, minibatch 424/424, validation error 15.094340 %
 epoch 30, minibatch 424/424, test error of best model 15.227273 %
epoch 31, minibatch 424/424, validation error 15.000000 %
 epoch 31, minibatch 424/424, test error of best model 15.075758 %
epoch 32, minibatch 424/424, validation error 14.937107 %
 epoch 32, minibatch 424/424, test error of best model 14.974747 %
epoch 33, minibatch 424/424, validation error 14.874214 %
 epoch 33, minibatch 424/424, test error of best model 14.848485 %
epoch 34, minibatch 424/424, validation error 14.654088 %
 epoch 34, minibatch 424/424, test error of best model 14.646465 %
epoch 35, minibatch 424/424, validation error 14.559748 %
 epoch 35, minibatch 424/424, test error of best model 14.520202 %
epoch 36, minibatch 424/424, validation error 14.559748 %
epoch 37, minibatch 424/424, validation error 14.528302 %
 epoch 37, minibatch 424/424, test error of best model 14.469697 %
epoch 38, minibatch 424/424, validation error 14.622642 %
epoch 39, minibatch 424/424, validation error 14.622642 %
epoch 40, minibatch 424/424, validation error 14.622642 %
epoch 41, minibatch 424/424, validation error 14.528302 %
epoch 42, minibatch 424/424, validation error 14.559748 %
epoch 43, minibatch 424/424, validation error 14.559748 %
epoch 44, minibatch 424/424, validation error 14.528302 %
epoch 45, minibatch 424/424, validation error 14.465409 %
 epoch 45, minibatch 424/424, test error of best model 13.939394 %
epoch 46, minibatch 424/424, validation error 14.308176 %
 epoch 46, minibatch 424/424, test error of best model 13.863636 %
epoch 47, minibatch 424/424, validation error 14.245283 %
 epoch 47, minibatch 424/424, test error of best model 13.838384 %
epoch 48, minibatch 424/424, validation error 14.056604 %
 epoch 48, minibatch 424/424, test error of best model 13.737374 %
epoch 49, minibatch 424/424, validation error 13.899371 %
 epoch 49, minibatch 424/424, test error of best model 13.712121 %
epoch 50, minibatch 424/424, validation error 13.773585 %
 epoch 50, minibatch 424/424, test error of best model 13.611111 %
epoch 51, minibatch 424/424, validation error 13.710692 %
 epoch 51, minibatch 424/424, test error of best model 13.510101 %
epoch 52, minibatch 424/424, validation error 13.679245 %
 epoch 52, minibatch 424/424, test error of best model 13.560606 %
epoch 53, minibatch 424/424, validation error 13.710692 %
epoch 54, minibatch 424/424, validation error 13.710692 %
epoch 55, minibatch 424/424, validation error 13.616352 %
 epoch 55, minibatch 424/424, test error of best model 13.560606 %
epoch 56, minibatch 424/424, validation error 13.553459 %
 epoch 56, minibatch 424/424, test error of best model 13.560606 %
epoch 57, minibatch 424/424, validation error 13.396226 %
 epoch 57, minibatch 424/424, test error of best model 13.560606 %
epoch 58, minibatch 424/424, validation error 13.333333 %
 epoch 58, minibatch 424/424, test error of best model 13.585859 %
epoch 59, minibatch 424/424, validation error 13.238994 %
 epoch 59, minibatch 424/424, test error of best model 13.560606 %
epoch 60, minibatch 424/424, validation error 13.113208 %
 epoch 60, minibatch 424/424, test error of best model 13.535354 %
epoch 61, minibatch 424/424, validation error 13.144654 %
epoch 62, minibatch 424/424, validation error 13.113208 %
epoch 63, minibatch 424/424, validation error 13.144654 %
epoch 64, minibatch 424/424, validation error 13.081761 %
 epoch 64, minibatch 424/424, test error of best model 13.636364 %
epoch 65, minibatch 424/424, validation error 13.113208 %
epoch 66, minibatch 424/424, validation error 13.113208 %
epoch 67, minibatch 424/424, validation error 13.176101 %
epoch 68, minibatch 424/424, validation error 13.238994 %
epoch 69, minibatch 424/424, validation error 13.270440 %
epoch 70, minibatch 424/424, validation error 13.301887 %
epoch 71, minibatch 424/424, validation error 13.333333 %
epoch 72, minibatch 424/424, validation error 13.270440 %
epoch 73, minibatch 424/424, validation error 13.238994 %
epoch 74, minibatch 424/424, validation error 13.301887 %
epoch 75, minibatch 424/424, validation error 13.364780 %
epoch 76, minibatch 424/424, validation error 13.396226 %
epoch 77, minibatch 424/424, validation error 13.364780 %
epoch 78, minibatch 424/424, validation error 13.364780 %
epoch 79, minibatch 424/424, validation error 13.396226 %
epoch 80, minibatch 424/424, validation error 13.427673 %
epoch 81, minibatch 424/424, validation error 13.396226 %
epoch 82, minibatch 424/424, validation error 13.396226 %
epoch 83, minibatch 424/424, validation error 13.396226 %
epoch 84, minibatch 424/424, validation error 13.427673 %
epoch 85, minibatch 424/424, validation error 13.459119 %
epoch 86, minibatch 424/424, validation error 13.459119 %
epoch 87, minibatch 424/424, validation error 13.490566 %
epoch 88, minibatch 424/424, validation error 13.490566 %
epoch 89, minibatch 424/424, validation error 13.522013 %
epoch 90, minibatch 424/424, validation error 13.553459 %
epoch 91, minibatch 424/424, validation error 13.616352 %
epoch 92, minibatch 424/424, validation error 13.710692 %
epoch 93, minibatch 424/424, validation error 13.742138 %
epoch 94, minibatch 424/424, validation error 13.742138 %
epoch 95, minibatch 424/424, validation error 13.836478 %
epoch 96, minibatch 424/424, validation error 13.899371 %
epoch 97, minibatch 424/424, validation error 13.836478 %
epoch 98, minibatch 424/424, validation error 13.773585 %
epoch 99, minibatch 424/424, validation error 13.773585 %
epoch 100, minibatch 424/424, validation error 13.710692 %
epoch 101, minibatch 424/424, validation error 13.679245 %
epoch 102, minibatch 424/424, validation error 13.742138 %
epoch 103, minibatch 424/424, validation error 13.773585 %
epoch 104, minibatch 424/424, validation error 13.805031 %
epoch 105, minibatch 424/424, validation error 13.647799 %
epoch 106, minibatch 424/424, validation error 13.742138 %
epoch 107, minibatch 424/424, validation error 13.679245 %
epoch 108, minibatch 424/424, validation error 13.679245 %
epoch 109, minibatch 424/424, validation error 13.647799 %
epoch 110, minibatch 424/424, validation error 13.647799 %
epoch 111, minibatch 424/424, validation error 13.679245 %
epoch 112, minibatch 424/424, validation error 13.742138 %
epoch 113, minibatch 424/424, validation error 13.710692 %
epoch 114, minibatch 424/424, validation error 13.773585 %
epoch 115, minibatch 424/424, validation error 13.805031 %
epoch 116, minibatch 424/424, validation error 13.773585 %
epoch 117, minibatch 424/424, validation error 13.836478 %
epoch 118, minibatch 424/424, validation error 13.805031 %
epoch 119, minibatch 424/424, validation error 13.710692 %
The pretraining code ran for 2.63m

In [ ]:
sda_transformed.p(new_x_test_minmax_whole)

In [47]:
import pickle
with open('array_A.pickle', 'wb') as handle:
  pickle.dump(array_A, handle)
with open('array_B.pickle', 'wb') as handle:
  pickle.dump(array_B, handle)
with open('pos_index.pickle', 'wb') as handle:
  pickle.dump(pos_index, handle)
with open('neg_index.pickle', 'wb') as handle:
  pickle.dump(neg_index, handle)

In [58]:
import cPickle
import gzip
import os
import sys
import time

import numpy

import theano
import theano.tensor as T
from theano.tensor.shared_randomstreams import RandomStreams

from logistic_sgd import LogisticRegression, load_data
from mlp import HiddenLayer
from dA import dA
import warnings
warnings.filterwarnings("ignore")

def shared_dataset(data_xy, borrow=True):
    """ Function that loads the dataset into shared variables

    The reason we store our dataset in shared variables is to allow
    Theano to copy it into the GPU memory (when code is run on GPU).
    Since copying data into the GPU is slow, copying a minibatch everytime
    is needed (the default behaviour if the data is not in a shared
    variable) would lead to a large decrease in performance.
    """
    data_x, data_y = data_xy
    shared_x = theano.shared(numpy.asarray(data_x,
                                           dtype=theano.config.floatX),
                             borrow=borrow)
    shared_y = theano.shared(numpy.asarray(data_y,
                                           dtype=theano.config.floatX),
                             borrow=borrow)
    # When storing data on the GPU it has to be stored as floats
    # therefore we will store the labels as ``floatX`` as well
    # (``shared_y`` does exactly that). But during our computations
    # we need them as ints (we use labels as index, and if they are
    # floats it doesn't make sense) therefore instead of returning
    # ``shared_y`` we will have to cast it to int. This little hack
    # lets ous get around this issue
    return shared_x, T.cast(shared_y, 'int32')
def shared_dataset_X(data_x, borrow=True):
    shared_x = theano.shared(numpy.asarray(data_x,
                                           dtype=theano.config.floatX),
                             borrow=borrow)
    return shared_x
class MultipleAEs(object):
    """Stacked denoising auto-encoder class that extract hi level features.
    get the last hidden layer activation

    """

    def __init__(self, numpy_rng, theano_rng=None, n_ins=784,
                 hidden_layers_sizes=[500, 500],
                 corruption_levels=[0.1, 0.1]):
        """ This class is made to support a variable number of layers.

        :type numpy_rng: numpy.random.RandomState
        :param numpy_rng: numpy random number generator used to draw initial
                    weights

        :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams
        :param theano_rng: Theano random generator; if None is given one is
                           generated based on a seed drawn from `rng`

        :type n_ins: int
        :param n_ins: dimension of the input to the sdA

        :type n_layers_sizes: list of ints
        :param n_layers_sizes: intermediate layers size, must contain
        :type corruption_levels: list of float
        :param corruption_levels: amount of corruption to use for each
                                  layer
        """

        self.sigmoid_layers = []
        self.dA_layers = []
        self.params = []
        self.n_layers = len(hidden_layers_sizes)

        assert self.n_layers > 0

        if not theano_rng:
            theano_rng = RandomStreams(numpy_rng.randint(2 ** 30))
        # allocate symbolic variables for the data
        self.x = T.matrix('x')  # the data is presented as rasterized images
        self.y = T.ivector('y')  # the labels are presented as 1D vector of
                                 # [int] labels

        # The SdA is an MLP, for which all weights of intermediate layers
        # are shared with a different denoising autoencoders
        # We will first construct the SdA as a deep multilayer perceptron,
        # and when constructing each sigmoidal layer we also construct a
        # denoising autoencoder that shares weights with that layer
        # During pretraining we will train these autoencoders (which will
        # lead to chainging the weights of the MLP as well)
        # During finetunining we will finish training the SdA by doing
        # stochastich gradient descent on the MLP

        for i in xrange(self.n_layers):
            # construct the sigmoidal layer

            # the size of the input is either the number of hidden units of
            # the layer below or the input size if we are on the first layer
            if i == 0:
                input_size = n_ins
            else:
                input_size = hidden_layers_sizes[i - 1]

            # the input to this layer is either the activation of the hidden
            # layer below or the input of the SdA if you are on the first
            # layer
            if i == 0:
                layer_input = self.x
            else:
                layer_input = self.sigmoid_layers[-1].output

            sigmoid_layer = HiddenLayer(rng=numpy_rng,
                                        input=layer_input,
                                        n_in=input_size,
                                        n_out=hidden_layers_sizes[i],
                                        activation=T.nnet.sigmoid)
            # add the layer to our list of layers
            self.sigmoid_layers.append(sigmoid_layer)
            # its arguably a philosophical question...
            # but we are going to only declare that the parameters of the
            # sigmoid_layers are parameters of the StackedDAA
            # the visible biases in the dA are parameters of those
            # dA, but not the SdA
            self.params.extend(sigmoid_layer.params)

            # Construct a denoising autoencoder that shared weights with this
            # layer
            dA_layer = dA(numpy_rng=numpy_rng,
                          theano_rng=theano_rng,
                          input=layer_input,
                          n_visible=input_size,
                          n_hidden=hidden_layers_sizes[i],
                          W=sigmoid_layer.W,
                          bhid=sigmoid_layer.b)
            self.dA_layers.append(dA_layer)
        '''
        we don't need this layer since is AE
        # We now need to add a logistic layer on top of the MLP
        self.logLayer = LogisticRegression(
                         input=self.sigmoid_layers[-1].output,
                         n_in=hidden_layers_sizes[-1], n_out=n_outs)
        
        self.params.extend(self.logLayer.params)
        # construct a function that implements one step of finetunining
        
        # compute the cost for second phase of training,
        # defined as the negative log likelihood
        self.finetune_cost = self.logLayer.negative_log_likelihood(self.y)
        # compute the gradients with respect to the model parameters
        # symbolic variable that points to the number of errors made on the
        # minibatch given by self.x and self.y
        self.errors = self.logLayer.errors(self.y)
        '''
    def transform(self,data_x): # get the last layaer activations to transform data.
        last_layer_activations = self.sigmoid_layers[-1].output
        theano_fn = theano.function(inputs=[],
                                 outputs=last_layer_activations,

                                 givens={self.x: data_x})
        newFeatures=theano_fn()
        return newFeatures
    def pretraining_functions(self, train_set_x, batch_size):
        ''' Generates a list of functions, each of them implementing one
        step in trainnig the dA corresponding to the layer with same index.
        The function will require as input the minibatch index, and to train
        a dA you just need to iterate, calling the corresponding function on
        all minibatch indexes.

        :type train_set_x: theano.tensor.TensorType
        :param train_set_x: Shared variable that contains all datapoints used
                            for training the dA

        :type batch_size: int
        :param batch_size: size of a [mini]batch

        :type learning_rate: float
        :param learning_rate: learning rate used during training for any of
                              the dA layers
        '''

        # index to a [mini]batch
        index = T.lscalar('index')  # index to a minibatch
        corruption_level = T.scalar('corruption')  # % of corruption to use
        learning_rate = T.scalar('lr')  # learning rate to use
        # number of batches
        n_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
        # begining of a batch, given `index`
        batch_begin = index * batch_size
        # ending of a batch given `index`
        batch_end = batch_begin + batch_size

        pretrain_fns = []
        for dA in self.dA_layers:
            # get the cost and the updates list
            cost, updates = dA.get_cost_updates(corruption_level,
                                                learning_rate)
            # compile the theano function
            fn = theano.function(inputs=[index,
                              theano.Param(corruption_level, default=0.2),
                              theano.Param(learning_rate, default=0.1)],
                                 outputs=cost,
                                 updates=updates,
                                 givens={self.x: train_set_x[batch_begin:
                                                             batch_end]})
            # append `fn` to the list of functions
            pretrain_fns.append(fn)

        return pretrain_fns

    def build_finetune_functions(self, datasets, batch_size, learning_rate):
        '''Generates a function `train` that implements one step of
        finetuning, a function `validate` that computes the error on
        a batch from the validation set, and a function `test` that
        computes the error on a batch from the testing set

        :type datasets: list of pairs of theano.tensor.TensorType
        :param datasets: It is a list that contain all the datasets;
                         the has to contain three pairs, `train`,
                         `valid`, `test` in this order, where each pair
                         is formed of two Theano variables, one for the
                         datapoints, the other for the labels

        :type batch_size: int
        :param batch_size: size of a minibatch

        :type learning_rate: float
        :param learning_rate: learning rate used during finetune stage
        '''

        (train_set_x, train_set_y) = datasets[0]
        (valid_set_x, valid_set_y) = datasets[1]
        (test_set_x, test_set_y) = datasets[2]

        # compute number of minibatches for training, validation and testing
        n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
        n_valid_batches /= batch_size
        n_test_batches = test_set_x.get_value(borrow=True).shape[0]
        n_test_batches /= batch_size

        index = T.lscalar('index')  # index to a [mini]batch

        # compute the gradients with respect to the model parameters
        gparams = T.grad(self.finetune_cost, self.params)

        # compute list of fine-tuning updates
        updates = []
        for param, gparam in zip(self.params, gparams):
            updates.append((param, param - gparam * learning_rate))

        train_fn = theano.function(inputs=[index],
              outputs=self.finetune_cost,
              updates=updates,
              givens={
                self.x: train_set_x[index * batch_size:
                                    (index + 1) * batch_size],
                self.y: train_set_y[index * batch_size:
                                    (index + 1) * batch_size]},
              name='train')

        test_score_i = theano.function([index], self.errors,
                 givens={
                   self.x: test_set_x[index * batch_size:
                                      (index + 1) * batch_size],
                   self.y: test_set_y[index * batch_size:
                                      (index + 1) * batch_size]},
                      name='test')

        valid_score_i = theano.function([index], self.errors,
              givens={
                 self.x: valid_set_x[index * batch_size:
                                     (index + 1) * batch_size],
                 self.y: valid_set_y[index * batch_size:
                                     (index + 1) * batch_size]},
                      name='valid')

        # Create a function that scans the entire validation set
        def valid_score():
            return [valid_score_i(i) for i in xrange(n_valid_batches)]

        # Create a function that scans the entire test set
        def test_score():
            return [test_score_i(i) for i in xrange(n_test_batches)]

        return train_fn, valid_score, test_score
def train_a_MultipleAEs(X, pretraining_epochs=10, pretrain_lr=0.001, batch_size=30,
                        hidden_layers_sizes=[100, 100], corruption_levels=[0, 0]):
    
    # get a shared copy of X
    train_set_x = shared_dataset_X(X)
    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0]
    n_train_batches /= batch_size

    # numpy random generator
    numpy_rng = numpy.random.RandomState(89677)
    print '... building the model'
    # construct the stacked denoising autoencoder class
    sda = MultipleAEs(numpy_rng=numpy_rng, n_ins=train_set_x.get_value(borrow=True).shape[1],
              hidden_layers_sizes=hidden_layers_sizes, corruption_levels = corruption_levels)

    #########################
    # PRETRAINING THE MODEL #
    #########################
    print '... getting the pretraining functions'
    pretraining_fns = sda.pretraining_functions(train_set_x=train_set_x,
                                                batch_size=batch_size)

    print '... pre-training the model'
    start_time = time.clock()
    ## Pre-train layer-wise
    corruption_levels = [.1, .2, .3]
    for i in xrange(sda.n_layers):
        # go through pretraining epochs
        for epoch in xrange(pretraining_epochs):
            # go through the training set
            c = []
            for batch_index in xrange(n_train_batches):
                c.append(pretraining_fns[i](index=batch_index,
                         corruption=corruption_levels[i],
                         lr=pretrain_lr))
            print 'Pre-training layer %i, epoch %d, cost ' % (i, epoch),
            print numpy.mean(c)

    end_time = time.clock()
    return sda
class SdA(object):
    """Stacked denoising auto-encoder class (SdA)

    A stacked denoising autoencoder model is obtained by stacking several
    dAs. The hidden layer of the dA at layer `i` becomes the input of
    the dA at layer `i+1`. The first layer dA gets as input the input of
    the SdA, and the hidden layer of the last dA represents the output.
    Note that after pretraining, the SdA is dealt with as a normal MLP,
    the dAs are only used to initialize the weights.
    """

    def __init__(self, numpy_rng, theano_rng=None, n_ins=784,
                 hidden_layers_sizes=[500, 500], n_outs=10,
                 corruption_levels=[0.1, 0.1]):
        """ This class is made to support a variable number of layers.

        :type numpy_rng: numpy.random.RandomState
        :param numpy_rng: numpy random number generator used to draw initial
                    weights

        :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams
        :param theano_rng: Theano random generator; if None is given one is
                           generated based on a seed drawn from `rng`

        :type n_ins: int
        :param n_ins: dimension of the input to the sdA

        :type n_layers_sizes: list of ints
        :param n_layers_sizes: intermediate layers size, must contain
                               at least one value

        :type n_outs: int
        :param n_outs: dimension of the output of the network

        :type corruption_levels: list of float
        :param corruption_levels: amount of corruption to use for each
                                  layer
        """

        self.sigmoid_layers = []
        self.dA_layers = []
        self.params = []
        self.n_layers = len(hidden_layers_sizes)

        assert self.n_layers > 0

        if not theano_rng:
            theano_rng = RandomStreams(numpy_rng.randint(2 ** 30))
        # allocate symbolic variables for the data
        self.x = T.matrix('x')  # the data is presented as rasterized images
        self.y = T.ivector('y')  # the labels are presented as 1D vector of
                                 # [int] labels

        # The SdA is an MLP, for which all weights of intermediate layers
        # are shared with a different denoising autoencoders
        # We will first construct the SdA as a deep multilayer perceptron,
        # and when constructing each sigmoidal layer we also construct a
        # denoising autoencoder that shares weights with that layer
        # During pretraining we will train these autoencoders (which will
        # lead to chainging the weights of the MLP as well)
        # During finetunining we will finish training the SdA by doing
        # stochastich gradient descent on the MLP

        for i in xrange(self.n_layers):
            # construct the sigmoidal layer

            # the size of the input is either the number of hidden units of
            # the layer below or the input size if we are on the first layer
            if i == 0:
                input_size = n_ins
            else:
                input_size = hidden_layers_sizes[i - 1]

            # the input to this layer is either the activation of the hidden
            # layer below or the input of the SdA if you are on the first
            # layer
            if i == 0:
                layer_input = self.x
            else:
                layer_input = self.sigmoid_layers[-1].output

            sigmoid_layer = HiddenLayer(rng=numpy_rng,
                                        input=layer_input,
                                        n_in=input_size,
                                        n_out=hidden_layers_sizes[i],
                                        activation=T.nnet.sigmoid)
            # add the layer to our list of layers
            self.sigmoid_layers.append(sigmoid_layer)
            # its arguably a philosophical question...
            # but we are going to only declare that the parameters of the
            # sigmoid_layers are parameters of the StackedDAA
            # the visible biases in the dA are parameters of those
            # dA, but not the SdA
            self.params.extend(sigmoid_layer.params)

            # Construct a denoising autoencoder that shared weights with this
            # layer
            dA_layer = dA(numpy_rng=numpy_rng,
                          theano_rng=theano_rng,
                          input=layer_input,
                          n_visible=input_size,
                          n_hidden=hidden_layers_sizes[i],
                          W=sigmoid_layer.W,
                          bhid=sigmoid_layer.b)
            self.dA_layers.append(dA_layer)

        # We now need to add a logistic layer on top of the MLP
        self.logLayer = LogisticRegression(
                         input=self.sigmoid_layers[-1].output,
                         n_in=hidden_layers_sizes[-1], n_out=n_outs)

        self.params.extend(self.logLayer.params)
        # construct a function that implements one step of finetunining

        # compute the cost for second phase of training,
        # defined as the negative log likelihood
        self.finetune_cost = self.logLayer.negative_log_likelihood(self.y)
        # compute the gradients with respect to the model parameters
        # symbolic variable that points to the number of errors made on the
        # minibatch given by self.x and self.y
        self.errors = self.logLayer.errors(self.y)

    def pretraining_functions(self, train_set_x, batch_size):
        ''' Generates a list of functions, each of them implementing one
        step in trainnig the dA corresponding to the layer with same index.
        The function will require as input the minibatch index, and to train
        a dA you just need to iterate, calling the corresponding function on
        all minibatch indexes.

        :type train_set_x: theano.tensor.TensorType
        :param train_set_x: Shared variable that contains all datapoints used
                            for training the dA

        :type batch_size: int
        :param batch_size: size of a [mini]batch

        :type learning_rate: float
        :param learning_rate: learning rate used during training for any of
                              the dA layers
        '''

        # index to a [mini]batch
        index = T.lscalar('index')  # index to a minibatch
        corruption_level = T.scalar('corruption')  # % of corruption to use
        learning_rate = T.scalar('lr')  # learning rate to use
        # number of batches
        n_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
        # begining of a batch, given `index`
        batch_begin = index * batch_size
        # ending of a batch given `index`
        batch_end = batch_begin + batch_size

        pretrain_fns = []
        for dA in self.dA_layers:
            # get the cost and the updates list
            cost, updates = dA.get_cost_updates(corruption_level,
                                                learning_rate)
            # compile the theano function
            fn = theano.function(inputs=[index,
                              theano.Param(corruption_level, default=0.2),
                              theano.Param(learning_rate, default=0.1)],
                                 outputs=cost,
                                 updates=updates,
                                 givens={self.x: train_set_x[batch_begin:
                                                             batch_end]})
            # append `fn` to the list of functions
            pretrain_fns.append(fn)

        return pretrain_fns
    def predict(self, x_dataset):
        predict_fn = theano.function([], sda.logLayer.y_pred,
                   givens={sda.x: x_dataset})
        predicted = predict_fn()
        return predicted
    def predict_p(self, x_dataset):
        predict_p_fn = theano.function([], sda.logLayer.p_y_given_x,
                   givens={sda.x: x_dataset})
        predicted_p = predict_p_fn()
        return predicted_p
    def build_finetune_functions(self, datasets, batch_size, learning_rate):
        '''Generates a function `train` that implements one step of
        finetuning, a function `validate` that computes the error on
        a batch from the validation set, and a function `test` that
        computes the error on a batch from the testing set

        :type datasets: list of pairs of theano.tensor.TensorType
        :param datasets: It is a list that contain all the datasets;
                         the has to contain three pairs, `train`,
                         `valid`, `test` in this order, where each pair
                         is formed of two Theano variables, one for the
                         datapoints, the other for the labels

        :type batch_size: int
        :param batch_size: size of a minibatch

        :type learning_rate: float
        :param learning_rate: learning rate used during finetune stage
        '''

        (train_set_x, train_set_y) = datasets[0]
        (valid_set_x, valid_set_y) = datasets[1]
        (test_set_x, test_set_y) = datasets[2]

        # compute number of minibatches for training, validation and testing
        n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
        n_valid_batches /= batch_size
        n_test_batches = test_set_x.get_value(borrow=True).shape[0]
        n_test_batches /= batch_size

        index = T.lscalar('index')  # index to a [mini]batch

        # compute the gradients with respect to the model parameters
        gparams = T.grad(self.finetune_cost, self.params)

        # compute list of fine-tuning updates
        updates = []
        for param, gparam in zip(self.params, gparams):
            updates.append((param, param - gparam * learning_rate))

        train_fn = theano.function(inputs=[index],
              outputs=self.finetune_cost,
              updates=updates,
              givens={
                self.x: train_set_x[index * batch_size:
                                    (index + 1) * batch_size],
                self.y: train_set_y[index * batch_size:
                                    (index + 1) * batch_size]},
              name='train')

        test_score_i = theano.function([index], self.errors,
                 givens={
                   self.x: test_set_x[index * batch_size:
                                      (index + 1) * batch_size],
                   self.y: test_set_y[index * batch_size:
                                      (index + 1) * batch_size]},
                      name='test')

        valid_score_i = theano.function([index], self.errors,
              givens={
                 self.x: valid_set_x[index * batch_size:
                                     (index + 1) * batch_size],
                 self.y: valid_set_y[index * batch_size:
                                     (index + 1) * batch_size]},
                      name='valid')

        # Create a function that scans the entire validation set
        def valid_score():
            return [valid_score_i(i) for i in xrange(n_valid_batches)]

        # Create a function that scans the entire test set
        def test_score():
            return [test_score_i(i) for i in xrange(n_test_batches)]

        return train_fn, valid_score, test_score

##### create a function to train an Sda and return it.
def trainSda(X_train_minmax, y_train,
             X_validation_minmax, y_validation , 
             X_test_minmax, y_test,
             hidden_layers_sizes = [100, 100, 100], corruption_levels = [0, 0, 0], batch_size = 30 , \
             training_epochs = 100, pretraining_epochs = 100, pretrain_lr = 0.001, finetune_lr=0.1
             ):
    n_visible = X_train_minmax.shape[1]
    # compute number of minibatches for training, validation and testing

    train_set_x, train_set_y = shared_dataset( (X_train_minmax,  y_train), borrow=True)
    valid_set_x, valid_set_y = shared_dataset( (X_validation_minmax,  y_validation), borrow=True)
    test_set_x, test_set_y = shared_dataset( (X_test_minmax,  y_test), borrow=True)
    n_train_batches = train_set_x.get_value(borrow=True).shape[0]
    n_train_batches /= batch_size
    # numpy random generator
    numpy_rng = numpy.random.RandomState(89677)
    print '... building the model'
    # construct the stacked denoising autoencoder class
    sda = SdA(numpy_rng=numpy_rng, n_ins=n_visible,
              hidden_layers_sizes= hidden_layers_sizes,
              n_outs=2)
    #########################
    # PRETRAINING THE MODEL #
    #########################
    print '... getting the pretraining functions'
    pretraining_fns = sda.pretraining_functions(train_set_x=train_set_x,
                                                batch_size=batch_size)

    print '... pre-training the model'
    start_time = time.clock()
    ## Pre-train layer-wise

    for i in xrange(sda.n_layers):
        # go through pretraining epochs
        for epoch in xrange(pretraining_epochs):
            # go through the training set
            c = []
            for batch_index in xrange(n_train_batches):
                c.append(pretraining_fns[i](index=batch_index,
                         corruption=corruption_levels[i],
                         lr=pretrain_lr))
            print 'Pre-training layer %i, epoch %d, cost ' % (i, epoch),
            print numpy.mean(c)

    end_time = time.clock()

    print >> sys.stderr, ('The pretraining code ran for %.2fm' % ((end_time - start_time) / 60.))

    ########################
    # FINETUNING THE MODEL #
    ########################

    # get the training, validation and testing function for the model
    print '... getting the finetuning functions'
    datasets = [(train_set_x, train_set_y) , (valid_set_x, valid_set_y), (test_set_x, test_set_y)]
    train_fn, validate_model, test_model = sda.build_finetune_functions(
                datasets=datasets, batch_size=batch_size,
                learning_rate=finetune_lr)

    print '... finetunning the model'
    # early-stopping parameters
    patience = 10 * n_train_batches  # look as this many examples regardless
    patience_increase = 2.  # wait this much longer when a new best is
                            # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    test_score = 0.
    start_time = time.clock()

    done_looping = False
    epoch = 0

    while (epoch < training_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in xrange(n_train_batches):
            minibatch_avg_cost = train_fn(minibatch_index)
            iter = (epoch - 1) * n_train_batches + minibatch_index

            if (iter + 1) % validation_frequency == 0:
                validation_losses = validate_model()
                this_validation_loss = numpy.mean(validation_losses)
                print('epoch %i, minibatch %i/%i, validation error %f %%' %
                      (epoch, minibatch_index + 1, n_train_batches,
                       this_validation_loss * 100.))

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:

                    #improve patience if loss improvement is good enough
                    if (this_validation_loss < best_validation_loss *
                        improvement_threshold):
                        patience = max(patience, iter * patience_increase)

                    # save best validation score and iteration number
                    best_validation_loss = this_validation_loss
                    best_iter = iter

                    # test it on the test set
                    test_losses = test_model()
                    test_score = numpy.mean(test_losses)
                    print((' epoch %i, minibatch %i/%i, test error of '
                           'best model %f %%') %
                          (epoch, minibatch_index + 1, n_train_batches,
                           test_score * 100.))

            if patience <= iter:
                done_looping = True
                break
    return sda
# the following is a split stacked auto encoder

class Split_SdA(object):
    """Split Stacked denoising auto-encoder class (SdA)

    """

    def __init__(self, numpy_rng, theano_rng=None, n_ins=784,
                 hidden_layers_sizes_A=[100, 100],
                 hidden_layers_sizes_B=[100, 100],
                 n_outs=2,
                 corruption_levels_A=[0, 0],
                 corruption_levels_B=[0, 0]):
        """ This class is made to support a variable number of layers.

        :type numpy_rng: numpy.random.RandomState
        :param numpy_rng: numpy random number generator used to draw initial
                    weights

        :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams
        :param theano_rng: Theano random generator; if None is given one is
                           generated based on a seed drawn from `rng`

        :type n_ins: int
        :param n_ins: dimension of the input to the sdA

        :type n_layers_sizes: list of ints
        :param n_layers_sizes: intermediate layers size, must contain
                               at least one value

        :type n_outs: int
        :param n_outs: dimension of the output of the network

        :type corruption_levels: list of float
        :param corruption_levels: amount of corruption to use for each
                                  layer
        """

        self.sigmoid_layers = []
        self.dA_layers = []
        self.params = []
        self.n_layers = len(hidden_layers_sizes)

        assert self.n_layers > 0

        if not theano_rng:
            theano_rng = RandomStreams(numpy_rng.randint(2 ** 30))
        # allocate symbolic variables for the data
        self.x = T.matrix('x')  # the data is presented as rasterized images
        self.y = T.ivector('y')  # the labels are presented as 1D vector of
                                 # [int] labels

        # The SdA is an MLP, for which all weights of intermediate layers
        # are shared with a different denoising autoencoders
        # We will first construct the SdA as a deep multilayer perceptron,
        # and when constructing each sigmoidal layer we also construct a
        # denoising autoencoder that shares weights with that layer
        # During pretraining we will train these autoencoders (which will
        # lead to chainging the weights of the MLP as well)
        # During finetunining we will finish training the SdA by doing
        # stochastich gradient descent on the MLP

        for i in xrange(self.n_layers):
            # construct the sigmoidal layer

            # the size of the input is either the number of hidden units of
            # the layer below or the input size if we are on the first layer
            if i == 0:
                input_size = n_ins
            else:
                input_size = hidden_layers_sizes[i - 1]

            # the input to this layer is either the activation of the hidden
            # layer below or the input of the SdA if you are on the first
            # layer
            if i == 0:
                layer_input = self.x
            else:
                layer_input = self.sigmoid_layers[-1].output

            sigmoid_layer = HiddenLayer(rng=numpy_rng,
                                        input=layer_input,
                                        n_in=input_size,
                                        n_out=hidden_layers_sizes[i],
                                        activation=T.nnet.sigmoid)
            # add the layer to our list of layers
            self.sigmoid_layers.append(sigmoid_layer)
            # its arguably a philosophical question...
            # but we are going to only declare that the parameters of the
            # sigmoid_layers are parameters of the StackedDAA
            # the visible biases in the dA are parameters of those
            # dA, but not the SdA
            self.params.extend(sigmoid_layer.params)

            # Construct a denoising autoencoder that shared weights with this
            # layer
            dA_layer = dA(numpy_rng=numpy_rng,
                          theano_rng=theano_rng,
                          input=layer_input,
                          n_visible=input_size,
                          n_hidden=hidden_layers_sizes[i],
                          W=sigmoid_layer.W,
                          bhid=sigmoid_layer.b)
            self.dA_layers.append(dA_layer)

        # We now need to add a logistic layer on top of the MLP
        self.logLayer = LogisticRegression(
                         input=self.sigmoid_layers[-1].output,
                         n_in=hidden_layers_sizes[-1], n_out=n_outs)

        self.params.extend(self.logLayer.params)
        # construct a function that implements one step of finetunining

        # compute the cost for second phase of training,
        # defined as the negative log likelihood
        self.finetune_cost = self.logLayer.negative_log_likelihood(self.y)
        # compute the gradients with respect to the model parameters
        # symbolic variable that points to the number of errors made on the
        # minibatch given by self.x and self.y
        self.errors = self.logLayer.errors(self.y)