In [1]:
#import libraries
import os
import cv2
import numpy as np
from scipy.sparse import lil_matrix
from scipy.stats import expon
from sklearn.decomposition import RandomizedPCA
from sklearn import cross_validation
from sklearn import svm
from sklearn import metrics
from sklearn import preprocessing
from time import time
from sklearn.grid_search  import GridSearchCV
from sklearn.neural_network import BernoulliRBM


wd = '/home/mckc/Downloads/Dogs and cats/' #change this to make the code work
dataTrainDir = '/home/mckc/Downloads/Dogs and cats/train/'
dataTestDir = '/home/mckc/Downloads/Dogs and cats/test1/'

os.chdir(wd)

labels = ['cat.', 'dog.']
desiredDimensions = [30, 30]

In [2]:
#define loading and pre-processing function grayscale
def preprocessImg(animal, number, dim1, dim2, dataDir):
    imageName = '{0:s}{1:s}{2:d}{3:s}'.format(dataDir, animal, number, '.jpg')
    npImage = cv2.imread(imageName)
    npImage = cv2.cvtColor(npImage, cv2.COLOR_BGR2GRAY)
    #vectorof255s =  np.tile(255., (npImage.shape[0], npImage.shape [1]))
    #npImage = np.divide(npImage, vectorof255s)
    avg = np.mean(npImage.reshape(1, npImage.shape[0] * npImage.shape [1]))
    avg = np.tile(avg, (npImage.shape[0], npImage.shape [1]))
    npImage = npImage - avg
    npImage = cv2.resize(npImage, (dim1, dim2))
    return(npImage.reshape(1, dim1 * dim2))

#m = 1000 #pet Train dataset
m = 12500 #full Train dataset
mTest = 12500 #number of images in the test set


indexesIm = np.random.permutation(m * len(labels))
idxImages = np.tile(range(m), len(labels))
idxImages = idxImages[indexesIm]
testIndexes = range(len(indexesIm), len(indexesIm) + mTest)
y = np.append(np.tile(0, m), np.tile(1, m))
y = y[indexesIm]

def animalInput(theNumber):
    if theNumber == 0:
        return 'cat.'
    elif theNumber == 1:
        return 'dog.'
    else:
        return ''

In [5]:
#Build the sparse matrix with the preprocessed image data for both train and test data
bigMatrix = lil_matrix((len(indexesIm) + len(testIndexes), desiredDimensions[0] * desiredDimensions[1]))

for i in range(len(indexesIm)):
    bigMatrix[i, :] = preprocessImg(animalInput(y[i]), idxImages[i], desiredDimensions[0], desiredDimensions[1], dataTrainDir)

someNumbers = range(mTest)
for ii in someNumbers:
    bigMatrix[testIndexes[ii], :] = preprocessImg(animalInput('printNothing'), ii + 1, desiredDimensions[0], desiredDimensions[1], dataTestDir)

#Transform to csr matrix and standarization
bigMatrix = bigMatrix.tocsr()
bigMatrix = preprocessing.scale(bigMatrix, with_mean=False)

#extract features with neural nets (Restricted Boltzmann Machine)
#RBM = BernoulliRBM(verbose = True)
#RBM.learning_rate = 0.06
#RBM.n_iter = 20
#RBM.n_components = 100
#min_max_scaler = preprocessing.MinMaxScaler()
#RBM.fit(min_max_scaler.fit_transform((bigMatrix.todense()))

#Reduce features to main components so that they contain 99% of variance
pca = RandomizedPCA(n_components=250, whiten = True)
pca.fit(np.array(bigMatrix))
varianceExplained = pca.explained_variance_ratio_
print(pca.explained_variance_ratio_)

def anonFunOne(vector):
    variance = 0
    for ii in range(len(vector)):
            variance += vector[ii]
            if variance > 0.99:
                return (ii)
                break

bigMatrix = pca.fit_transform(bigMatrix, y = anonFunOne(varianceExplained))

#Divide train Matrix and Test Matrix (for which I don't have labels)
trainMatrixReduced = bigMatrix[0:max(indexesIm), :]
testMatrixReduced = bigMatrix[testIndexes[0]:bigMatrix.shape[0], :]


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-5-81fd44718530> in <module>()
     24 #Reduce features to main components so that they contain 99% of variance
     25 pca = RandomizedPCA(n_components=250, whiten = True)
---> 26 pca.fit(np.array(bigMatrix))
     27 varianceExplained = pca.explained_variance_ratio_
     28 print(pca.explained_variance_ratio_)

/home/mckc/anaconda/lib/python2.7/site-packages/sklearn/decomposition/pca.pyc in fit(self, X, y)
    569             Returns the instance itself.
    570         """
--> 571         self._fit(check_array(X))
    572         return self
    573 

/home/mckc/anaconda/lib/python2.7/site-packages/sklearn/utils/validation.pyc in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
    371                                       force_all_finite)
    372     else:
--> 373         array = np.array(array, dtype=dtype, order=order, copy=copy)
    374 
    375         if ensure_2d:

ValueError: setting an array element with a sequence.

In [ ]:
#Divide dataset for cross validation purposes
X_train, X_test, y_train, y_test = cross_validation.train_test_split(
    trainMatrixReduced, y[0:24999], test_size=0.4, random_state=0) #fix this

#random grid search of hiperparameters
#create a classifier
clf = svm.SVC(verbose = True)

# specify parameters and distributions to sample from
params2Test = {'C': [1, 3, 10, 30, 100, 300], 'gamma': [0.001], 'kernel': ['rbf']}

#run randomized search
grid_search = GridSearchCV(clf, param_grid = params2Test)

start = time()
grid_search.fit(trainMatrixReduced, y[0:24999])
print("GridSearchCV took %.2f seconds for %d candidate parameter settings." % (time() - start, len(grid_search.grid_scores_)))
type(grid_search)
grid_search.grid_scores_

#Machine Learning part
#Support vector machine model
clf.fit(X_train, y_train)

#prediction
predictionFromDataset = clf.predict(X_test)

correctValues = sum(predictionFromDataset == y_test)
percentage = float(correctValues)/len(y_test)

print(percentage)

#prediction probability
predictionFromDataset2 = clf.predict_proba(X_test)
predictionFromDataset2 = predictionFromDataset2[:, 1]
fpr, tpr, thresholds = metrics.roc_curve(y_test, predictionFromDataset2)
predictionProbability = metrics.auc(fpr, tpr)

#Predict images from the test set
#Train the model with full data set
clf = svm.SVC(C = 10, gamma = 0.001, kernel= 'rbf',verbose = True)
clf.fit(trainMatrixReduced, y[0:24999]) #fix this

#Prediction
#predictionFromTest = clf.predict_proba(testMatrixReduced)
predictionFromTest = clf.predict(testMatrixReduced)
#label = predictionFromTest[:, 1]
idVector = range(1, mTest + 1)

#predictionsToCsv = np.column_stack((idVector, label))
predictionsToCsv = np.column_stack((idVector, predictionFromTest))

In [ ]: