You should start ipython notebook server with "--pylab inline" flag.
In [1]:
import sys
sys.path.append('../src')
import csv
In [2]:
from numpy import loadtxt
from numpy.random import uniform
from sklearn.decomposition import PCA
from sklearn.cross_validation import train_test_split
from matplotlib.cm import binary_r
from FeedforwardNeuNet import sigmoid, NnLayer, FeedforwardNeuNet
from CostFunc import courseraML_CostFunc, courseraML_CostFuncGrad
Check http://www.kaggle.com/c/digit-recognizer/data for data format:
In [3]:
train_targetsAndInputs=loadtxt('../demoDataSet/kaggle_digit_train.csv',delimiter=',',skiprows=1)
test_inputsOrig=loadtxt('../demoDataSet/kaggle_digit_test.csv',delimiter=',',skiprows=1)
test_rf_benchmark=loadtxt('../demoDataSet/kaggle_digit_rf_benchmark.csv',delimiter=',',skiprows=1)
y = train_targetsAndInputs[:,0][:, None] # make each target a row in 2D array
identityArr = identity(10)
targets = select([y == 0, y == 1, y == 2, y == 3, y == 4, y == 5, y == 6, y == 7, y == 8, y == 9], [identityArr[0], identityArr[1], identityArr[2], identityArr[3], identityArr[4], identityArr[5], identityArr[6], identityArr[7], identityArr[8], identityArr[9]])
traingSetNormalized=0
PCAonTrainSet=0
inputs=train_targetsAndInputs[:,1:]
pcaReducedDimTo=inputs[0].size
In [4]:
imshow(reshape(train_targetsAndInputs[-1,1:], (28, 28)), binary_r)
print('target: {0}'.format(train_targetsAndInputs[-1,0]))
Normalizing training set and test set:
In [5]:
traingSetNormalized=1
inputs=(inputs-mean(inputs))/std(inputs) #normalizing inputs
test_inputs=(test_inputsOrig-mean(test_inputsOrig))/std(test_inputsOrig) # normalize test_inputsOrig
Do NOT use PCA code here to reduce input dimension cause it messed up the data, and resulted in slow nn training:
In [ ]:
# If I use PCA, then it takes forever to train my nn
PCAonTrainSet=1
pcaReducedDimTo=400
pca = PCA(pcaReducedDimTo)
pca.fit(inputs)
print(pca.explained_variance_ratio_)
pca_inputs=pca.transform(inputs)
pca.fit(test_inputs)
pca_test_inputs=pca.transform(test_inputs)
Split training data into training set and CV set:
In [6]:
nnInputs=pca_inputs if PCAonTrainSet else inputs
nnInputs_train, nnInputs_cv, targets_train, targets_cv = train_test_split(nnInputs, targets, test_size=0.3, random_state=0)
I initialize weights using the "effective strategy for initializing initial weights" (described in Coursera Stanford Machine Learning course, homework assignment 4, ex4.pdf, p7)
In [7]:
errRate_train, errRate_cv=[],[]
In [138]:
maxIter=20
start, end=3,6 # inclusive
for numOfLyExcOutputLy in xrange(start, end+1):
layersExOutputLy=[]
dimReducedTo=pcaReducedDimTo
listOfNumOfUnits=range(10, dimReducedTo, dimReducedTo/numOfLyExcOutputLy)
for n in reversed(listOfNumOfUnits):
layersExOutputLy.append(NnLayer(sigmoid, dimReducedTo, 1, n)) # input layer and each hidden layer. No need to creat output layer
initRange=6**0.5/(dimReducedTo+1+n+1)**0.5
layersExOutputLy[-1].updateForwardWeight(uniform(-initRange, initRange, (n,dimReducedTo+1)))
dimReducedTo=n
nn = FeedforwardNeuNet(layersExOutputLy, 1, 0.01, 0)
nn.train(nnInputs_train, targets_train, courseraML_CostFunc, courseraML_CostFuncGrad, maxIter)
predictionsOnTrainSet=argmax(asarray(nn.forwardPropogateAllInput(nnInputs_train)),1)
numPredictErr=count_nonzero(logical_not(argmax(targets_train,1)==predictionsOnTrainSet))
errRate_train.append(numPredictErr*1.0/len(predictionsOnTrainSet))
predictionsOnCVSet=argmax(asarray(nn.forwardPropogateAllInput(nnInputs_cv)),1)
numPredictErr=count_nonzero(logical_not(argmax(targets_cv,1)==predictionsOnCVSet))
errRate_cv.append(numPredictErr*1.0/len(predictionsOnCVSet))
In [139]:
fig, ax = plt.subplots()
ax.plot(range(start,end+1), errRate_cv, lw=2, label = 'cross-validation error')
ax.plot(range(start,end+1), errRate_train, lw=2, label = 'training error')
ax.legend(loc=0)
ax.set_xlabel('num of layers')
ax.set_ylabel('prediction error rate')
Out[139]:
It seems like using more layers is not necessarilly better. So I choose a 3-layer setup.
In [20]:
errRate_train, errRate_cv=[],[]
In [21]:
maxIter=20
start, end=50, 500 # inclusive
for fstHidUnt in xrange(start,end+1,100):
layersExOutputLy=[]
listOfNumOfUnits=[10, 40, fstHidUnt]
dimReducedTo=pcaReducedDimTo
for n in reversed(listOfNumOfUnits):
layersExOutputLy.append(NnLayer(sigmoid, dimReducedTo, 1, n)) # input layer and each hidden layer. No need to creat output layer
initRange=6**0.5/(dimReducedTo+1+n+1)**0.5
layersExOutputLy[-1].updateForwardWeight(uniform(-initRange, initRange, (n,dimReducedTo+1)))
dimReducedTo=n
nn = FeedforwardNeuNet(layersExOutputLy, 1, 0.01, 0)
nn.train(nnInputs_train, targets_train, courseraML_CostFunc, courseraML_CostFuncGrad, maxIter)
predictionsOnTrainSet=argmax(asarray(nn.forwardPropogateAllInput(nnInputs_train)),1)
numPredictErr=count_nonzero(logical_not(argmax(targets_train,1)==predictionsOnTrainSet))
errRate_train.append(numPredictErr*1.0/len(predictionsOnTrainSet))
predictionsOnCVSet=argmax(asarray(nn.forwardPropogateAllInput(nnInputs_cv)),1)
numPredictErr=count_nonzero(logical_not(argmax(targets_cv,1)==predictionsOnCVSet))
errRate_cv.append(numPredictErr*1.0/len(predictionsOnCVSet))
In [25]:
fig, ax = plt.subplots()
ax.plot(range(start,end+1,100), errRate_cv, lw=2, label = 'cross-validation error')
ax.plot(range(start,end+1,100), errRate_train, lw=2, label = 'training error')
ax.legend(loc=0)
ax.set_xlabel('num of units in 1st hidden layer')
ax.set_ylabel('prediction error rate')
Out[25]:
In [24]:
(errRate_cv, errRate_train)
Out[24]:
Based on the err rate of CV set, I set the number of hidden units in 1st hidden layer to: 200
In [30]:
errRate_train, errRate_cv=[],[]
In [31]:
maxIter=20
start, end=50, 150 # inclusive
for fstHidUnt in xrange(start, end+1, 20):
layersExOutputLy=[]
listOfNumOfUnits=[10, fstHidUnt, 200]
dimReducedTo=pcaReducedDimTo
for n in reversed(listOfNumOfUnits):
layersExOutputLy.append(NnLayer(sigmoid, dimReducedTo, 1, n)) # input layer and each hidden layer. No need to creat output layer
initRange=6**0.5/(dimReducedTo+1+n+1)**0.5
layersExOutputLy[-1].updateForwardWeight(uniform(-initRange, initRange, (n,dimReducedTo+1)))
dimReducedTo=n
nn = FeedforwardNeuNet(layersExOutputLy, 1, 0.01, 0)
nn.train(nnInputs_train, targets_train, courseraML_CostFunc, courseraML_CostFuncGrad, maxIter)
predictionsOnTrainSet=argmax(asarray(nn.forwardPropogateAllInput(nnInputs_train)),1)
numPredictErr=count_nonzero(logical_not(argmax(targets_train,1)==predictionsOnTrainSet))
errRate_train.append(numPredictErr*1.0/len(predictionsOnTrainSet))
predictionsOnCVSet=argmax(asarray(nn.forwardPropogateAllInput(nnInputs_cv)),1)
numPredictErr=count_nonzero(logical_not(argmax(targets_cv,1)==predictionsOnCVSet))
errRate_cv.append(numPredictErr*1.0/len(predictionsOnCVSet))
In [35]:
fig, ax = plt.subplots()
ax.plot(range(start,end+1,20), errRate_cv, lw=2, label = 'cross-validation error')
ax.plot(range(start,end+1,20), errRate_train, lw=2, label = 'training error')
ax.legend(loc=0)
ax.set_xlabel('num of units in 2nd hidden layer')
ax.set_ylabel('prediction error rate')
Out[35]:
In [33]:
(errRate_cv, errRate_train)
Out[33]:
Based on the err rate of CV set, I set the number of hidden units in 2nd hidden layer to: 70
In [5]:
maxIter=150
layersExOutputLy=[]
listOfNumOfUnits=[10, 70, 200]
dimReducedTo=pcaReducedDimTo
for n in reversed(listOfNumOfUnits):
# Only input layer and each hidden layer are created. No need to creat output layer cause in my
# implementation, each layer will call activation function on weighted inputs (and also its weighted
# bias) and output the result to next layer. In other words, the nn[:]( or nn.outputs, when given
# multiple inputs) will serve as the output layer, and all it does is storing the output values of
# the last hidden layer.
layersExOutputLy.append(NnLayer(sigmoid, dimReducedTo, 1, n))
initRange=6**0.5/(dimReducedTo+1+n+1)**0.5
layersExOutputLy[-1].updateForwardWeight(uniform(-initRange, initRange, (n,dimReducedTo+1)))
dimReducedTo=n
nn = FeedforwardNeuNet(layersExOutputLy, 1, 0.01, 0)
In [37]:
nn.train(nnInputs_train, targets_train, courseraML_CostFunc, courseraML_CostFuncGrad, maxIter)
In [38]:
predictionsOnTrainSet=argmax(asarray(nn.forwardPropogateAllInput(nnInputs_train)),1)
numPredictErr=count_nonzero(logical_not(argmax(targets_train,1)==predictionsOnTrainSet))
errRate_train=numPredictErr*1.0/len(predictionsOnTrainSet)
predictionsOnCVSet=argmax(asarray(nn.forwardPropogateAllInput(nnInputs_cv)),1)
numPredictErr=count_nonzero(logical_not(argmax(targets_cv,1)==predictionsOnCVSet))
errRate_cv=numPredictErr*1.0/len(predictionsOnCVSet)
(errRate_train, errRate_cv)
Out[38]:
Before using cross validation on model selection, I trained my nn using casually-picked hyperparameters:
I used the follow code snippet to visualize the impact when gradually increase the max iteration of my nn.train() function:
In [ ]:
errRate_train,errRate_cv=[],[]
start, end=80,150 # inclusive, this segment of code takes pretty long to finish
for maxI in range(start,end+1,10):
initRange=6**0.5/(dimReducedTo+1+28+1)**0.5
nn.layersExOutputLy[0].updateForwardWeight(uniform(-initRange, initRange, (40,dimReducedTo+1))) # treat a layer as a column vector
initRange=6**0.5/(28+1+20+1)**0.5
nn.layersExOutputLy[1].updateForwardWeight(uniform(-initRange, initRange, (20,41)))
initRange=6**0.5/(20+1+10)**0.5
nn.layersExOutputLy[2].updateForwardWeight(uniform(-initRange, initRange, (10,21)))
nn.train(nnInputs_train, targets_train, courseraML_CostFunc, courseraML_CostFuncGrad, maxI)
predictionsOnTrainSet=argmax(asarray(nn.forwardPropogateAllInput(nnInputs_train)),1)
numPredictErr=count_nonzero(logical_not(argmax(targets_train,1)==predictionsOnTrainSet))
errRate_train.append(numPredictErr*1.0/len(predictionsOnTrainSet))
predictionsOnCVSet=argmax(asarray(nn.forwardPropogateAllInput(nnInputs_cv)),1)
numPredictErr=count_nonzero(logical_not(argmax(targets_cv,1)==predictionsOnCVSet))
errRate_cv.append(numPredictErr*1.0/len(predictionsOnCVSet))
In [80]:
fig, ax = plt.subplots()
ax.plot(range(start,end+1,10), errRate_cv, lw=2, label = 'cross-validation error')
ax.plot(range(start,end+1,10), errRate_train, lw=2, label = 'training error')
ax.legend(loc=0)
ax.set_xlabel('max iteration of train()')
ax.set_ylabel('prediction error rate')
Out[80]:
In [81]:
(errRate_train,errRate_cv)
Out[81]:
The value of cost function given optimized weights when max iteration is:
It's clear that cost function converges at ~130. And this preliminary model suffers from higher error rates on both training set and cv set. In fact, the preliminary model has error rates that are:
Conclusion: Cross validation is truly an amazing tool for model selection!
In [28]:
with open('Kaggle_digits_recognition_configuration.csv', 'ab') as csvfile:
csvWriter=csv.writer(csvfile)
csvWriter.writerow(['numbOfLayers','numOfUnitsExcBias','fming_cg_maxIter','traingSetNormalized','PCAonTrainSet','reducedDimAfterPCA'])
In [40]:
whichConfig=4
dimReducedTo=len(nn.layersExOutputLy[0])-1
for i,ly in enumerate(nn.layersExOutputLy):
save('./Kaggle_digits_recognition_optWeight{0}_{1}'.format(whichConfig,i), nn.layersExOutputLy[i]._NnLayer__forwardWeight)
with open('Kaggle_digits_recognition_configuration.csv', 'ab') as csvfile:
csvWriter = csv.writer(csvfile)
data=[len(nn.layersExOutputLy)]
for ly in nn.layersExOutputLy:
data.append(len(ly)-1)
data.append(maxIter)
data.append(traingSetNormalized)
data.append(PCAonTrainSet)
data.append(dimReducedTo)
csvWriter.writerow(data)
In [53]:
nn_testInputs=pca_test_inputs if PCAonTrainSet else test_inputs
whichExample=109
imshow(reshape(nn_testInputs[whichExample], (28, 28)), binary_r)
print('My neural network\'s prediction: {0}'.format(argmax(nn.forwardPropogateOneInput(nn_testInputs[whichExample]))))
print('Testing example:')
In [43]:
indexAndPredictions=array((arange(1,len(nn_testInputs)+1),argmax(asarray(nn.forwardPropogateAllInput(nn_testInputs)),1))).T
In [49]:
count_nonzero(logical_not(int64(test_rf_benchmark)[:,1]==indexAndPredictions[:,1]))
Out[49]:
In [45]:
logical_not(int64(test_rf_benchmark)[:,1]==indexAndPredictions[:,1]).nonzero()[0] # image index of difference
Out[45]:
In [46]:
savetxt('./Kaggle_digits_recognition_nnPrediction{0}.csv'.format(whichConfig),indexAndPredictions,'%d',delimiter=',',comments='',header='ImageId,Label')
I really appreciate Jake Vanderplass for his awesome introduction to scikit-learn and his works on https://github.com/jakevdp/sklearn_pycon2013.