Handwritten digit recognition

Heavily based on this notebook by amezhenin.

Loading libraries and data


In [ ]:
%pylab inline 

# pandas and numpy
# not so much of pandas but for read_csv which is more efficient than numpy.loadtxt
import numpy as np
import pandas as pd

# scikit-learn classifiers and cross validation utils
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV

# scikit-learn dimension reduction
from sklearn.decomposition import PCA

# scikit-learn dataset processing utils
from sklearn.preprocessing import MinMaxScaler

Load the training data and have a quick look


In [ ]:
df = pd.read_csv('../data/train.csv')
df = df.astype(float64)

Exploring data


In [ ]:
df.shape

In [ ]:
print df.as_matrix.__doc__

In [ ]:
df.ix[1000:1010,200:210]

In [ ]:
f, (ax1, ax2, ax3, ax4) = subplots(ncols=4)

imsize = (28, 28)
ax1.matshow(np.reshape(df.ix[35053,1:], imsize), cmap='gray_r')
ax2.matshow(np.reshape(df.ix[1008,1:], imsize), cmap='gray_r')
ax3.matshow(np.reshape(df.ix[1009,1:], imsize), cmap='gray_r')
ax4.matshow(np.reshape(df.ix[1012,1:], imsize), cmap='gray_r')

f.tight_layout();

Logistic regression

Preparing data


In [ ]:
from sklearn.linear_model import LogisticRegression

X = df.ix[:,1:]
Y = np.ravel(np.array(df.ix[:,0]))

Training


In [ ]:
model = LogisticRegression()
print X.shape, Y.shape
model.fit(X,Y)

Saving model


In [ ]:
from sklearn.externals import joblib
joblib.dump(model, 'logistic_regression_model.pkl')

Predicting test samples


In [ ]:
#prediction = model.predict(X)

In [ ]:
df2 = pd.read_csv('./test.csv')
df2 = df2.astype(float64)
test = np.array(df2.ix[:,:])

In [ ]:
test_prediction = model.predict(test)

In [ ]:
f, (ax1, ax2, ax3, ax4) = subplots(ncols=4)

imsize = (28, 28)
ax1.matshow(np.reshape(df2.ix[0,:], imsize), cmap='gray_r')
ax2.matshow(np.reshape(df2.ix[1,:], imsize), cmap='gray_r')
ax3.matshow(np.reshape(df2.ix[2,:], imsize), cmap='gray_r')
ax4.matshow(np.reshape(df2.ix[3,:], imsize), cmap='gray_r')

f.tight_layout();

In [ ]:
print test_prediction[0:4]
int_test_prediction = [int(i) for i in test_prediction]
print int_test_prediction[0:4]

In [ ]:
with open('./resultados.csv', 'w') as of:
    of.write("ImageId,label\n")
    for x in enumerate(int_test_prediction, start=1):
        of.write("%s,%s\n" % x)