In [14]:
from __future__ import division
import os, time, math, csv
import cPickle as pickle
import matplotlib.pyplot as plt
import numpy as np
from print_imgs import print_imgs # my own function to print a grid of square images
from sklearn.utils import shuffle
from sklearn.cross_validation import train_test_split
from sklearn.decomposition import PCA
from sklearn.svm import SVC
np.random.seed(seed=1009)
%matplotlib inline
In [15]:
#%qtconsole
In [16]:
file_path = '../kaggle/data/'
train_img_deskewed_filename = 'kaggle_trainX_deskewed.csv'
test_img_deskewed_filename = 'kaggle_testX_deskewed.csv'
train_img_original_filename = 'kaggle_trainX.csv'
test_img_original_filename = 'kaggle_testX.csv'
train_label_filename = 'kaggle_trainY.csv'
test_label_filename = None # not provided
In [17]:
portion = 1.0 # set to 1.0 for all of it less than 1.0 for less
In [18]:
# read both trainX files
with open(file_path + train_img_original_filename,'r') as f:
data_iter = csv.reader(f, delimiter = ',')
data = [data for data in data_iter]
trainXo = np.ascontiguousarray(data, dtype = np.float64)
with open(file_path + train_img_deskewed_filename,'r') as f:
data_iter = csv.reader(f, delimiter = ',')
data = [data for data in data_iter]
trainXd = np.ascontiguousarray(data, dtype = np.float64)
# vertically concatenate the two files
trainX = np.vstack((trainXo, trainXd))
trainXo = None
trainXd = None
# read trainY twice and vertically concatenate
with open(file_path + train_label_filename,'r') as f:
data_iter = csv.reader(f, delimiter = ',')
data = [data for data in data_iter]
trainYo = np.ascontiguousarray(data, dtype = np.int8)
trainYd = np.ascontiguousarray(data, dtype = np.int8)
trainY = np.vstack((trainYo, trainYd)).ravel()
trainYo = None
trainYd = None
data = None
# shuffle trainX & trainY
trainX, trainY = shuffle(trainX, trainY, random_state=0)
# use less data if specified
if portion < 1.0:
trainX = trainX[:portion*trainX.shape[0]]
trainY = trainY[:portion*trainY.shape[0]]
print("trainX shape: {0}".format(trainX.shape))
print("trainY shape: {0}\n".format(trainY.shape))
print(trainX.flags)
In [19]:
print_imgs(images = trainX,
actual_labels = trainY,
predicted_labels = trainY,
starting_index = np.random.randint(0, high=trainY.shape[0]-36, size=1)[0],
size = 6)
In [20]:
with open(file_path + test_img_deskewed_filename,'r') as f:
data_iter = csv.reader(f, delimiter = ',')
data = [data for data in data_iter]
testX = np.ascontiguousarray(data, dtype = np.float64)
# do not shuffle testX ... order is important for submission file
# ===============================================================
#testX = shuffle(testX, random_state=0)
# use less data if specified
if portion < 1.0:
testX = testX[:portion*testX.shape[0]]
testY = None
data = None
testX_original = testX.copy() # becasue PCA changes textX ... it's not images anymore
print("testX shape: {0}".format(testX.shape))
In [21]:
t0 = time.time()
pca = PCA(n_components=0.85, whiten=True)
trainX = pca.fit_transform(trainX)
testX = pca.transform(testX)
print("trainX shape: {0}".format(trainX.shape))
print("trainY shape: {0}\n".format(trainY.shape))
print("testX shape: {0}".format(testX.shape))
print("\ntime in minutes {0:.2f}".format((time.time()-t0)/60))
In [22]:
# default parameters for SVC
# ==========================
default_svc_params = {}
default_svc_params['C'] = 1.0 # penalty
default_svc_params['class_weight'] = None # Set the parameter C of class i to class_weight[i]*C
# set to 'auto' for unbalanced classes
default_svc_params['gamma'] = 0.0 # Kernel coefficient for 'rbf', 'poly' and 'sigmoid'
default_svc_params['kernel'] = 'rbf' # 'linear', 'poly', 'rbf', 'sigmoid', 'precomputed' or a callable
# use of 'sigmoid' is discouraged
default_svc_params['shrinking'] = True # Whether to use the shrinking heuristic.
default_svc_params['probability'] = False # Whether to enable probability estimates.
default_svc_params['tol'] = 0.001 # Tolerance for stopping criterion.
default_svc_params['cache_size'] = 200 # size of the kernel cache (in MB).
default_svc_params['max_iter'] = -1 # limit on iterations within solver, or -1 for no limit.
default_svc_params['verbose'] = False
default_svc_params['degree'] = 3 # 'poly' only
default_svc_params['coef0'] = 0.0 # 'poly' and 'sigmoid' only
# set the parameters for the classifier
# =====================================
svc_params = dict(default_svc_params)
svc_params['C'] = 2.9470517025518097
svc_params['gamma'] = 0.015998587196060572
svc_params['cache_size'] = 2000
#svc_params['verbose'] = True
# create the classifier itself
# ============================
svc_clf = SVC(**svc_params)
In [23]:
t0 = time.time()
svc_clf.fit(trainX, trainY)
# Save the fitted clasifier to disk
pickle.dump( svc_clf, open( '../kaggle/SVC_RBF_PCA.pkl', 'wb' ) )
print("time in minutes {0:.2f}".format((time.time()-t0)/60))
In [24]:
t0 = time.time()
predicted_values = svc_clf.predict(testX)
print("time in minutes {0:.2f}".format((time.time()-t0)/60))
In [25]:
testY_first36 = [2,0,9,0,3,7,0,3,0,3,5,7,4,0,4,3,3,1,9,0,9,1,1,5,7,4,2,7,4,7,7,5,4,2,6,2]
print_imgs(images = testX_original,
actual_labels = testY_first36,
predicted_labels = predicted_values,
starting_index = 0,
size = 6)
In [26]:
with open('../kaggle/submissions/submission_SVC_RBF_PCA.csv', 'w') as f_result:
f_result.write('"ImageId","Label"\n')
for i, y in enumerate(predicted_values, 1):
f_result.write('{},"{}"\n'.format(i,y))
In [ ]: