In [1]:
import csv
import numpy as np

Dataset Input


In [2]:
## Train Dataset.
images = [] # Images as vectors
digits = [] # Digits in the images

with open('../data/train.csv', 'rb') as trainfile:
    trainread = csv.reader(trainfile, delimiter=',', quotechar="'")
    trainread.next() # Skips header

    for row in trainread: # Reads image and digit
        row = [int(i) for i in row]
        digits.append(row[0])
        images.append(row[1:])

In [3]:
## Test Dataset
test_images = []

with open('../data/test.csv', 'rb') as testfile:
    testread = csv.reader(testfile, delimiter=',', quotechar="'")
    testread.next() # Skips header
    
    for row in testread:
        row = [int(i) for i in row]
        test_images.append(row)

Submissions


In [5]:
def submit(prediction,file):
    with open('../data/' + file + '.csv', 'wb') as outfile:
        predwriter = csv.writer(outfile,delimiter=",",quotechar='|', quoting=csv.QUOTE_MINIMAL)
        
        predwriter.writerow(['ImageId','Label'])
        for i in range(len(prediction)):
            predwriter.writerow([i+1,prediction[i]])

KNN Classifier


In [3]:
## KNN Classifier
# Training
from sklearn import neighbors
from sklearn import cross_validation
from sklearn.cross_validation import StratifiedShuffleSplit

train = []
target = []
sample = StratifiedShuffleSplit(digits, 1, train_size=0.1, random_state=0)

# Subsample the training set to do cross validation later
for train_index,test_index in sample:
    for i in train_index:
        train.append(images[i])
        target.append(digits[i])

# Adjustment of arrays
train = np.array(train)
target = np.array(target)

max_k=1000
maximum=0
k=0

# Find the better value for k
for i in range(max_k):
    knn = neighbors.KNeighborsClassifier(n_neighbors=i+1)
    # Computing the mean of the cross validation accuracies
    score = cross_validation.cross_val_score(knn, train, target, cv=3, n_jobs=-1).mean()
    (k,maximum) = (i,score) if maximum < score else (k,maximum)
    
knn = neighbors.KNeighborsClassifier(n_neighbors=k+1)
knn.fit(images,digits)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-3-b0291922b4be> in <module>()
      7 train = []
      8 target = []
----> 9 sample = StratifiedShuffleSplit(digits, 1, train_size=0.1, random_state=0)
     10 
     11 # Subsample the training set to do cross validation later

NameError: name 'digits' is not defined

In [ ]:
# Prediction
knn_prediction = knn.predict(test_images).tolist()

In [ ]:
# Submission
submit(knn_prediction,"digits_knn." + str(k))

Random Forest


In [6]:
## Random Forests Classifier
# Training
from sklearn.ensemble import RandomForestClassifier
k = 100
forest = RandomForestClassifier(n_estimators = 100, n_jobs=-1)
forest.fit(images, digits)


Out[6]:
RandomForestClassifier(bootstrap=True, compute_importances=None,
            criterion='gini', max_depth=None, max_features='auto',
            min_density=None, min_samples_leaf=1, min_samples_split=2,
            n_estimators=100, n_jobs=-1, oob_score=False,
            random_state=None, verbose=0)

In [7]:
# Prediction
forest_prediction = forest.predict(test_images).tolist()

In [8]:
# Submission
submit(forest_prediction,"forest_digits1")

Naive Bayes


In [11]:
## Naive Bayes Classifier
# Training
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(images, digits)


Out[11]:
GaussianNB()

In [12]:
# Prediction
nb_prediction = nb.predict(test_images).tolist()

In [13]:
# Submission
submit(nb_prediction,"nbayes_digits1")

In [ ]: