In [29]:
from __future__ import division
import numpy as np
import pandas as pd

def accuracy(lst):                                                              
    correct = 0                                                                 
    for pair in lst:                                                          
        if pair[0] == pair[1]:                                                  
            correct += 1                                                        
    return correct/len(lst)

# read in file
fileNameTrain = 'digist123-1.csv'
train_data = pd.read_csv(fileNameTrain, header=None, sep=';').values
fileNameTest = 'digist123-2.csv'
test_data = pd.read_csv(fileNameTest, header=None, sep=';').values

# all data in a row is X except the last value which is Y
X = np.array(train_data[:, :-1])
Y = np.array(train_data[:,-1])
X_test = np.array(test_data[:, :-1])
Y_test = np.array(test_data[:,-1])


# import gaussian naive bayes classifier from skicit learn
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X, Y)
predicted = gnb.predict(X_test)
print accuracy(zip(predicted, Y_test))

# import logistic regression classifier from skicit learn
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X, Y)
predicted = lr.predict(X_test)
print accuracy(zip(predicted, Y_test))


0.85
0.9625

As we can clearly see, Logistic regression does a way better job of classifying the digits


In [ ]:


In [ ]: