In [29]:
from __future__ import division
import numpy as np
import pandas as pd
def accuracy(lst):
correct = 0
for pair in lst:
if pair[0] == pair[1]:
correct += 1
return correct/len(lst)
# read in file
fileNameTrain = 'digist123-1.csv'
train_data = pd.read_csv(fileNameTrain, header=None, sep=';').values
fileNameTest = 'digist123-2.csv'
test_data = pd.read_csv(fileNameTest, header=None, sep=';').values
# all data in a row is X except the last value which is Y
X = np.array(train_data[:, :-1])
Y = np.array(train_data[:,-1])
X_test = np.array(test_data[:, :-1])
Y_test = np.array(test_data[:,-1])
# import gaussian naive bayes classifier from skicit learn
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X, Y)
predicted = gnb.predict(X_test)
print accuracy(zip(predicted, Y_test))
# import logistic regression classifier from skicit learn
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X, Y)
predicted = lr.predict(X_test)
print accuracy(zip(predicted, Y_test))
As we can clearly see, Logistic regression does a way better job of classifying the digits
In [ ]:
In [ ]: