In [2]:
import pandas as pd
import numpy as np
In [3]:
data = pd.read_csv('../data/train.csv')
data
Out[3]:
label
pixel0
pixel1
pixel2
pixel3
pixel4
pixel5
pixel6
pixel7
pixel8
...
pixel774
pixel775
pixel776
pixel777
pixel778
pixel779
pixel780
pixel781
pixel782
pixel783
0
1
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
2
1
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
3
4
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
4
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
5
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
6
7
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
7
3
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
8
5
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
9
3
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
10
8
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
11
9
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
12
1
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
13
3
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
14
3
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
15
1
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
16
2
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
17
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
18
7
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
19
5
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
20
8
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
21
6
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
22
2
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
23
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
24
2
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
25
3
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
26
6
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
27
9
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
28
9
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
29
7
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
41970
2
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
41971
3
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
41972
4
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
41973
4
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
41974
3
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
41975
9
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
41976
2
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
41977
4
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
41978
4
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
41979
4
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
41980
7
0
0
0
0
0
0
0
0
0
...
27
253
110
0
0
0
0
0
0
0
41981
2
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
41982
8
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
41983
7
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
41984
3
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
41985
3
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
41986
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
41987
5
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
41988
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
41989
5
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
41990
3
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
41991
1
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
41992
9
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
41993
6
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
41994
4
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
41995
0
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
41996
1
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
41997
7
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
41998
6
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
41999
9
0
0
0
0
0
0
0
0
0
...
0
0
0
0
0
0
0
0
0
0
42000 rows × 785 columns
In [11]:
label_sum = {}
label_count = {}
for i_example in xrange(len(data)):
row = data.iloc[i_example]
label_sum[row[0]] = label_sum.get(row[0],0) + np.sum(row[1:])
label_count[row[0]] = label_count.get(row[0],0) + 1
print label_sum
print label_count
{0: 143101108, 1: 71142776, 2: 124771582, 3: 123221138, 4: 98675646, 5: 98047318, 6: 114739353, 7: 100920406, 8: 122638195, 9: 102831105}
{0: 4132, 1: 4684, 2: 4177, 3: 4351, 4: 4072, 5: 3795, 6: 4137, 7: 4401, 8: 4063, 9: 4188}
In [14]:
label_mean = {}
for key in label_sum:
label_mean[key] = label_sum[key] / float(label_count[key])
print label_mean
{0: 34632.407550822849, 1: 15188.466268146884, 2: 29871.099353603066, 3: 28320.188002757986, 4: 24232.722495088408, 5: 25835.920421607378, 6: 27734.917331399563, 7: 22931.244262667577, 8: 30184.148412503077, 9: 24553.75}
In [18]:
def closest_number(value):
global label_mean
best_num, best_dist = -1, np.inf
for key in label_mean:
dist = abs(label_mean[key]-value)
if dist < best_dist:
best_dist = dist
best_num = key
return key
In [19]:
correct = incorrect = 0
for i_example in xrange(len(data)):
row = data.iloc[i_example]
darkness = np.sum(row[1:])
prediction = closest_number(darkness)
if prediction == row[0]:
correct += 1
else:
incorrect += 1
In [21]:
print correct/float(correct+incorrect), incorrect/float(correct+incorrect)
0.0997142857143 0.900285714286
In [27]:
test = pd.read_csv('../data/test.csv')
In [36]:
classification = pd.Series(np.zeros((len(test)+1), dtype=int))
for i in xrange(len(test)):
row = test.iloc[i,1:]
darkness = row.sum()
prediction = closest_number(darkness)
classification[i+1] = int(prediction)
classification.to_csv('prediction.csv')
Content source: garred/kaggle-digit-recognizer
Similar notebooks: