In [2]:
import pandas as pd
import numpy as np

In [3]:
data = pd.read_csv('../data/train.csv')
data


Out[3]:
label pixel0 pixel1 pixel2 pixel3 pixel4 pixel5 pixel6 pixel7 pixel8 ... pixel774 pixel775 pixel776 pixel777 pixel778 pixel779 pixel780 pixel781 pixel782 pixel783
0 1 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 1 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3 4 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
5 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
6 7 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
7 3 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
8 5 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
9 3 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
10 8 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
11 9 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
12 1 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
13 3 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
14 3 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
15 1 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
16 2 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
17 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
18 7 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
19 5 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
20 8 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
21 6 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
22 2 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
23 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
24 2 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
25 3 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
26 6 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
27 9 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
28 9 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
29 7 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
41970 2 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
41971 3 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
41972 4 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
41973 4 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
41974 3 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
41975 9 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
41976 2 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
41977 4 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
41978 4 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
41979 4 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
41980 7 0 0 0 0 0 0 0 0 0 ... 27 253 110 0 0 0 0 0 0 0
41981 2 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
41982 8 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
41983 7 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
41984 3 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
41985 3 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
41986 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
41987 5 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
41988 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
41989 5 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
41990 3 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
41991 1 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
41992 9 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
41993 6 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
41994 4 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
41995 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
41996 1 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
41997 7 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
41998 6 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
41999 9 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

42000 rows × 785 columns


In [11]:
label_sum = {}
label_count = {}
for i_example in xrange(len(data)):
    row = data.iloc[i_example]
    label_sum[row[0]] = label_sum.get(row[0],0) + np.sum(row[1:])
    label_count[row[0]] = label_count.get(row[0],0) + 1
print label_sum
print label_count


{0: 143101108, 1: 71142776, 2: 124771582, 3: 123221138, 4: 98675646, 5: 98047318, 6: 114739353, 7: 100920406, 8: 122638195, 9: 102831105}
{0: 4132, 1: 4684, 2: 4177, 3: 4351, 4: 4072, 5: 3795, 6: 4137, 7: 4401, 8: 4063, 9: 4188}

In [14]:
label_mean = {}
for key in label_sum:
    label_mean[key] = label_sum[key] / float(label_count[key])
print label_mean


{0: 34632.407550822849, 1: 15188.466268146884, 2: 29871.099353603066, 3: 28320.188002757986, 4: 24232.722495088408, 5: 25835.920421607378, 6: 27734.917331399563, 7: 22931.244262667577, 8: 30184.148412503077, 9: 24553.75}

In [18]:
def closest_number(value):
    global label_mean
    best_num, best_dist = -1, np.inf
    for key in label_mean:
        dist = abs(label_mean[key]-value)
        if dist < best_dist:
            best_dist = dist
            best_num = key
    return key

In [19]:
correct = incorrect = 0

for i_example in xrange(len(data)):
    row = data.iloc[i_example]
    
    darkness = np.sum(row[1:])    
    prediction = closest_number(darkness)
    
    if prediction == row[0]:
        correct += 1
    else:
        incorrect += 1

In [21]:
print correct/float(correct+incorrect), incorrect/float(correct+incorrect)


0.0997142857143 0.900285714286

In [27]:
test = pd.read_csv('../data/test.csv')

In [36]:
classification = pd.Series(np.zeros((len(test)+1), dtype=int))

for i in xrange(len(test)):
    row = test.iloc[i,1:]
    
    darkness = row.sum()
    prediction = closest_number(darkness)
    
    classification[i+1] = int(prediction)

classification.to_csv('prediction.csv')