In [2]:
import pandas as pd
import numpy as np

In [3]:
data = pd.read_csv('../data/train.csv')
data


Out[3]:
label pixel0 pixel1 pixel2 pixel3 pixel4 pixel5 pixel6 pixel7 pixel8 ... pixel774 pixel775 pixel776 pixel777 pixel778 pixel779 pixel780 pixel781 pixel782 pixel783
0 1 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 1 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3 4 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
5 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
6 7 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
7 3 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
8 5 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
9 3 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
10 8 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
11 9 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
12 1 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
13 3 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
14 3 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
15 1 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
16 2 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
17 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
18 7 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
19 5 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
20 8 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
21 6 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
22 2 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
23 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
24 2 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
25 3 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
26 6 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
27 9 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
28 9 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
29 7 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
41970 2 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
41971 3 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
41972 4 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
41973 4 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
41974 3 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
41975 9 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
41976 2 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
41977 4 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
41978 4 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
41979 4 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
41980 7 0 0 0 0 0 0 0 0 0 ... 27 253 110 0 0 0 0 0 0 0
41981 2 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
41982 8 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
41983 7 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
41984 3 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
41985 3 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
41986 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
41987 5 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
41988 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
41989 5 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
41990 3 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
41991 1 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
41992 9 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
41993 6 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
41994 4 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
41995 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
41996 1 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
41997 7 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
41998 6 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
41999 9 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

42000 rows × 785 columns


In [8]:
label_sum = {}
label_count = {}
for i_example in xrange(len(data)):
    row = data.iloc[i]
    label_sum[row[0]] = label_sum.get(row[0]) + sum(row[1:])
    label_count[row[0]] = label_count.get(row[0]) + sum(row[1:])


label