In [1]:
import tables
import csv
import time
import numpy as np
import cPickle
from collections import Counter, defaultdict
import matplotlib.pyplot as plt
from itertools import izip
import operator
import copy

In [2]:
file_handler = tables.open_file("click_data.h5", mode = "r")

In [3]:
root = file_handler.root

In [4]:
X = file_handler.root.train.train_raw.X

In [5]:
y = file_handler.root.train.train_raw.y

In [6]:
X_t = file_handler.root.test.test_raw.X_t

In [7]:
X_train = file_handler.root.train.train_raw.X_train

In [8]:
y_train = file_handler.root.train.train_raw.y_train

In [9]:
X_valid = file_handler.root.train.train_raw.X_valid

In [10]:
y_valid = file_handler.root.train.train_raw.y_valid

categories in each feature in 10 days


In [15]:
names = X.colnames

In [16]:
names = ['day', 'hour', 'banner_pos', 'site_id', 'site_domain', 'site_category', 'app_id', 'app_domain', 'app_category', 
         'device_id', 'device_ip', 'device_model', 'device_type', 'device_conn_type', 
         'C1', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21']

In [6]:
cldic = {}

In [7]:
%%time
for col in X.colnames:
    cldic[col] = np.unique(X.col(col))

In [17]:
indexdic = {}

In [18]:
%%time
i = 0
for key in names:
    indexdic[key] = {}
    for ele in cldic[key]:
        i+=1
        indexdic[key][str(ele)] = i


CPU times: user 10.4 s, sys: 1.13 s, total: 11.5 s
Wall time: 11.6 s

In [19]:
i


Out[19]:
9449236

In [22]:
indexdic['banner_pos']


Out[22]:
{'0': 32, '1': 33, '2': 34, '3': 35, '4': 36, '5': 37, '7': 38}

In [23]:
f = open('indexdicall.pkl', 'wb')
cPickle.dump(indexdic, f, -1)
f.close()

only one hot encoding some features


In [2]:
f = open('indexdicall.pkl', 'rb')
indexdic = cPickle.load(f)
f.close()

In [3]:
names = ['day', 'hour', 'banner_pos', 'site_category', 'app_category', 
         'device_type', 'device_conn_type', 
         'C1', 'C15', 'C16', 'C18', 'C20']

In [4]:
%%time
key_remove = [akey for akey in indexdic if akey not in names]
for akey in key_remove:
    del indexdic[akey]


CPU times: user 466 ms, sys: 32.2 ms, total: 498 ms
Wall time: 499 ms

In [6]:
%%time
i = 0
for key in names:
    for akey in indexdic[key]:
        i+=1
        indexdic[key][akey] = i


CPU times: user 167 µs, sys: 16 µs, total: 183 µs
Wall time: 115 µs

In [8]:
i


Out[8]:
309

In [9]:
f = open('indexdiconehot.pkl', 'wb')
cPickle.dump(indexdic, f, -1)
f.close()

In [ ]:

trim unseen test data


In [5]:
f = open('indexdicall.pkl', 'rb')
indexdic = cPickle.load(f)
f.close()

In [6]:
names = ['day', 'hour', 'banner_pos', 'site_id', 'site_domain', 'site_category', 'app_id', 'app_domain', 'app_category', 
         'device_id', 'device_ip', 'device_model', 'device_type', 'device_conn_type', 
         'C1', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21']

In [7]:
%%time
cldictest = {}
for col in names:
    cldictest[col] = set(X_t.col(col))


CPU times: user 12 s, sys: 2.04 s, total: 14 s
Wall time: 14 s

In [8]:
import copy
indexdic_cp = copy.deepcopy(indexdic)

In [9]:
for key in cldictest:
    cldictest[key] = set([str(e) for e in cldictest[key]])

In [10]:
cldictest['hour']


Out[10]:
{'0',
 '1',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '2',
 '20',
 '21',
 '22',
 '23',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9'}

In [12]:
indexdic['hour']


Out[12]:
{'0': 8,
 '1': 9,
 '10': 18,
 '11': 19,
 '12': 20,
 '13': 21,
 '14': 22,
 '15': 23,
 '16': 24,
 '17': 25,
 '18': 26,
 '19': 27,
 '2': 10,
 '20': 28,
 '21': 29,
 '22': 30,
 '23': 31,
 '3': 11,
 '4': 12,
 '5': 13,
 '6': 14,
 '7': 15,
 '8': 16,
 '9': 17}

In [11]:
names.remove('day')

remove unseen categories


In [13]:
%%time
for key in names:
    key_remove = [akey for akey in indexdic[key]
                  if akey not in cldictest[key]]
    for akey in key_remove:
        del indexdic[key][akey]


CPU times: user 5.25 s, sys: 160 ms, total: 5.41 s
Wall time: 5.28 s

In [14]:
for key in indexdic:
    print key, len(indexdic[key])


site_id 2720
app_domain 180
C19 46
site_domain 3199
device_type 4
C17 201
C16 9
device_ip 468030
C14 998
C15 8
device_conn_type 4
C1 7
app_category 28
site_category 22
C20 162
C21 37
banner_pos 6
app_id 3368
day 7
device_id 82194
hour 24
device_model 5386
C18 4

add other


In [15]:
%%time
for key in indexdic:
    if len(indexdic[key]) != len(indexdic_cp[key]):
        indexdic[key]['other'] = 0


CPU times: user 37 µs, sys: 4 µs, total: 41 µs
Wall time: 36 µs

one hot encoding


In [16]:
names = ['day', 'hour', 'banner_pos', 'site_id', 'site_domain', 'site_category', 'app_id', 'app_domain', 'app_category', 
         'device_id', 'device_ip', 'device_model', 'device_type', 'device_conn_type', 
         'C1', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21']

In [18]:
%%time
indexdictrim = {}
i = 0
for key in names:
    indexdictrim[key] = {}
    for akey in indexdic[key]:
        i+=1
        indexdictrim[key][str(akey)] = i


CPU times: user 582 ms, sys: 67.3 ms, total: 649 ms
Wall time: 561 ms

In [19]:
i


Out[19]:
566660

In [24]:
indexdictrim['site_category']


Out[24]:
{'0569f928': 5962,
 '28905ebd': 5960,
 '335d28a8': 5966,
 '3e814130': 5981,
 '42a36e14': 5976,
 '50e219e0': 5977,
 '5378d028': 5970,
 '70fb0e29': 5964,
 '72722551': 5965,
 '74073276': 5967,
 '75fa27f6': 5961,
 '76b2941d': 5979,
 '8fd0aea4': 5974,
 '9ccfa2ea': 5978,
 'a72a0145': 5971,
 'a818d37a': 5969,
 'bcf865d9': 5963,
 'c0dd3be3': 5968,
 'dedf689d': 5973,
 'e787de0e': 5982,
 'f028772b': 5980,
 'f66779e6': 5975,
 'other': 5972}

In [20]:
f = open('indexdicalltrimed.pkl', 'wb')
cPickle.dump(indexdictrim, f, -1)
f.close()

category counts and clickrate in each feature in 10 days


In [8]:
names = ['day', 'hour', 'banner_pos', 'site_id', 'site_domain', 'site_category', 'app_id', 'app_domain', 'app_category', 
         'device_id', 'device_ip', 'device_model', 'device_type', 'device_conn_type', 
         'C1', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21']

In [7]:
countdic = {}
clickdic = {}
for col in names:
    countdic[col] = defaultdict(int)
    clickdic[col] = defaultdict(int)
%%time
for row, target in izip(X.iterrows(),y.iterrows()):
    for name in names:
        countdic[name][row[name]] += 1
        clickdic[name][row[name]] += target[0]

In [10]:
f = open('countdicall.pkl', 'wb')
cPickle.dump(countdic, f, -1)
f.close()

In [11]:
f = open('clickdicall.pkl', 'wb')
cPickle.dump(clickdic, f, -1)
f.close()

In [12]:
del clickdic

In [ ]:

delete rare values


In [7]:
f = open('countdicall.pkl', 'rb')
countdic = cPickle.load(f)
f.close()

In [ ]:
f = open('clickdicall.pkl', 'rb')
clickdic = cPickle.load(f)
f.close()

In [9]:
countdic_cp1 = copy.deepcopy(countdic)
countdic_cp2 = copy.deepcopy(countdic)

In [10]:
for row in X_t.iterrows():
    for name in names:
        countdic[name][row[name]] += 1

In [11]:
%%time
for key in countdic:
    key_remove = [akey for akey, value in countdic[key].iteritems()
                  if value < 11]
    for akey in key_remove:
        if akey in countdic_cp1[key]:
            del countdic_cp1[key][akey]


CPU times: user 4.52 s, sys: 11.9 ms, total: 4.53 s
Wall time: 4.52 s

In [12]:
for key in names:
    print key, len(countdic_cp1[key])


day 7
hour 24
banner_pos 7
site_id 3134
site_domain 3477
site_category 23
app_id 4038
app_domain 252
app_category 27
device_id 93952
device_ip 503912
device_model 5912
device_type 5
device_conn_type 4
C1 7
C14 2422
C15 8
C16 9
C17 428
C18 4
C19 66
C20 166
C21 60

In [12]:
%%time
for key in countdic_cp1:
    if len(countdic_cp1[key]) != len(countdic_cp2[key]):
        countdic_cp1[key]['other'] = 0


CPU times: user 35 µs, sys: 1 µs, total: 36 µs
Wall time: 31.9 µs

In [13]:
%%time
indexdic = {}
i = 0
for key in names:
    indexdic[key] = {}
    for akey in countdic_cp1[key]:
        i+=1
        indexdic[key][str(akey)] = i


CPU times: user 624 ms, sys: 33.1 ms, total: 657 ms
Wall time: 591 ms

In [14]:
i


Out[14]:
617957

In [19]:
f = open('indexdicless.pkl', 'wb')
cPickle.dump(indexdic, f, -1)
f.close()

do not delete rare values in test set


In [10]:
countdict = {}
for col in names:
    countdict[col] = defaultdict(int)

for row in X_t.iterrows():
    for name in names:
        countdict[name][row[name]] += 1
        countdic[name][row[name]] += 1

In [11]:
%%time
for key in names:
    key_remove = [akey for akey, value in countdic[key].iteritems()
                  if value < 11]
    for akey in key_remove:
        if akey in countdic_cp1[key]:
            if akey not in countdict[key]:
                del countdic_cp1[key][akey]


CPU times: user 6.99 s, sys: 60.5 ms, total: 7.05 s
Wall time: 6.99 s

In [12]:
for key in names:
    print key, len(countdic_cp1[key])


day 7
hour 24
banner_pos 7
site_id 3348
site_domain 3910
site_category 24
app_id 4666
app_domain 288
app_category 29
device_id 155636
device_ip 770138
device_model 6169
device_type 5
device_conn_type 4
C1 7
C14 2442
C15 8
C16 9
C17 431
C18 4
C19 67
C20 167
C21 60

In [13]:
%%time
for key in countdic_cp1:
    if len(countdic_cp1[key]) != len(countdic_cp2[key]):
        countdic_cp1[key]['other'] = 0


CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 31 µs

In [14]:
%%time
indexdic = {}
i = 0
for key in names:
    indexdic[key] = {}
    for akey in countdic_cp1[key]:
        i+=1
        indexdic[key][str(akey)] = i


CPU times: user 809 ms, sys: 20.4 ms, total: 830 ms
Wall time: 795 ms

In [15]:
i


Out[15]:
947463

In [16]:
f = open('indexdicless2.pkl', 'wb')
cPickle.dump(indexdic, f, -1)
f.close()

In [ ]:

delete rare values both in training set and test set


In [9]:
for row in X_t.iterrows():
    for name in names:
        countdic[name][row[name]] += 1

In [10]:
countdic_cp1 = copy.deepcopy(countdic)
countdic_cp2 = copy.deepcopy(countdic)

In [11]:
%%time
for key in countdic:
    key_remove = [akey for akey, value in countdic[key].iteritems()
                  if value < 11]
    for akey in key_remove:
        del countdic_cp1[key][akey]


CPU times: user 3.54 s, sys: 16.1 ms, total: 3.55 s
Wall time: 3.53 s

In [12]:
for key in names:
    print key, len(countdic_cp1[key])


day 7
hour 24
banner_pos 7
site_id 3162
site_domain 3486
site_category 23
app_id 4114
app_domain 257
app_category 27
device_id 96655
device_ip 519585
device_model 5914
device_type 5
device_conn_type 4
C1 7
C14 2633
C15 8
C16 9
C17 465
C18 4
C19 67
C20 166
C21 61

In [13]:
%%time
for key in countdic_cp1:
    if len(countdic_cp1[key]) != len(countdic_cp2[key]):
        countdic_cp1[key]['other'] = 0


CPU times: user 41 µs, sys: 1 µs, total: 42 µs
Wall time: 36 µs

In [14]:
%%time
indexdic = {}
i = 0
for key in names:
    indexdic[key] = {}
    for akey in countdic_cp1[key]:
        i+=1
        indexdic[key][str(akey)] = i


CPU times: user 678 ms, sys: 24.7 ms, total: 702 ms
Wall time: 659 ms

In [15]:
i


Out[15]:
636704

In [16]:
f = open('indexdicless3.pkl', 'wb')
cPickle.dump(indexdic, f, -1)
f.close()

In [ ]:

category counts and clickrate in each feature in each day

just for testing


In [53]:
f = open('dayrows.pkl', 'rb')
dayrows = cPickle.load(f)
f.close()

In [54]:
dayrows


Out[54]:
array([4122995, 5337126, 3870752, 3335302, 3363122, 3835892, 3225010,
       5287222, 3832608, 4218938])

In [55]:
allkeys = []
allkeysset = set()

In [56]:
name = 'app_id'

In [57]:
countdic = {}
clickdic = {}

In [58]:
countlist = {}
clicklist = {}

In [59]:
countlistexist = {}
clicklistexist = {}

In [75]:
countlistnonexist = {}
clicklistnonexist = {}

day


In [67]:
day = 2

In [68]:
%%time

countdic[day] = defaultdict(int)
clickdic[day] = defaultdict(int)

for row, target in izip(X.iterrows(start=sum(dayrows[:day-1]),stop=sum(dayrows[:day])),y.iterrows(start=sum(dayrows[:day-1]),stop=sum(dayrows[:day]))):
    countdic[day][row[name]] += 1
    clickdic[day][row[name]] += target[0]


CPU times: user 20.6 s, sys: 161 ms, total: 20.8 s
Wall time: 21.5 s

In [69]:
keys = countdic[day].keys()

In [76]:
keys1 = countdic[day-1].keys()

In [70]:
countlist[day] = [countdic[day][key] for key in keys if key not in allkeysset]
clicklist[day] = [clickdic[day][key] for key in keys if key not in allkeysset]
countlistexist[day] = [countdic[day][key] for key in keys if key in allkeysset]
clicklistexist[day] = [clickdic[day][key] for key in keys if key in allkeysset]
countlist[day] = np.log(countlist[day])
clicklist[day] = np.log(clicklist[day])
countlistexist[day] = np.log(countlistexist[day])
clicklistexist[day] = np.log(clicklistexist[day])
clicklist[day][np.isinf(clicklist[day])] = 0
clicklistexist[day][np.isinf(clicklistexist[day])] = 0

In [80]:
countlistnonexist[day-1] = [countdic[day-1][key] for key in keys1 if key not in keys]
clicklistnonexist[day-1] = [clickdic[day-1][key] for key in keys1 if key not in keys]
countlistnonexist[day-1] = np.log(countlistnonexist[day-1])
clicklistnonexist[day-1] = np.log(clicklistnonexist[day-1])
clicklistnonexist[day-1][np.isinf(clicklistnonexist[day-1])] = 0

In [71]:
allkeys += keys

In [72]:
allkeysset = set(allkeys)

In [83]:
countlistnonexist


Out[83]:
{1: array([ 0.69314718,  0.        ,  1.60943791, ...,  1.09861229,
         0.        ,  0.69314718])}

In [86]:
######################################
import matplotlib.pyplot as plt
fig, axes = plt.subplots()
axes.set_xlabel('count')
axes.set_ylabel('clickrate')
axes.grid()
#y = [colcatecount[key] for key in x]

#axes.plot(x, y, 'ro')
axes.plot(countlist[day-1], clicklist[day-1], 'bo', alpha=0.5)
axes.plot(countlistnonexist[day-1], clicklistnonexist[day-1], 'ro', ms=10)

#axes.plot(countlist[day-1]+16, clicklist[day-1], 'ro', ms=10)
#axes.plot(countlistexist[day-1]+16, clicklistexist[day-1], 'bo', alpha=0.5)

axes.plot(countlistexist[day]+16, clicklistexist[day], 'bo', alpha=0.5)
axes.plot(countlist[day]+16, clicklist[day], 'ro', ms=10)


plt.show()
#fig.savefig("ycount.svg")
#######################################

In [ ]:

gbdt list used in fm


In [2]:
f = open('countdicall.pkl', 'rb')
countdic = cPickle.load(f)
f.close()

In [20]:
cnames = ['site_id', 'site_domain', 'site_category', 'app_id', 'app_domain', 'app_category', 
         'device_id', 'device_model']

In [3]:
cnames = ['site_id', 'site_domain', 'site_category', 'app_id', 'app_domain', 'app_category', 
         'device_id', 'device_model', 'C1', 'banner_pos', 'device_type', 'device_conn_type', 
         'C15', 'C16']

In [4]:
gbdtlist = []

In [22]:
%%time
for name in cnames:
    for key,value in countdic[name].items():
        if value > 1000000:
            gbdtlist.append(name+'_'+key)
            
for key,value in countdic['device_ip'].items():
    if value > 100000:
        gbdtlist.append('device_ip_'+key)


CPU times: user 1min 4s, sys: 203 ms, total: 1min 4s
Wall time: 1min 4s

In [5]:
%%time
for name in cnames:
    for key,value in countdic[name].items():
        if value > 1000000:
            gbdtlist.append(name+'_'+str(key))
            
for key,value in countdic['device_ip'].items():
    if value > 100000:
        gbdtlist.append('device_ip_'+key)
for key,value in countdic['C19'].items():
    if value > 10000000:
        gbdtlist.append('C19_'+str(key))
for key,value in countdic['C20'].items():
    if value > 10000000:
        gbdtlist.append('C20_'+str(key))


CPU times: user 22.7 s, sys: 188 ms, total: 22.9 s
Wall time: 22.8 s
Parser   : 650 ms

In [6]:
gbdtlist


Out[6]:
['site_id_1fbe01fe',
 'site_id_85f751fd',
 'site_id_e151e245',
 'site_domain_7687a86e',
 'site_domain_c4e18dd6',
 'site_domain_7e091613',
 'site_domain_f3845767',
 'site_category_28905ebd',
 'site_category_3e814130',
 'site_category_50e219e0',
 'site_category_f028772b',
 'app_id_ecad2386',
 'app_id_92f5800b',
 'app_id_e2fcccd2',
 'app_domain_7801e8d9',
 'app_domain_5c5a694b',
 'app_domain_2347f47a',
 'app_domain_ae637522',
 'app_category_cef3e649',
 'app_category_8ded1f7a',
 'app_category_0f2161f8',
 'app_category_f95efa07',
 'app_category_07d7df22',
 'device_id_a99f214a',
 'device_model_1f0bc64f',
 'device_model_d787e91b',
 'device_model_8a4875bd',
 'C1_1002',
 'C1_1005',
 'banner_pos_0',
 'banner_pos_1',
 'device_type_0',
 'device_type_1',
 'device_conn_type_0',
 'device_conn_type_2',
 'device_conn_type_3',
 'C15_320',
 'C15_300',
 'C16_50',
 'C16_250',
 'device_ip_431b3174',
 'device_ip_6b9769f2',
 'C19_35',
 'C20_-1']

In [24]:
f = open('gbdtclist.pkl', 'wb')
cPickle.dump(gbdtlist, f, -1)
f.close()

In [7]:
f = open('gbdtclist2.pkl', 'wb')
cPickle.dump(gbdtlist, f, -1)
f.close()

In [ ]:

likelihood


In [6]:
m=2
l=1

In [7]:
glh = np.sum(y)/y.shape[0]

In [8]:
glh


Out[8]:
0.16980562476404604

In [9]:
glh/2.


Out[9]:
0.084902812382023019

In [10]:
f = open('countdicall.pkl', 'rb')
countdic = cPickle.load(f)
f.close()

In [11]:
f = open('clickdicall.pkl', 'rb')
clickdic = cPickle.load(f)
f.close()

In [12]:
clickratedic = {}

In [13]:
%%time
for col in X.colnames:
    clickratedic[col] = defaultdict(float)
    for key in countdic[col]:
        count = float(countdic[col][key])
        click = clickdic[col][key]
        clickratedic[col][key] = (l/(count/9.+m))*glh + (1.-(l/(count/9.+m)))*(click/count)


CPU times: user 33.8 s, sys: 720 ms, total: 34.6 s
Wall time: 34.3 s

In [14]:
f = open('clickratedicall.pkl', 'wb')
cPickle.dump(clickratedic, f, -1)
f.close()

In [15]:
clickratedic['C1']


Out[15]:
defaultdict(<type 'float'>, {1001: 0.033522707474318701, 1002: 0.21073131434371142, 1005: 0.169330882798997, 1007: 0.039462179697126364, 1008: 0.12172663538846347, 1010: 0.095216121297243689, 1012: 0.17249256307238373})

In [ ]:

categories in each feature in 9 days


In [7]:
f = open('dayrows.pkl', 'rb')
days = cPickle.load(f)
f.close()

In [8]:
days


Out[8]:
array([4122995, 5337126, 3870752, 3335302, 3363122, 3835892, 3225010,
       5287222, 3832608, 4218938])

In [9]:
numrows = sum(days[:9])

In [21]:
numrows


Out[21]:
36210029

In [7]:
X.colnames


Out[7]:
['day',
 'hour',
 'C1',
 'banner_pos',
 'site_id',
 'site_domain',
 'site_category',
 'app_id',
 'app_domain',
 'app_category',
 'device_id',
 'device_ip',
 'device_model',
 'device_type',
 'device_conn_type',
 'C14',
 'C15',
 'C16',
 'C17',
 'C18',
 'C19',
 'C20',
 'C21']

In [12]:
cldic = {}

In [13]:
for col in X.colnames:
    cldic[col] = np.unique(X.col(col)[:numrows])

In [5]:
cldic = {}

In [6]:
names = ['day', 'hour', 'banner_pos', 'site_id', 'site_domain', 'site_category', 'app_id', 'app_domain', 'app_category', 
         'device_id', 'device_ip', 'device_model', 'device_type', 'device_conn_type', 
         'C1', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21']

In [7]:
%%time
for col in names:
    cldic[col] = np.unique(X_train.col(col))


CPU times: user 1min 46s, sys: 18 s, total: 2min 4s
Wall time: 3min 18s

In [8]:
indexdic = {}

In [9]:
%%time
i = 0
for key in names:
    indexdic[key] = {}
    for ele in cldic[key]:
        i+=1
        indexdic[key][str(ele)] = i


CPU times: user 5.94 s, sys: 340 ms, total: 6.28 s
Wall time: 6.3 s

In [10]:
i


Out[10]:
8651478

In [18]:
for key in indexdic.keys():
    print key, len(indexdic[key])


site_id 4642
app_domain 548
C19 66
site_domain 7564
device_type 5
C17 407
C16 9
device_ip 6134351
C14 2470
C15 8
device_conn_type 4
C1 7
app_category 36
site_category 26
C20 172
C21 55
banner_pos 7
app_id 8291
day 7
device_id 2484613
hour 24
device_model 8162
C18 4

In [13]:
indexdic['banner_pos']


Out[13]:
{'0': 32, '1': 33, '2': 34, '3': 35, '4': 36, '5': 37, '7': 38}

In [14]:
f = open('indexdic.pkl', 'wb')
cPickle.dump(indexdic, f, -1)
f.close()

test searching time


In [33]:
%time '0000b3ab' in indexdic['device_ip']


CPU times: user 5 µs, sys: 1e+03 ns, total: 6 µs
Wall time: 8.82 µs
Out[33]:
True

In [34]:
%time '0000b3ab' in indexdic['device_ip'].keys()


CPU times: user 337 ms, sys: 12 ms, total: 349 ms
Wall time: 349 ms
Out[34]:
True

category counts and clickrate in each feature in 9 days


In [6]:
colnames = X_train.colnames

method 1


In [23]:
cldic = {}

In [25]:
%%time
for col in colnames:
    cldic[col] = np.unique(X_train.col(col))


CPU times: user 2min 17s, sys: 33.1 s, total: 2min 50s
Wall time: 2min 50s

In [12]:
cldic


Out[12]:
{'C1': defaultdict(<type 'list'>, {}),
 'C14': defaultdict(<type 'list'>, {}),
 'C15': defaultdict(<type 'list'>, {}),
 'C16': defaultdict(<type 'list'>, {}),
 'C17': defaultdict(<type 'list'>, {}),
 'C18': defaultdict(<type 'list'>, {}),
 'C19': defaultdict(<type 'list'>, {}),
 'C20': defaultdict(<type 'list'>, {}),
 'C21': defaultdict(<type 'list'>, {}),
 'app_category': defaultdict(<type 'list'>, {}),
 'app_domain': defaultdict(<type 'list'>, {}),
 'app_id': defaultdict(<type 'list'>, {}),
 'banner_pos': defaultdict(<type 'list'>, {}),
 'day': defaultdict(<type 'list'>, {}),
 'device_conn_type': defaultdict(<type 'list'>, {}),
 'device_id': defaultdict(<type 'list'>, {}),
 'device_ip': defaultdict(<type 'list'>, {}),
 'device_model': defaultdict(<type 'list'>, {}),
 'device_type': defaultdict(<type 'list'>, {}),
 'hour': defaultdict(<type 'list'>, {}),
 'site_category': defaultdict(<type 'list'>, {}),
 'site_domain': defaultdict(<type 'list'>, {}),
 'site_id': defaultdict(<type 'list'>, {})}

In [26]:
clcountdic = {}
clclickdic = {}

In [27]:
%%time
for key in cldic:
    clcountdic[key] = {}
    clclickdic[key] = {}
    for key1 in cldic[key]:
        clcountdic[key][key1] = 0
        clclickdic[key][key1] = 0


CPU times: user 9.35 s, sys: 828 ms, total: 10.2 s
Wall time: 10.1 s

In [38]:
%%time
i=0
for row, target in izip(X_train.iterrows(),y_train.iterrows()):
    for name in colnames:
        clcountdic[name][row[name]] += 1
        clclickdic[name][row[name]] += target[0]
    i+=1
    if i > 3000:
        break


CPU times: user 344 ms, sys: 4.08 ms, total: 348 ms
Wall time: 347 ms

method 2


In [7]:
countdic = {}
clickdic = {}

In [8]:
for col in colnames:
    countdic[col] = defaultdict(int)
    clickdic[col] = defaultdict(int)

In [9]:
%%time
for row, target in izip(X_train.iterrows(),y_train.iterrows()):
    for name in colnames:
        countdic[name][row[name]] += 1
        clickdic[name][row[name]] += target[0]


CPU times: user 24min 16s, sys: 9.86 s, total: 24min 26s
Wall time: 24min 33s

In [10]:
clickdic['day']


Out[10]:
defaultdict(<type 'int'>, {0: 702156, 1: 585643, 2: 1522840, 3: 1438861, 4: 705211, 5: 582730, 6: 613440})

In [11]:
countdic['day']


Out[11]:
defaultdict(<type 'int'>, {0: 3835892, 1: 3225010, 2: 9410217, 3: 9169734, 4: 3870752, 5: 3335302, 6: 3363122})

In [12]:
f = open('countdic.pkl', 'wb')
cPickle.dump(countdic, f, -1)
f.close()

In [ ]:
f = open('clickdic.pkl', 'wb')
cPickle.dump(clickdic, f, -1)
f.close()

one hot encoding with trimmed categories


In [10]:
f = open('countdic.pkl', 'rb')
countdic = cPickle.load(f)
f.close()

In [3]:
countdic['C14']


Out[3]:
defaultdict(<type 'int'>, {16403: 4771, 16404: 14, 16405: 4595, 16406: 4819, 16407: 4735, 16408: 4867, 8330: 431084, 8334: 3975, 8392: 233, 8395: 146, 16615: 438356, 8434: 180, 8437: 178, 16685: 59912, 16687: 68108, 16688: 152795, 375: 85158, 376: 93, 377: 76487, 380: 72773, 381: 3977, 451: 660, 452: 59590, 454: 60193, 455: 171, 456: 62101, 16841: 16244, 463: 52652, 16858: 66248, 16859: 67116, 15838: 506, 16920: 262886, 16932: 1062, 16934: 1032, 15840: 31405, 15841: 639, 16989: 9526, 15842: 7, 17014: 176380, 17016: 148032, 17017: 138148, 17026: 1403, 17027: 12, 17037: 148513, 17062: 1486, 17081: 88451, 17109: 20137, 13777: 4048, 13778: 3768, 13779: 3874, 13780: 4778, 17151: 84, 17156: 108, 17157: 104, 17163: 46034, 17170: 2048, 787: 13365, 8994: 4676, 8995: 4926, 8996: 4620, 9014: 523, 9015: 320, 9016: 358, 17209: 1183, 17212: 66446, 17234: 1206, 22942: 2376, 17239: 316875, 17262: 187882, 17263: 18805, 17264: 201646, 6990: 13490, 6991: 22048, 6992: 13676, 6993: 13921, 1037: 93220, 1038: 75240, 1039: 83365, 9236: 7950, 9248: 48503, 17444: 641, 9334: 145, 9335: 810, 17566: 62056, 7024: 3420, 17573: 6626, 17583: 6750, 17586: 6686, 17614: 223199, 9436: 499, 9437: 1054, 9438: 12906, 1266: 432, 1267: 410, 1268: 371, 1269: 82, 1270: 733, 9463: 170, 1272: 382, 1275: 910, 1276: 971, 17661: 19525, 1278: 580, 17670: 1550, 17671: 1487, 17706: 3659, 17709: 3715, 17747: 91170, 17753: 224965, 17755: 7003, 17757: 1642, 17758: 1673, 17759: 1727, 17786: 999, 17863: 24355, 17875: 133684, 17877: 133180, 17893: 80889, 17894: 87059, 17914: 45401, 9798: 555, 9803: 534, 9805: 564, 18091: 46790, 18092: 44219, 18093: 46202, 18094: 46848, 18095: 44550, 7695: 292, 18156: 1, 18311: 2993, 18314: 3014, 18316: 2939, 18319: 31, 18321: 37, 18324: 36, 18403: 235, 18404: 76, 18405: 246, 18406: 189, 18419: 65, 18420: 89, 18421: 26, 18422: 29, 18423: 19, 18424: 14, 18425: 30, 18426: 18, 18427: 22, 18428: 30, 18429: 15, 18430: 9, 18431: 26, 18432: 37, 18433: 18, 18434: 2, 18443: 57, 18444: 95, 18445: 20, 18446: 39, 18447: 2263, 18448: 19, 18449: 4, 18450: 73, 18451: 129, 18452: 26, 18453: 34, 18454: 2277, 18455: 19, 18456: 2, 18457: 58, 18458: 111, 18459: 25, 18460: 43, 18461: 3742, 18462: 15, 18463: 9, 18464: 76, 18465: 89, 18466: 19, 18467: 32, 18468: 3732, 18469: 16, 18470: 3, 18471: 63, 18472: 100, 18473: 27, 18474: 33, 18475: 3552, 18476: 19, 18477: 2, 10289: 1892, 18485: 81, 18486: 103, 18487: 28, 18488: 46, 18489: 5885, 18490: 29, 18491: 3, 18492: 59, 18493: 256, 18494: 24, 18495: 50, 18496: 4566, 18497: 17, 18498: 2, 18499: 72, 18500: 123, 18501: 28, 18502: 43, 18503: 6033, 18504: 23, 18505: 8, 18506: 69, 18507: 99, 18508: 26, 18509: 41, 18510: 2387, 18511: 23, 18512: 3, 18528: 69, 18529: 121, 18530: 23, 18531: 34, 18532: 4132, 18533: 14, 18534: 3, 18535: 55, 18536: 111, 18537: 25, 18538: 37, 18539: 3656, 18540: 17, 18541: 5, 18542: 73, 18543: 113, 18544: 30, 18545: 32, 18546: 2322, 18547: 15, 18548: 5, 18549: 62, 18550: 117, 18551: 30, 18552: 30, 18553: 3717, 18554: 13, 18555: 6, 18556: 68, 18557: 107, 18558: 30, 18559: 34, 18560: 3775, 18561: 18, 18562: 4, 18563: 59, 18564: 109, 18565: 32, 18566: 24, 18567: 2285, 18568: 18, 18569: 4, 18570: 74, 18571: 120, 18572: 26, 18573: 33, 18574: 5897, 18575: 14, 18576: 3, 18577: 52, 18578: 100, 18579: 20, 18580: 43, 18581: 2291, 18582: 26, 18583: 4, 18584: 81, 18585: 97, 18586: 19, 18587: 33, 18588: 5908, 18589: 20, 18590: 4, 18591: 76, 18592: 280, 18593: 31, 18594: 46, 18595: 6135, 18596: 22, 18597: 3, 18598: 60, 18599: 130, 18600: 28, 18601: 30, 18602: 2204, 18603: 23, 18604: 6, 18646: 5, 18648: 59408, 10531: 174, 10532: 172, 10533: 159, 10534: 135, 10535: 146, 10536: 173, 6399: 3809, 18854: 28339, 18856: 33099, 18858: 34601, 18859: 169, 18860: 203, 18861: 3980, 18925: 8655, 10738: 1520, 10739: 1511, 10740: 1437, 10741: 1506, 18934: 8332, 10743: 1538, 18936: 8380, 18945: 1318, 18946: 1342, 18947: 1314, 18948: 94, 18949: 1039, 18950: 158, 18951: 78, 18952: 2049, 18953: 2052, 18954: 2080, 6360: 16194, 18985: 81, 18986: 45, 18987: 34847, 18993: 32279, 10810: 13, 19015: 120517, 19016: 121394, 6362: 16060, 10895: 99, 10896: 117, 10897: 98, 10898: 100, 6364: 16993, 6401: 3029, 16838: 33762, 19143: 20106, 19146: 19954, 19176: 1895, 19178: 239, 19196: 8005, 19197: 8248, 19215: 1936, 19216: 6648, 19251: 339308, 19276: 3812, 19277: 3717, 19278: 4134, 19279: 3833, 14136: 303, 19284: 412, 14137: 89, 14140: 280, 22333: 356, 15507: 421, 14142: 299, 15508: 411, 15509: 424, 11142: 277, 11143: 258, 11144: 270, 11145: 263, 19368: 132, 19370: 143, 15517: 393, 11188: 145, 11189: 41, 11190: 114, 11191: 134, 11192: 37, 11193: 142, 23715: 337, 19418: 445, 19421: 7329, 19436: 1723, 19454: 13161, 11265: 24, 11266: 5, 11267: 20, 11268: 581, 19493: 54769, 15538: 89, 15539: 110, 11323: 3016, 11325: 3048, 15541: 93, 15549: 104, 15550: 105, 15552: 85, 22379: 4, 19591: 4071, 19592: 3935, 19593: 2083, 19594: 3945, 19595: 3942, 19597: 2155, 19631: 382, 19632: 115, 19633: 105, 19634: 116, 19635: 98, 19637: 93, 19638: 110, 19639: 107, 19641: 118, 19665: 157059, 19666: 5917, 19668: 9694, 15387: 343, 19733: 61393, 19739: 83, 19740: 1256, 19741: 1785, 19742: 1015, 19743: 108526, 19744: 147, 19745: 79, 19746: 3, 19748: 1498, 19749: 1480, 15388: 316, 19771: 673991, 19772: 673290, 19775: 54638, 19776: 55458, 19865: 562, 19870: 10026, 19878: 9955, 19881: 292, 6392: 68700, 19950: 508984, 19951: 9121, 6393: 69073, 19982: 1079, 19983: 1333, 19984: 1230, 19985: 1379, 19986: 1347, 19988: 497, 19998: 28644, 19999: 667, 20000: 899, 20001: 19863, 20002: 295, 20003: 23728, 20004: 5066, 20005: 39482, 20006: 39562, 20007: 39156, 20008: 40046, 20009: 39024, 20010: 5162, 20011: 5169, 20012: 5014, 20013: 5026, 20014: 4530, 20015: 4478, 20016: 5177, 20018: 33363, 20019: 30981, 20048: 3027, 22456: 5, 22458: 5483, 22459: 2856, 20071: 73065, 20072: 80019, 20077: 3390, 22461: 2277, 6397: 3466, 20093: 187349, 20108: 552994, 20122: 19800, 20128: 17291, 20129: 976, 20130: 1008, 20131: 993, 20133: 1088, 20135: 950, 20143: 1248, 20144: 1868, 20145: 1925, 20146: 1937, 20147: 1969, 20148: 1965, 20149: 1886, 20150: 26, 20151: 5175, 20152: 592, 20153: 37356, 20170: 18407, 20173: 15229, 6400: 2986, 11993: 119, 11994: 161, 11996: 111, 12000: 15585, 12004: 98, 12007: 128, 12008: 125, 12012: 121, 12016: 110, 12021: 130, 20214: 37526, 20215: 38159, 12026: 4569, 12028: 104, 12034: 103, 12041: 184, 6402: 2690, 12043: 229, 20251: 203603, 20262: 978, 20271: 7371, 20272: 7363, 20273: 7450, 20274: 7278, 20277: 156379, 20279: 2860, 20280: 714, 20282: 4272, 9464: 49, 20284: 968, 20285: 2502, 20286: 794, 20287: 1146, 20288: 4471, 20289: 3646, 20290: 733, 20291: 764, 20292: 734, 20294: 792, 20295: 2747, 21680: 7152, 20312: 94378, 20345: 104804, 20346: 108119, 20352: 126808, 20353: 1145, 20355: 32595, 20359: 34490, 12168: 1, 20361: 7492, 20362: 132705, 20363: 34086, 20364: 1190, 20365: 7475, 20366: 133658, 12175: 2, 20382: 15760, 20384: 9866, 20385: 9688, 20386: 9773, 20387: 9419, 20388: 9480, 20389: 10094, 20390: 2269, 20391: 66182, 20392: 66718, 20393: 8470, 20394: 66533, 20395: 65542, 20396: 8327, 20397: 8443, 12206: 39, 12207: 43, 12208: 38, 12209: 26, 12210: 36, 12211: 41, 12212: 32, 12213: 63, 12214: 32, 12215: 43, 12216: 40, 12217: 38, 12218: 41, 12219: 43, 12220: 34, 12222: 35, 12223: 34, 12224: 35, 12225: 32, 12226: 51, 12227: 35, 12228: 37, 12229: 37, 12230: 39, 12233: 38, 12234: 37, 12235: 27, 12236: 38, 12237: 27, 13557: 185, 20473: 1, 20474: 22, 20476: 80875, 20477: 1258, 20478: 524, 20508: 110260, 20509: 174, 20510: 833, 20540: 73, 20541: 91, 20542: 77, 20543: 82, 20560: 10077, 20596: 32709, 20617: 381, 20619: 395, 20626: 286, 20628: 308, 20632: 171218, 20633: 252321, 20634: 219454, 20635: 27866, 20636: 41, 20637: 35, 13781: 1476, 12469: 2529, 12470: 2551, 12471: 2491, 12472: 2529, 12477: 2365, 13782: 4190, 13783: 3728, 20731: 209, 20742: 1, 20743: 3374, 20744: 3523, 20750: 2560, 20751: 44509, 4407: 8198, 4518: 8312, 20966: 14613, 20969: 26636, 20970: 14502, 20980: 35217, 20984: 35303, 20993: 16185, 20994: 9, 20995: 16298, 20996: 16303, 20997: 16295, 20998: 16067, 20999: 16009, 4687: 912661, 21093: 3511, 17171: 2174, 17653: 263271, 21135: 1044, 21136: 48, 21137: 2622, 21138: 72, 21139: 104, 21153: 151170, 21154: 8254, 21155: 8406, 21156: 8396, 21157: 953, 21158: 828, 21159: 788, 21160: 826, 21161: 803, 21162: 155, 21188: 5, 21189: 764129, 21190: 6, 21191: 763261, 21192: 1993, 21193: 40, 21194: 2080, 21195: 1978, 21196: 1993, 21197: 1973, 21200: 1207, 21201: 1797, 21202: 1774, 21203: 15, 21204: 1862, 21205: 1919, 21206: 5, 21207: 6092, 21229: 22, 21230: 24, 21232: 27, 21233: 107, 21234: 16127, 21235: 4778, 21239: 138, 21240: 326, 21241: 288, 21243: 696, 21244: 617, 21245: 298, 21246: 3817, 21247: 3872, 21248: 5577, 21249: 8685, 21250: 8565, 17654: 265390, 21273: 11396, 21274: 7709, 21275: 3, 21276: 9125, 21277: 25893, 21278: 10975, 6361: 15408, 21294: 10, 21295: 17, 21296: 892, 21298: 1, 21299: 1, 21300: 3350, 21301: 2, 21303: 4, 21304: 1, 21305: 1, 21306: 3, 21307: 6, 21308: 3, 21309: 3910, 21310: 4051, 21311: 4012, 21312: 414, 21313: 406, 21314: 416, 21315: 391, 21316: 399, 21317: 199, 21318: 176, 21319: 219, 21325: 841, 21326: 844, 21327: 862, 21328: 5994, 17208: 1166, 21330: 5714, 21331: 581, 21332: 615, 21333: 610, 21363: 9, 21367: 123, 21369: 4, 21388: 65, 21399: 2750, 21400: 2640, 21401: 2623, 21404: 8046, 21408: 7762, 21409: 8104, 21410: 7755, 21411: 8745, 21412: 44419, 21413: 43215, 21475: 385, 21476: 1464, 21477: 377, 21478: 934, 21329: 5695, 21480: 395, 21481: 375, 21482: 388, 21483: 354, 21484: 390, 21485: 375, 21486: 472, 21555: 1598, 21556: 1916, 21557: 1276, 21558: 1987, 21559: 1851, 21560: 1845, 21561: 1418, 21562: 2733, 21563: 3883, 21564: 1932, 21565: 597, 21566: 1945, 21567: 1669, 21568: 1693, 21569: 1553, 21570: 1713, 21571: 1742, 21589: 14187, 21590: 14322, 21591: 13048, 21592: 14337, 21593: 13638, 21594: 14174, 21595: 14070, 21596: 25, 21597: 30, 21598: 26, 21599: 21, 21600: 32, 21601: 27, 21602: 23, 21603: 20, 21604: 9, 21605: 24, 21606: 66, 21607: 24, 21608: 126, 21609: 5112, 21610: 5885, 21611: 816604, 21612: 10954, 21613: 6, 21614: 473, 21615: 5, 21616: 2593, 21617: 2682, 21618: 2648, 21619: 2784, 21620: 2689, 21621: 2791, 21622: 473, 21623: 2947, 21624: 353, 21625: 2902, 21626: 2799, 21627: 2873, 21628: 2886, 21629: 2988, 21630: 2794, 21631: 2798, 21632: 2781, 21633: 138, 21634: 310, 21635: 358, 21640: 6670, 21647: 114386, 21652: 5374, 21653: 5467, 21654: 5367, 21655: 506, 21660: 27005, 21661: 3242, 21662: 6713, 21663: 7165, 21664: 85667, 21665: 120786, 21666: 35802, 21667: 35655, 21668: 289, 21669: 527, 21670: 304, 21671: 270, 21672: 318, 21673: 366, 21674: 48422, 21675: 18043, 21676: 18408, 21677: 17935, 21678: 46411, 21679: 49572, 5296: 62436, 21681: 9292, 21682: 9377, 21683: 9276, 21684: 7407, 21685: 9327, 21686: 5723, 21687: 48714, 21688: 7176, 21689: 9252, 21690: 2036, 21691: 17547, 21692: 1219, 21693: 884, 21694: 19003, 21695: 1235, 21696: 882, 21697: 18829, 21698: 812, 21699: 838, 21700: 796, 21701: 822, 21702: 771, 21703: 792, 21704: 810, 21705: 821, 21706: 60106, 21707: 112, 21708: 98, 21715: 71386, 21716: 134, 21717: 52, 21718: 10, 21719: 24, 21720: 68, 21721: 6, 21722: 2, 21723: 2623, 21724: 75965, 21725: 59673, 21726: 62603, 21727: 58570, 21729: 90, 21730: 1108, 21731: 79826, 21732: 57453, 21733: 63, 21734: 1113, 21735: 2, 21737: 1106, 21738: 3222, 21739: 22746, 21740: 2, 21741: 2046, 21742: 59, 21743: 2690, 21744: 1830, 21745: 1591, 21746: 43142, 21747: 43345, 21748: 1200, 21749: 415, 21750: 231, 21751: 49140, 13560: 178, 21753: 57, 21755: 917, 21757: 57422, 21758: 57718, 21759: 56471, 21760: 56558, 21761: 57666, 21762: 74607, 21763: 66375, 21764: 48919, 21765: 2592, 21766: 1858, 21767: 376105, 21768: 373692, 21769: 202166, 21770: 198961, 21771: 5271, 21772: 29429, 21773: 11571, 21774: 1062, 21775: 1049, 21776: 8028, 21777: 1057, 21778: 2507, 21779: 207, 21780: 180, 21781: 214, 21782: 193, 21783: 215, 21784: 1, 21785: 1, 21786: 41, 21787: 28, 21789: 162611, 21790: 280425, 21791: 21436, 21792: 1096, 21793: 15326, 21794: 1235, 21795: 2031, 21800: 1, 21801: 26426, 21802: 1, 21803: 26900, 6365: 16689, 21812: 1553, 6366: 15292, 21814: 1547, 21816: 35597, 21817: 2826, 21818: 35584, 21819: 2742, 21820: 7336, 21821: 7374, 21822: 5327, 21823: 1, 21824: 4, 21825: 5, 21826: 5, 21827: 7750, 21828: 7676, 21829: 7550, 21830: 7672, 21831: 7731, 21832: 7647, 21833: 7813, 21834: 23293, 21835: 23158, 21836: 23692, 21837: 18861, 21838: 6644, 21839: 32, 21840: 36, 21841: 6646, 21842: 6712, 21843: 33, 21844: 6663, 21845: 1554, 21846: 1541, 21847: 1542, 21848: 1593, 21849: 1562, 21850: 1496, 21851: 1513, 21853: 31695, 21860: 1300, 21862: 20518, 21863: 20352, 21869: 22039, 21872: 137, 21873: 143, 21874: 429, 21875: 73129, 21876: 2160, 21877: 69589, 21878: 55684, 21880: 1326, 21881: 186, 21882: 93749, 21883: 56931, 21884: 1, 21885: 361, 21886: 2298, 21887: 2087, 21888: 1969, 21889: 2104, 21890: 2140, 21891: 2066, 21892: 2149, 21893: 94757, 21894: 57668, 21895: 221, 21896: 212, 21897: 199, 21898: 198, 21899: 210, 21900: 167, 21901: 207, 21902: 176, 21903: 1, 21912: 15055, 21913: 15065, 21914: 1391, 21915: 387, 21916: 14692, 21917: 16074, 21918: 1465, 21919: 381, 21920: 26, 21921: 14896, 21922: 13588, 21923: 14975, 21924: 9226, 21950: 1072, 21951: 1150, 21952: 1050, 21953: 763, 21954: 728, 21955: 760, 21956: 2328, 21957: 14449, 21958: 198, 21959: 231, 21960: 252, 21961: 2863, 21962: 2447, 21965: 8584, 21966: 145, 21967: 133, 21968: 134, 21969: 146, 21970: 137, 21971: 137, 21972: 5025, 21973: 2, 21974: 2, 21975: 5039, 21977: 3, 21978: 3, 21979: 553, 21980: 6012, 21981: 5967, 21982: 5996, 21983: 6007, 21984: 5990, 21985: 5864, 21986: 36, 21987: 5534, 21988: 2655, 21990: 2665, 21991: 2780, 21992: 2812, 21993: 2711, 21994: 2762, 21995: 2691, 21996: 1666, 21997: 1571, 21998: 1711, 21999: 1545, 22000: 1617, 22001: 1631, 22002: 1499, 22003: 1609, 6398: 3841, 22006: 5275, 22007: 5230, 22008: 5410, 22009: 5345, 22010: 5382, 22011: 5397, 22012: 115, 22013: 114, 22014: 116, 22015: 123, 22016: 122, 22017: 13, 22018: 101, 22019: 130, 22020: 110, 22021: 2114, 22022: 2352, 22023: 2211, 22024: 1393, 22025: 2269, 22026: 2155, 22027: 2257, 22028: 2218, 22029: 2208, 22057: 238, 22064: 235, 22065: 217, 22066: 224, 22067: 218, 22068: 225, 22069: 196, 22074: 2, 22075: 2, 22076: 1, 22077: 32, 22078: 1, 22080: 1, 22081: 2, 22082: 301, 22083: 273, 22084: 264, 22085: 594, 22086: 308, 22087: 32, 22088: 124, 22089: 284, 22090: 4936, 22092: 5032, 22094: 16710, 22095: 172, 22096: 10, 22097: 195, 22102: 40819, 22103: 34051, 22104: 80613, 22105: 33440, 22106: 57210, 22107: 9879, 22108: 7561, 22109: 8838, 22110: 9336, 22111: 8731, 22112: 8382, 22113: 19032, 22114: 19176, 22115: 1248, 22116: 18948, 22117: 1112, 22118: 19030, 22119: 536, 22120: 18601, 22122: 185, 22123: 179, 22126: 233, 22127: 128, 22132: 260, 22133: 16089, 22135: 3, 22136: 240, 22137: 16055, 22139: 6, 22140: 22563, 22141: 51583, 22142: 27937, 22143: 471, 22144: 486, 22145: 15, 22146: 471, 22147: 482, 22148: 512, 22149: 536, 22150: 458, 22151: 493, 22152: 2891, 22153: 2007, 22154: 6247, 22155: 5250, 22156: 5703, 22157: 4113, 22158: 1336, 22159: 5174, 22160: 2538, 22161: 7166, 22162: 2204, 22163: 2349, 22164: 4787, 22165: 2273, 22166: 2197, 22167: 2295, 22168: 2231, 22169: 2286, 22170: 1933, 22171: 114, 22172: 1331, 22173: 1450, 22174: 1379, 22175: 1294, 22176: 1269, 22177: 1399, 22178: 1373, 22179: 1293, 22180: 685, 22181: 712, 22182: 661, 22183: 673, 22184: 706, 22185: 692, 22186: 4074, 22187: 841, 22188: 743, 22189: 5100, 22190: 13503, 22191: 1669, 22192: 3042, 22193: 12845, 22194: 47, 22195: 2744, 22196: 10677, 22197: 3051, 22198: 3266, 22199: 5, 22200: 3233, 22201: 8, 22202: 3256, 22203: 5, 22217: 7, 22218: 863, 22219: 28, 22220: 4410, 22221: 16524, 22222: 1379, 22224: 22643, 22225: 7138, 22227: 28, 22228: 300, 22229: 20, 22230: 278, 22231: 30, 22236: 3339, 22237: 1, 22238: 3321, 22239: 3, 22240: 1116, 22241: 8112, 22242: 2132, 22243: 257, 22244: 5550, 22245: 1094, 22246: 75421, 22247: 2817, 22252: 5, 22254: 53618, 22255: 48310, 22256: 1351, 22257: 30614, 22258: 53700, 22259: 54120, 22260: 29042, 22261: 53419, 22262: 5314, 22263: 4944, 22264: 2229, 22265: 2196, 22266: 132, 22267: 125378, 22268: 139133, 22269: 5193, 22270: 5752, 22271: 5748, 22272: 11373, 22273: 7117, 22275: 6898, 22277: 557, 22278: 2209, 22279: 590, 22280: 2166, 22281: 544, 22282: 761, 22283: 743, 22284: 773, 22285: 129, 22286: 76, 22287: 75, 22288: 129057, 22289: 13522, 22290: 1275, 22291: 1176, 22292: 4211, 22293: 1340, 22294: 1264, 22295: 1214, 22296: 1272, 22297: 1296, 22298: 4045, 22299: 1242, 22300: 1201, 22301: 1305, 22302: 1180, 22303: 1268, 22304: 3974, 22305: 4247, 22306: 1241, 22307: 1273, 22308: 1288, 22309: 1229, 22310: 1304, 22311: 1320, 22312: 4319, 22313: 1216, 22314: 1234, 22315: 4121, 22316: 1681, 22318: 1658, 22320: 7, 22321: 18, 22322: 19956, 22323: 15, 22324: 19972, 22325: 7103, 22326: 616, 22327: 608, 22328: 636, 22329: 368, 22330: 363, 22331: 354, 22332: 8443, 14141: 311, 22334: 91, 22335: 12784, 22336: 4653, 22338: 2835, 22339: 37, 22340: 11, 22341: 15, 22342: 17, 22343: 2657, 22344: 16, 22345: 1, 22346: 39, 22347: 20, 22348: 13, 22349: 6, 22350: 2595, 22351: 8, 22352: 12, 22353: 20, 22354: 12, 22358: 70, 22359: 6609, 22360: 74, 22361: 81, 22362: 861, 22366: 6284, 22371: 6529, 22372: 208, 22373: 631, 22377: 225, 22378: 9, 21479: 411, 22380: 700, 22384: 189, 22387: 5, 22388: 5, 14198: 116, 22393: 73, 14202: 3465, 22396: 15, 22397: 2, 22400: 71, 22404: 2, 22405: 65, 22406: 2273, 22407: 4055, 22408: 5, 22409: 4, 22410: 9, 22411: 2263, 22412: 238, 22413: 13, 22414: 4021, 22418: 2321, 22420: 8, 22421: 224, 22422: 250, 22444: 2, 22448: 2, 22451: 4, 14264: 1634, 14265: 43282, 14266: 1537, 14267: 148, 14268: 86, 14269: 20, 22462: 128, 22463: 26, 22464: 65, 22465: 67, 22466: 1581, 22467: 1540, 22468: 1648, 22469: 1561, 22470: 276, 22471: 44, 22472: 2, 22473: 296, 22474: 494, 14283: 23, 22476: 4, 22477: 41, 22478: 7, 22479: 295, 22480: 496, 22482: 4, 22484: 173, 22485: 99, 22486: 22, 22487: 11144, 22489: 2, 22490: 2770, 22491: 2906, 22492: 1822, 22493: 1859, 22494: 610, 22496: 582, 22499: 127, 22501: 141, 22503: 1629, 22504: 2358, 22514: 21420, 22516: 21234, 22519: 87, 22521: 80, 22523: 3617, 22530: 3693, 22532: 6505, 22541: 14723, 22542: 1674, 22543: 1607, 22544: 1774, 22545: 519, 22546: 554, 22547: 537, 22548: 1540, 22549: 525, 22550: 406, 22551: 3104, 22552: 22640, 22553: 23611, 22554: 53427, 22555: 173, 22556: 173, 22557: 157, 22558: 4819, 22559: 5038, 22560: 4788, 22561: 54755, 22562: 4777, 22567: 1, 22570: 2, 22571: 2, 22572: 411, 22573: 229, 22575: 2241, 22576: 1, 22578: 89, 22579: 84, 22580: 63, 22581: 66, 22582: 76, 22584: 102, 22585: 86, 22586: 729, 22587: 10345, 22588: 361, 22590: 2244, 22591: 2119, 22592: 2152, 22593: 595, 22595: 4946, 22598: 17684, 22599: 6054, 22600: 5675, 22601: 5512, 22602: 5870, 22603: 5477, 22604: 5762, 22605: 5726, 22606: 5573, 22607: 4243, 22611: 7944, 22612: 6, 22614: 4265, 22615: 2991, 22616: 29, 22617: 33, 22618: 39, 22619: 2, 22620: 3, 22621: 2, 22622: 31, 22624: 45002, 22628: 1022, 22629: 982, 22630: 948, 22632: 1001, 22633: 9133, 22634: 80, 22635: 82, 22636: 61, 22637: 2093, 22638: 2104, 22639: 2149, 22640: 70, 22641: 2156, 22642: 404, 22643: 435, 22644: 423, 22645: 606, 22646: 418, 22647: 399, 22648: 387, 22649: 378, 22650: 16, 22651: 365, 22652: 394, 22653: 372, 22654: 376, 22655: 349, 22656: 40, 22657: 368, 22658: 402, 22659: 374, 22660: 14, 22661: 3, 22662: 2, 22663: 11, 22664: 23, 22665: 15, 22666: 4, 22667: 12, 22668: 613, 22670: 602, 22672: 58708, 22676: 159137, 22677: 61368, 22680: 172697, 22681: 169090, 22682: 160850, 22683: 170437, 22684: 39790, 22685: 39308, 22686: 24323, 22687: 2519, 22700: 43550, 22701: 41796, 22702: 22837, 22703: 43976, 22704: 285, 22705: 2968, 22706: 304, 22707: 3012, 22708: 297, 22713: 17197, 22715: 3573, 22716: 85, 22724: 1155, 22726: 1122, 22727: 801, 22728: 1214, 22729: 760, 22730: 1168, 22735: 3077, 22736: 2541, 22737: 2058, 22738: 64211, 22739: 3527, 22740: 713, 22741: 4735, 22742: 3495, 22743: 3719, 22744: 6204, 22745: 3522, 22746: 1277, 22747: 1363, 22748: 1245, 22749: 1230, 22750: 1255, 22751: 1112, 22752: 1124, 22753: 2928, 22754: 2811, 22755: 2833, 22756: 2861, 22757: 3103, 22758: 2909, 22759: 3078, 22760: 1699, 22761: 2061, 22762: 1750, 22763: 1907, 22764: 2007, 22765: 4186, 22766: 1791, 22767: 1856, 22768: 2232, 22769: 1663, 22770: 2440, 22771: 2071, 22772: 2093, 22773: 4315, 22774: 793, 22775: 824, 22776: 712, 22777: 700, 22778: 756, 22779: 760, 22780: 729, 22781: 232, 22782: 426, 22783: 451, 22784: 464, 22785: 589, 22786: 591, 22787: 490, 22788: 917, 22789: 870, 22790: 912, 22791: 1046, 22792: 989, 22793: 909, 22794: 804, 22795: 542, 22796: 2126, 22797: 2192, 22798: 2977, 22799: 2037, 22800: 2991, 22801: 1679, 22802: 2269, 22803: 2313, 22804: 2295, 22805: 2132, 22806: 2070, 22807: 2054, 22808: 2025, 22809: 13329, 22810: 13555, 22811: 14049, 22812: 13757, 22813: 55491, 22814: 38317, 22815: 40274, 22816: 3119, 22817: 2793, 22818: 4027, 22819: 4455, 22820: 2342, 22821: 3719, 22822: 3050, 22823: 2737, 22824: 56, 22825: 66, 22826: 67, 22827: 166, 22828: 150, 22829: 192, 22830: 67, 22831: 179, 22832: 37969, 22833: 157, 22834: 158, 22835: 155, 22836: 526, 22837: 190, 22838: 219, 22839: 159, 22840: 223, 22841: 2009, 22842: 59, 22843: 87, 22844: 71, 22845: 54, 22846: 46, 22847: 47, 22848: 72, 22849: 54, 22850: 6773, 22851: 163, 22852: 151, 22853: 129, 22854: 408, 22855: 393, 22856: 400, 22857: 141, 22858: 387, 22859: 925, 22860: 113, 22861: 95, 22862: 105, 22863: 33, 22864: 26, 22865: 24, 22866: 117, 22867: 41, 22868: 903, 22869: 119, 22870: 98, 22871: 96, 22872: 23, 22873: 34, 22874: 37, 22875: 92, 22876: 28, 22877: 521, 22878: 43, 22879: 71, 22880: 48, 22881: 23, 22882: 26, 22883: 25, 22884: 62, 22885: 50, 22886: 428, 22887: 74, 22888: 63, 22889: 61, 22890: 30, 22891: 34, 22892: 39, 22893: 56, 22894: 40, 22895: 2223, 22896: 179, 22897: 168, 22898: 185, 22899: 54, 22900: 60, 22901: 75, 22902: 35, 22903: 69, 22904: 1128, 22905: 69, 22906: 81, 22907: 62, 22908: 69, 22909: 55, 22910: 64, 22911: 84, 22912: 60, 22913: 1968, 22914: 24073, 22915: 24610, 22916: 24891, 22917: 304, 22919: 5202, 22920: 60171, 22921: 263, 22922: 6924, 22923: 6874, 22924: 6962, 22925: 6933, 22928: 132, 22934: 1991, 22935: 1410, 22936: 1418, 22937: 1361, 22938: 1983, 22939: 2265, 22940: 310, 22941: 2241, 6558: 30690, 6559: 20322, 6560: 28279, 22945: 304, 22946: 132, 6563: 269920, 22948: 12214, 22949: 8859, 22950: 8819, 22951: 8060, 22952: 14355, 22953: 12537, 22954: 14008, 22955: 8972, 22956: 5055, 22957: 5219, 22958: 2941, 6557: 19880, 22960: 4607, 22961: 4934, 22962: 4624, 22963: 23, 22964: 31, 22965: 35, 22966: 272, 22967: 208, 22968: 197, 22969: 34, 22970: 184, 22943: 1699, 22972: 108, 22973: 93, 22974: 87, 20213: 36924, 22976: 1, 22944: 281, 22978: 1, 22979: 214, 22980: 232, 22981: 217, 22982: 190, 22983: 227, 22984: 205, 22985: 192, 22986: 218, 22987: 18759, 22988: 14508, 22989: 8957, 22990: 7574, 22991: 7331, 22992: 7316, 22993: 10819, 22994: 6872, 22947: 8783, 22996: 143, 22997: 314, 22998: 4258, 22999: 4245, 6616: 81995, 23001: 479, 23002: 4434, 23003: 368, 23005: 170, 23006: 3, 23007: 150, 23008: 1, 23011: 3013, 23012: 5168, 23013: 2457, 23014: 2213, 23015: 4300, 23016: 5931, 23017: 3294, 23018: 4228, 23019: 167, 23020: 744, 23021: 137, 23022: 167, 23023: 141, 23024: 132, 23025: 140, 23026: 199, 23032: 1, 23035: 14, 23036: 17, 23037: 14, 23038: 12, 23039: 67, 23040: 72, 23041: 915, 23042: 979, 23043: 1149, 23044: 1189, 23045: 1138, 23046: 1183, 23047: 1216, 23048: 1180, 23049: 1072, 23050: 48, 23051: 50, 23052: 32, 23053: 23, 23054: 20, 23055: 39, 23056: 19, 23057: 131, 23058: 140, 23060: 116, 23062: 76, 23065: 692, 22959: 4960, 23068: 60, 23087: 2027, 23089: 17, 23090: 41, 23091: 14053, 23093: 3, 23095: 48, 23096: 24, 23098: 28, 23099: 54, 23100: 13951, 23101: 53, 23102: 133, 23103: 27, 23104: 23, 23105: 1033, 23106: 45, 23107: 1075, 23108: 26, 23109: 49, 23110: 10, 23111: 1119, 23113: 17, 23114: 1, 23115: 19, 23116: 13, 23117: 48, 23118: 1047, 23119: 44, 23120: 128, 23121: 21, 23122: 12, 23123: 1108, 23124: 45, 23125: 1084, 23126: 14, 23127: 51, 23128: 14, 23129: 1109, 23131: 19, 23133: 12, 23134: 10, 23135: 35, 23136: 1135, 23137: 13096, 23138: 12990, 22971: 84, 23141: 13365, 23142: 3002, 23143: 240598, 23144: 236260, 23145: 2597, 23146: 602, 23147: 591, 23148: 617, 23150: 579, 23151: 651, 23152: 617, 23153: 628, 23160: 52122, 23161: 53782, 22975: 46, 23164: 7955, 23165: 7996, 23166: 10124, 23167: 7925, 23168: 11222, 23169: 7806, 23170: 10653, 23171: 7710, 23172: 7755, 23173: 7649, 23174: 10777, 14983: 897, 23176: 287, 14985: 919, 23178: 522, 14988: 873, 23182: 237, 23184: 218, 23185: 4, 23188: 1, 23191: 2907, 23192: 2686, 23193: 2762, 23194: 2797, 23195: 2781, 23196: 2886, 15005: 893, 15006: 877, 23199: 2697, 23200: 2978, 23201: 2743, 15010: 916, 23203: 9041, 23204: 5072, 23205: 8732, 23206: 8719, 23207: 1586, 15016: 892, 15017: 881, 23210: 1564, 23211: 40, 23212: 47, 23213: 44, 23214: 1095, 23216: 2292, 23219: 14, 23220: 110, 23221: 100690, 23222: 100681, 23223: 901, 23224: 97604, 23228: 488, 23229: 3, 23230: 4, 23231: 565, 23232: 2, 23234: 185, 23251: 373, 23254: 5, 23258: 99, 23260: 1, 23261: 7, 23279: 59, 23282: 3, 22995: 131, 23286: 271, 23289: 9, 23300: 87, 23303: 23, 23000: 4633, 23314: 266, 23317: 23, 23321: 60, 23324: 8, 23328: 315, 23331: 8, 23335: 318, 23338: 10, 23339: 1, 23342: 189, 23345: 6, 23349: 56, 23352: 8, 23353: 1, 23356: 415, 23359: 13, 23360: 1, 23363: 65, 23366: 12, 23368: 29445, 23369: 30880, 23370: 43, 23371: 15212, 23372: 8, 23373: 1367, 23374: 5, 23375: 313, 23376: 1773, 23377: 13, 23378: 3, 23379: 3587, 23380: 5, 23384: 5, 23389: 1, 23390: 10261, 23391: 20, 23392: 7, 23393: 20, 23394: 10184, 23395: 929, 23396: 13, 23401: 51952, 23402: 315, 23403: 2441, 23404: 323, 23405: 51932, 23406: 600, 7023: 2476, 23408: 1179, 23409: 5, 23410: 2, 23411: 10408, 23412: 18, 23415: 2344, 23416: 38, 23417: 584, 23418: 218, 23419: 1520, 23420: 980, 23423: 23, 23424: 3934, 23425: 1, 23427: 77, 23429: 248, 23430: 33958, 23431: 166, 23432: 1026, 23433: 165, 23434: 34126, 23435: 445, 23436: 3, 23437: 4, 23438: 178261, 23439: 2471, 23440: 717, 23441: 118824, 23442: 176, 23443: 125, 23444: 171, 23445: 193, 23446: 204, 23447: 37, 23448: 45, 23449: 19, 23450: 21, 23451: 9, 23452: 15, 23453: 28, 23454: 116652, 23455: 2912, 23456: 4787, 23468: 2277, 23469: 203, 23475: 13, 23477: 15, 23480: 11, 23482: 26, 23485: 19, 23486: 20, 23487: 16, 23488: 13, 23489: 26, 23490: 9, 23492: 22, 10742: 1456, 23495: 16, 23496: 16, 23497: 15, 23498: 16, 23499: 17, 23507: 217, 23508: 14, 23511: 53, 23512: 7, 23515: 47, 23516: 3, 23518: 1, 23519: 58, 23520: 2, 23536: 2525, 23537: 2509, 23538: 39, 23539: 2707, 23540: 2724, 23550: 170, 23551: 15009, 23552: 30963, 23553: 2204, 23554: 1, 23561: 42842, 23562: 43305, 23568: 3, 23570: 2, 23571: 252, 23572: 6, 23573: 1, 23575: 4, 23576: 30, 15385: 326, 15386: 396, 23579: 8, 23580: 28, 23581: 2, 15390: 756, 23584: 1, 23585: 272, 23587: 1, 23588: 225, 23589: 140, 23590: 3, 23592: 132, 23596: 2, 23597: 7, 23598: 1, 23601: 11, 23602: 2, 5299: 68959, 23606: 133, 23609: 6, 23616: 1289, 23618: 675, 23620: 15, 23622: 86, 23623: 29, 23624: 149, 23625: 7319, 23626: 89690, 23627: 244, 23628: 6, 23629: 28, 23630: 106, 23631: 119, 23632: 51, 23633: 119, 23634: 140, 23635: 17625, 23636: 30, 23637: 32, 23638: 99, 23639: 566, 23640: 543, 23641: 525, 23642: 70443, 23643: 521, 23644: 1538, 23645: 61, 23646: 1699, 23647: 1756, 23648: 1652, 23649: 1935, 23650: 3099, 23651: 1838, 23652: 1621, 23653: 1762, 23654: 1897, 23655: 1765, 23656: 1726, 23657: 1858, 23658: 1771, 23659: 607, 23660: 576, 23661: 599, 23662: 613, 23663: 606, 23664: 585, 23665: 631, 23666: 610, 23667: 482, 23668: 596, 23669: 564, 23670: 292, 23672: 221, 23674: 255, 23675: 244, 23683: 234, 23684: 219, 23685: 264, 23686: 218, 23687: 248, 23688: 284, 23689: 193, 23690: 244, 23691: 7628, 23692: 331, 23693: 352, 15503: 470, 15504: 429, 15505: 375, 15506: 411, 23699: 1202, 23700: 1211, 23701: 1251, 15510: 416, 15511: 413, 15512: 398, 15513: 410, 15514: 447, 15515: 450, 15516: 422, 23709: 5791, 23710: 1124, 23712: 1226, 23714: 1265, 23067: 2023, 23716: 291, 23717: 111, 23718: 353, 23719: 309, 23720: 312, 23721: 8861, 23722: 50304, 23723: 27097, 23724: 3339, 23725: 23426, 23726: 18457, 23727: 26, 23728: 16880, 23729: 18882, 23730: 9358, 23731: 3, 15540: 154, 23733: 4, 15542: 102, 15543: 103, 15544: 94, 15545: 120, 15546: 87, 15547: 73, 15548: 79, 23741: 3847, 23742: 2441, 15551: 104, 23744: 2406, 23750: 23, 23754: 2742, 23755: 2698, 23756: 2648, 23757: 2603, 23758: 2563, 23759: 2699, 23760: 2703, 23761: 2666, 23762: 2647, 23763: 2633, 23764: 1424, 23765: 1354, 15575: 524, 23770: 16, 23777: 17, 23784: 7, 23791: 15, 23798: 8, 23801: 171, 23802: 170, 23803: 164, 23805: 7720, 23806: 2, 23807: 4, 23808: 38, 23809: 807, 23810: 38, 23811: 168, 23812: 156, 23813: 792, 23814: 7760, 23815: 3975, 23816: 4234, 23817: 4117, 23818: 4031, 23819: 3964, 23820: 3885, 23821: 3970, 23822: 4183, 23823: 4063, 23824: 4031, 23825: 4010, 23826: 261, 23827: 363, 23828: 108, 23830: 106, 23831: 99, 23832: 123, 23833: 122, 23834: 119, 23835: 112, 23836: 95, 20360: 1216, 15698: 7810, 15699: 484722, 15700: 7944, 15701: 487389, 15702: 466859, 15703: 483992, 15704: 483641, 15705: 490513, 15706: 459004, 15707: 482781, 15708: 483071, 13554: 171, 13555: 155, 13556: 182, 9461: 17587, 13558: 155, 13559: 149, 21752: 169, 15827: 445, 15829: 1674, 15830: 24, 13561: 163, 15833: 17, 15835: 2, 15837: 31, 7646: 1, 15839: 1171, 7648: 1, 7649: 2, 7650: 2, 7651: 5, 15845: 10, 7654: 2, 15847: 91, 7656: 1, 7657: 2, 7658: 3, 12199: 12, 12200: 10, 12201: 2, 12202: 39, 12203: 31, 7693: 967, 7694: 333, 12205: 40, 7696: 317, 7697: 310, 7698: 647, 7699: 702, 7700: 643, 7701: 693, 7702: 580, 7704: 560, 7707: 589, 7708: 615, 7709: 585, 7710: 731, 7711: 650, 7712: 690, 7713: 669, 7714: 637, 15908: 62664, 9478: 127316, 23139: 12930, 16011: 21834, 16070: 36, 16071: 4227, 23163: 8043, 16150: 1, 16152: 308, 16153: 88, 16154: 297, 23175: 599, 23177: 538, 16204: 3157, 16208: 634271, 16250: 2679, 16251: 2539, 16252: 2562, 16263: 11325, 15384: 334, 23197: 2801, 23198: 2793, 13643: 3763, 13644: 3519, 23202: 2963, 16339: 665, 16340: 2189, 16341: 672, 16342: 2186, 16343: 637, 16350: 26008, 16352: 26249, 23208: 971, 23209: 1480})

In [11]:
for key in countdic:
    print key, len(countdic[key])


site_id 4642
app_domain 548
C19 66
site_domain 7564
device_type 5
C17 407
C16 9
device_ip 6134351
C14 2470
C15 8
device_conn_type 4
C1 7
app_category 36
site_category 26
C20 172
C21 55
banner_pos 7
app_id 8291
day 7
device_id 2484613
hour 24
device_model 8162
C18 4

unseen in test


In [12]:
%%time
cldictest = {}
for col in X_t.colnames:
    cldictest[col] = set(X_t.col(col))


CPU times: user 19.3 s, sys: 5.4 s, total: 24.7 s
Wall time: 27.5 s

In [13]:
import copy
countdic_cp = copy.deepcopy(countdic)

In [19]:
keylist = countdic.keys()

In [20]:
keylist.remove('day')

In [24]:
%%time
for key in keylist:
    key_remove = [akey for akey in countdic[key]
                  if akey not in cldictest[key]]
    for akey in key_remove:
        del countdic[key][akey]


CPU times: user 7.97 s, sys: 76.6 ms, total: 8.04 s
Wall time: 8.01 s

In [25]:
for key in countdic:
    print key, len(countdic[key])


site_id 2694
app_domain 180
C19 45
site_domain 3176
device_type 4
C17 183
C16 9
device_ip 426610
C14 904
C15 8
device_conn_type 4
C1 7
app_category 28
site_category 22
C20 162
C21 34
banner_pos 6
app_id 3307
day 7
device_id 69606
hour 24
device_model 5370
C18 4

removed value as 'other'


In [26]:
%%time
for key in countdic:
    if len(countdic[key]) != len(countdic_cp[key]):
        countdic[key]['other'] = 0


CPU times: user 54 µs, sys: 9 µs, total: 63 µs
Wall time: 80.1 µs

In [27]:
for key in countdic:
    print key, len(countdic[key])


site_id 2695
app_domain 181
C19 46
site_domain 3177
device_type 5
C17 184
C16 9
device_ip 426611
C14 905
C15 8
device_conn_type 4
C1 7
app_category 29
site_category 23
C20 163
C21 35
banner_pos 7
app_id 3308
day 7
device_id 69607
hour 24
device_model 5371
C18 4

one hot encoding


In [28]:
%%time
indexdic = {}
i = 0
for key in countdic:
    indexdic[key] = {}
    for akey in countdic[key]:
        i+=1
        indexdic[key][str(akey)] = i


CPU times: user 991 ms, sys: 25.6 ms, total: 1.02 s
Wall time: 999 ms

In [29]:
for key in indexdic:
    print key, len(indexdic[key])


site_id 2695
C20 163
C19 46
C21 35
device_type 5
C17 184
C16 9
device_ip 426611
C14 905
C15 8
device_conn_type 4
C1 7
app_category 29
site_category 23
app_domain 181
site_domain 3177
banner_pos 7
app_id 3308
day 7
device_id 69607
hour 24
device_model 5371
C18 4

In [32]:
f = open('indexdictrimed.pkl', 'wb')
cPickle.dump(indexdic, f, -1)
f.close()

In [2]:
f = open('indexdictrimed.pkl', 'rb')
indexdic = cPickle.load(f)
f.close()

In [3]:
sumvalue = 0
for key in indexdic:
    sumvalue += len(indexdic[key])

In [4]:
sumvalue


Out[4]:
512410

In [ ]:

delete rare values

really need to delete?


In [7]:
%%time
for key in countdic:
    key_remove = [akey for akey, value in countdic[key].iteritems()
                  if value == 1]
    for akey in key_remove:
        del countdic[key][akey]

In [8]:
for key in countdic:
    print key, len(countdic[key])


site_id 4098
app_domain 420
C19 66
site_domain 5668
device_type 5
C17 406
C16 9
device_ip 3079616
C14 2426
C15 8
device_conn_type 4
C1 7
app_category 33
site_category 26
C20 171
C21 55
banner_pos 7
app_id 6678
day 7
device_id 915086
hour 24
device_model 7350
C18 4

In [33]:
%%time
for key in countdic:
    key_remove = [akey for akey, value in countdic[key].iteritems()
                  if value < 9]
    for akey in key_remove:
        del countdic[key][akey]


CPU times: user 387 ms, sys: 19.2 ms, total: 406 ms
Wall time: 402 ms

In [34]:
for key in countdic:
    print key, len(countdic[key])


site_id 2448
app_domain 138
C19 44
site_domain 2712
device_type 4
C17 180
C16 9
device_ip 162776
C14 876
C15 8
device_conn_type 4
C1 7
app_category 26
site_category 21
C20 160
C21 34
banner_pos 6
app_id 2570
day 7
device_id 14336
hour 24
device_model 5059
C18 4

In [ ]:

click likelihood


In [11]:
m=2
l=1

In [12]:
glh = np.sum(y_train) / y_train.shape[0]

In [13]:
glh


Out[13]:
0.1698667791732506

In [14]:
f = open('countdic.pkl', 'rb')
countdic = cPickle.load(f)
f.close()

In [15]:
f = open('clickdic.pkl', 'rb')
clickdic = cPickle.load(f)
f.close()

In [16]:
clickratedic = {}

In [17]:
%%time
for col in X_train.colnames:
    clickratedic[col] = defaultdict(float)
    for key in countdic[col]:
        count = float(countdic[col][key])
        click = clickdic[col][key]
        clickratedic[col][key] = (l/(count/9.+m))*glh + (1.-(l/(count/9.+m)))*(click/count)


CPU times: user 30.6 s, sys: 887 ms, total: 31.5 s
Wall time: 31.2 s

In [18]:
clickratedic['C1']


Out[18]:
defaultdict(<type 'float'>, {1001: 0.033816014782281821, 1002: 0.20939268583620738, 1005: 0.1696148310451287, 1007: 0.039964964276398458, 1008: 0.1203785092869926, 1010: 0.093813344660555534, 1012: 0.15896939513783689})

In [19]:
f = open('clickratedic.pkl', 'wb')
cPickle.dump(clickratedic, f, -1)
f.close()

In [36]:
len(colnames)


Out[36]:
23

In [54]:
(float(l)/m)*glh


Out[54]:
0.084933389586625302

In [55]:
glh/2


Out[55]:
0.084933389586625302

just for testing


In [39]:
%%time
i=0
for row, target in izip(X_train.iterrows(),y_train.iterrows()):
    for name in colnames:
        countdic[name][row[name]] += 1
        clickdic[name][row[name]] += target[0]
    i+=1
    if i > 3000:
        break


CPU times: user 126 ms, sys: 445 µs, total: 126 ms
Wall time: 126 ms
%%time i=0 for row, target in izip(X_train.iterrows(),y_train.iterrows()): print row, target[0] i+=1 if i > 300: break

In [ ]: