In [1]:
import tables
import csv
import time
import numpy as np
import cPickle
from collections import Counter, defaultdict
import matplotlib.pyplot as plt
from itertools import izip
import operator
import copy
In [2]:
file_handler = tables.open_file("click_data.h5", mode = "r")
In [3]:
root = file_handler.root
In [4]:
X = file_handler.root.train.train_raw.X
In [5]:
y = file_handler.root.train.train_raw.y
In [6]:
X_t = file_handler.root.test.test_raw.X_t
In [7]:
X_train = file_handler.root.train.train_raw.X_train
In [8]:
y_train = file_handler.root.train.train_raw.y_train
In [9]:
X_valid = file_handler.root.train.train_raw.X_valid
In [10]:
y_valid = file_handler.root.train.train_raw.y_valid
In [15]:
names = X.colnames
In [16]:
names = ['day', 'hour', 'banner_pos', 'site_id', 'site_domain', 'site_category', 'app_id', 'app_domain', 'app_category',
'device_id', 'device_ip', 'device_model', 'device_type', 'device_conn_type',
'C1', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21']
In [6]:
cldic = {}
In [7]:
%%time
for col in X.colnames:
cldic[col] = np.unique(X.col(col))
In [17]:
indexdic = {}
In [18]:
%%time
i = 0
for key in names:
indexdic[key] = {}
for ele in cldic[key]:
i+=1
indexdic[key][str(ele)] = i
In [19]:
i
Out[19]:
In [22]:
indexdic['banner_pos']
Out[22]:
In [23]:
f = open('indexdicall.pkl', 'wb')
cPickle.dump(indexdic, f, -1)
f.close()
In [2]:
f = open('indexdicall.pkl', 'rb')
indexdic = cPickle.load(f)
f.close()
In [3]:
names = ['day', 'hour', 'banner_pos', 'site_category', 'app_category',
'device_type', 'device_conn_type',
'C1', 'C15', 'C16', 'C18', 'C20']
In [4]:
%%time
key_remove = [akey for akey in indexdic if akey not in names]
for akey in key_remove:
del indexdic[akey]
In [6]:
%%time
i = 0
for key in names:
for akey in indexdic[key]:
i+=1
indexdic[key][akey] = i
In [8]:
i
Out[8]:
In [9]:
f = open('indexdiconehot.pkl', 'wb')
cPickle.dump(indexdic, f, -1)
f.close()
In [ ]:
In [5]:
f = open('indexdicall.pkl', 'rb')
indexdic = cPickle.load(f)
f.close()
In [6]:
names = ['day', 'hour', 'banner_pos', 'site_id', 'site_domain', 'site_category', 'app_id', 'app_domain', 'app_category',
'device_id', 'device_ip', 'device_model', 'device_type', 'device_conn_type',
'C1', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21']
In [7]:
%%time
cldictest = {}
for col in names:
cldictest[col] = set(X_t.col(col))
In [8]:
import copy
indexdic_cp = copy.deepcopy(indexdic)
In [9]:
for key in cldictest:
cldictest[key] = set([str(e) for e in cldictest[key]])
In [10]:
cldictest['hour']
Out[10]:
In [12]:
indexdic['hour']
Out[12]:
In [11]:
names.remove('day')
remove unseen categories
In [13]:
%%time
for key in names:
key_remove = [akey for akey in indexdic[key]
if akey not in cldictest[key]]
for akey in key_remove:
del indexdic[key][akey]
In [14]:
for key in indexdic:
print key, len(indexdic[key])
add other
In [15]:
%%time
for key in indexdic:
if len(indexdic[key]) != len(indexdic_cp[key]):
indexdic[key]['other'] = 0
one hot encoding
In [16]:
names = ['day', 'hour', 'banner_pos', 'site_id', 'site_domain', 'site_category', 'app_id', 'app_domain', 'app_category',
'device_id', 'device_ip', 'device_model', 'device_type', 'device_conn_type',
'C1', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21']
In [18]:
%%time
indexdictrim = {}
i = 0
for key in names:
indexdictrim[key] = {}
for akey in indexdic[key]:
i+=1
indexdictrim[key][str(akey)] = i
In [19]:
i
Out[19]:
In [24]:
indexdictrim['site_category']
Out[24]:
In [20]:
f = open('indexdicalltrimed.pkl', 'wb')
cPickle.dump(indexdictrim, f, -1)
f.close()
In [8]:
names = ['day', 'hour', 'banner_pos', 'site_id', 'site_domain', 'site_category', 'app_id', 'app_domain', 'app_category',
'device_id', 'device_ip', 'device_model', 'device_type', 'device_conn_type',
'C1', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21']
In [7]:
countdic = {}
clickdic = {}
for col in names:
countdic[col] = defaultdict(int)
clickdic[col] = defaultdict(int)
%%time
for row, target in izip(X.iterrows(),y.iterrows()):
for name in names:
countdic[name][row[name]] += 1
clickdic[name][row[name]] += target[0]
In [10]:
f = open('countdicall.pkl', 'wb')
cPickle.dump(countdic, f, -1)
f.close()
In [11]:
f = open('clickdicall.pkl', 'wb')
cPickle.dump(clickdic, f, -1)
f.close()
In [12]:
del clickdic
In [ ]:
In [7]:
f = open('countdicall.pkl', 'rb')
countdic = cPickle.load(f)
f.close()
In [ ]:
f = open('clickdicall.pkl', 'rb')
clickdic = cPickle.load(f)
f.close()
In [9]:
countdic_cp1 = copy.deepcopy(countdic)
countdic_cp2 = copy.deepcopy(countdic)
In [10]:
for row in X_t.iterrows():
for name in names:
countdic[name][row[name]] += 1
In [11]:
%%time
for key in countdic:
key_remove = [akey for akey, value in countdic[key].iteritems()
if value < 11]
for akey in key_remove:
if akey in countdic_cp1[key]:
del countdic_cp1[key][akey]
In [12]:
for key in names:
print key, len(countdic_cp1[key])
In [12]:
%%time
for key in countdic_cp1:
if len(countdic_cp1[key]) != len(countdic_cp2[key]):
countdic_cp1[key]['other'] = 0
In [13]:
%%time
indexdic = {}
i = 0
for key in names:
indexdic[key] = {}
for akey in countdic_cp1[key]:
i+=1
indexdic[key][str(akey)] = i
In [14]:
i
Out[14]:
In [19]:
f = open('indexdicless.pkl', 'wb')
cPickle.dump(indexdic, f, -1)
f.close()
In [10]:
countdict = {}
for col in names:
countdict[col] = defaultdict(int)
for row in X_t.iterrows():
for name in names:
countdict[name][row[name]] += 1
countdic[name][row[name]] += 1
In [11]:
%%time
for key in names:
key_remove = [akey for akey, value in countdic[key].iteritems()
if value < 11]
for akey in key_remove:
if akey in countdic_cp1[key]:
if akey not in countdict[key]:
del countdic_cp1[key][akey]
In [12]:
for key in names:
print key, len(countdic_cp1[key])
In [13]:
%%time
for key in countdic_cp1:
if len(countdic_cp1[key]) != len(countdic_cp2[key]):
countdic_cp1[key]['other'] = 0
In [14]:
%%time
indexdic = {}
i = 0
for key in names:
indexdic[key] = {}
for akey in countdic_cp1[key]:
i+=1
indexdic[key][str(akey)] = i
In [15]:
i
Out[15]:
In [16]:
f = open('indexdicless2.pkl', 'wb')
cPickle.dump(indexdic, f, -1)
f.close()
In [ ]:
In [9]:
for row in X_t.iterrows():
for name in names:
countdic[name][row[name]] += 1
In [10]:
countdic_cp1 = copy.deepcopy(countdic)
countdic_cp2 = copy.deepcopy(countdic)
In [11]:
%%time
for key in countdic:
key_remove = [akey for akey, value in countdic[key].iteritems()
if value < 11]
for akey in key_remove:
del countdic_cp1[key][akey]
In [12]:
for key in names:
print key, len(countdic_cp1[key])
In [13]:
%%time
for key in countdic_cp1:
if len(countdic_cp1[key]) != len(countdic_cp2[key]):
countdic_cp1[key]['other'] = 0
In [14]:
%%time
indexdic = {}
i = 0
for key in names:
indexdic[key] = {}
for akey in countdic_cp1[key]:
i+=1
indexdic[key][str(akey)] = i
In [15]:
i
Out[15]:
In [16]:
f = open('indexdicless3.pkl', 'wb')
cPickle.dump(indexdic, f, -1)
f.close()
In [ ]:
just for testing
In [53]:
f = open('dayrows.pkl', 'rb')
dayrows = cPickle.load(f)
f.close()
In [54]:
dayrows
Out[54]:
In [55]:
allkeys = []
allkeysset = set()
In [56]:
name = 'app_id'
In [57]:
countdic = {}
clickdic = {}
In [58]:
countlist = {}
clicklist = {}
In [59]:
countlistexist = {}
clicklistexist = {}
In [75]:
countlistnonexist = {}
clicklistnonexist = {}
day
In [67]:
day = 2
In [68]:
%%time
countdic[day] = defaultdict(int)
clickdic[day] = defaultdict(int)
for row, target in izip(X.iterrows(start=sum(dayrows[:day-1]),stop=sum(dayrows[:day])),y.iterrows(start=sum(dayrows[:day-1]),stop=sum(dayrows[:day]))):
countdic[day][row[name]] += 1
clickdic[day][row[name]] += target[0]
In [69]:
keys = countdic[day].keys()
In [76]:
keys1 = countdic[day-1].keys()
In [70]:
countlist[day] = [countdic[day][key] for key in keys if key not in allkeysset]
clicklist[day] = [clickdic[day][key] for key in keys if key not in allkeysset]
countlistexist[day] = [countdic[day][key] for key in keys if key in allkeysset]
clicklistexist[day] = [clickdic[day][key] for key in keys if key in allkeysset]
countlist[day] = np.log(countlist[day])
clicklist[day] = np.log(clicklist[day])
countlistexist[day] = np.log(countlistexist[day])
clicklistexist[day] = np.log(clicklistexist[day])
clicklist[day][np.isinf(clicklist[day])] = 0
clicklistexist[day][np.isinf(clicklistexist[day])] = 0
In [80]:
countlistnonexist[day-1] = [countdic[day-1][key] for key in keys1 if key not in keys]
clicklistnonexist[day-1] = [clickdic[day-1][key] for key in keys1 if key not in keys]
countlistnonexist[day-1] = np.log(countlistnonexist[day-1])
clicklistnonexist[day-1] = np.log(clicklistnonexist[day-1])
clicklistnonexist[day-1][np.isinf(clicklistnonexist[day-1])] = 0
In [71]:
allkeys += keys
In [72]:
allkeysset = set(allkeys)
In [83]:
countlistnonexist
Out[83]:
In [86]:
######################################
import matplotlib.pyplot as plt
fig, axes = plt.subplots()
axes.set_xlabel('count')
axes.set_ylabel('clickrate')
axes.grid()
#y = [colcatecount[key] for key in x]
#axes.plot(x, y, 'ro')
axes.plot(countlist[day-1], clicklist[day-1], 'bo', alpha=0.5)
axes.plot(countlistnonexist[day-1], clicklistnonexist[day-1], 'ro', ms=10)
#axes.plot(countlist[day-1]+16, clicklist[day-1], 'ro', ms=10)
#axes.plot(countlistexist[day-1]+16, clicklistexist[day-1], 'bo', alpha=0.5)
axes.plot(countlistexist[day]+16, clicklistexist[day], 'bo', alpha=0.5)
axes.plot(countlist[day]+16, clicklist[day], 'ro', ms=10)
plt.show()
#fig.savefig("ycount.svg")
#######################################
In [ ]:
In [2]:
f = open('countdicall.pkl', 'rb')
countdic = cPickle.load(f)
f.close()
In [20]:
cnames = ['site_id', 'site_domain', 'site_category', 'app_id', 'app_domain', 'app_category',
'device_id', 'device_model']
In [3]:
cnames = ['site_id', 'site_domain', 'site_category', 'app_id', 'app_domain', 'app_category',
'device_id', 'device_model', 'C1', 'banner_pos', 'device_type', 'device_conn_type',
'C15', 'C16']
In [4]:
gbdtlist = []
In [22]:
%%time
for name in cnames:
for key,value in countdic[name].items():
if value > 1000000:
gbdtlist.append(name+'_'+key)
for key,value in countdic['device_ip'].items():
if value > 100000:
gbdtlist.append('device_ip_'+key)
In [5]:
%%time
for name in cnames:
for key,value in countdic[name].items():
if value > 1000000:
gbdtlist.append(name+'_'+str(key))
for key,value in countdic['device_ip'].items():
if value > 100000:
gbdtlist.append('device_ip_'+key)
for key,value in countdic['C19'].items():
if value > 10000000:
gbdtlist.append('C19_'+str(key))
for key,value in countdic['C20'].items():
if value > 10000000:
gbdtlist.append('C20_'+str(key))
In [6]:
gbdtlist
Out[6]:
In [24]:
f = open('gbdtclist.pkl', 'wb')
cPickle.dump(gbdtlist, f, -1)
f.close()
In [7]:
f = open('gbdtclist2.pkl', 'wb')
cPickle.dump(gbdtlist, f, -1)
f.close()
In [ ]:
In [6]:
m=2
l=1
In [7]:
glh = np.sum(y)/y.shape[0]
In [8]:
glh
Out[8]:
In [9]:
glh/2.
Out[9]:
In [10]:
f = open('countdicall.pkl', 'rb')
countdic = cPickle.load(f)
f.close()
In [11]:
f = open('clickdicall.pkl', 'rb')
clickdic = cPickle.load(f)
f.close()
In [12]:
clickratedic = {}
In [13]:
%%time
for col in X.colnames:
clickratedic[col] = defaultdict(float)
for key in countdic[col]:
count = float(countdic[col][key])
click = clickdic[col][key]
clickratedic[col][key] = (l/(count/9.+m))*glh + (1.-(l/(count/9.+m)))*(click/count)
In [14]:
f = open('clickratedicall.pkl', 'wb')
cPickle.dump(clickratedic, f, -1)
f.close()
In [15]:
clickratedic['C1']
Out[15]:
In [ ]:
In [7]:
f = open('dayrows.pkl', 'rb')
days = cPickle.load(f)
f.close()
In [8]:
days
Out[8]:
In [9]:
numrows = sum(days[:9])
In [21]:
numrows
Out[21]:
In [7]:
X.colnames
Out[7]:
In [12]:
cldic = {}
In [13]:
for col in X.colnames:
cldic[col] = np.unique(X.col(col)[:numrows])
In [5]:
cldic = {}
In [6]:
names = ['day', 'hour', 'banner_pos', 'site_id', 'site_domain', 'site_category', 'app_id', 'app_domain', 'app_category',
'device_id', 'device_ip', 'device_model', 'device_type', 'device_conn_type',
'C1', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21']
In [7]:
%%time
for col in names:
cldic[col] = np.unique(X_train.col(col))
In [8]:
indexdic = {}
In [9]:
%%time
i = 0
for key in names:
indexdic[key] = {}
for ele in cldic[key]:
i+=1
indexdic[key][str(ele)] = i
In [10]:
i
Out[10]:
In [18]:
for key in indexdic.keys():
print key, len(indexdic[key])
In [13]:
indexdic['banner_pos']
Out[13]:
In [14]:
f = open('indexdic.pkl', 'wb')
cPickle.dump(indexdic, f, -1)
f.close()
test searching time
In [33]:
%time '0000b3ab' in indexdic['device_ip']
Out[33]:
In [34]:
%time '0000b3ab' in indexdic['device_ip'].keys()
Out[34]:
In [6]:
colnames = X_train.colnames
In [23]:
cldic = {}
In [25]:
%%time
for col in colnames:
cldic[col] = np.unique(X_train.col(col))
In [12]:
cldic
Out[12]:
In [26]:
clcountdic = {}
clclickdic = {}
In [27]:
%%time
for key in cldic:
clcountdic[key] = {}
clclickdic[key] = {}
for key1 in cldic[key]:
clcountdic[key][key1] = 0
clclickdic[key][key1] = 0
In [38]:
%%time
i=0
for row, target in izip(X_train.iterrows(),y_train.iterrows()):
for name in colnames:
clcountdic[name][row[name]] += 1
clclickdic[name][row[name]] += target[0]
i+=1
if i > 3000:
break
In [7]:
countdic = {}
clickdic = {}
In [8]:
for col in colnames:
countdic[col] = defaultdict(int)
clickdic[col] = defaultdict(int)
In [9]:
%%time
for row, target in izip(X_train.iterrows(),y_train.iterrows()):
for name in colnames:
countdic[name][row[name]] += 1
clickdic[name][row[name]] += target[0]
In [10]:
clickdic['day']
Out[10]:
In [11]:
countdic['day']
Out[11]:
In [12]:
f = open('countdic.pkl', 'wb')
cPickle.dump(countdic, f, -1)
f.close()
In [ ]:
f = open('clickdic.pkl', 'wb')
cPickle.dump(clickdic, f, -1)
f.close()
In [10]:
f = open('countdic.pkl', 'rb')
countdic = cPickle.load(f)
f.close()
In [3]:
countdic['C14']
Out[3]:
In [11]:
for key in countdic:
print key, len(countdic[key])
In [12]:
%%time
cldictest = {}
for col in X_t.colnames:
cldictest[col] = set(X_t.col(col))
In [13]:
import copy
countdic_cp = copy.deepcopy(countdic)
In [19]:
keylist = countdic.keys()
In [20]:
keylist.remove('day')
In [24]:
%%time
for key in keylist:
key_remove = [akey for akey in countdic[key]
if akey not in cldictest[key]]
for akey in key_remove:
del countdic[key][akey]
In [25]:
for key in countdic:
print key, len(countdic[key])
removed value as 'other'
In [26]:
%%time
for key in countdic:
if len(countdic[key]) != len(countdic_cp[key]):
countdic[key]['other'] = 0
In [27]:
for key in countdic:
print key, len(countdic[key])
one hot encoding
In [28]:
%%time
indexdic = {}
i = 0
for key in countdic:
indexdic[key] = {}
for akey in countdic[key]:
i+=1
indexdic[key][str(akey)] = i
In [29]:
for key in indexdic:
print key, len(indexdic[key])
In [32]:
f = open('indexdictrimed.pkl', 'wb')
cPickle.dump(indexdic, f, -1)
f.close()
In [2]:
f = open('indexdictrimed.pkl', 'rb')
indexdic = cPickle.load(f)
f.close()
In [3]:
sumvalue = 0
for key in indexdic:
sumvalue += len(indexdic[key])
In [4]:
sumvalue
Out[4]:
In [ ]:
really need to delete?
In [7]:
%%time
for key in countdic:
key_remove = [akey for akey, value in countdic[key].iteritems()
if value == 1]
for akey in key_remove:
del countdic[key][akey]
In [8]:
for key in countdic:
print key, len(countdic[key])
In [33]:
%%time
for key in countdic:
key_remove = [akey for akey, value in countdic[key].iteritems()
if value < 9]
for akey in key_remove:
del countdic[key][akey]
In [34]:
for key in countdic:
print key, len(countdic[key])
In [ ]:
In [11]:
m=2
l=1
In [12]:
glh = np.sum(y_train) / y_train.shape[0]
In [13]:
glh
Out[13]:
In [14]:
f = open('countdic.pkl', 'rb')
countdic = cPickle.load(f)
f.close()
In [15]:
f = open('clickdic.pkl', 'rb')
clickdic = cPickle.load(f)
f.close()
In [16]:
clickratedic = {}
In [17]:
%%time
for col in X_train.colnames:
clickratedic[col] = defaultdict(float)
for key in countdic[col]:
count = float(countdic[col][key])
click = clickdic[col][key]
clickratedic[col][key] = (l/(count/9.+m))*glh + (1.-(l/(count/9.+m)))*(click/count)
In [18]:
clickratedic['C1']
Out[18]:
In [19]:
f = open('clickratedic.pkl', 'wb')
cPickle.dump(clickratedic, f, -1)
f.close()
In [36]:
len(colnames)
Out[36]:
In [54]:
(float(l)/m)*glh
Out[54]:
In [55]:
glh/2
Out[55]:
In [39]:
%%time
i=0
for row, target in izip(X_train.iterrows(),y_train.iterrows()):
for name in colnames:
countdic[name][row[name]] += 1
clickdic[name][row[name]] += target[0]
i+=1
if i > 3000:
break
In [ ]: