The category dictionary of each feature has been saved before. The csv data have been saved in hdf5
In [ ]:
import tables
import time
import numpy as np
import cPickle
from itertools import izip
import pyhash
import math
In [2]:
file_handler = tables.open_file("click_data.h5", mode = "r")
In [3]:
X = file_handler.root.train.train_raw.X
In [4]:
y = file_handler.root.train.train_raw.y
In [5]:
X_train = file_handler.root.train.train_raw.X_train
In [6]:
y_train = file_handler.root.train.train_raw.y_train
In [7]:
X_valid = file_handler.root.train.train_raw.X_valid
In [8]:
y_valid = file_handler.root.train.train_raw.y_valid
In [5]:
X_t = file_handler.root.test.test_raw.X_t
In [6]:
names = ['day', 'hour', 'banner_pos', 'site_id', 'site_domain', 'site_category', 'app_id', 'app_domain', 'app_category',
'device_id', 'device_ip', 'device_model', 'device_type', 'device_conn_type',
'C1', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21']
In [7]:
lines = X.shape[0]
In [ ]:
f = open('indexdicless.pkl', 'rb')
indexdic = cPickle.load(f)
f.close()
In [9]:
%%time
i = 0
with open('train.txt', 'wb') as fw:
for row, target in izip(X.iterrows(),y.iterrows()):
indexlist = []
out = "{0}".format(target[0])
for name in names:
value = str(row[name])
if value in indexdic[name]:
index = indexdic[name][value]
indexlist.append(index)
elif 'other' in indexdic[name]:
index = indexdic[name]['other']
indexlist.append(index)
else:
print 'error'
break
indexlist.sort()
out += " " + " ".join(["{0}:{1}".format(index, 1) for index in indexlist]) + "\n"
fw.write(out)
i+=1
if (i % 1000000) == 0:
print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)
In [8]:
f = open('indexdicless2.pkl', 'rb')
indexdic = cPickle.load(f)
f.close()
In [9]:
%%time
i = 0
with open('train1.txt', 'wb') as fw:
for row, target in izip(X.iterrows(),y.iterrows()):
indexlist = []
out = "{0}".format(target[0])
for name in names:
value = str(row[name])
if value in indexdic[name]:
index = indexdic[name][value]
indexlist.append(index)
elif 'other' in indexdic[name]:
index = indexdic[name]['other']
indexlist.append(index)
else:
print 'error'
break
indexlist.sort()
out += " " + " ".join(["{0}:{1}".format(index, 1) for index in indexlist]) + "\n"
fw.write(out)
i+=1
if (i % 1000000) == 0:
print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)
In [8]:
f = open('indexdicless3.pkl', 'rb')
indexdic = cPickle.load(f)
f.close()
In [9]:
%%time
i = 0
with open('train.txt', 'wb') as fw:
for row, target in izip(X.iterrows(),y.iterrows()):
indexlist = []
out = "{0}".format(target[0])
for name in names:
value = str(row[name])
if value in indexdic[name]:
index = indexdic[name][value]
indexlist.append(index)
elif 'other' in indexdic[name]:
index = indexdic[name]['other']
indexlist.append(index)
else:
print 'error'
break
indexlist.sort()
out += " " + " ".join(["{0}:{1}".format(index, 1) for index in indexlist]) + "\n"
fw.write(out)
i+=1
if (i % 1000000) == 0:
print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)
In [10]:
lines = X_train.shape[0]
In [11]:
%%time
i = 0
with open('train9days.txt', 'wb') as fw:
for row, target in izip(X_train.iterrows(),y_train.iterrows()):
indexlist = []
out = "{0}".format(target[0])
for name in names:
value = str(row[name])
if value in indexdic[name]:
index = indexdic[name][value]
indexlist.append(index)
indexlist.sort()
out += " ".join([" {0}:{1}".format(index, 1) for index in indexlist]) + "\n"
fw.write(out)
i+=1
if (i % 100000) == 0:
print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)
In [10]:
lines = X_valid.shape[0]
In [11]:
%%time
i = 0
with open('train10thday.txt', 'wb') as fw:
for row, target in izip(X_valid.iterrows(),y_valid.iterrows()):
indexlist = []
out = "{0}".format(target[0])
for name in names:
value = str(row[name])
if value in indexdic[name]:
index = indexdic[name][value]
indexlist.append(index)
indexlist.sort()
out += " ".join([" {0}:{1}".format(index, 1) for index in indexlist]) + "\n"
fw.write(out)
i+=1
if (i % 100000) == 0:
print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)
In [10]:
lines = X_t.shape[0]
In [12]:
%%time
i = 0
with open('test.txt', 'wb') as fw:
for row in X_t.iterrows():
indexlist = []
out = "{0}".format(0)
for name in names:
value = str(row[name])
if value in indexdic[name]:
index = indexdic[name][value]
indexlist.append(index)
elif 'other' in indexdic[name]:
index = indexdic[name]['other']
indexlist.append(index)
indexlist.sort()
out += " " + " ".join(["{0}:{1}".format(index, 1) for index in indexlist]) + "\n"
fw.write(out)
i+=1
if (i % 100000) == 0:
print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)
In [12]:
%%time
i = 0
with open('test1.txt', 'wb') as fw:
for row in X_t.iterrows():
indexlist = []
out = "{0}".format(0)
for name in names:
value = str(row[name])
if value in indexdic[name]:
index = indexdic[name][value]
indexlist.append(index)
elif 'other' in indexdic[name]:
index = indexdic[name]['other']
indexlist.append(index)
indexlist.sort()
out += " " + " ".join(["{0}:{1}".format(index, 1) for index in indexlist]) + "\n"
fw.write(out)
i+=1
if (i % 100000) == 0:
print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)
In [11]:
%%time
i = 0
with open('test.txt', 'wb') as fw:
for row in X_t.iterrows():
indexlist = []
out = "{0}".format(0)
for name in names:
value = str(row[name])
if value in indexdic[name]:
index = indexdic[name][value]
indexlist.append(index)
elif 'other' in indexdic[name]:
index = indexdic[name]['other']
indexlist.append(index)
indexlist.sort()
out += " " + " ".join(["{0}:{1}".format(index, 1) for index in indexlist]) + "\n"
fw.write(out)
i+=1
if (i % 100000) == 0:
print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)
In [10]:
f = open('indexdiconehot.pkl', 'rb')
indexdic = cPickle.load(f)
f.close()
In [11]:
cnames = set(['day', 'hour', 'banner_pos', 'site_category', 'app_category',
'device_type', 'device_conn_type',
'C1', 'C15', 'C16', 'C18', 'C20'])
In [12]:
hasher = pyhash.murmur3_32()
def hashstr(feat, bins):
return hasher(str(feat)) % bins +1+309
def gen_hashed_fm_feats(feats, bins):
feats = [hashstr(feat, bins) for feat in feats]
#feats.sort()
return feats
In [13]:
bins = 10000000
In [14]:
names = X.colnames
In [15]:
lines = X.shape[0]
In [16]:
%%time
i = 0
with open('train.txt', 'wb') as fw:
for row, target in izip(X.iterrows(),y.iterrows()):
out = "{0}".format(target[0])
feats = []
feati = []
for name in names:
value = str(row[name])
if name in cnames:
feati.append(indexdic[name][value])
else:
feats.append('{0}_{1}'.format(name, value))
feats = gen_hashed_fm_feats(feats, bins)
feats = feats + feati
feats.sort()
out += " " + " ".join(["{0}:{1}".format(feat, 1) for feat in feats]) + "\n"
fw.write(out)
i+=1
if (i % 1000000) == 0:
print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)
In [ ]:
In [17]:
names = X_t.colnames
In [18]:
lines = X_t.shape[0]
In [19]:
%%time
i = 0
with open('test.txt', 'wb') as fw:
for row in X_t.iterrows():
out = "0"
feats = []
feati = []
for name in names:
value = str(row[name])
if name in cnames:
feati.append(indexdic[name][value])
else:
feats.append('{0}_{1}'.format(name, value))
feats = gen_hashed_fm_feats(feats, bins)
feats = feats + feati
feats.sort()
out += " " + " ".join(["{0}:{1}".format(feat, 1) for feat in feats]) + "\n"
fw.write(out)
i+=1
if (i % 100000) == 0:
print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)
In [ ]:
gbdt and fm from NTU https://github.com/guestwalk/kaggle-2014-criteo.git
In [6]:
f = open('gbdtclist2.pkl', 'rb')
clist = cPickle.load(f)
f.close()
In [5]:
densenames = ['day', 'hour', 'banner_pos', 'device_type', 'device_conn_type',
'C1', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21']
sparsenames = ['site_id', 'site_domain', 'site_category', 'app_id', 'app_domain', 'app_category',
'device_id', 'device_ip', 'device_model']
In [7]:
densenames = []
sparsenames = ['site_id', 'site_domain', 'site_category', 'app_id', 'app_domain', 'app_category',
'device_id', 'device_ip', 'device_model', 'C1', 'banner_pos', 'device_type', 'device_conn_type',
'C15', 'C16', 'C19', 'C20']
In [ ]:
In [8]:
lines = X.shape[0]
In [9]:
%%time
i = 0
with open('train.gbdt.dense', 'wb') as fd, open('train.gbdt.sparse', 'wb') as fs:
for row, target in izip(X.iterrows(),y.iterrows()):
outd = "{0} ".format(target[0])
outs = "{0} ".format(target[0])
#outd += " ".join(["{0}".format(row[name]) for name in densenames]) + "\n"
outd += "\n"
fd.write(outd)
cat_feats = set()
for name in sparsenames:
key = name + '_' + str(row[name])
cat_feats.add(key)
outs += " ".join(["{0}".format(j) for j, feat in enumerate(clist, start=1) if feat in cat_feats]) + "\n"
fs.write(outs)
i+=1
if (i % 1000000) == 0:
print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)
In [10]:
lines = X_t.shape[0]
In [11]:
%%time
i = 0
with open('test.gbdt.dense', 'wb') as fd, open('test.gbdt.sparse', 'wb') as fs:
for row in X_t.iterrows():
outd = "{0} ".format(0)
outs = "{0} ".format(0)
#outd += " ".join(["{0}".format(row[name]) for name in densenames]) + "\n"
outd += "\n"
fd.write(outd)
cat_feats = set()
for name in sparsenames:
key = name + '_' + str(row[name])
cat_feats.add(key)
outs += " ".join(["{0}".format(j) for j, feat in enumerate(clist, start=1) if feat in cat_feats]) + "\n"
fs.write(outs)
i+=1
if (i % 100000) == 0:
print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)
In [10]:
f = open('indexdiconehot.pkl', 'rb')
indexdic = cPickle.load(f)
f.close()
cnames = set(['day', 'hour', 'banner_pos', 'site_category', 'app_category',
'device_type', 'device_conn_type',
'C1', 'C15', 'C16', 'C18', 'C20'])
In [11]:
hasher = pyhash.murmur3_32()
def hashstr(feat, bins):
return hasher(str(feat)) % bins +1+309
def gen_hashed_fm_feats(feats, bins):
feats = [(field, hashstr(feat, bins)) for (field, feat) in feats]
#feats.sort()
feats = ['{0}'.format(idx) for (field, idx) in feats]
return feats
bins = 1000000
In [13]:
names = X.colnames
In [14]:
lines = X.shape[0]
In [15]:
%%time
i = 0
with open('train.fm', 'wb') as fw:
for row, target, line_gbdt in izip(X.iterrows(), y.iterrows(), open('train.gbdt.out')):
out = "{0}".format(target[0])
featsX = []
for name in names:
value = str(row[name])
if name in cnames:
out += " {0}".format(indexdic[name][value])
else:
featsX.append((name, '{0}_{1}'.format(name, value)))
featsgbdt = [('gbdt{0}'.format(j), 'gbdt{0}_{1}'.format(j, feat)) for j, feat in enumerate(line_gbdt.strip().split()[1:13], start=1)]
feats = featsX + featsgbdt
feats = gen_hashed_fm_feats(feats, bins)
out += ' ' + ' '.join(feats) + '\n'
fw.write(out)
i+=1
if (i % 1000000) == 0:
print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)
In [6]:
names = X.colnames
names.remove('device_ip')
In [7]:
lines = X.shape[0]
In [8]:
f = open('indexdiconehot.pkl', 'rb')
indexdic = cPickle.load(f)
f.close()
cnames = set(['day', 'hour', 'banner_pos', 'site_category', 'app_category',
'device_type', 'device_conn_type',
'C1', 'C15', 'C16', 'C18', 'C20'])
In [10]:
hasher = pyhash.murmur3_32()
def hashstr(feat, bins):
return hasher(str(feat)) % bins +1+309
def gen_hashed_fm_feats(feats, bins):
feats = [(field, hashstr(feat, bins)) for (field, feat) in feats]
#feats.sort()
feats = ['{0}'.format(idx) for (field, idx) in feats]
return feats
bins = 10000000
In [11]:
%%time
i = 0
with open('train.fm', 'wb') as fw:
for row, target in izip(X.iterrows(), y.iterrows()):
out = "{0}".format(target[0])
feats = []
for name in names:
value = str(row[name])
if name in cnames:
out += " {0}".format(indexdic[name][value])
else:
feats.append((name, '{0}_{1}'.format(name, value)))
feats = gen_hashed_fm_feats(feats, bins)
out += ' ' + ' '.join(feats) + '\n'
fw.write(out)
i+=1
if (i % 1000000) == 0:
print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)
In [12]:
names = X_train.colnames
In [13]:
lines = X_train.shape[0]
In [14]:
%%time
i = 0
with open('train9days.fm', 'wb') as fw:
for row, target, line_gbdt in izip(X_train.iterrows(), y_train.iterrows(), open('train9days.gbdt.out')):
out = "{0}".format(target[0])
featsX = []
for name in names:
value = str(row[name])
if name in cnames:
out += " {0}".format(indexdic[name][value])
else:
featsX.append((name, '{0}_{1}'.format(name, value)))
featsgbdt = [('gbdt{0}'.format(j), 'gbdt{0}_{1}'.format(j, feat)) for j, feat in enumerate(line_gbdt.strip().split()[1:9], start=1)]
feats = featsX + featsgbdt
feats = gen_hashed_fm_feats(feats, bins)
out += ' ' + ' '.join(feats) + '\n'
fw.write(out)
i+=1
if (i % 1000000) == 0:
print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)
In [15]:
names = X_valid.colnames
In [16]:
lines = X_valid.shape[0]
In [17]:
%%time
i = 0
with open('train10thday.fm', 'wb') as fw:
for row, target, line_gbdt in izip(X_valid.iterrows(), y_valid.iterrows(), open('train10thday.gbdt.out')):
out = "{0}".format(target[0])
featsX = []
for name in names:
value = str(row[name])
if name in cnames:
out += " {0}".format(indexdic[name][value])
else:
featsX.append((name, '{0}_{1}'.format(name, value)))
featsgbdt = [('gbdt{0}'.format(j), 'gbdt{0}_{1}'.format(j, feat)) for j, feat in enumerate(line_gbdt.strip().split()[1:9], start=1)]
feats = featsX + featsgbdt
feats = gen_hashed_fm_feats(feats, bins)
out += ' ' + ' '.join(feats) + '\n'
fw.write(out)
i+=1
if (i % 100000) == 0:
print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)
In [ ]:
In [16]:
names = X_t.colnames
In [17]:
lines = X_t.shape[0]
In [18]:
%%time
i = 0
with open('test.fm', 'wb') as fw:
for row, line_gbdt in izip(X_t.iterrows(), open('test.gbdt.out')):
out = "0"
featsX = []
for name in names:
value = str(row[name])
if name in cnames:
out += " {0}".format(indexdic[name][value])
else:
featsX.append((name, '{0}_{1}'.format(name, value)))
featsgbdt = [('gbdt{0}'.format(j), 'gbdt{0}_{1}'.format(j, feat)) for j, feat in enumerate(line_gbdt.strip().split()[1:13], start=1)]
feats = featsX + featsgbdt
feats = gen_hashed_fm_feats(feats, bins)
out += ' ' + ' '.join(feats) + '\n'
fw.write(out)
i+=1
if (i % 100000) == 0:
print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)
In [12]:
names = X_t.colnames
names.remove('device_ip')
In [13]:
lines = X_t.shape[0]
In [14]:
%%time
i = 0
with open('test.fm', 'wb') as fw:
for row in X_t.iterrows():
out = "0"
feats = []
for name in names:
value = str(row[name])
if name in cnames:
out += " {0}".format(indexdic[name][value])
else:
feats.append((name, '{0}_{1}'.format(name, value)))
feats = gen_hashed_fm_feats(feats, bins)
out += ' ' + ' '.join(feats) + '\n'
fw.write(out)
i+=1
if (i % 100000) == 0:
print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)
In [6]:
names = ['day', 'hour', 'banner_pos', 'site_id', 'site_domain', 'site_category', 'app_id', 'app_domain', 'app_category',
'device_id', 'device_ip', 'device_model', 'device_type', 'device_conn_type',
'C1', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21']
In [7]:
lines = X.shape[0]
In [6]:
f = open('indexdicless.pkl', 'rb')
indexdic = cPickle.load(f)
f.close()
In [9]:
%%time
i = 0
with open('train.fm', 'wb') as fw:
for row, target in izip(X.iterrows(),y.iterrows()):
indexlist = []
out = "{0}".format(target[0])
for name in names:
value = str(row[name])
if value in indexdic[name]:
index = indexdic[name][value]
indexlist.append(index)
elif 'other' in indexdic[name]:
index = indexdic[name]['other']
indexlist.append(index)
else:
print 'error'
break
out += " " + " ".join(["{0}".format(index) for index in indexlist]) + "\n"
fw.write(out)
i+=1
if (i % 1000000) == 0:
print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)
In [8]:
f = open('indexdicless2.pkl', 'rb')
indexdic = cPickle.load(f)
f.close()
In [9]:
%%time
i = 0
with open('train1.fm', 'wb') as fw:
for row, target in izip(X.iterrows(),y.iterrows()):
indexlist = []
out = "{0}".format(target[0])
for name in names:
value = str(row[name])
if value in indexdic[name]:
index = indexdic[name][value]
indexlist.append(index)
elif 'other' in indexdic[name]:
index = indexdic[name]['other']
indexlist.append(index)
else:
print 'error'
break
out += " " + " ".join(["{0}".format(index) for index in indexlist]) + "\n"
fw.write(out)
i+=1
if (i % 1000000) == 0:
print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)
In [8]:
f = open('indexdicless3.pkl', 'rb')
indexdic = cPickle.load(f)
f.close()
In [9]:
%%time
i = 0
with open('train.fm', 'wb') as fw:
for row, target in izip(X.iterrows(),y.iterrows()):
indexlist = []
out = "{0}".format(target[0])
for name in names:
value = str(row[name])
if value in indexdic[name]:
index = indexdic[name][value]
indexlist.append(index)
elif 'other' in indexdic[name]:
index = indexdic[name]['other']
indexlist.append(index)
else:
print 'error'
break
out += " " + " ".join(["{0}".format(index) for index in indexlist]) + "\n"
fw.write(out)
i+=1
if (i % 1000000) == 0:
print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)
In [10]:
lines = X_t.shape[0]
In [9]:
%%time
i = 0
with open('test.fm', 'wb') as fw:
for row in X_t.iterrows():
indexlist = []
out = "0"
for name in names:
value = str(row[name])
if value in indexdic[name]:
index = indexdic[name][value]
indexlist.append(index)
elif 'other' in indexdic[name]:
index = indexdic[name]['other']
indexlist.append(index)
else:
indexlist.append(617958)
out += " " + " ".join(["{0}".format(index) for index in indexlist]) + "\n"
fw.write(out)
i+=1
if (i % 100000) == 0:
print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)
In [11]:
%%time
i = 0
with open('test1.fm', 'wb') as fw:
for row in X_t.iterrows():
indexlist = []
out = "0"
for name in names:
value = str(row[name])
if value in indexdic[name]:
index = indexdic[name][value]
indexlist.append(index)
elif 'other' in indexdic[name]:
index = indexdic[name]['other']
indexlist.append(index)
else:
indexlist.append(947464)
out += " " + " ".join(["{0}".format(index) for index in indexlist]) + "\n"
fw.write(out)
i+=1
if (i % 100000) == 0:
print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)
In [11]:
%%time
i = 0
with open('test.fm', 'wb') as fw:
for row in X_t.iterrows():
indexlist = []
out = "0"
for name in names:
value = str(row[name])
if value in indexdic[name]:
index = indexdic[name][value]
indexlist.append(index)
elif 'other' in indexdic[name]:
index = indexdic[name]['other']
indexlist.append(index)
else:
indexlist.append(636705)
out += " " + " ".join(["{0}".format(index) for index in indexlist]) + "\n"
fw.write(out)
i+=1
if (i % 100000) == 0:
print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)
In [10]:
f = open('clickratedic.pkl', 'rb')
clickratedic = cPickle.load(f)
f.close()
In [11]:
glh = 0.084933389586625302
In [12]:
names = X_train.colnames
lines = X_train.shape[0]
In [13]:
%%time
i = 0
with open('train9days.xgb', 'wb') as fw:
for row, target in izip(X_train.iterrows(), y_train.iterrows()):
feats = ["{0}:{1}".format(j, math.exp(clickratedic[name][row[name]])) if row[name] in clickratedic[name] else "{0}:{1}".format(j, math.exp(glh)) for j, name in enumerate(names, start=1)]
out = str(target[0]) + ' ' + ' '.join(feats) + '\n'
fw.write(out)
i+=1
if (i % 1000000) == 0:
print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)
In [14]:
names = X_valid.colnames
lines = X_valid.shape[0]
In [15]:
%%time
i = 0
with open('train10thday.xgb', 'wb') as fw:
for row, target in izip(X_valid.iterrows(), y_valid.iterrows()):
feats = ["{0}:{1}".format(j, math.exp(clickratedic[name][row[name]])) if row[name] in clickratedic[name] else "{0}:{1}".format(j, math.exp(glh)) for j, name in enumerate(names, start=1)]
out = str(target[0]) + ' ' + ' '.join(feats) + '\n'
fw.write(out)
i+=1
if (i % 100000) == 0:
print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)
In [8]:
f = open('clickratedicall.pkl', 'rb')
clickratedic = cPickle.load(f)
f.close()
In [ ]:
glh = 0.084902812382023019
In [ ]:
names = X.colnames
lines = X.shape[0]
In [ ]:
%%time
i = 0
with open('train.xgb', 'wb') as fw:
for row, target in izip(X.iterrows(), y.iterrows()):
feats = ["{0}:{1}".format(j, math.exp(clickratedic[name][row[name]])) if row[name] in clickratedic[name] else "{0}:{1}".format(j, math.exp(glh)) for j, name in enumerate(names, start=1)]
out = str(target[0]) + ' ' + ' '.join(feats) + '\n'
fw.write(out)
i+=1
if (i % 1000000) == 0:
print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)
In [16]:
names = X_t.colnames
lines = X_t.shape[0]
In [18]:
%%time
i = 0
with open('test.xgb', 'wb') as fw:
for row in X_t.iterrows():
feats = ["{0}:{1}".format(j, math.exp(clickratedic[name][row[name]])) if row[name] in clickratedic[name] else "{0}:{1}".format(j, math.exp(glh)) for j, name in enumerate(names, start=1)]
out = '0 ' + ' '.join(feats) + '\n'
fw.write(out)
i+=1
if (i % 100000) == 0:
print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)
In [ ]: