The csv data have been saved in hdf5
In [1]:
import tables
import time
import numpy as np
import cPickle
from itertools import izip
In [2]:
file_handler = tables.open_file("click_data.h5", mode = "r")
In [3]:
X = file_handler.root.train.train_raw.X
In [4]:
y = file_handler.root.train.train_raw.y
In [5]:
X_t = file_handler.root.test.test_raw.X_t
In [6]:
colnames = X.colnames
In [7]:
lines = X.shape[0]
In [8]:
%%time
i = 0
with open('train.vw', 'wb') as fw:
for row, target in izip(X.iterrows(),y.iterrows()):
out = "{0} |".format(target[0]*2-1)
for name in colnames:
value = str(row[name])
out += " {0}_{1}".format(name, value)
out += '\n'
fw.write(out)
i+=1
if (i % 1000000) == 0:
print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)
In [8]:
timelist = ['day','hour']
ban = ['banner_pos']
site = ['site_id', 'site_domain', 'site_category']
app = ['app_id', 'app_domain', 'app_category']
device = ['device_id', 'device_ip', 'device_model', 'device_type', 'device_conn_type']
clist = ['C1', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21']
In [9]:
%%time
i = 0
with open('train.vw', 'wb') as fw:
for row, target in izip(X.iterrows(),y.iterrows()):
out = "{0}".format(target[0]*2-1)
out += " |t"
for name in timelist:
value = str(row[name])
out += " {0}_{1}".format(name, value)
out += " |b"
for name in ban:
value = str(row[name])
out += " {0}_{1}".format(name, value)
out += " |s"
for name in site:
value = str(row[name])
out += " {0}_{1}".format(name, value)
out += " |a"
for name in app:
value = str(row[name])
out += " {0}_{1}".format(name, value)
out += " |d"
for name in device:
value = str(row[name])
out += " {0}_{1}".format(name, value)
out += " |c"
for name in clist:
value = str(row[name])
out += " {0}_{1}".format(name, value)
out += '\n'
fw.write(out)
i+=1
if (i % 1000000) == 0:
print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)
In [9]:
f = open('indexdiconehot.pkl', 'rb')
indexdic = cPickle.load(f)
f.close()
In [10]:
names = set(['day', 'hour', 'banner_pos', 'site_category', 'app_category',
'device_type', 'device_conn_type',
'C1', 'C15', 'C16', 'C18', 'C20'])
In [11]:
%%time
i = 0
with open('train1.vw', 'wb') as fw:
for row, target in izip(X.iterrows(),y.iterrows()):
out = "{0}".format(target[0]*2-1)
out += " |t"
for name in timelist:
value = str(row[name])
if name in names:
out += " {0}".format(indexdic[name][value])
else:
out += " {0}_{1}".format(name, value)
out += " |b"
for name in ban:
value = str(row[name])
if name in names:
out += " {0}".format(indexdic[name][value])
else:
out += " {0}_{1}".format(name, value)
out += " |s"
for name in site:
value = str(row[name])
if name in names:
out += " {0}".format(indexdic[name][value])
else:
out += " {0}_{1}".format(name, value)
out += " |a"
for name in app:
value = str(row[name])
if name in names:
out += " {0}".format(indexdic[name][value])
else:
out += " {0}_{1}".format(name, value)
out += " |d"
for name in device:
value = str(row[name])
if name in names:
out += " {0}".format(indexdic[name][value])
else:
out += " {0}_{1}".format(name, value)
out += " |c"
for name in clist:
value = str(row[name])
if name in names:
out += " {0}".format(indexdic[name][value])
else:
out += " {0}_{1}".format(name, value)
out += '\n'
fw.write(out)
i+=1
if (i % 1000000) == 0:
print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)
In [9]:
f = open('indexdicless.pkl', 'rb')
indexdic = cPickle.load(f)
f.close()
In [10]:
%%time
i = 0
with open('train2.vw', 'wb') as fw:
for row, target in izip(X.iterrows(),y.iterrows()):
out = "{0}".format(target[0]*2-1)
out += " |t"
for name in timelist:
value = str(row[name])
if value in indexdic[name]:
out += " {0}".format(indexdic[name][value])
elif 'other' in indexdic[name]:
out += " {0}".format(indexdic[name]['other'])
else:
print 'error'
break
out += " |b"
for name in ban:
value = str(row[name])
if value in indexdic[name]:
out += " {0}".format(indexdic[name][value])
elif 'other' in indexdic[name]:
out += " {0}".format(indexdic[name]['other'])
else:
print 'error'
break
out += " |s"
for name in site:
value = str(row[name])
if value in indexdic[name]:
out += " {0}".format(indexdic[name][value])
elif 'other' in indexdic[name]:
out += " {0}".format(indexdic[name]['other'])
else:
print 'error'
break
out += " |a"
for name in app:
value = str(row[name])
if value in indexdic[name]:
out += " {0}".format(indexdic[name][value])
elif 'other' in indexdic[name]:
out += " {0}".format(indexdic[name]['other'])
else:
print 'error'
break
out += " |d"
for name in device:
value = str(row[name])
if value in indexdic[name]:
out += " {0}".format(indexdic[name][value])
elif 'other' in indexdic[name]:
out += " {0}".format(indexdic[name]['other'])
else:
print 'error'
break
out += " |c"
for name in clist:
value = str(row[name])
if value in indexdic[name]:
out += " {0}".format(indexdic[name][value])
elif 'other' in indexdic[name]:
out += " {0}".format(indexdic[name]['other'])
else:
print 'error'
break
out += '\n'
fw.write(out)
i+=1
if (i % 1000000) == 0:
print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)
In [9]:
f = open('indexdicless2.pkl', 'rb')
indexdic = cPickle.load(f)
f.close()
In [10]:
%%time
i = 0
with open('train1.vw', 'wb') as fw:
for row, target in izip(X.iterrows(),y.iterrows()):
out = "{0}".format(target[0]*2-1)
out += " |t"
for name in timelist:
value = str(row[name])
if value in indexdic[name]:
out += " {0}".format(indexdic[name][value])
elif 'other' in indexdic[name]:
out += " {0}".format(indexdic[name]['other'])
else:
print 'error'
break
out += " |b"
for name in ban:
value = str(row[name])
if value in indexdic[name]:
out += " {0}".format(indexdic[name][value])
elif 'other' in indexdic[name]:
out += " {0}".format(indexdic[name]['other'])
else:
print 'error'
break
out += " |s"
for name in site:
value = str(row[name])
if value in indexdic[name]:
out += " {0}".format(indexdic[name][value])
elif 'other' in indexdic[name]:
out += " {0}".format(indexdic[name]['other'])
else:
print 'error'
break
out += " |a"
for name in app:
value = str(row[name])
if value in indexdic[name]:
out += " {0}".format(indexdic[name][value])
elif 'other' in indexdic[name]:
out += " {0}".format(indexdic[name]['other'])
else:
print 'error'
break
out += " |d"
for name in device:
value = str(row[name])
if value in indexdic[name]:
out += " {0}".format(indexdic[name][value])
elif 'other' in indexdic[name]:
out += " {0}".format(indexdic[name]['other'])
else:
print 'error'
break
out += " |c"
for name in clist:
value = str(row[name])
if value in indexdic[name]:
out += " {0}".format(indexdic[name][value])
elif 'other' in indexdic[name]:
out += " {0}".format(indexdic[name]['other'])
else:
print 'error'
break
out += '\n'
fw.write(out)
i+=1
if (i % 1000000) == 0:
print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)
In [9]:
f = open('indexdicless3.pkl', 'rb')
indexdic = cPickle.load(f)
f.close()
In [10]:
%%time
i = 0
with open('train1.vw', 'wb') as fw:
for row, target in izip(X.iterrows(),y.iterrows()):
out = "{0}".format(target[0]*2-1)
out += " |t"
for name in timelist:
value = str(row[name])
if value in indexdic[name]:
out += " {0}".format(indexdic[name][value])
elif 'other' in indexdic[name]:
out += " {0}".format(indexdic[name]['other'])
else:
print 'error'
break
out += " |b"
for name in ban:
value = str(row[name])
if value in indexdic[name]:
out += " {0}".format(indexdic[name][value])
elif 'other' in indexdic[name]:
out += " {0}".format(indexdic[name]['other'])
else:
print 'error'
break
out += " |s"
for name in site:
value = str(row[name])
if value in indexdic[name]:
out += " {0}".format(indexdic[name][value])
elif 'other' in indexdic[name]:
out += " {0}".format(indexdic[name]['other'])
else:
print 'error'
break
out += " |a"
for name in app:
value = str(row[name])
if value in indexdic[name]:
out += " {0}".format(indexdic[name][value])
elif 'other' in indexdic[name]:
out += " {0}".format(indexdic[name]['other'])
else:
print 'error'
break
out += " |d"
for name in device:
value = str(row[name])
if value in indexdic[name]:
out += " {0}".format(indexdic[name][value])
elif 'other' in indexdic[name]:
out += " {0}".format(indexdic[name]['other'])
else:
print 'error'
break
out += " |c"
for name in clist:
value = str(row[name])
if value in indexdic[name]:
out += " {0}".format(indexdic[name][value])
elif 'other' in indexdic[name]:
out += " {0}".format(indexdic[name]['other'])
else:
print 'error'
break
out += '\n'
fw.write(out)
i+=1
if (i % 1000000) == 0:
print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)
In [11]:
colnames = X_t.colnames
In [12]:
lines = X_t.shape[0]
In [11]:
%%time
i = 0
with open('test.vw', 'wb') as fw:
for row in X_t.iterrows():
out = " |"
for name in colnames:
value = str(row[name])
out += " {0}_{1}".format(name, value)
out += '\n'
fw.write(out)
i+=1
if (i % 100000) == 0:
print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)
In [13]:
timelist = ['day','hour']
ban = ['banner_pos']
site = ['site_id', 'site_domain', 'site_category']
app = ['app_id', 'app_domain', 'app_category']
device = ['device_id', 'device_ip', 'device_model', 'device_type', 'device_conn_type']
clist = ['C1', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21']
In [13]:
%%time
i = 0
with open('test.vw', 'wb') as fw:
for row in X_t.iterrows():
out = " "
out += " |t"
for name in timelist:
value = str(row[name])
out += " {0}_{1}".format(name, value)
out += " |b"
for name in ban:
value = str(row[name])
out += " {0}_{1}".format(name, value)
out += " |s"
for name in site:
value = str(row[name])
out += " {0}_{1}".format(name, value)
out += " |a"
for name in app:
value = str(row[name])
out += " {0}_{1}".format(name, value)
out += " |d"
for name in device:
value = str(row[name])
out += " {0}_{1}".format(name, value)
out += " |c"
for name in clist:
value = str(row[name])
out += " {0}_{1}".format(name, value)
out += '\n'
fw.write(out)
i+=1
if (i % 100000) == 0:
print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)
In [ ]:
f = open('indexdiconehot.pkl', 'rb')
indexdic = cPickle.load(f)
f.close()
In [ ]:
names = set(['day', 'hour', 'banner_pos', 'site_category', 'app_category',
'device_type', 'device_conn_type',
'C1', 'C15', 'C16', 'C18', 'C20'])
In [ ]:
%%time
i = 0
with open('test1.vw', 'wb') as fw:
for row in X_t.iterrows():
out = " "
out += " |t"
for name in timelist:
value = str(row[name])
if name in names:
out += " {0}".format(indexdic[name][value])
else:
out += " {0}_{1}".format(name, value)
out += " |b"
for name in ban:
value = str(row[name])
if name in names:
out += " {0}".format(indexdic[name][value])
else:
out += " {0}_{1}".format(name, value)
out += " |s"
for name in site:
value = str(row[name])
if name in names:
out += " {0}".format(indexdic[name][value])
else:
out += " {0}_{1}".format(name, value)
out += " |a"
for name in app:
value = str(row[name])
if name in names:
out += " {0}".format(indexdic[name][value])
else:
out += " {0}_{1}".format(name, value)
out += " |d"
for name in device:
value = str(row[name])
if name in names:
out += " {0}".format(indexdic[name][value])
else:
out += " {0}_{1}".format(name, value)
out += " |c"
for name in clist:
value = str(row[name])
if name in names:
out += " {0}".format(indexdic[name][value])
else:
out += " {0}_{1}".format(name, value)
out += '\n'
fw.write(out)
i+=1
if (i % 100000) == 0:
print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)
In [ ]:
In [14]:
%%time
i = 0
with open('test2.vw', 'wb') as fw:
for row in X_t.iterrows():
out = " "
out += " |t"
for name in timelist:
value = str(row[name])
if value in indexdic[name]:
out += " {0}".format(indexdic[name][value])
elif 'other' in indexdic[name]:
out += " {0}".format(indexdic[name]['other'])
else:
out += " {0}".format(617958)
out += " |b"
for name in ban:
value = str(row[name])
if value in indexdic[name]:
out += " {0}".format(indexdic[name][value])
elif 'other' in indexdic[name]:
out += " {0}".format(indexdic[name]['other'])
else:
out += " {0}".format(617958)
out += " |s"
for name in site:
value = str(row[name])
if value in indexdic[name]:
out += " {0}".format(indexdic[name][value])
elif 'other' in indexdic[name]:
out += " {0}".format(indexdic[name]['other'])
else:
out += " {0}".format(617958)
out += " |a"
for name in app:
value = str(row[name])
if value in indexdic[name]:
out += " {0}".format(indexdic[name][value])
elif 'other' in indexdic[name]:
out += " {0}".format(indexdic[name]['other'])
else:
out += " {0}".format(617958)
out += " |d"
for name in device:
value = str(row[name])
if value in indexdic[name]:
out += " {0}".format(indexdic[name][value])
elif 'other' in indexdic[name]:
out += " {0}".format(indexdic[name]['other'])
else:
out += " {0}".format(617958)
out += " |c"
for name in clist:
value = str(row[name])
if value in indexdic[name]:
out += " {0}".format(indexdic[name][value])
elif 'other' in indexdic[name]:
out += " {0}".format(indexdic[name]['other'])
else:
out += " {0}".format(617958)
out += '\n'
fw.write(out)
i+=1
if (i % 100000) == 0:
print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)
In [14]:
%%time
i = 0
with open('test1.vw', 'wb') as fw:
for row in X_t.iterrows():
out = " "
out += " |t"
for name in timelist:
value = str(row[name])
if value in indexdic[name]:
out += " {0}".format(indexdic[name][value])
elif 'other' in indexdic[name]:
out += " {0}".format(indexdic[name]['other'])
else:
out += " {0}".format(947464)
out += " |b"
for name in ban:
value = str(row[name])
if value in indexdic[name]:
out += " {0}".format(indexdic[name][value])
elif 'other' in indexdic[name]:
out += " {0}".format(indexdic[name]['other'])
else:
out += " {0}".format(947464)
out += " |s"
for name in site:
value = str(row[name])
if value in indexdic[name]:
out += " {0}".format(indexdic[name][value])
elif 'other' in indexdic[name]:
out += " {0}".format(indexdic[name]['other'])
else:
out += " {0}".format(947464)
out += " |a"
for name in app:
value = str(row[name])
if value in indexdic[name]:
out += " {0}".format(indexdic[name][value])
elif 'other' in indexdic[name]:
out += " {0}".format(indexdic[name]['other'])
else:
out += " {0}".format(947464)
out += " |d"
for name in device:
value = str(row[name])
if value in indexdic[name]:
out += " {0}".format(indexdic[name][value])
elif 'other' in indexdic[name]:
out += " {0}".format(indexdic[name]['other'])
else:
out += " {0}".format(947464)
out += " |c"
for name in clist:
value = str(row[name])
if value in indexdic[name]:
out += " {0}".format(indexdic[name][value])
elif 'other' in indexdic[name]:
out += " {0}".format(indexdic[name]['other'])
else:
out += " {0}".format(947464)
out += '\n'
fw.write(out)
i+=1
if (i % 100000) == 0:
print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)
In [14]:
%%time
i = 0
with open('test1.vw', 'wb') as fw:
for row in X_t.iterrows():
out = " "
out += " |t"
for name in timelist:
value = str(row[name])
if value in indexdic[name]:
out += " {0}".format(indexdic[name][value])
elif 'other' in indexdic[name]:
out += " {0}".format(indexdic[name]['other'])
else:
out += " {0}".format(636705)
out += " |b"
for name in ban:
value = str(row[name])
if value in indexdic[name]:
out += " {0}".format(indexdic[name][value])
elif 'other' in indexdic[name]:
out += " {0}".format(indexdic[name]['other'])
else:
out += " {0}".format(636705)
out += " |s"
for name in site:
value = str(row[name])
if value in indexdic[name]:
out += " {0}".format(indexdic[name][value])
elif 'other' in indexdic[name]:
out += " {0}".format(indexdic[name]['other'])
else:
out += " {0}".format(636705)
out += " |a"
for name in app:
value = str(row[name])
if value in indexdic[name]:
out += " {0}".format(indexdic[name][value])
elif 'other' in indexdic[name]:
out += " {0}".format(indexdic[name]['other'])
else:
out += " {0}".format(636705)
out += " |d"
for name in device:
value = str(row[name])
if value in indexdic[name]:
out += " {0}".format(indexdic[name][value])
elif 'other' in indexdic[name]:
out += " {0}".format(indexdic[name]['other'])
else:
out += " {0}".format(636705)
out += " |c"
for name in clist:
value = str(row[name])
if value in indexdic[name]:
out += " {0}".format(indexdic[name][value])
elif 'other' in indexdic[name]:
out += " {0}".format(indexdic[name]['other'])
else:
out += " {0}".format(636705)
out += '\n'
fw.write(out)
i+=1
if (i % 100000) == 0:
print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)
In [ ]: