The csv data have been saved in hdf5
In [1]:
    
import tables
import time
import numpy as np
import cPickle
from itertools import izip
    
In [2]:
    
file_handler = tables.open_file("click_data.h5", mode = "r")
    
In [3]:
    
X = file_handler.root.train.train_raw.X
    
In [4]:
    
y = file_handler.root.train.train_raw.y
    
In [5]:
    
X_t = file_handler.root.test.test_raw.X_t
    
In [6]:
    
colnames = X.colnames
    
In [7]:
    
lines = X.shape[0]
    
In [8]:
    
%%time
i = 0
with open('train.vw', 'wb') as fw:
    for row, target in izip(X.iterrows(),y.iterrows()):
        out = "{0} |".format(target[0]*2-1)
        for name in colnames:
            value = str(row[name])
            out += " {0}_{1}".format(name, value)
        out += '\n'
        fw.write(out)
        
        i+=1
        if (i % 1000000) == 0:
            print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)
    
    
In [8]:
    
timelist = ['day','hour']
ban = ['banner_pos']
site = ['site_id', 'site_domain', 'site_category']
app = ['app_id', 'app_domain', 'app_category']
device = ['device_id', 'device_ip', 'device_model', 'device_type', 'device_conn_type']
clist = ['C1', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21']
    
In [9]:
    
%%time
i = 0
with open('train.vw', 'wb') as fw:
    for row, target in izip(X.iterrows(),y.iterrows()):
        out = "{0}".format(target[0]*2-1)
        
        out += " |t"
        for name in timelist:
            value = str(row[name])
            out += " {0}_{1}".format(name, value)
        
        out += " |b"
        for name in ban:
            value = str(row[name])
            out += " {0}_{1}".format(name, value)
            
        out += " |s"
        for name in site:
            value = str(row[name])
            out += " {0}_{1}".format(name, value)
            
        out += " |a"
        for name in app:
            value = str(row[name])
            out += " {0}_{1}".format(name, value)
            
        out += " |d"
        for name in device:
            value = str(row[name])
            out += " {0}_{1}".format(name, value)
            
        out += " |c"
        for name in clist:
            value = str(row[name])
            out += " {0}_{1}".format(name, value)
             
        out += '\n'
        fw.write(out)
        
        i+=1
        
        if (i % 1000000) == 0:
            print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)
    
    
In [9]:
    
f = open('indexdiconehot.pkl', 'rb')
indexdic = cPickle.load(f)
f.close()
    
In [10]:
    
names = set(['day', 'hour', 'banner_pos', 'site_category', 'app_category', 
         'device_type', 'device_conn_type', 
         'C1', 'C15', 'C16', 'C18', 'C20'])
    
In [11]:
    
%%time
i = 0
with open('train1.vw', 'wb') as fw:
    for row, target in izip(X.iterrows(),y.iterrows()):
        out = "{0}".format(target[0]*2-1)
        
        out += " |t"
        for name in timelist:
            value = str(row[name])
            if name in names:
                out += " {0}".format(indexdic[name][value])
            else:
                out += " {0}_{1}".format(name, value)
        
        out += " |b"
        for name in ban:
            value = str(row[name])
            if name in names:
                out += " {0}".format(indexdic[name][value])
            else:
                out += " {0}_{1}".format(name, value)
            
        out += " |s"
        for name in site:
            value = str(row[name])
            if name in names:
                out += " {0}".format(indexdic[name][value])
            else:
                out += " {0}_{1}".format(name, value)
            
        out += " |a"
        for name in app:
            value = str(row[name])
            if name in names:
                out += " {0}".format(indexdic[name][value])
            else:
                out += " {0}_{1}".format(name, value)
            
        out += " |d"
        for name in device:
            value = str(row[name])
            if name in names:
                out += " {0}".format(indexdic[name][value])
            else:
                out += " {0}_{1}".format(name, value)
            
        out += " |c"
        for name in clist:
            value = str(row[name])
            if name in names:
                out += " {0}".format(indexdic[name][value])
            else:
                out += " {0}_{1}".format(name, value)
             
        out += '\n'
        fw.write(out)
        
        i+=1
        
        if (i % 1000000) == 0:
            print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)
    
    
In [9]:
    
f = open('indexdicless.pkl', 'rb')
indexdic = cPickle.load(f)
f.close()
    
In [10]:
    
%%time
i = 0
with open('train2.vw', 'wb') as fw:
    for row, target in izip(X.iterrows(),y.iterrows()):
        out = "{0}".format(target[0]*2-1)
        
        out += " |t"
        for name in timelist:
            value = str(row[name])
            if value in indexdic[name]:
                out += " {0}".format(indexdic[name][value])
            elif 'other' in indexdic[name]:
                out += " {0}".format(indexdic[name]['other'])
            else:
                print 'error'
                break
        out += " |b"
        for name in ban:
            value = str(row[name])
            if value in indexdic[name]:
                out += " {0}".format(indexdic[name][value])
            elif 'other' in indexdic[name]:
                out += " {0}".format(indexdic[name]['other'])
            else:
                print 'error'
                break
            
        out += " |s"
        for name in site:
            value = str(row[name])
            if value in indexdic[name]:
                out += " {0}".format(indexdic[name][value])
            elif 'other' in indexdic[name]:
                out += " {0}".format(indexdic[name]['other'])
            else:
                print 'error'
                break
            
        out += " |a"
        for name in app:
            value = str(row[name])
            if value in indexdic[name]:
                out += " {0}".format(indexdic[name][value])
            elif 'other' in indexdic[name]:
                out += " {0}".format(indexdic[name]['other'])
            else:
                print 'error'
                break
            
        out += " |d"
        for name in device:
            value = str(row[name])
            if value in indexdic[name]:
                out += " {0}".format(indexdic[name][value])
            elif 'other' in indexdic[name]:
                out += " {0}".format(indexdic[name]['other'])
            else:
                print 'error'
                break
            
        out += " |c"
        for name in clist:
            value = str(row[name])
            if value in indexdic[name]:
                out += " {0}".format(indexdic[name][value])
            elif 'other' in indexdic[name]:
                out += " {0}".format(indexdic[name]['other'])
            else:
                print 'error'
                break
             
        out += '\n'
        fw.write(out)
            
        i+=1
        
        if (i % 1000000) == 0:
            print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)
    
    
In [9]:
    
f = open('indexdicless2.pkl', 'rb')
indexdic = cPickle.load(f)
f.close()
    
In [10]:
    
%%time
i = 0
with open('train1.vw', 'wb') as fw:
    for row, target in izip(X.iterrows(),y.iterrows()):
        out = "{0}".format(target[0]*2-1)
        
        out += " |t"
        for name in timelist:
            value = str(row[name])
            if value in indexdic[name]:
                out += " {0}".format(indexdic[name][value])
            elif 'other' in indexdic[name]:
                out += " {0}".format(indexdic[name]['other'])
            else:
                print 'error'
                break
        out += " |b"
        for name in ban:
            value = str(row[name])
            if value in indexdic[name]:
                out += " {0}".format(indexdic[name][value])
            elif 'other' in indexdic[name]:
                out += " {0}".format(indexdic[name]['other'])
            else:
                print 'error'
                break
            
        out += " |s"
        for name in site:
            value = str(row[name])
            if value in indexdic[name]:
                out += " {0}".format(indexdic[name][value])
            elif 'other' in indexdic[name]:
                out += " {0}".format(indexdic[name]['other'])
            else:
                print 'error'
                break
            
        out += " |a"
        for name in app:
            value = str(row[name])
            if value in indexdic[name]:
                out += " {0}".format(indexdic[name][value])
            elif 'other' in indexdic[name]:
                out += " {0}".format(indexdic[name]['other'])
            else:
                print 'error'
                break
            
        out += " |d"
        for name in device:
            value = str(row[name])
            if value in indexdic[name]:
                out += " {0}".format(indexdic[name][value])
            elif 'other' in indexdic[name]:
                out += " {0}".format(indexdic[name]['other'])
            else:
                print 'error'
                break
            
        out += " |c"
        for name in clist:
            value = str(row[name])
            if value in indexdic[name]:
                out += " {0}".format(indexdic[name][value])
            elif 'other' in indexdic[name]:
                out += " {0}".format(indexdic[name]['other'])
            else:
                print 'error'
                break
             
        out += '\n'
        fw.write(out)
            
        i+=1
        
        if (i % 1000000) == 0:
            print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)
    
    
In [9]:
    
f = open('indexdicless3.pkl', 'rb')
indexdic = cPickle.load(f)
f.close()
    
In [10]:
    
%%time
i = 0
with open('train1.vw', 'wb') as fw:
    for row, target in izip(X.iterrows(),y.iterrows()):
        out = "{0}".format(target[0]*2-1)
        
        out += " |t"
        for name in timelist:
            value = str(row[name])
            if value in indexdic[name]:
                out += " {0}".format(indexdic[name][value])
            elif 'other' in indexdic[name]:
                out += " {0}".format(indexdic[name]['other'])
            else:
                print 'error'
                break
        out += " |b"
        for name in ban:
            value = str(row[name])
            if value in indexdic[name]:
                out += " {0}".format(indexdic[name][value])
            elif 'other' in indexdic[name]:
                out += " {0}".format(indexdic[name]['other'])
            else:
                print 'error'
                break
            
        out += " |s"
        for name in site:
            value = str(row[name])
            if value in indexdic[name]:
                out += " {0}".format(indexdic[name][value])
            elif 'other' in indexdic[name]:
                out += " {0}".format(indexdic[name]['other'])
            else:
                print 'error'
                break
            
        out += " |a"
        for name in app:
            value = str(row[name])
            if value in indexdic[name]:
                out += " {0}".format(indexdic[name][value])
            elif 'other' in indexdic[name]:
                out += " {0}".format(indexdic[name]['other'])
            else:
                print 'error'
                break
            
        out += " |d"
        for name in device:
            value = str(row[name])
            if value in indexdic[name]:
                out += " {0}".format(indexdic[name][value])
            elif 'other' in indexdic[name]:
                out += " {0}".format(indexdic[name]['other'])
            else:
                print 'error'
                break
            
        out += " |c"
        for name in clist:
            value = str(row[name])
            if value in indexdic[name]:
                out += " {0}".format(indexdic[name][value])
            elif 'other' in indexdic[name]:
                out += " {0}".format(indexdic[name]['other'])
            else:
                print 'error'
                break
             
        out += '\n'
        fw.write(out)
            
        i+=1
        
        if (i % 1000000) == 0:
            print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)
    
    
In [11]:
    
colnames = X_t.colnames
    
In [12]:
    
lines = X_t.shape[0]
    
In [11]:
    
%%time
i = 0
with open('test.vw', 'wb') as fw:
    for row in X_t.iterrows():
        out = " |"
        for name in colnames:
            value = str(row[name])
            out += " {0}_{1}".format(name, value)
        out += '\n'
        fw.write(out)
        
        i+=1
        if (i % 100000) == 0:
            print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)
    
    
In [13]:
    
timelist = ['day','hour']
ban = ['banner_pos']
site = ['site_id', 'site_domain', 'site_category']
app = ['app_id', 'app_domain', 'app_category']
device = ['device_id', 'device_ip', 'device_model', 'device_type', 'device_conn_type']
clist = ['C1', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21']
    
In [13]:
    
%%time
i = 0
with open('test.vw', 'wb') as fw:
    for row in X_t.iterrows():
        out = " "
        
        out += " |t"
        for name in timelist:
            value = str(row[name])
            out += " {0}_{1}".format(name, value)
        
        out += " |b"
        for name in ban:
            value = str(row[name])
            out += " {0}_{1}".format(name, value)
            
        out += " |s"
        for name in site:
            value = str(row[name])
            out += " {0}_{1}".format(name, value)
            
        out += " |a"
        for name in app:
            value = str(row[name])
            out += " {0}_{1}".format(name, value)
            
        out += " |d"
        for name in device:
            value = str(row[name])
            out += " {0}_{1}".format(name, value)
            
        out += " |c"
        for name in clist:
            value = str(row[name])
            out += " {0}_{1}".format(name, value)
            
        out += '\n'
        fw.write(out)
        
        i+=1
        
        if (i % 100000) == 0:
            print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)
    
    
In [ ]:
    
f = open('indexdiconehot.pkl', 'rb')
indexdic = cPickle.load(f)
f.close()
    
In [ ]:
    
names = set(['day', 'hour', 'banner_pos', 'site_category', 'app_category', 
         'device_type', 'device_conn_type', 
         'C1', 'C15', 'C16', 'C18', 'C20'])
    
In [ ]:
    
%%time
i = 0
with open('test1.vw', 'wb') as fw:
    for row in X_t.iterrows():
        out = " "
        
        out += " |t"
        for name in timelist:
            value = str(row[name])
            if name in names:
                out += " {0}".format(indexdic[name][value])
            else:
                out += " {0}_{1}".format(name, value)
        
        out += " |b"
        for name in ban:
            value = str(row[name])
            if name in names:
                out += " {0}".format(indexdic[name][value])
            else:
                out += " {0}_{1}".format(name, value)
            
        out += " |s"
        for name in site:
            value = str(row[name])
            if name in names:
                out += " {0}".format(indexdic[name][value])
            else:
                out += " {0}_{1}".format(name, value)
            
        out += " |a"
        for name in app:
            value = str(row[name])
            if name in names:
                out += " {0}".format(indexdic[name][value])
            else:
                out += " {0}_{1}".format(name, value)
            
        out += " |d"
        for name in device:
            value = str(row[name])
            if name in names:
                out += " {0}".format(indexdic[name][value])
            else:
                out += " {0}_{1}".format(name, value)
            
        out += " |c"
        for name in clist:
            value = str(row[name])
            if name in names:
                out += " {0}".format(indexdic[name][value])
            else:
                out += " {0}_{1}".format(name, value)
            
        out += '\n'
        fw.write(out)
        
        i+=1
        
        if (i % 100000) == 0:
            print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)
    
In [ ]:
    
    
In [14]:
    
%%time
i = 0
with open('test2.vw', 'wb') as fw:
    for row in X_t.iterrows():
        out = " "
        
        out += " |t"
        for name in timelist:
            value = str(row[name])
            if value in indexdic[name]:
                out += " {0}".format(indexdic[name][value])
            elif 'other' in indexdic[name]:
                out += " {0}".format(indexdic[name]['other'])
            else:
                out += " {0}".format(617958)
        out += " |b"
        for name in ban:
            value = str(row[name])
            if value in indexdic[name]:
                out += " {0}".format(indexdic[name][value])
            elif 'other' in indexdic[name]:
                out += " {0}".format(indexdic[name]['other'])
            else:
                out += " {0}".format(617958)
            
        out += " |s"
        for name in site:
            value = str(row[name])
            if value in indexdic[name]:
                out += " {0}".format(indexdic[name][value])
            elif 'other' in indexdic[name]:
                out += " {0}".format(indexdic[name]['other'])
            else:
                out += " {0}".format(617958)
            
        out += " |a"
        for name in app:
            value = str(row[name])
            if value in indexdic[name]:
                out += " {0}".format(indexdic[name][value])
            elif 'other' in indexdic[name]:
                out += " {0}".format(indexdic[name]['other'])
            else:
                out += " {0}".format(617958)
            
        out += " |d"
        for name in device:
            value = str(row[name])
            if value in indexdic[name]:
                out += " {0}".format(indexdic[name][value])
            elif 'other' in indexdic[name]:
                out += " {0}".format(indexdic[name]['other'])
            else:
                out += " {0}".format(617958)
            
        out += " |c"
        for name in clist:
            value = str(row[name])
            if value in indexdic[name]:
                out += " {0}".format(indexdic[name][value])
            elif 'other' in indexdic[name]:
                out += " {0}".format(indexdic[name]['other'])
            else:
                out += " {0}".format(617958)
             
        out += '\n'
        fw.write(out)
            
        i+=1
        
        if (i % 100000) == 0:
            print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)
    
    
In [14]:
    
%%time
i = 0
with open('test1.vw', 'wb') as fw:
    for row in X_t.iterrows():
        out = " "
        
        out += " |t"
        for name in timelist:
            value = str(row[name])
            if value in indexdic[name]:
                out += " {0}".format(indexdic[name][value])
            elif 'other' in indexdic[name]:
                out += " {0}".format(indexdic[name]['other'])
            else:
                out += " {0}".format(947464)
        out += " |b"
        for name in ban:
            value = str(row[name])
            if value in indexdic[name]:
                out += " {0}".format(indexdic[name][value])
            elif 'other' in indexdic[name]:
                out += " {0}".format(indexdic[name]['other'])
            else:
                out += " {0}".format(947464)
            
        out += " |s"
        for name in site:
            value = str(row[name])
            if value in indexdic[name]:
                out += " {0}".format(indexdic[name][value])
            elif 'other' in indexdic[name]:
                out += " {0}".format(indexdic[name]['other'])
            else:
                out += " {0}".format(947464)
            
        out += " |a"
        for name in app:
            value = str(row[name])
            if value in indexdic[name]:
                out += " {0}".format(indexdic[name][value])
            elif 'other' in indexdic[name]:
                out += " {0}".format(indexdic[name]['other'])
            else:
                out += " {0}".format(947464)
            
        out += " |d"
        for name in device:
            value = str(row[name])
            if value in indexdic[name]:
                out += " {0}".format(indexdic[name][value])
            elif 'other' in indexdic[name]:
                out += " {0}".format(indexdic[name]['other'])
            else:
                out += " {0}".format(947464)
            
        out += " |c"
        for name in clist:
            value = str(row[name])
            if value in indexdic[name]:
                out += " {0}".format(indexdic[name][value])
            elif 'other' in indexdic[name]:
                out += " {0}".format(indexdic[name]['other'])
            else:
                out += " {0}".format(947464)
             
        out += '\n'
        fw.write(out)
        
            
        i+=1
        
        if (i % 100000) == 0:
            print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)
    
    
In [14]:
    
%%time
i = 0
with open('test1.vw', 'wb') as fw:
    for row in X_t.iterrows():
        out = " "
        
        out += " |t"
        for name in timelist:
            value = str(row[name])
            if value in indexdic[name]:
                out += " {0}".format(indexdic[name][value])
            elif 'other' in indexdic[name]:
                out += " {0}".format(indexdic[name]['other'])
            else:
                out += " {0}".format(636705)
        out += " |b"
        for name in ban:
            value = str(row[name])
            if value in indexdic[name]:
                out += " {0}".format(indexdic[name][value])
            elif 'other' in indexdic[name]:
                out += " {0}".format(indexdic[name]['other'])
            else:
                out += " {0}".format(636705)
            
        out += " |s"
        for name in site:
            value = str(row[name])
            if value in indexdic[name]:
                out += " {0}".format(indexdic[name][value])
            elif 'other' in indexdic[name]:
                out += " {0}".format(indexdic[name]['other'])
            else:
                out += " {0}".format(636705)
            
        out += " |a"
        for name in app:
            value = str(row[name])
            if value in indexdic[name]:
                out += " {0}".format(indexdic[name][value])
            elif 'other' in indexdic[name]:
                out += " {0}".format(indexdic[name]['other'])
            else:
                out += " {0}".format(636705)
            
        out += " |d"
        for name in device:
            value = str(row[name])
            if value in indexdic[name]:
                out += " {0}".format(indexdic[name][value])
            elif 'other' in indexdic[name]:
                out += " {0}".format(indexdic[name]['other'])
            else:
                out += " {0}".format(636705)
            
        out += " |c"
        for name in clist:
            value = str(row[name])
            if value in indexdic[name]:
                out += " {0}".format(indexdic[name][value])
            elif 'other' in indexdic[name]:
                out += " {0}".format(indexdic[name]['other'])
            else:
                out += " {0}".format(636705)
             
        out += '\n'
        fw.write(out)
        
            
        i+=1
        
        if (i % 100000) == 0:
            print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)
    
    
In [ ]: