libSVM data

libSVM The format of training and testing data file is:

The category dictionary of each feature has been saved before. The csv data have been saved in hdf5


In [ ]:
import tables
import time
import numpy as np
import cPickle
from itertools import izip
import pyhash
import math

In [2]:
file_handler = tables.open_file("click_data.h5", mode = "r")

In [3]:
X = file_handler.root.train.train_raw.X

In [4]:
y = file_handler.root.train.train_raw.y

In [5]:
X_train = file_handler.root.train.train_raw.X_train

In [6]:
y_train = file_handler.root.train.train_raw.y_train

In [7]:
X_valid = file_handler.root.train.train_raw.X_valid

In [8]:
y_valid = file_handler.root.train.train_raw.y_valid

In [5]:
X_t = file_handler.root.test.test_raw.X_t

one hot encoding


In [6]:
names = ['day', 'hour', 'banner_pos', 'site_id', 'site_domain', 'site_category', 'app_id', 'app_domain', 'app_category', 
         'device_id', 'device_ip', 'device_model', 'device_type', 'device_conn_type', 
         'C1', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21']

training data version 1


In [7]:
lines = X.shape[0]

In [ ]:
f = open('indexdicless.pkl', 'rb')
indexdic = cPickle.load(f)
f.close()

In [9]:
%%time
i = 0
with open('train.txt', 'wb') as fw:
    for row, target in izip(X.iterrows(),y.iterrows()):
        indexlist = []
        out = "{0}".format(target[0])
        for name in names:
            value = str(row[name])
            if value in indexdic[name]:
                index = indexdic[name][value]
                indexlist.append(index)
            elif 'other' in indexdic[name]:
                index = indexdic[name]['other']
                indexlist.append(index)
            else:
                print 'error'
                break
        indexlist.sort()
        out += " " + " ".join(["{0}:{1}".format(index, 1) for index in indexlist]) + "\n"
        fw.write(out)
            
        i+=1
        if (i % 1000000) == 0:
            print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)


1000000 lines of 40428967 written (2%)
2000000 lines of 40428967 written (4%)
3000000 lines of 40428967 written (7%)
4000000 lines of 40428967 written (9%)
5000000 lines of 40428967 written (12%)
6000000 lines of 40428967 written (14%)
7000000 lines of 40428967 written (17%)
8000000 lines of 40428967 written (19%)
9000000 lines of 40428967 written (22%)
10000000 lines of 40428967 written (24%)
11000000 lines of 40428967 written (27%)
12000000 lines of 40428967 written (29%)
13000000 lines of 40428967 written (32%)
14000000 lines of 40428967 written (34%)
15000000 lines of 40428967 written (37%)
16000000 lines of 40428967 written (39%)
17000000 lines of 40428967 written (42%)
18000000 lines of 40428967 written (44%)
19000000 lines of 40428967 written (46%)
20000000 lines of 40428967 written (49%)
21000000 lines of 40428967 written (51%)
22000000 lines of 40428967 written (54%)
23000000 lines of 40428967 written (56%)
24000000 lines of 40428967 written (59%)
25000000 lines of 40428967 written (61%)
26000000 lines of 40428967 written (64%)
27000000 lines of 40428967 written (66%)
28000000 lines of 40428967 written (69%)
29000000 lines of 40428967 written (71%)
30000000 lines of 40428967 written (74%)
31000000 lines of 40428967 written (76%)
32000000 lines of 40428967 written (79%)
33000000 lines of 40428967 written (81%)
34000000 lines of 40428967 written (84%)
35000000 lines of 40428967 written (86%)
36000000 lines of 40428967 written (89%)
37000000 lines of 40428967 written (91%)
38000000 lines of 40428967 written (93%)
39000000 lines of 40428967 written (96%)
40000000 lines of 40428967 written (98%)
CPU times: user 21min 32s, sys: 8.08 s, total: 21min 40s
Wall time: 22min 10s

training data version 2


In [8]:
f = open('indexdicless2.pkl', 'rb')
indexdic = cPickle.load(f)
f.close()

In [9]:
%%time
i = 0
with open('train1.txt', 'wb') as fw:
    for row, target in izip(X.iterrows(),y.iterrows()):
        indexlist = []
        out = "{0}".format(target[0])
        for name in names:
            value = str(row[name])
            if value in indexdic[name]:
                index = indexdic[name][value]
                indexlist.append(index)
            elif 'other' in indexdic[name]:
                index = indexdic[name]['other']
                indexlist.append(index)
            else:
                print 'error'
                break
        indexlist.sort()
        out += " " + " ".join(["{0}:{1}".format(index, 1) for index in indexlist]) + "\n"
        fw.write(out)
        
        i+=1
        if (i % 1000000) == 0:
            print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)


1000000 lines of 40428967 written (2%)
2000000 lines of 40428967 written (4%)
3000000 lines of 40428967 written (7%)
4000000 lines of 40428967 written (9%)
5000000 lines of 40428967 written (12%)
6000000 lines of 40428967 written (14%)
7000000 lines of 40428967 written (17%)
8000000 lines of 40428967 written (19%)
9000000 lines of 40428967 written (22%)
10000000 lines of 40428967 written (24%)
11000000 lines of 40428967 written (27%)
12000000 lines of 40428967 written (29%)
13000000 lines of 40428967 written (32%)
14000000 lines of 40428967 written (34%)
15000000 lines of 40428967 written (37%)
16000000 lines of 40428967 written (39%)
17000000 lines of 40428967 written (42%)
18000000 lines of 40428967 written (44%)
19000000 lines of 40428967 written (46%)
20000000 lines of 40428967 written (49%)
21000000 lines of 40428967 written (51%)
22000000 lines of 40428967 written (54%)
23000000 lines of 40428967 written (56%)
24000000 lines of 40428967 written (59%)
25000000 lines of 40428967 written (61%)
26000000 lines of 40428967 written (64%)
27000000 lines of 40428967 written (66%)
28000000 lines of 40428967 written (69%)
29000000 lines of 40428967 written (71%)
30000000 lines of 40428967 written (74%)
31000000 lines of 40428967 written (76%)
32000000 lines of 40428967 written (79%)
33000000 lines of 40428967 written (81%)
34000000 lines of 40428967 written (84%)
35000000 lines of 40428967 written (86%)
36000000 lines of 40428967 written (89%)
37000000 lines of 40428967 written (91%)
38000000 lines of 40428967 written (93%)
39000000 lines of 40428967 written (96%)
40000000 lines of 40428967 written (98%)
CPU times: user 21min 57s, sys: 9.57 s, total: 22min 6s
Wall time: 24min 16s

training data version 3


In [8]:
f = open('indexdicless3.pkl', 'rb')
indexdic = cPickle.load(f)
f.close()

In [9]:
%%time
i = 0
with open('train.txt', 'wb') as fw:
    for row, target in izip(X.iterrows(),y.iterrows()):
        indexlist = []
        out = "{0}".format(target[0])
        for name in names:
            value = str(row[name])
            if value in indexdic[name]:
                index = indexdic[name][value]
                indexlist.append(index)
            elif 'other' in indexdic[name]:
                index = indexdic[name]['other']
                indexlist.append(index)
            else:
                print 'error'
                break
        indexlist.sort()
        out += " " + " ".join(["{0}:{1}".format(index, 1) for index in indexlist]) + "\n"
        fw.write(out)
        
        i+=1
        if (i % 1000000) == 0:
            print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)


1000000 lines of 40428967 written (2%)
2000000 lines of 40428967 written (4%)
3000000 lines of 40428967 written (7%)
4000000 lines of 40428967 written (9%)
5000000 lines of 40428967 written (12%)
6000000 lines of 40428967 written (14%)
7000000 lines of 40428967 written (17%)
8000000 lines of 40428967 written (19%)
9000000 lines of 40428967 written (22%)
10000000 lines of 40428967 written (24%)
11000000 lines of 40428967 written (27%)
12000000 lines of 40428967 written (29%)
13000000 lines of 40428967 written (32%)
14000000 lines of 40428967 written (34%)
15000000 lines of 40428967 written (37%)
16000000 lines of 40428967 written (39%)
17000000 lines of 40428967 written (42%)
18000000 lines of 40428967 written (44%)
19000000 lines of 40428967 written (46%)
20000000 lines of 40428967 written (49%)
21000000 lines of 40428967 written (51%)
22000000 lines of 40428967 written (54%)
23000000 lines of 40428967 written (56%)
24000000 lines of 40428967 written (59%)
25000000 lines of 40428967 written (61%)
26000000 lines of 40428967 written (64%)
27000000 lines of 40428967 written (66%)
28000000 lines of 40428967 written (69%)
29000000 lines of 40428967 written (71%)
30000000 lines of 40428967 written (74%)
31000000 lines of 40428967 written (76%)
32000000 lines of 40428967 written (79%)
33000000 lines of 40428967 written (81%)
34000000 lines of 40428967 written (84%)
35000000 lines of 40428967 written (86%)
36000000 lines of 40428967 written (89%)
37000000 lines of 40428967 written (91%)
38000000 lines of 40428967 written (93%)
39000000 lines of 40428967 written (96%)
40000000 lines of 40428967 written (98%)
CPU times: user 23min 42s, sys: 11.4 s, total: 23min 53s
Wall time: 25min 56s

training data from the first 9 days


In [10]:
lines = X_train.shape[0]

In [11]:
%%time
i = 0
with open('train9days.txt', 'wb') as fw:
    for row, target in izip(X_train.iterrows(),y_train.iterrows()):
        indexlist = []
        out = "{0}".format(target[0])
        for name in names:
            value = str(row[name])
            if value in indexdic[name]:
                index = indexdic[name][value]
                indexlist.append(index)
        indexlist.sort()
        out += " ".join([" {0}:{1}".format(index, 1) for index in indexlist]) + "\n"
        fw.write(out)
    
        i+=1
        if (i % 100000) == 0:
            print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)


100000 lines of 36210029 written (0%)
200000 lines of 36210029 written (0%)
300000 lines of 36210029 written (0%)
400000 lines of 36210029 written (1%)
500000 lines of 36210029 written (1%)
600000 lines of 36210029 written (1%)
700000 lines of 36210029 written (1%)
800000 lines of 36210029 written (2%)
900000 lines of 36210029 written (2%)
1000000 lines of 36210029 written (2%)
1100000 lines of 36210029 written (3%)
1200000 lines of 36210029 written (3%)
1300000 lines of 36210029 written (3%)
1400000 lines of 36210029 written (3%)
1500000 lines of 36210029 written (4%)
1600000 lines of 36210029 written (4%)
1700000 lines of 36210029 written (4%)
1800000 lines of 36210029 written (4%)
1900000 lines of 36210029 written (5%)
2000000 lines of 36210029 written (5%)
2100000 lines of 36210029 written (5%)
2200000 lines of 36210029 written (6%)
2300000 lines of 36210029 written (6%)
2400000 lines of 36210029 written (6%)
2500000 lines of 36210029 written (6%)
2600000 lines of 36210029 written (7%)
2700000 lines of 36210029 written (7%)
2800000 lines of 36210029 written (7%)
2900000 lines of 36210029 written (8%)
3000000 lines of 36210029 written (8%)
3100000 lines of 36210029 written (8%)
3200000 lines of 36210029 written (8%)
3300000 lines of 36210029 written (9%)
3400000 lines of 36210029 written (9%)
3500000 lines of 36210029 written (9%)
3600000 lines of 36210029 written (9%)
3700000 lines of 36210029 written (10%)
3800000 lines of 36210029 written (10%)
3900000 lines of 36210029 written (10%)
4000000 lines of 36210029 written (11%)
4100000 lines of 36210029 written (11%)
4200000 lines of 36210029 written (11%)
4300000 lines of 36210029 written (11%)
4400000 lines of 36210029 written (12%)
4500000 lines of 36210029 written (12%)
4600000 lines of 36210029 written (12%)
4700000 lines of 36210029 written (12%)
4800000 lines of 36210029 written (13%)
4900000 lines of 36210029 written (13%)
5000000 lines of 36210029 written (13%)
5100000 lines of 36210029 written (14%)
5200000 lines of 36210029 written (14%)
5300000 lines of 36210029 written (14%)
5400000 lines of 36210029 written (14%)
5500000 lines of 36210029 written (15%)
5600000 lines of 36210029 written (15%)
5700000 lines of 36210029 written (15%)
5800000 lines of 36210029 written (16%)
5900000 lines of 36210029 written (16%)
6000000 lines of 36210029 written (16%)
6100000 lines of 36210029 written (16%)
6200000 lines of 36210029 written (17%)
6300000 lines of 36210029 written (17%)
6400000 lines of 36210029 written (17%)
6500000 lines of 36210029 written (17%)
6600000 lines of 36210029 written (18%)
6700000 lines of 36210029 written (18%)
6800000 lines of 36210029 written (18%)
6900000 lines of 36210029 written (19%)
7000000 lines of 36210029 written (19%)
7100000 lines of 36210029 written (19%)
7200000 lines of 36210029 written (19%)
7300000 lines of 36210029 written (20%)
7400000 lines of 36210029 written (20%)
7500000 lines of 36210029 written (20%)
7600000 lines of 36210029 written (20%)
7700000 lines of 36210029 written (21%)
7800000 lines of 36210029 written (21%)
7900000 lines of 36210029 written (21%)
8000000 lines of 36210029 written (22%)
8100000 lines of 36210029 written (22%)
8200000 lines of 36210029 written (22%)
8300000 lines of 36210029 written (22%)
8400000 lines of 36210029 written (23%)
8500000 lines of 36210029 written (23%)
8600000 lines of 36210029 written (23%)
8700000 lines of 36210029 written (24%)
8800000 lines of 36210029 written (24%)
8900000 lines of 36210029 written (24%)
9000000 lines of 36210029 written (24%)
9100000 lines of 36210029 written (25%)
9200000 lines of 36210029 written (25%)
9300000 lines of 36210029 written (25%)
9400000 lines of 36210029 written (25%)
9500000 lines of 36210029 written (26%)
9600000 lines of 36210029 written (26%)
9700000 lines of 36210029 written (26%)
9800000 lines of 36210029 written (27%)
9900000 lines of 36210029 written (27%)
10000000 lines of 36210029 written (27%)
10100000 lines of 36210029 written (27%)
10200000 lines of 36210029 written (28%)
10300000 lines of 36210029 written (28%)
10400000 lines of 36210029 written (28%)
10500000 lines of 36210029 written (28%)
10600000 lines of 36210029 written (29%)
10700000 lines of 36210029 written (29%)
10800000 lines of 36210029 written (29%)
10900000 lines of 36210029 written (30%)
11000000 lines of 36210029 written (30%)
11100000 lines of 36210029 written (30%)
11200000 lines of 36210029 written (30%)
11300000 lines of 36210029 written (31%)
11400000 lines of 36210029 written (31%)
11500000 lines of 36210029 written (31%)
11600000 lines of 36210029 written (32%)
11700000 lines of 36210029 written (32%)
11800000 lines of 36210029 written (32%)
11900000 lines of 36210029 written (32%)
12000000 lines of 36210029 written (33%)
12100000 lines of 36210029 written (33%)
12200000 lines of 36210029 written (33%)
12300000 lines of 36210029 written (33%)
12400000 lines of 36210029 written (34%)
12500000 lines of 36210029 written (34%)
12600000 lines of 36210029 written (34%)
12700000 lines of 36210029 written (35%)
12800000 lines of 36210029 written (35%)
12900000 lines of 36210029 written (35%)
13000000 lines of 36210029 written (35%)
13100000 lines of 36210029 written (36%)
13200000 lines of 36210029 written (36%)
13300000 lines of 36210029 written (36%)
13400000 lines of 36210029 written (37%)
13500000 lines of 36210029 written (37%)
13600000 lines of 36210029 written (37%)
13700000 lines of 36210029 written (37%)
13800000 lines of 36210029 written (38%)
13900000 lines of 36210029 written (38%)
14000000 lines of 36210029 written (38%)
14100000 lines of 36210029 written (38%)
14200000 lines of 36210029 written (39%)
14300000 lines of 36210029 written (39%)
14400000 lines of 36210029 written (39%)
14500000 lines of 36210029 written (40%)
14600000 lines of 36210029 written (40%)
14700000 lines of 36210029 written (40%)
14800000 lines of 36210029 written (40%)
14900000 lines of 36210029 written (41%)
15000000 lines of 36210029 written (41%)
15100000 lines of 36210029 written (41%)
15200000 lines of 36210029 written (41%)
15300000 lines of 36210029 written (42%)
15400000 lines of 36210029 written (42%)
15500000 lines of 36210029 written (42%)
15600000 lines of 36210029 written (43%)
15700000 lines of 36210029 written (43%)
15800000 lines of 36210029 written (43%)
15900000 lines of 36210029 written (43%)
16000000 lines of 36210029 written (44%)
16100000 lines of 36210029 written (44%)
16200000 lines of 36210029 written (44%)
16300000 lines of 36210029 written (45%)
16400000 lines of 36210029 written (45%)
16500000 lines of 36210029 written (45%)
16600000 lines of 36210029 written (45%)
16700000 lines of 36210029 written (46%)
16800000 lines of 36210029 written (46%)
16900000 lines of 36210029 written (46%)
17000000 lines of 36210029 written (46%)
17100000 lines of 36210029 written (47%)
17200000 lines of 36210029 written (47%)
17300000 lines of 36210029 written (47%)
17400000 lines of 36210029 written (48%)
17500000 lines of 36210029 written (48%)
17600000 lines of 36210029 written (48%)
17700000 lines of 36210029 written (48%)
17800000 lines of 36210029 written (49%)
17900000 lines of 36210029 written (49%)
18000000 lines of 36210029 written (49%)
18100000 lines of 36210029 written (49%)
18200000 lines of 36210029 written (50%)
18300000 lines of 36210029 written (50%)
18400000 lines of 36210029 written (50%)
18500000 lines of 36210029 written (51%)
18600000 lines of 36210029 written (51%)
18700000 lines of 36210029 written (51%)
18800000 lines of 36210029 written (51%)
18900000 lines of 36210029 written (52%)
19000000 lines of 36210029 written (52%)
19100000 lines of 36210029 written (52%)
19200000 lines of 36210029 written (53%)
19300000 lines of 36210029 written (53%)
19400000 lines of 36210029 written (53%)
19500000 lines of 36210029 written (53%)
19600000 lines of 36210029 written (54%)
19700000 lines of 36210029 written (54%)
19800000 lines of 36210029 written (54%)
19900000 lines of 36210029 written (54%)
20000000 lines of 36210029 written (55%)
20100000 lines of 36210029 written (55%)
20200000 lines of 36210029 written (55%)
20300000 lines of 36210029 written (56%)
20400000 lines of 36210029 written (56%)
20500000 lines of 36210029 written (56%)
20600000 lines of 36210029 written (56%)
20700000 lines of 36210029 written (57%)
20800000 lines of 36210029 written (57%)
20900000 lines of 36210029 written (57%)
21000000 lines of 36210029 written (57%)
21100000 lines of 36210029 written (58%)
21200000 lines of 36210029 written (58%)
21300000 lines of 36210029 written (58%)
21400000 lines of 36210029 written (59%)
21500000 lines of 36210029 written (59%)
21600000 lines of 36210029 written (59%)
21700000 lines of 36210029 written (59%)
21800000 lines of 36210029 written (60%)
21900000 lines of 36210029 written (60%)
22000000 lines of 36210029 written (60%)
22100000 lines of 36210029 written (61%)
22200000 lines of 36210029 written (61%)
22300000 lines of 36210029 written (61%)
22400000 lines of 36210029 written (61%)
22500000 lines of 36210029 written (62%)
22600000 lines of 36210029 written (62%)
22700000 lines of 36210029 written (62%)
22800000 lines of 36210029 written (62%)
22900000 lines of 36210029 written (63%)
23000000 lines of 36210029 written (63%)
23100000 lines of 36210029 written (63%)
23200000 lines of 36210029 written (64%)
23300000 lines of 36210029 written (64%)
23400000 lines of 36210029 written (64%)
23500000 lines of 36210029 written (64%)
23600000 lines of 36210029 written (65%)
23700000 lines of 36210029 written (65%)
23800000 lines of 36210029 written (65%)
23900000 lines of 36210029 written (66%)
24000000 lines of 36210029 written (66%)
24100000 lines of 36210029 written (66%)
24200000 lines of 36210029 written (66%)
24300000 lines of 36210029 written (67%)
24400000 lines of 36210029 written (67%)
24500000 lines of 36210029 written (67%)
24600000 lines of 36210029 written (67%)
24700000 lines of 36210029 written (68%)
24800000 lines of 36210029 written (68%)
24900000 lines of 36210029 written (68%)
25000000 lines of 36210029 written (69%)
25100000 lines of 36210029 written (69%)
25200000 lines of 36210029 written (69%)
25300000 lines of 36210029 written (69%)
25400000 lines of 36210029 written (70%)
25500000 lines of 36210029 written (70%)
25600000 lines of 36210029 written (70%)
25700000 lines of 36210029 written (70%)
25800000 lines of 36210029 written (71%)
25900000 lines of 36210029 written (71%)
26000000 lines of 36210029 written (71%)
26100000 lines of 36210029 written (72%)
26200000 lines of 36210029 written (72%)
26300000 lines of 36210029 written (72%)
26400000 lines of 36210029 written (72%)
26500000 lines of 36210029 written (73%)
26600000 lines of 36210029 written (73%)
26700000 lines of 36210029 written (73%)
26800000 lines of 36210029 written (74%)
26900000 lines of 36210029 written (74%)
27000000 lines of 36210029 written (74%)
27100000 lines of 36210029 written (74%)
27200000 lines of 36210029 written (75%)
27300000 lines of 36210029 written (75%)
27400000 lines of 36210029 written (75%)
27500000 lines of 36210029 written (75%)
27600000 lines of 36210029 written (76%)
27700000 lines of 36210029 written (76%)
27800000 lines of 36210029 written (76%)
27900000 lines of 36210029 written (77%)
28000000 lines of 36210029 written (77%)
28100000 lines of 36210029 written (77%)
28200000 lines of 36210029 written (77%)
28300000 lines of 36210029 written (78%)
28400000 lines of 36210029 written (78%)
28500000 lines of 36210029 written (78%)
28600000 lines of 36210029 written (78%)
28700000 lines of 36210029 written (79%)
28800000 lines of 36210029 written (79%)
28900000 lines of 36210029 written (79%)
29000000 lines of 36210029 written (80%)
29100000 lines of 36210029 written (80%)
29200000 lines of 36210029 written (80%)
29300000 lines of 36210029 written (80%)
29400000 lines of 36210029 written (81%)
29500000 lines of 36210029 written (81%)
29600000 lines of 36210029 written (81%)
29700000 lines of 36210029 written (82%)
29800000 lines of 36210029 written (82%)
29900000 lines of 36210029 written (82%)
30000000 lines of 36210029 written (82%)
30100000 lines of 36210029 written (83%)
30200000 lines of 36210029 written (83%)
30300000 lines of 36210029 written (83%)
30400000 lines of 36210029 written (83%)
30500000 lines of 36210029 written (84%)
30600000 lines of 36210029 written (84%)
30700000 lines of 36210029 written (84%)
30800000 lines of 36210029 written (85%)
30900000 lines of 36210029 written (85%)
31000000 lines of 36210029 written (85%)
31100000 lines of 36210029 written (85%)
31200000 lines of 36210029 written (86%)
31300000 lines of 36210029 written (86%)
31400000 lines of 36210029 written (86%)
31500000 lines of 36210029 written (86%)
31600000 lines of 36210029 written (87%)
31700000 lines of 36210029 written (87%)
31800000 lines of 36210029 written (87%)
31900000 lines of 36210029 written (88%)
32000000 lines of 36210029 written (88%)
32100000 lines of 36210029 written (88%)
32200000 lines of 36210029 written (88%)
32300000 lines of 36210029 written (89%)
32400000 lines of 36210029 written (89%)
32500000 lines of 36210029 written (89%)
32600000 lines of 36210029 written (90%)
32700000 lines of 36210029 written (90%)
32800000 lines of 36210029 written (90%)
32900000 lines of 36210029 written (90%)
33000000 lines of 36210029 written (91%)
33100000 lines of 36210029 written (91%)
33200000 lines of 36210029 written (91%)
33300000 lines of 36210029 written (91%)
33400000 lines of 36210029 written (92%)
33500000 lines of 36210029 written (92%)
33600000 lines of 36210029 written (92%)
33700000 lines of 36210029 written (93%)
33800000 lines of 36210029 written (93%)
33900000 lines of 36210029 written (93%)
34000000 lines of 36210029 written (93%)
34100000 lines of 36210029 written (94%)
34200000 lines of 36210029 written (94%)
34300000 lines of 36210029 written (94%)
34400000 lines of 36210029 written (95%)
34500000 lines of 36210029 written (95%)
34600000 lines of 36210029 written (95%)
34700000 lines of 36210029 written (95%)
34800000 lines of 36210029 written (96%)
34900000 lines of 36210029 written (96%)
35000000 lines of 36210029 written (96%)
35100000 lines of 36210029 written (96%)
35200000 lines of 36210029 written (97%)
35300000 lines of 36210029 written (97%)
35400000 lines of 36210029 written (97%)
35500000 lines of 36210029 written (98%)
35600000 lines of 36210029 written (98%)
35700000 lines of 36210029 written (98%)
35800000 lines of 36210029 written (98%)
35900000 lines of 36210029 written (99%)
36000000 lines of 36210029 written (99%)
36100000 lines of 36210029 written (99%)
36200000 lines of 36210029 written (99%)
CPU times: user 36min 49s, sys: 3min 58s, total: 40min 48s
Wall time: 42min 38s

validation data from the 10th day


In [10]:
lines = X_valid.shape[0]

In [11]:
%%time
i = 0
with open('train10thday.txt', 'wb') as fw:
    for row, target in izip(X_valid.iterrows(),y_valid.iterrows()):
        indexlist = []
        out = "{0}".format(target[0])
        for name in names:
            value = str(row[name])
            if value in indexdic[name]:
                index = indexdic[name][value]
                indexlist.append(index)
        indexlist.sort()
        out += " ".join([" {0}:{1}".format(index, 1) for index in indexlist]) + "\n"
        fw.write(out)
    
        i+=1
        if (i % 100000) == 0:
            print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)


100000 lines of 4218938 written (2%)
200000 lines of 4218938 written (4%)
300000 lines of 4218938 written (7%)
400000 lines of 4218938 written (9%)
500000 lines of 4218938 written (11%)
600000 lines of 4218938 written (14%)
700000 lines of 4218938 written (16%)
800000 lines of 4218938 written (18%)
900000 lines of 4218938 written (21%)
1000000 lines of 4218938 written (23%)
1100000 lines of 4218938 written (26%)
1200000 lines of 4218938 written (28%)
1300000 lines of 4218938 written (30%)
1400000 lines of 4218938 written (33%)
1500000 lines of 4218938 written (35%)
1600000 lines of 4218938 written (37%)
1700000 lines of 4218938 written (40%)
1800000 lines of 4218938 written (42%)
1900000 lines of 4218938 written (45%)
2000000 lines of 4218938 written (47%)
2100000 lines of 4218938 written (49%)
2200000 lines of 4218938 written (52%)
2300000 lines of 4218938 written (54%)
2400000 lines of 4218938 written (56%)
2500000 lines of 4218938 written (59%)
2600000 lines of 4218938 written (61%)
2700000 lines of 4218938 written (63%)
2800000 lines of 4218938 written (66%)
2900000 lines of 4218938 written (68%)
3000000 lines of 4218938 written (71%)
3100000 lines of 4218938 written (73%)
3200000 lines of 4218938 written (75%)
3300000 lines of 4218938 written (78%)
3400000 lines of 4218938 written (80%)
3500000 lines of 4218938 written (82%)
3600000 lines of 4218938 written (85%)
3700000 lines of 4218938 written (87%)
3800000 lines of 4218938 written (90%)
3900000 lines of 4218938 written (92%)
4000000 lines of 4218938 written (94%)
4100000 lines of 4218938 written (97%)
4200000 lines of 4218938 written (99%)
CPU times: user 4min 42s, sys: 49 s, total: 5min 31s
Wall time: 5min 46s

test data version 1


In [10]:
lines = X_t.shape[0]

In [12]:
%%time
i = 0
with open('test.txt', 'wb') as fw:
    for row in X_t.iterrows():
        indexlist = []
        out = "{0}".format(0)
        for name in names:
            value = str(row[name])
            if value in indexdic[name]:
                index = indexdic[name][value]
                indexlist.append(index)
            elif 'other' in indexdic[name]:
                index = indexdic[name]['other']
                indexlist.append(index)
            
        indexlist.sort()
        out += " " + " ".join(["{0}:{1}".format(index, 1) for index in indexlist]) + "\n"
        fw.write(out)
        
        i+=1
        if (i % 100000) == 0:
            print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)


100000 lines of 4577464 written (2%)
200000 lines of 4577464 written (4%)
300000 lines of 4577464 written (6%)
400000 lines of 4577464 written (8%)
500000 lines of 4577464 written (10%)
600000 lines of 4577464 written (13%)
700000 lines of 4577464 written (15%)
800000 lines of 4577464 written (17%)
900000 lines of 4577464 written (19%)
1000000 lines of 4577464 written (21%)
1100000 lines of 4577464 written (24%)
1200000 lines of 4577464 written (26%)
1300000 lines of 4577464 written (28%)
1400000 lines of 4577464 written (30%)
1500000 lines of 4577464 written (32%)
1600000 lines of 4577464 written (34%)
1700000 lines of 4577464 written (37%)
1800000 lines of 4577464 written (39%)
1900000 lines of 4577464 written (41%)
2000000 lines of 4577464 written (43%)
2100000 lines of 4577464 written (45%)
2200000 lines of 4577464 written (48%)
2300000 lines of 4577464 written (50%)
2400000 lines of 4577464 written (52%)
2500000 lines of 4577464 written (54%)
2600000 lines of 4577464 written (56%)
2700000 lines of 4577464 written (58%)
2800000 lines of 4577464 written (61%)
2900000 lines of 4577464 written (63%)
3000000 lines of 4577464 written (65%)
3100000 lines of 4577464 written (67%)
3200000 lines of 4577464 written (69%)
3300000 lines of 4577464 written (72%)
3400000 lines of 4577464 written (74%)
3500000 lines of 4577464 written (76%)
3600000 lines of 4577464 written (78%)
3700000 lines of 4577464 written (80%)
3800000 lines of 4577464 written (83%)
3900000 lines of 4577464 written (85%)
4000000 lines of 4577464 written (87%)
4100000 lines of 4577464 written (89%)
4200000 lines of 4577464 written (91%)
4300000 lines of 4577464 written (93%)
4400000 lines of 4577464 written (96%)
4500000 lines of 4577464 written (98%)
CPU times: user 2min 22s, sys: 1.14 s, total: 2min 24s
Wall time: 2min 25s

test data version 2


In [12]:
%%time
i = 0
with open('test1.txt', 'wb') as fw:
    for row in X_t.iterrows():
        indexlist = []
        out = "{0}".format(0)
        for name in names:
            value = str(row[name])
            if value in indexdic[name]:
                index = indexdic[name][value]
                indexlist.append(index)
            elif 'other' in indexdic[name]:
                index = indexdic[name]['other']
                indexlist.append(index)
            
        indexlist.sort()
        out += " " + " ".join(["{0}:{1}".format(index, 1) for index in indexlist]) + "\n"
        fw.write(out)
        
        i+=1
        if (i % 100000) == 0:
            print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)


100000 lines of 4577464 written (2%)
200000 lines of 4577464 written (4%)
300000 lines of 4577464 written (6%)
400000 lines of 4577464 written (8%)
500000 lines of 4577464 written (10%)
600000 lines of 4577464 written (13%)
700000 lines of 4577464 written (15%)
800000 lines of 4577464 written (17%)
900000 lines of 4577464 written (19%)
1000000 lines of 4577464 written (21%)
1100000 lines of 4577464 written (24%)
1200000 lines of 4577464 written (26%)
1300000 lines of 4577464 written (28%)
1400000 lines of 4577464 written (30%)
1500000 lines of 4577464 written (32%)
1600000 lines of 4577464 written (34%)
1700000 lines of 4577464 written (37%)
1800000 lines of 4577464 written (39%)
1900000 lines of 4577464 written (41%)
2000000 lines of 4577464 written (43%)
2100000 lines of 4577464 written (45%)
2200000 lines of 4577464 written (48%)
2300000 lines of 4577464 written (50%)
2400000 lines of 4577464 written (52%)
2500000 lines of 4577464 written (54%)
2600000 lines of 4577464 written (56%)
2700000 lines of 4577464 written (58%)
2800000 lines of 4577464 written (61%)
2900000 lines of 4577464 written (63%)
3000000 lines of 4577464 written (65%)
3100000 lines of 4577464 written (67%)
3200000 lines of 4577464 written (69%)
3300000 lines of 4577464 written (72%)
3400000 lines of 4577464 written (74%)
3500000 lines of 4577464 written (76%)
3600000 lines of 4577464 written (78%)
3700000 lines of 4577464 written (80%)
3800000 lines of 4577464 written (83%)
3900000 lines of 4577464 written (85%)
4000000 lines of 4577464 written (87%)
4100000 lines of 4577464 written (89%)
4200000 lines of 4577464 written (91%)
4300000 lines of 4577464 written (93%)
4400000 lines of 4577464 written (96%)
4500000 lines of 4577464 written (98%)
CPU times: user 2min 14s, sys: 928 ms, total: 2min 15s
Wall time: 2min 29s

test data version 3


In [11]:
%%time
i = 0
with open('test.txt', 'wb') as fw:
    for row in X_t.iterrows():
        indexlist = []
        out = "{0}".format(0)
        for name in names:
            value = str(row[name])
            if value in indexdic[name]:
                index = indexdic[name][value]
                indexlist.append(index)
            elif 'other' in indexdic[name]:
                index = indexdic[name]['other']
                indexlist.append(index)
            
        indexlist.sort()
        out += " " + " ".join(["{0}:{1}".format(index, 1) for index in indexlist]) + "\n"
        fw.write(out)
        
        i+=1
        if (i % 100000) == 0:
            print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)


100000 lines of 4577464 written (2%)
200000 lines of 4577464 written (4%)
300000 lines of 4577464 written (6%)
400000 lines of 4577464 written (8%)
500000 lines of 4577464 written (10%)
600000 lines of 4577464 written (13%)
700000 lines of 4577464 written (15%)
800000 lines of 4577464 written (17%)
900000 lines of 4577464 written (19%)
1000000 lines of 4577464 written (21%)
1100000 lines of 4577464 written (24%)
1200000 lines of 4577464 written (26%)
1300000 lines of 4577464 written (28%)
1400000 lines of 4577464 written (30%)
1500000 lines of 4577464 written (32%)
1600000 lines of 4577464 written (34%)
1700000 lines of 4577464 written (37%)
1800000 lines of 4577464 written (39%)
1900000 lines of 4577464 written (41%)
2000000 lines of 4577464 written (43%)
2100000 lines of 4577464 written (45%)
2200000 lines of 4577464 written (48%)
2300000 lines of 4577464 written (50%)
2400000 lines of 4577464 written (52%)
2500000 lines of 4577464 written (54%)
2600000 lines of 4577464 written (56%)
2700000 lines of 4577464 written (58%)
2800000 lines of 4577464 written (61%)
2900000 lines of 4577464 written (63%)
3000000 lines of 4577464 written (65%)
3100000 lines of 4577464 written (67%)
3200000 lines of 4577464 written (69%)
3300000 lines of 4577464 written (72%)
3400000 lines of 4577464 written (74%)
3500000 lines of 4577464 written (76%)
3600000 lines of 4577464 written (78%)
3700000 lines of 4577464 written (80%)
3800000 lines of 4577464 written (83%)
3900000 lines of 4577464 written (85%)
4000000 lines of 4577464 written (87%)
4100000 lines of 4577464 written (89%)
4200000 lines of 4577464 written (91%)
4300000 lines of 4577464 written (93%)
4400000 lines of 4577464 written (96%)
4500000 lines of 4577464 written (98%)
CPU times: user 2min 14s, sys: 949 ms, total: 2min 15s
Wall time: 2min 30s

hashing


In [10]:
f = open('indexdiconehot.pkl', 'rb')
indexdic = cPickle.load(f)
f.close()

In [11]:
cnames = set(['day', 'hour', 'banner_pos', 'site_category', 'app_category', 
         'device_type', 'device_conn_type', 
         'C1', 'C15', 'C16', 'C18', 'C20'])

In [12]:
hasher = pyhash.murmur3_32()
def hashstr(feat, bins):
    return hasher(str(feat)) % bins +1+309

def gen_hashed_fm_feats(feats, bins):
    feats = [hashstr(feat, bins) for feat in feats]
    #feats.sort()
    return feats

In [13]:
bins = 10000000

training data


In [14]:
names = X.colnames

In [15]:
lines = X.shape[0]

In [16]:
%%time
i = 0
with open('train.txt', 'wb') as fw:
    for row, target in izip(X.iterrows(),y.iterrows()):
        out = "{0}".format(target[0])
        feats = []
        feati = []
        for name in names:
            value = str(row[name])
            if name in cnames:
                feati.append(indexdic[name][value])
            else:
                feats.append('{0}_{1}'.format(name, value))
                
        feats = gen_hashed_fm_feats(feats, bins)
        
        feats = feats + feati
        feats.sort()
        
        out += " " + " ".join(["{0}:{1}".format(feat, 1) for feat in feats]) + "\n"
        
        fw.write(out)
        
        i+=1
        
        if (i % 1000000) == 0:
            print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)


1000000 lines of 40428967 written (2%)
2000000 lines of 40428967 written (4%)
3000000 lines of 40428967 written (7%)
4000000 lines of 40428967 written (9%)
5000000 lines of 40428967 written (12%)
6000000 lines of 40428967 written (14%)
7000000 lines of 40428967 written (17%)
8000000 lines of 40428967 written (19%)
9000000 lines of 40428967 written (22%)
10000000 lines of 40428967 written (24%)
11000000 lines of 40428967 written (27%)
12000000 lines of 40428967 written (29%)
13000000 lines of 40428967 written (32%)
14000000 lines of 40428967 written (34%)
15000000 lines of 40428967 written (37%)
16000000 lines of 40428967 written (39%)
17000000 lines of 40428967 written (42%)
18000000 lines of 40428967 written (44%)
19000000 lines of 40428967 written (46%)
20000000 lines of 40428967 written (49%)
21000000 lines of 40428967 written (51%)
22000000 lines of 40428967 written (54%)
23000000 lines of 40428967 written (56%)
24000000 lines of 40428967 written (59%)
25000000 lines of 40428967 written (61%)
26000000 lines of 40428967 written (64%)
27000000 lines of 40428967 written (66%)
28000000 lines of 40428967 written (69%)
29000000 lines of 40428967 written (71%)
30000000 lines of 40428967 written (74%)
31000000 lines of 40428967 written (76%)
32000000 lines of 40428967 written (79%)
33000000 lines of 40428967 written (81%)
34000000 lines of 40428967 written (84%)
35000000 lines of 40428967 written (86%)
36000000 lines of 40428967 written (89%)
37000000 lines of 40428967 written (91%)
38000000 lines of 40428967 written (93%)
39000000 lines of 40428967 written (96%)
40000000 lines of 40428967 written (98%)
CPU times: user 34min 16s, sys: 8.37 s, total: 34min 25s
Wall time: 35min 35s

In [ ]:

test data


In [17]:
names = X_t.colnames

In [18]:
lines = X_t.shape[0]

In [19]:
%%time
i = 0
with open('test.txt', 'wb') as fw:
    for row in X_t.iterrows():
        out = "0"
        feats = []
        feati = []
        for name in names:
            value = str(row[name])
            if name in cnames:
                feati.append(indexdic[name][value])
            else:
                feats.append('{0}_{1}'.format(name, value))
                
        feats = gen_hashed_fm_feats(feats, bins)
        feats = feats + feati
        feats.sort()
        
        out += " " + " ".join(["{0}:{1}".format(feat, 1) for feat in feats]) + "\n"
        
        fw.write(out)
    
        i+=1
        
        if (i % 100000) == 0:
            print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)


100000 lines of 4577464 written (2%)
200000 lines of 4577464 written (4%)
300000 lines of 4577464 written (6%)
400000 lines of 4577464 written (8%)
500000 lines of 4577464 written (10%)
600000 lines of 4577464 written (13%)
700000 lines of 4577464 written (15%)
800000 lines of 4577464 written (17%)
900000 lines of 4577464 written (19%)
1000000 lines of 4577464 written (21%)
1100000 lines of 4577464 written (24%)
1200000 lines of 4577464 written (26%)
1300000 lines of 4577464 written (28%)
1400000 lines of 4577464 written (30%)
1500000 lines of 4577464 written (32%)
1600000 lines of 4577464 written (34%)
1700000 lines of 4577464 written (37%)
1800000 lines of 4577464 written (39%)
1900000 lines of 4577464 written (41%)
2000000 lines of 4577464 written (43%)
2100000 lines of 4577464 written (45%)
2200000 lines of 4577464 written (48%)
2300000 lines of 4577464 written (50%)
2400000 lines of 4577464 written (52%)
2500000 lines of 4577464 written (54%)
2600000 lines of 4577464 written (56%)
2700000 lines of 4577464 written (58%)
2800000 lines of 4577464 written (61%)
2900000 lines of 4577464 written (63%)
3000000 lines of 4577464 written (65%)
3100000 lines of 4577464 written (67%)
3200000 lines of 4577464 written (69%)
3300000 lines of 4577464 written (72%)
3400000 lines of 4577464 written (74%)
3500000 lines of 4577464 written (76%)
3600000 lines of 4577464 written (78%)
3700000 lines of 4577464 written (80%)
3800000 lines of 4577464 written (83%)
3900000 lines of 4577464 written (85%)
4000000 lines of 4577464 written (87%)
4100000 lines of 4577464 written (89%)
4200000 lines of 4577464 written (91%)
4300000 lines of 4577464 written (93%)
4400000 lines of 4577464 written (96%)
4500000 lines of 4577464 written (98%)
CPU times: user 3min 33s, sys: 925 ms, total: 3min 34s
Wall time: 3min 43s

In [ ]:

FM

GBDT dataset used in FM


In [6]:
f = open('gbdtclist2.pkl', 'rb')
clist = cPickle.load(f)
f.close()
Dense Matrix ------------ The input format is:
Binary Sparse Matrix -------------------- The input format is:

In [5]:
densenames = ['day', 'hour', 'banner_pos', 'device_type', 'device_conn_type', 
              'C1', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21']
sparsenames = ['site_id', 'site_domain', 'site_category', 'app_id', 'app_domain', 'app_category', 
               'device_id', 'device_ip', 'device_model']

In [7]:
densenames = []
sparsenames = ['site_id', 'site_domain', 'site_category', 'app_id', 'app_domain', 'app_category', 
               'device_id', 'device_ip', 'device_model', 'C1', 'banner_pos', 'device_type', 'device_conn_type', 
               'C15', 'C16', 'C19', 'C20']

In [ ]:

training set


In [8]:
lines = X.shape[0]

In [9]:
%%time
i = 0
with open('train.gbdt.dense', 'wb') as fd, open('train.gbdt.sparse', 'wb') as fs:
    for row, target in izip(X.iterrows(),y.iterrows()):
        
        outd = "{0} ".format(target[0])
        outs = "{0} ".format(target[0])
        #outd += " ".join(["{0}".format(row[name]) for name in densenames]) + "\n"
        outd += "\n"
        fd.write(outd)
        
        cat_feats = set()
        for name in sparsenames:
            key = name + '_' + str(row[name])
            cat_feats.add(key)

        outs += " ".join(["{0}".format(j) for j, feat in enumerate(clist, start=1) if feat in cat_feats]) + "\n"
        fs.write(outs)
        
        i+=1
        
        if (i % 1000000) == 0:
            print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)


1000000 lines of 40428967 written (2%)
2000000 lines of 40428967 written (4%)
3000000 lines of 40428967 written (7%)
4000000 lines of 40428967 written (9%)
5000000 lines of 40428967 written (12%)
6000000 lines of 40428967 written (14%)
7000000 lines of 40428967 written (17%)
8000000 lines of 40428967 written (19%)
9000000 lines of 40428967 written (22%)
10000000 lines of 40428967 written (24%)
11000000 lines of 40428967 written (27%)
12000000 lines of 40428967 written (29%)
13000000 lines of 40428967 written (32%)
14000000 lines of 40428967 written (34%)
15000000 lines of 40428967 written (37%)
16000000 lines of 40428967 written (39%)
17000000 lines of 40428967 written (42%)
18000000 lines of 40428967 written (44%)
19000000 lines of 40428967 written (46%)
20000000 lines of 40428967 written (49%)
21000000 lines of 40428967 written (51%)
22000000 lines of 40428967 written (54%)
23000000 lines of 40428967 written (56%)
24000000 lines of 40428967 written (59%)
25000000 lines of 40428967 written (61%)
26000000 lines of 40428967 written (64%)
27000000 lines of 40428967 written (66%)
28000000 lines of 40428967 written (69%)
29000000 lines of 40428967 written (71%)
30000000 lines of 40428967 written (74%)
31000000 lines of 40428967 written (76%)
32000000 lines of 40428967 written (79%)
33000000 lines of 40428967 written (81%)
34000000 lines of 40428967 written (84%)
35000000 lines of 40428967 written (86%)
36000000 lines of 40428967 written (89%)
37000000 lines of 40428967 written (91%)
38000000 lines of 40428967 written (93%)
39000000 lines of 40428967 written (96%)
40000000 lines of 40428967 written (98%)
CPU times: user 15min 26s, sys: 2.33 s, total: 15min 28s
Wall time: 15min 29s

test set


In [10]:
lines = X_t.shape[0]

In [11]:
%%time
i = 0
with open('test.gbdt.dense', 'wb') as fd, open('test.gbdt.sparse', 'wb') as fs:
    for row in X_t.iterrows():
        
        outd = "{0} ".format(0)
        outs = "{0} ".format(0)
        
        #outd += " ".join(["{0}".format(row[name]) for name in densenames]) + "\n"
        outd += "\n"
        fd.write(outd)
        
        cat_feats = set()
        for name in sparsenames:
            key = name + '_' + str(row[name])
            cat_feats.add(key)

        outs += " ".join(["{0}".format(j) for j, feat in enumerate(clist, start=1) if feat in cat_feats]) + "\n"
        fs.write(outs)
        
        i+=1
        
        if (i % 100000) == 0:
            print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)


100000 lines of 4577464 written (2%)
200000 lines of 4577464 written (4%)
300000 lines of 4577464 written (6%)
400000 lines of 4577464 written (8%)
500000 lines of 4577464 written (10%)
600000 lines of 4577464 written (13%)
700000 lines of 4577464 written (15%)
800000 lines of 4577464 written (17%)
900000 lines of 4577464 written (19%)
1000000 lines of 4577464 written (21%)
1100000 lines of 4577464 written (24%)
1200000 lines of 4577464 written (26%)
1300000 lines of 4577464 written (28%)
1400000 lines of 4577464 written (30%)
1500000 lines of 4577464 written (32%)
1600000 lines of 4577464 written (34%)
1700000 lines of 4577464 written (37%)
1800000 lines of 4577464 written (39%)
1900000 lines of 4577464 written (41%)
2000000 lines of 4577464 written (43%)
2100000 lines of 4577464 written (45%)
2200000 lines of 4577464 written (48%)
2300000 lines of 4577464 written (50%)
2400000 lines of 4577464 written (52%)
2500000 lines of 4577464 written (54%)
2600000 lines of 4577464 written (56%)
2700000 lines of 4577464 written (58%)
2800000 lines of 4577464 written (61%)
2900000 lines of 4577464 written (63%)
3000000 lines of 4577464 written (65%)
3100000 lines of 4577464 written (67%)
3200000 lines of 4577464 written (69%)
3300000 lines of 4577464 written (72%)
3400000 lines of 4577464 written (74%)
3500000 lines of 4577464 written (76%)
3600000 lines of 4577464 written (78%)
3700000 lines of 4577464 written (80%)
3800000 lines of 4577464 written (83%)
3900000 lines of 4577464 written (85%)
4000000 lines of 4577464 written (87%)
4100000 lines of 4577464 written (89%)
4200000 lines of 4577464 written (91%)
4300000 lines of 4577464 written (93%)
4400000 lines of 4577464 written (96%)
4500000 lines of 4577464 written (98%)
CPU times: user 1min 33s, sys: 336 ms, total: 1min 33s
Wall time: 1min 33s

libfm data set used in FM

Data Format =========== The input of this factorization machine solver consists of a label vector (y) and a binary sparse matrix (X). The input format is:

In [10]:
f = open('indexdiconehot.pkl', 'rb')
indexdic = cPickle.load(f)
f.close()

cnames = set(['day', 'hour', 'banner_pos', 'site_category', 'app_category', 
         'device_type', 'device_conn_type', 
         'C1', 'C15', 'C16', 'C18', 'C20'])

In [11]:
hasher = pyhash.murmur3_32()
def hashstr(feat, bins):
    return hasher(str(feat)) % bins +1+309

def gen_hashed_fm_feats(feats, bins):
    feats = [(field, hashstr(feat, bins)) for (field, feat) in feats]
    #feats.sort()
    feats = ['{0}'.format(idx) for (field, idx) in feats]
    return feats
bins = 1000000

training set


In [13]:
names = X.colnames

In [14]:
lines = X.shape[0]

In [15]:
%%time
i = 0
with open('train.fm', 'wb') as fw:
    for row, target, line_gbdt in izip(X.iterrows(), y.iterrows(), open('train.gbdt.out')):
                
        out = "{0}".format(target[0])
        featsX = []
        for name in names:
            value = str(row[name])
            if name in cnames:
                out += " {0}".format(indexdic[name][value])
            else:
                featsX.append((name, '{0}_{1}'.format(name, value)))
                
        featsgbdt = [('gbdt{0}'.format(j), 'gbdt{0}_{1}'.format(j, feat)) for j, feat in enumerate(line_gbdt.strip().split()[1:13], start=1)]
        feats = featsX + featsgbdt
        
        feats = gen_hashed_fm_feats(feats, bins)
        
        out +=  ' ' + ' '.join(feats) + '\n'
        fw.write(out)
        
        i+=1
        
        if (i % 1000000) == 0:
            print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)


1000000 lines of 40428967 written (2%)
2000000 lines of 40428967 written (4%)
3000000 lines of 40428967 written (7%)
4000000 lines of 40428967 written (9%)
5000000 lines of 40428967 written (12%)
6000000 lines of 40428967 written (14%)
7000000 lines of 40428967 written (17%)
8000000 lines of 40428967 written (19%)
9000000 lines of 40428967 written (22%)
10000000 lines of 40428967 written (24%)
11000000 lines of 40428967 written (27%)
12000000 lines of 40428967 written (29%)
13000000 lines of 40428967 written (32%)
14000000 lines of 40428967 written (34%)
15000000 lines of 40428967 written (37%)
16000000 lines of 40428967 written (39%)
17000000 lines of 40428967 written (42%)
18000000 lines of 40428967 written (44%)
19000000 lines of 40428967 written (46%)
20000000 lines of 40428967 written (49%)
21000000 lines of 40428967 written (51%)
22000000 lines of 40428967 written (54%)
23000000 lines of 40428967 written (56%)
24000000 lines of 40428967 written (59%)
25000000 lines of 40428967 written (61%)
26000000 lines of 40428967 written (64%)
27000000 lines of 40428967 written (66%)
28000000 lines of 40428967 written (69%)
29000000 lines of 40428967 written (71%)
30000000 lines of 40428967 written (74%)
31000000 lines of 40428967 written (76%)
32000000 lines of 40428967 written (79%)
33000000 lines of 40428967 written (81%)
34000000 lines of 40428967 written (84%)
35000000 lines of 40428967 written (86%)
36000000 lines of 40428967 written (89%)
37000000 lines of 40428967 written (91%)
38000000 lines of 40428967 written (93%)
39000000 lines of 40428967 written (96%)
40000000 lines of 40428967 written (98%)
CPU times: user 54min 41s, sys: 14.2 s, total: 54min 55s
Wall time: 56min 14s

partial one hot encoding


In [6]:
names = X.colnames
names.remove('device_ip')

In [7]:
lines = X.shape[0]

In [8]:
f = open('indexdiconehot.pkl', 'rb')
indexdic = cPickle.load(f)
f.close()

cnames = set(['day', 'hour', 'banner_pos', 'site_category', 'app_category', 
         'device_type', 'device_conn_type', 
         'C1', 'C15', 'C16', 'C18', 'C20'])

In [10]:
hasher = pyhash.murmur3_32()
def hashstr(feat, bins):
    return hasher(str(feat)) % bins +1+309

def gen_hashed_fm_feats(feats, bins):
    feats = [(field, hashstr(feat, bins)) for (field, feat) in feats]
    #feats.sort()
    feats = ['{0}'.format(idx) for (field, idx) in feats]
    return feats
bins = 10000000

In [11]:
%%time
i = 0
with open('train.fm', 'wb') as fw:
    for row, target in izip(X.iterrows(), y.iterrows()):
        out = "{0}".format(target[0])
        feats = []
        for name in names:
            value = str(row[name])
            if name in cnames:
                out += " {0}".format(indexdic[name][value])
            else:
                feats.append((name, '{0}_{1}'.format(name, value)))
        
        
        feats = gen_hashed_fm_feats(feats, bins)
        
        out += ' ' + ' '.join(feats) + '\n'
        fw.write(out)
        
        i+=1
        
        if (i % 1000000) == 0:
            print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)


1000000 lines of 40428967 written (2%)
2000000 lines of 40428967 written (4%)
3000000 lines of 40428967 written (7%)
4000000 lines of 40428967 written (9%)
5000000 lines of 40428967 written (12%)
6000000 lines of 40428967 written (14%)
7000000 lines of 40428967 written (17%)
8000000 lines of 40428967 written (19%)
9000000 lines of 40428967 written (22%)
10000000 lines of 40428967 written (24%)
11000000 lines of 40428967 written (27%)
12000000 lines of 40428967 written (29%)
13000000 lines of 40428967 written (32%)
14000000 lines of 40428967 written (34%)
15000000 lines of 40428967 written (37%)
16000000 lines of 40428967 written (39%)
17000000 lines of 40428967 written (42%)
18000000 lines of 40428967 written (44%)
19000000 lines of 40428967 written (46%)
20000000 lines of 40428967 written (49%)
21000000 lines of 40428967 written (51%)
22000000 lines of 40428967 written (54%)
23000000 lines of 40428967 written (56%)
24000000 lines of 40428967 written (59%)
25000000 lines of 40428967 written (61%)
26000000 lines of 40428967 written (64%)
27000000 lines of 40428967 written (66%)
28000000 lines of 40428967 written (69%)
29000000 lines of 40428967 written (71%)
30000000 lines of 40428967 written (74%)
31000000 lines of 40428967 written (76%)
32000000 lines of 40428967 written (79%)
33000000 lines of 40428967 written (81%)
34000000 lines of 40428967 written (84%)
35000000 lines of 40428967 written (86%)
36000000 lines of 40428967 written (89%)
37000000 lines of 40428967 written (91%)
38000000 lines of 40428967 written (93%)
39000000 lines of 40428967 written (96%)
40000000 lines of 40428967 written (98%)
CPU times: user 30min 56s, sys: 8.99 s, total: 31min 5s
Wall time: 31min 56s

train 9 days


In [12]:
names = X_train.colnames

In [13]:
lines = X_train.shape[0]

In [14]:
%%time
i = 0
with open('train9days.fm', 'wb') as fw:
    for row, target, line_gbdt in izip(X_train.iterrows(), y_train.iterrows(), open('train9days.gbdt.out')):
             
        out = "{0}".format(target[0])
        featsX = []
        for name in names:
            value = str(row[name])
            if name in cnames:
                out += " {0}".format(indexdic[name][value])
            else:
                featsX.append((name, '{0}_{1}'.format(name, value)))
                
        featsgbdt = [('gbdt{0}'.format(j), 'gbdt{0}_{1}'.format(j, feat)) for j, feat in enumerate(line_gbdt.strip().split()[1:9], start=1)]
        feats = featsX + featsgbdt
        
        feats = gen_hashed_fm_feats(feats, bins)
        
        out += ' ' + ' '.join(feats) + '\n'
        fw.write(out)
        
        i+=1
        
        if (i % 1000000) == 0:
            print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)


1000000 lines of 36210029 written (2%)
2000000 lines of 36210029 written (5%)
3000000 lines of 36210029 written (8%)
4000000 lines of 36210029 written (11%)
5000000 lines of 36210029 written (13%)
6000000 lines of 36210029 written (16%)
7000000 lines of 36210029 written (19%)
8000000 lines of 36210029 written (22%)
9000000 lines of 36210029 written (24%)
10000000 lines of 36210029 written (27%)
11000000 lines of 36210029 written (30%)
12000000 lines of 36210029 written (33%)
13000000 lines of 36210029 written (35%)
14000000 lines of 36210029 written (38%)
15000000 lines of 36210029 written (41%)
16000000 lines of 36210029 written (44%)
17000000 lines of 36210029 written (46%)
18000000 lines of 36210029 written (49%)
19000000 lines of 36210029 written (52%)
20000000 lines of 36210029 written (55%)
21000000 lines of 36210029 written (57%)
22000000 lines of 36210029 written (60%)
23000000 lines of 36210029 written (63%)
24000000 lines of 36210029 written (66%)
25000000 lines of 36210029 written (69%)
26000000 lines of 36210029 written (71%)
27000000 lines of 36210029 written (74%)
28000000 lines of 36210029 written (77%)
29000000 lines of 36210029 written (80%)
30000000 lines of 36210029 written (82%)
31000000 lines of 36210029 written (85%)
32000000 lines of 36210029 written (88%)
33000000 lines of 36210029 written (91%)
34000000 lines of 36210029 written (93%)
35000000 lines of 36210029 written (96%)
36000000 lines of 36210029 written (99%)
CPU times: user 1h 15min 14s, sys: 19.7 s, total: 1h 15min 33s
Wall time: 1h 16min 48s

train 10th day


In [15]:
names = X_valid.colnames

In [16]:
lines = X_valid.shape[0]

In [17]:
%%time
i = 0
with open('train10thday.fm', 'wb') as fw:
    for row, target, line_gbdt in izip(X_valid.iterrows(), y_valid.iterrows(), open('train10thday.gbdt.out')):
                
        out = "{0}".format(target[0])
        featsX = []
        for name in names:
            value = str(row[name])
            if name in cnames:
                out += " {0}".format(indexdic[name][value])
            else:
                featsX.append((name, '{0}_{1}'.format(name, value)))
                
        featsgbdt = [('gbdt{0}'.format(j), 'gbdt{0}_{1}'.format(j, feat)) for j, feat in enumerate(line_gbdt.strip().split()[1:9], start=1)]
        feats = featsX + featsgbdt
        
        feats = gen_hashed_fm_feats(feats, bins)
        
        out += ' ' + ' '.join(feats) + '\n'
        fw.write(out)
        
        i+=1
        
        if (i % 100000) == 0:
            print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)


100000 lines of 4218938 written (2%)
200000 lines of 4218938 written (4%)
300000 lines of 4218938 written (7%)
400000 lines of 4218938 written (9%)
500000 lines of 4218938 written (11%)
600000 lines of 4218938 written (14%)
700000 lines of 4218938 written (16%)
800000 lines of 4218938 written (18%)
900000 lines of 4218938 written (21%)
1000000 lines of 4218938 written (23%)
1100000 lines of 4218938 written (26%)
1200000 lines of 4218938 written (28%)
1300000 lines of 4218938 written (30%)
1400000 lines of 4218938 written (33%)
1500000 lines of 4218938 written (35%)
1600000 lines of 4218938 written (37%)
1700000 lines of 4218938 written (40%)
1800000 lines of 4218938 written (42%)
1900000 lines of 4218938 written (45%)
2000000 lines of 4218938 written (47%)
2100000 lines of 4218938 written (49%)
2200000 lines of 4218938 written (52%)
2300000 lines of 4218938 written (54%)
2400000 lines of 4218938 written (56%)
2500000 lines of 4218938 written (59%)
2600000 lines of 4218938 written (61%)
2700000 lines of 4218938 written (63%)
2800000 lines of 4218938 written (66%)
2900000 lines of 4218938 written (68%)
3000000 lines of 4218938 written (71%)
3100000 lines of 4218938 written (73%)
3200000 lines of 4218938 written (75%)
3300000 lines of 4218938 written (78%)
3400000 lines of 4218938 written (80%)
3500000 lines of 4218938 written (82%)
3600000 lines of 4218938 written (85%)
3700000 lines of 4218938 written (87%)
3800000 lines of 4218938 written (90%)
3900000 lines of 4218938 written (92%)
4000000 lines of 4218938 written (94%)
4100000 lines of 4218938 written (97%)
4200000 lines of 4218938 written (99%)
CPU times: user 8min 48s, sys: 2.11 s, total: 8min 50s
Wall time: 8min 56s

In [ ]:

test set


In [16]:
names = X_t.colnames

In [17]:
lines = X_t.shape[0]

In [18]:
%%time
i = 0
with open('test.fm', 'wb') as fw:
    for row, line_gbdt in izip(X_t.iterrows(), open('test.gbdt.out')):
                
        out = "0"
        featsX = []
        for name in names:
            value = str(row[name])
            if name in cnames:
                out += " {0}".format(indexdic[name][value])
            else:
                featsX.append((name, '{0}_{1}'.format(name, value)))
                
        featsgbdt = [('gbdt{0}'.format(j), 'gbdt{0}_{1}'.format(j, feat)) for j, feat in enumerate(line_gbdt.strip().split()[1:13], start=1)]
        feats = featsX + featsgbdt
        
        feats = gen_hashed_fm_feats(feats, bins)
        
        out += ' ' + ' '.join(feats) + '\n'
        fw.write(out)
        
        
        i+=1
        
        if (i % 100000) == 0:
            print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)


100000 lines of 4577464 written (2%)
200000 lines of 4577464 written (4%)
300000 lines of 4577464 written (6%)
400000 lines of 4577464 written (8%)
500000 lines of 4577464 written (10%)
600000 lines of 4577464 written (13%)
700000 lines of 4577464 written (15%)
800000 lines of 4577464 written (17%)
900000 lines of 4577464 written (19%)
1000000 lines of 4577464 written (21%)
1100000 lines of 4577464 written (24%)
1200000 lines of 4577464 written (26%)
1300000 lines of 4577464 written (28%)
1400000 lines of 4577464 written (30%)
1500000 lines of 4577464 written (32%)
1600000 lines of 4577464 written (34%)
1700000 lines of 4577464 written (37%)
1800000 lines of 4577464 written (39%)
1900000 lines of 4577464 written (41%)
2000000 lines of 4577464 written (43%)
2100000 lines of 4577464 written (45%)
2200000 lines of 4577464 written (48%)
2300000 lines of 4577464 written (50%)
2400000 lines of 4577464 written (52%)
2500000 lines of 4577464 written (54%)
2600000 lines of 4577464 written (56%)
2700000 lines of 4577464 written (58%)
2800000 lines of 4577464 written (61%)
2900000 lines of 4577464 written (63%)
3000000 lines of 4577464 written (65%)
3100000 lines of 4577464 written (67%)
3200000 lines of 4577464 written (69%)
3300000 lines of 4577464 written (72%)
3400000 lines of 4577464 written (74%)
3500000 lines of 4577464 written (76%)
3600000 lines of 4577464 written (78%)
3700000 lines of 4577464 written (80%)
3800000 lines of 4577464 written (83%)
3900000 lines of 4577464 written (85%)
4000000 lines of 4577464 written (87%)
4100000 lines of 4577464 written (89%)
4200000 lines of 4577464 written (91%)
4300000 lines of 4577464 written (93%)
4400000 lines of 4577464 written (96%)
4500000 lines of 4577464 written (98%)
CPU times: user 5min 53s, sys: 1.64 s, total: 5min 55s
Wall time: 6min 8s

partial one hot encoding


In [12]:
names = X_t.colnames
names.remove('device_ip')

In [13]:
lines = X_t.shape[0]

In [14]:
%%time
i = 0
with open('test.fm', 'wb') as fw:
    for row in X_t.iterrows():
        out = "0"
        feats = []
        for name in names:
            value = str(row[name])
            if name in cnames:
                out += " {0}".format(indexdic[name][value])
            else:
                feats.append((name, '{0}_{1}'.format(name, value)))
        
        
        feats = gen_hashed_fm_feats(feats, bins)
        
        out += ' ' + ' '.join(feats) + '\n'
        fw.write(out)
        
        i+=1
            
        if (i % 100000) == 0:
            print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)


100000 lines of 4577464 written (2%)
200000 lines of 4577464 written (4%)
300000 lines of 4577464 written (6%)
400000 lines of 4577464 written (8%)
500000 lines of 4577464 written (10%)
600000 lines of 4577464 written (13%)
700000 lines of 4577464 written (15%)
800000 lines of 4577464 written (17%)
900000 lines of 4577464 written (19%)
1000000 lines of 4577464 written (21%)
1100000 lines of 4577464 written (24%)
1200000 lines of 4577464 written (26%)
1300000 lines of 4577464 written (28%)
1400000 lines of 4577464 written (30%)
1500000 lines of 4577464 written (32%)
1600000 lines of 4577464 written (34%)
1700000 lines of 4577464 written (37%)
1800000 lines of 4577464 written (39%)
1900000 lines of 4577464 written (41%)
2000000 lines of 4577464 written (43%)
2100000 lines of 4577464 written (45%)
2200000 lines of 4577464 written (48%)
2300000 lines of 4577464 written (50%)
2400000 lines of 4577464 written (52%)
2500000 lines of 4577464 written (54%)
2600000 lines of 4577464 written (56%)
2700000 lines of 4577464 written (58%)
2800000 lines of 4577464 written (61%)
2900000 lines of 4577464 written (63%)
3000000 lines of 4577464 written (65%)
3100000 lines of 4577464 written (67%)
3200000 lines of 4577464 written (69%)
3300000 lines of 4577464 written (72%)
3400000 lines of 4577464 written (74%)
3500000 lines of 4577464 written (76%)
3600000 lines of 4577464 written (78%)
3700000 lines of 4577464 written (80%)
3800000 lines of 4577464 written (83%)
3900000 lines of 4577464 written (85%)
4000000 lines of 4577464 written (87%)
4100000 lines of 4577464 written (89%)
4200000 lines of 4577464 written (91%)
4300000 lines of 4577464 written (93%)
4400000 lines of 4577464 written (96%)
4500000 lines of 4577464 written (98%)
CPU times: user 3min 13s, sys: 896 ms, total: 3min 14s
Wall time: 3min 19s

one hot encoding


In [6]:
names = ['day', 'hour', 'banner_pos', 'site_id', 'site_domain', 'site_category', 'app_id', 'app_domain', 'app_category', 
         'device_id', 'device_ip', 'device_model', 'device_type', 'device_conn_type', 
         'C1', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21']

training set version 1


In [7]:
lines = X.shape[0]

In [6]:
f = open('indexdicless.pkl', 'rb')
indexdic = cPickle.load(f)
f.close()

In [9]:
%%time
i = 0
with open('train.fm', 'wb') as fw:
    for row, target in izip(X.iterrows(),y.iterrows()):
        indexlist = []
        out = "{0}".format(target[0])
        
        for name in names:
            value = str(row[name])
            if value in indexdic[name]:
                index = indexdic[name][value]
                indexlist.append(index)
            elif 'other' in indexdic[name]:
                index = indexdic[name]['other']
                indexlist.append(index)
            else:
                print 'error'
                break
        out += " " + " ".join(["{0}".format(index) for index in indexlist]) + "\n"
        fw.write(out)
    
        i+=1
        if (i % 1000000) == 0:
            print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)


1000000 lines of 40428967 written (2%)
2000000 lines of 40428967 written (4%)
3000000 lines of 40428967 written (7%)
4000000 lines of 40428967 written (9%)
5000000 lines of 40428967 written (12%)
6000000 lines of 40428967 written (14%)
7000000 lines of 40428967 written (17%)
8000000 lines of 40428967 written (19%)
9000000 lines of 40428967 written (22%)
10000000 lines of 40428967 written (24%)
11000000 lines of 40428967 written (27%)
12000000 lines of 40428967 written (29%)
13000000 lines of 40428967 written (32%)
14000000 lines of 40428967 written (34%)
15000000 lines of 40428967 written (37%)
16000000 lines of 40428967 written (39%)
17000000 lines of 40428967 written (42%)
18000000 lines of 40428967 written (44%)
19000000 lines of 40428967 written (46%)
20000000 lines of 40428967 written (49%)
21000000 lines of 40428967 written (51%)
22000000 lines of 40428967 written (54%)
23000000 lines of 40428967 written (56%)
24000000 lines of 40428967 written (59%)
25000000 lines of 40428967 written (61%)
26000000 lines of 40428967 written (64%)
27000000 lines of 40428967 written (66%)
28000000 lines of 40428967 written (69%)
29000000 lines of 40428967 written (71%)
30000000 lines of 40428967 written (74%)
31000000 lines of 40428967 written (76%)
32000000 lines of 40428967 written (79%)
33000000 lines of 40428967 written (81%)
34000000 lines of 40428967 written (84%)
35000000 lines of 40428967 written (86%)
36000000 lines of 40428967 written (89%)
37000000 lines of 40428967 written (91%)
38000000 lines of 40428967 written (93%)
39000000 lines of 40428967 written (96%)
40000000 lines of 40428967 written (98%)
CPU times: user 22min 5s, sys: 2min 41s, total: 24min 46s
Wall time: 25min 58s

training set version 2


In [8]:
f = open('indexdicless2.pkl', 'rb')
indexdic = cPickle.load(f)
f.close()

In [9]:
%%time
i = 0
with open('train1.fm', 'wb') as fw:
    for row, target in izip(X.iterrows(),y.iterrows()):
        indexlist = []
        out = "{0}".format(target[0])
        
        for name in names:
            value = str(row[name])
            if value in indexdic[name]:
                index = indexdic[name][value]
                indexlist.append(index)
            elif 'other' in indexdic[name]:
                index = indexdic[name]['other']
                indexlist.append(index)
            else:
                print 'error'
                break
        out += " " + " ".join(["{0}".format(index) for index in indexlist]) + "\n"
        fw.write(out)
            
        i+=1
        if (i % 1000000) == 0:
            print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)


0 3 8 32 218 6328 7299 7916 12003 12297 52625 677713 940499 944257 944262 944267 946566 946715 946728 947120 947164 947186 947403 947438

0 3 8 32 218 6328 7299 7916 12003 12297 52625 558552 943178 944257 944261 944267 946564 946715 946728 947120 947164 947186 947298 947438

0 3 8 32 218 6328 7299 7916 12003 12297 52625 558552 942192 944257 944261 944267 946564 946715 946728 947120 947164 947186 947298 947438

0 3 8 32 218 6328 7299 7916 12003 12297 52625 197475 942974 944257 944261 944267 946566 946715 946728 947120 947164 947186 947298 947438

0 3 8 33 786 6745 7301 7916 12003 12297 52625 555671 938747 944257 944261 944267 944592 946715 946728 946742 947164 947186 947403 947414

0 3 8 32 2469 4133 7320 7916 12003 12297 52625 558552 942192 944257 944261 944267 944305 946715 946728 947139 947164 947175 947292 947458

0 3 8 32 714 5427 7320 7916 12003 12297 52625 528558 942021 944257 944261 944267 944803 946715 946728 946786 947164 947212 947403 947414

0 3 8 33 2644 5035 7320 7916 12003 12297 52625 558552 941682 944257 944261 944267 944877 946715 946728 946796 947167 947212 947403 947412

1 3 8 32 218 6328 7299 7916 12003 12297 52625 558552 940884 944257 944262 944267 946567 946715 946728 947120 947164 947186 947403 947438

0 3 8 32 1373 5923 7316 7916 12003 12297 30103 558552 939075 944256 944261 944266 945129 946715 946728 946847 947167 947179 947377 947412

0 3 8 33 2644 5035 7320 7916 12003 12297 52625 558552 943288 944257 944261 944267 944382 946715 946728 947154 947166 947212 947246 947417

0 3 8 32 218 6328 7299 7916 12003 12297 52625 915249 939376 944257 944261 944267 946561 946715 946728 947120 947164 947186 947403 947438

CPU times: user 10.4 ms, sys: 0 ns, total: 10.4 ms
Wall time: 107 ms

training set version 3


In [8]:
f = open('indexdicless3.pkl', 'rb')
indexdic = cPickle.load(f)
f.close()

In [9]:
%%time
i = 0
with open('train.fm', 'wb') as fw:
    for row, target in izip(X.iterrows(),y.iterrows()):
        indexlist = []
        out = "{0}".format(target[0])
        
        for name in names:
            value = str(row[name])
            if value in indexdic[name]:
                index = indexdic[name][value]
                indexlist.append(index)
            elif 'other' in indexdic[name]:
                index = indexdic[name]['other']
                indexlist.append(index)
            else:
                print 'error'
                break
        out += " " + " ".join(["{0}".format(index) for index in indexlist]) + "\n"
        fw.write(out)
            
        i+=1
        if (i % 1000000) == 0:
            print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)


1000000 lines of 40428967 written (2%)
2000000 lines of 40428967 written (4%)
3000000 lines of 40428967 written (7%)
4000000 lines of 40428967 written (9%)
5000000 lines of 40428967 written (12%)
6000000 lines of 40428967 written (14%)
7000000 lines of 40428967 written (17%)
8000000 lines of 40428967 written (19%)
9000000 lines of 40428967 written (22%)
10000000 lines of 40428967 written (24%)
11000000 lines of 40428967 written (27%)
12000000 lines of 40428967 written (29%)
13000000 lines of 40428967 written (32%)
14000000 lines of 40428967 written (34%)
15000000 lines of 40428967 written (37%)
16000000 lines of 40428967 written (39%)
17000000 lines of 40428967 written (42%)
18000000 lines of 40428967 written (44%)
19000000 lines of 40428967 written (46%)
20000000 lines of 40428967 written (49%)
21000000 lines of 40428967 written (51%)
22000000 lines of 40428967 written (54%)
23000000 lines of 40428967 written (56%)
24000000 lines of 40428967 written (59%)
25000000 lines of 40428967 written (61%)
26000000 lines of 40428967 written (64%)
27000000 lines of 40428967 written (66%)
28000000 lines of 40428967 written (69%)
29000000 lines of 40428967 written (71%)
30000000 lines of 40428967 written (74%)
31000000 lines of 40428967 written (76%)
32000000 lines of 40428967 written (79%)
33000000 lines of 40428967 written (81%)
34000000 lines of 40428967 written (84%)
35000000 lines of 40428967 written (86%)
36000000 lines of 40428967 written (89%)
37000000 lines of 40428967 written (91%)
38000000 lines of 40428967 written (93%)
39000000 lines of 40428967 written (96%)
40000000 lines of 40428967 written (98%)
CPU times: user 19min 54s, sys: 9.43 s, total: 20min 3s
Wall time: 21min 39s

test set version 1


In [10]:
lines = X_t.shape[0]

In [9]:
%%time
i = 0
with open('test.fm', 'wb') as fw:
    for row in X_t.iterrows():
        indexlist = []
        out = "0"
        
        for name in names:
            value = str(row[name])
            if value in indexdic[name]:
                index = indexdic[name][value]
                indexlist.append(index)
            elif 'other' in indexdic[name]:
                index = indexdic[name]['other']
                indexlist.append(index)
            else:
                indexlist.append(617958)
                
        out += " " + " ".join(["{0}".format(index) for index in indexlist]) + "\n"
        fw.write(out)
    
        i+=1
        if (i % 100000) == 0:
            print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)


1000000 lines of 40428967 written (2%)
2000000 lines of 40428967 written (4%)
3000000 lines of 40428967 written (7%)
4000000 lines of 40428967 written (9%)
5000000 lines of 40428967 written (12%)
6000000 lines of 40428967 written (14%)
7000000 lines of 40428967 written (17%)
8000000 lines of 40428967 written (19%)
9000000 lines of 40428967 written (22%)
10000000 lines of 40428967 written (24%)
11000000 lines of 40428967 written (27%)
12000000 lines of 40428967 written (29%)
13000000 lines of 40428967 written (32%)
14000000 lines of 40428967 written (34%)
15000000 lines of 40428967 written (37%)
16000000 lines of 40428967 written (39%)
17000000 lines of 40428967 written (42%)
18000000 lines of 40428967 written (44%)
19000000 lines of 40428967 written (46%)
20000000 lines of 40428967 written (49%)
21000000 lines of 40428967 written (51%)
22000000 lines of 40428967 written (54%)
23000000 lines of 40428967 written (56%)
24000000 lines of 40428967 written (59%)
25000000 lines of 40428967 written (61%)
26000000 lines of 40428967 written (64%)
27000000 lines of 40428967 written (66%)
28000000 lines of 40428967 written (69%)
29000000 lines of 40428967 written (71%)
30000000 lines of 40428967 written (74%)
31000000 lines of 40428967 written (76%)
32000000 lines of 40428967 written (79%)
33000000 lines of 40428967 written (81%)
34000000 lines of 40428967 written (84%)
35000000 lines of 40428967 written (86%)
36000000 lines of 40428967 written (89%)
37000000 lines of 40428967 written (91%)
38000000 lines of 40428967 written (93%)
39000000 lines of 40428967 written (96%)
40000000 lines of 40428967 written (98%)
CPU times: user 22min 5s, sys: 2min 41s, total: 24min 46s
Wall time: 25min 58s

test set version 2


In [11]:
%%time
i = 0
with open('test1.fm', 'wb') as fw:
    for row in X_t.iterrows():
        indexlist = []
        out = "0"
        
        for name in names:
            value = str(row[name])
            if value in indexdic[name]:
                index = indexdic[name][value]
                indexlist.append(index)
            elif 'other' in indexdic[name]:
                index = indexdic[name]['other']
                indexlist.append(index)
            else:
                indexlist.append(947464)
                
        out += " " + " ".join(["{0}".format(index) for index in indexlist]) + "\n"
        fw.write(out)
    
        i+=1
        if (i % 100000) == 0:
            print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)


0 6 8 32 3115 4798 7320 7916 12003 12297 52625 398516 939205 944257 944261 944267 944278 946715 946728 947074 947167 947192 947290 947412

0 6 8 32 218 6328 7299 7916 12003 12297 52625 387280 941699 944257 944261 944267 945768 946715 946728 946953 947164 947186 947297 947426

0 6 8 32 218 6328 7299 7916 12003 12297 52625 879845 939376 944257 944261 944267 945768 946715 946728 946953 947164 947186 947297 947426

0 6 8 32 389 5923 7316 8796 12071 12293 52625 492243 938691 944257 944261 944267 944555 946715 946728 947091 947167 947224 947349 947429

0 6 8 32 389 5923 7316 10575 12107 12295 52625 558552 939376 944257 944261 944267 946166 946715 946728 946996 947164 947169 947403 947422

0 6 8 33 2437 6774 7320 7916 12003 12297 52625 626280 940109 944257 944261 944267 946004 946715 946728 946956 947166 947212 947403 947416

0 6 8 32 218 6328 7299 7916 12003 12297 52625 558552 943018 944257 944261 944267 945874 946715 946728 946976 947166 947212 947342 947412

0 6 8 32 389 5923 7316 9009 12107 12280 60652 558552 941722 944257 944263 944267 946212 946717 946731 947004 947167 947231 947276 947449

0 6 8 32 120 6695 7322 7916 12003 12297 52625 499492 940437 944257 944261 944267 946381 946715 946728 947032 947167 947212 947403 947412

0 6 8 32 218 6328 7299 7916 12003 12297 52625 881719 942192 944257 944261 944267 946559 946715 946728 947120 947164 947186 947403 947438

0 6 8 32 218 6328 7299 7916 12003 12297 52625 703329 942623 944257 944262 944267 945507 946715 946728 946889 947164 947175 947298 947422

0 6 8 32 389 5923 7316 8059 12249 12293 52625 266054 938834 944257 944261 944267 946540 946715 946728 947055 947164 947222 947359 947459

CPU times: user 0 ns, sys: 3.92 ms, total: 3.92 ms
Wall time: 121 ms

test set version 3


In [11]:
%%time
i = 0
with open('test.fm', 'wb') as fw:
    for row in X_t.iterrows():
        indexlist = []
        out = "0"
        
        for name in names:
            value = str(row[name])
            if value in indexdic[name]:
                index = indexdic[name][value]
                indexlist.append(index)
            elif 'other' in indexdic[name]:
                index = indexdic[name]['other']
                indexlist.append(index)
            else:
                indexlist.append(636705)
                
        out += " " + " ".join(["{0}".format(index) for index in indexlist]) + "\n"
        fw.write(out)
    
        i+=1
        if (i % 100000) == 0:
            print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)


100000 lines of 4577464 written (2%)
200000 lines of 4577464 written (4%)
300000 lines of 4577464 written (6%)
400000 lines of 4577464 written (8%)
500000 lines of 4577464 written (10%)
600000 lines of 4577464 written (13%)
700000 lines of 4577464 written (15%)
800000 lines of 4577464 written (17%)
900000 lines of 4577464 written (19%)
1000000 lines of 4577464 written (21%)
1100000 lines of 4577464 written (24%)
1200000 lines of 4577464 written (26%)
1300000 lines of 4577464 written (28%)
1400000 lines of 4577464 written (30%)
1500000 lines of 4577464 written (32%)
1600000 lines of 4577464 written (34%)
1700000 lines of 4577464 written (37%)
1800000 lines of 4577464 written (39%)
1900000 lines of 4577464 written (41%)
2000000 lines of 4577464 written (43%)
2100000 lines of 4577464 written (45%)
2200000 lines of 4577464 written (48%)
2300000 lines of 4577464 written (50%)
2400000 lines of 4577464 written (52%)
2500000 lines of 4577464 written (54%)
2600000 lines of 4577464 written (56%)
2700000 lines of 4577464 written (58%)
2800000 lines of 4577464 written (61%)
2900000 lines of 4577464 written (63%)
3000000 lines of 4577464 written (65%)
3100000 lines of 4577464 written (67%)
3200000 lines of 4577464 written (69%)
3300000 lines of 4577464 written (72%)
3400000 lines of 4577464 written (74%)
3500000 lines of 4577464 written (76%)
3600000 lines of 4577464 written (78%)
3700000 lines of 4577464 written (80%)
3800000 lines of 4577464 written (83%)
3900000 lines of 4577464 written (85%)
4000000 lines of 4577464 written (87%)
4100000 lines of 4577464 written (89%)
4200000 lines of 4577464 written (91%)
4300000 lines of 4577464 written (93%)
4400000 lines of 4577464 written (96%)
4500000 lines of 4577464 written (98%)
CPU times: user 1min 59s, sys: 892 ms, total: 2min
Wall time: 2min 10s

xgboost

click rate data for 9 days


In [10]:
f = open('clickratedic.pkl', 'rb')
clickratedic = cPickle.load(f)
f.close()

train 9 days


In [11]:
glh = 0.084933389586625302

In [12]:
names = X_train.colnames
lines = X_train.shape[0]

In [13]:
%%time
i = 0
with open('train9days.xgb', 'wb') as fw:
    for row, target in izip(X_train.iterrows(), y_train.iterrows()):
        
        feats = ["{0}:{1}".format(j, math.exp(clickratedic[name][row[name]])) if row[name] in clickratedic[name] else "{0}:{1}".format(j, math.exp(glh)) for j, name in enumerate(names, start=1)]
        
        out = str(target[0]) + ' ' + ' '.join(feats) + '\n'
        fw.write(out)
        
        i+=1
        
        if (i % 1000000) == 0:
            print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)


1000000 lines of 36210029 written (2%)
2000000 lines of 36210029 written (5%)
3000000 lines of 36210029 written (8%)
4000000 lines of 36210029 written (11%)
5000000 lines of 36210029 written (13%)
6000000 lines of 36210029 written (16%)
7000000 lines of 36210029 written (19%)
8000000 lines of 36210029 written (22%)
9000000 lines of 36210029 written (24%)
10000000 lines of 36210029 written (27%)
11000000 lines of 36210029 written (30%)
12000000 lines of 36210029 written (33%)
13000000 lines of 36210029 written (35%)
14000000 lines of 36210029 written (38%)
15000000 lines of 36210029 written (41%)
16000000 lines of 36210029 written (44%)
17000000 lines of 36210029 written (46%)
18000000 lines of 36210029 written (49%)
19000000 lines of 36210029 written (52%)
20000000 lines of 36210029 written (55%)
21000000 lines of 36210029 written (57%)
22000000 lines of 36210029 written (60%)
23000000 lines of 36210029 written (63%)
24000000 lines of 36210029 written (66%)
25000000 lines of 36210029 written (69%)
26000000 lines of 36210029 written (71%)
27000000 lines of 36210029 written (74%)
28000000 lines of 36210029 written (77%)
29000000 lines of 36210029 written (80%)
30000000 lines of 36210029 written (82%)
31000000 lines of 36210029 written (85%)
32000000 lines of 36210029 written (88%)
33000000 lines of 36210029 written (91%)
34000000 lines of 36210029 written (93%)
35000000 lines of 36210029 written (96%)
36000000 lines of 36210029 written (99%)
CPU times: user 26min 7s, sys: 17.1 s, total: 26min 24s
Wall time: 29min 42s

train 10th day


In [14]:
names = X_valid.colnames
lines = X_valid.shape[0]

In [15]:
%%time
i = 0
with open('train10thday.xgb', 'wb') as fw:
    for row, target in izip(X_valid.iterrows(), y_valid.iterrows()):
        
        feats = ["{0}:{1}".format(j, math.exp(clickratedic[name][row[name]])) if row[name] in clickratedic[name] else "{0}:{1}".format(j, math.exp(glh)) for j, name in enumerate(names, start=1)]
        
        out = str(target[0]) + ' ' + ' '.join(feats) + '\n'
        fw.write(out)
        
        i+=1
        
        if (i % 100000) == 0:
            print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)


100000 lines of 4218938 written (2%)
200000 lines of 4218938 written (4%)
300000 lines of 4218938 written (7%)
400000 lines of 4218938 written (9%)
500000 lines of 4218938 written (11%)
600000 lines of 4218938 written (14%)
700000 lines of 4218938 written (16%)
800000 lines of 4218938 written (18%)
900000 lines of 4218938 written (21%)
1000000 lines of 4218938 written (23%)
1100000 lines of 4218938 written (26%)
1200000 lines of 4218938 written (28%)
1300000 lines of 4218938 written (30%)
1400000 lines of 4218938 written (33%)
1500000 lines of 4218938 written (35%)
1600000 lines of 4218938 written (37%)
1700000 lines of 4218938 written (40%)
1800000 lines of 4218938 written (42%)
1900000 lines of 4218938 written (45%)
2000000 lines of 4218938 written (47%)
2100000 lines of 4218938 written (49%)
2200000 lines of 4218938 written (52%)
2300000 lines of 4218938 written (54%)
2400000 lines of 4218938 written (56%)
2500000 lines of 4218938 written (59%)
2600000 lines of 4218938 written (61%)
2700000 lines of 4218938 written (63%)
2800000 lines of 4218938 written (66%)
2900000 lines of 4218938 written (68%)
3000000 lines of 4218938 written (71%)
3100000 lines of 4218938 written (73%)
3200000 lines of 4218938 written (75%)
3300000 lines of 4218938 written (78%)
3400000 lines of 4218938 written (80%)
3500000 lines of 4218938 written (82%)
3600000 lines of 4218938 written (85%)
3700000 lines of 4218938 written (87%)
3800000 lines of 4218938 written (90%)
3900000 lines of 4218938 written (92%)
4000000 lines of 4218938 written (94%)
4100000 lines of 4218938 written (97%)
4200000 lines of 4218938 written (99%)
CPU times: user 2min 55s, sys: 1.85 s, total: 2min 57s
Wall time: 3min 17s

click rate data for 10 days


In [8]:
f = open('clickratedicall.pkl', 'rb')
clickratedic = cPickle.load(f)
f.close()

training data


In [ ]:
glh = 0.084902812382023019

In [ ]:
names = X.colnames
lines = X.shape[0]

In [ ]:
%%time
i = 0
with open('train.xgb', 'wb') as fw:
    for row, target in izip(X.iterrows(), y.iterrows()):
        
        feats = ["{0}:{1}".format(j, math.exp(clickratedic[name][row[name]])) if row[name] in clickratedic[name] else "{0}:{1}".format(j, math.exp(glh)) for j, name in enumerate(names, start=1)]
        
        out = str(target[0]) + ' ' + ' '.join(feats) + '\n'
        fw.write(out)
        
        i+=1
        
        if (i % 1000000) == 0:
            print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)

test data


In [16]:
names = X_t.colnames
lines = X_t.shape[0]

In [18]:
%%time
i = 0
with open('test.xgb', 'wb') as fw:
    for row in X_t.iterrows():
        
        feats = ["{0}:{1}".format(j, math.exp(clickratedic[name][row[name]])) if row[name] in clickratedic[name] else "{0}:{1}".format(j, math.exp(glh)) for j, name in enumerate(names, start=1)]
        
        out = '0 ' + ' '.join(feats) + '\n'
        fw.write(out)
        
        i+=1
        
        if (i % 100000) == 0:
            print "{0} lines of {1} written ({2}%)".format(i, lines, 100*i/lines)


100000 lines of 4577464 written (2%)
200000 lines of 4577464 written (4%)
300000 lines of 4577464 written (6%)
400000 lines of 4577464 written (8%)
500000 lines of 4577464 written (10%)
600000 lines of 4577464 written (13%)
700000 lines of 4577464 written (15%)
800000 lines of 4577464 written (17%)
900000 lines of 4577464 written (19%)
1000000 lines of 4577464 written (21%)
1100000 lines of 4577464 written (24%)
1200000 lines of 4577464 written (26%)
1300000 lines of 4577464 written (28%)
1400000 lines of 4577464 written (30%)
1500000 lines of 4577464 written (32%)
1600000 lines of 4577464 written (34%)
1700000 lines of 4577464 written (37%)
1800000 lines of 4577464 written (39%)
1900000 lines of 4577464 written (41%)
2000000 lines of 4577464 written (43%)
2100000 lines of 4577464 written (45%)
2200000 lines of 4577464 written (48%)
2300000 lines of 4577464 written (50%)
2400000 lines of 4577464 written (52%)
2500000 lines of 4577464 written (54%)
2600000 lines of 4577464 written (56%)
2700000 lines of 4577464 written (58%)
2800000 lines of 4577464 written (61%)
2900000 lines of 4577464 written (63%)
3000000 lines of 4577464 written (65%)
3100000 lines of 4577464 written (67%)
3200000 lines of 4577464 written (69%)
3300000 lines of 4577464 written (72%)
3400000 lines of 4577464 written (74%)
3500000 lines of 4577464 written (76%)
3600000 lines of 4577464 written (78%)
3700000 lines of 4577464 written (80%)
3800000 lines of 4577464 written (83%)
3900000 lines of 4577464 written (85%)
4000000 lines of 4577464 written (87%)
4100000 lines of 4577464 written (89%)
4200000 lines of 4577464 written (91%)
4300000 lines of 4577464 written (93%)
4400000 lines of 4577464 written (96%)
4500000 lines of 4577464 written (98%)
CPU times: user 2min 32s, sys: 2.02 s, total: 2min 34s
Wall time: 2min 52s

In [ ]: