In [1]:
import tables
import csv
import time
import numpy as np
In [2]:
class XDataTrain(tables.IsDescription):
day = tables.UInt8Col(pos=0)
hour = tables.UInt8Col(pos=1)
C1 = tables.UInt16Col(pos=2)
banner_pos = tables.UInt8Col(pos=3)
site_id = tables.StringCol(itemsize=8, pos=4)
site_domain = tables.StringCol(itemsize=8, pos=5)
site_category = tables.StringCol(itemsize=8, pos=6)
app_id = tables.StringCol(itemsize=8, pos=7)
app_domain = tables.StringCol(itemsize=8, pos=8)
app_category = tables.StringCol(itemsize=8, pos=9)
device_id = tables.StringCol(itemsize=8, pos=10)
device_ip = tables.StringCol(itemsize=8, pos=11)
device_model = tables.StringCol(itemsize=8, pos=12)
device_type = tables.UInt16Col(pos=13)
device_conn_type = tables.UInt8Col(pos=14)
C14 = tables.UInt16Col(pos=15)
C15 = tables.UInt16Col(pos=16)
C16 = tables.UInt16Col(pos=17)
C17 = tables.UInt16Col(pos=18)
C18 = tables.UInt8Col(pos=19)
C19 = tables.UInt16Col(pos=20)
C20 = tables.Int32Col(pos=21)
C21 = tables.UInt8Col(pos=22)
In [3]:
class XDataTest(tables.IsDescription):
day = tables.UInt8Col(pos=0)
hour = tables.UInt8Col(pos=1)
C1 = tables.UInt16Col(pos=2)
banner_pos = tables.UInt8Col(pos=3)
site_id = tables.StringCol(itemsize=8, pos=4)
site_domain = tables.StringCol(itemsize=8, pos=5)
site_category = tables.StringCol(itemsize=8, pos=6)
app_id = tables.StringCol(itemsize=8, pos=7)
app_domain = tables.StringCol(itemsize=8, pos=8)
app_category = tables.StringCol(itemsize=8, pos=9)
device_id = tables.StringCol(itemsize=8, pos=10)
device_ip = tables.StringCol(itemsize=8, pos=11)
device_model = tables.StringCol(itemsize=8, pos=12)
device_type = tables.UInt16Col(pos=13)
device_conn_type = tables.UInt8Col(pos=14)
C14 = tables.UInt16Col(pos=15)
C15 = tables.UInt16Col(pos=16)
C16 = tables.UInt16Col(pos=17)
C17 = tables.UInt16Col(pos=18)
C18 = tables.UInt8Col(pos=19)
C19 = tables.UInt16Col(pos=20)
C20 = tables.Int32Col(pos=21)
C21 = tables.UInt8Col(pos=22)
In [4]:
file_handler = tables.open_file("click_data.h5", mode = "w", title = "Click data")
add training and test set
In [5]:
root = file_handler.root
In [6]:
train_group = file_handler.create_group(root, "train")
test_group = file_handler.create_group(root, "test")
In [7]:
train_raw_group = file_handler.create_group(train_group, "train_raw")
test_raw_group = file_handler.create_group(test_group, "test_raw")
In [8]:
filters = tables.Filters(complib='blosc', complevel=5)
y_atom = tables.UInt8Atom()
y_shape = (0,1)
y_array = file_handler.createEArray(train_raw_group, 'y', atom=y_atom, shape=y_shape,
title="targets", filters=filters, expectedrows=40428967)
In [9]:
train_table_X = file_handler.create_table(train_raw_group, "X", XDataTrain, expectedrows=40428967)
test_table_X = file_handler.create_table(test_raw_group, "X_t", XDataTest, expectedrows=4577464)
In [10]:
file_handler
Out[10]:
add training and validation set(from training set)
In [4]:
train_raw_group = file_handler.root.train.train_raw
train_table_X_train = file_handler.create_table(train_raw_group, "X_train", XDataTrain, expectedrows=36210029)
train_table_X_valid = file_handler.create_table(train_raw_group, "X_valid", XDataTrain, expectedrows=4218938)
In [5]:
filters = tables.Filters(complib='blosc', complevel=5)
y_atom = tables.UInt8Atom()
y_shape = (0,1)
y_array_train = file_handler.createEArray(train_raw_group, 'y_train', atom=y_atom, shape=y_shape,
title="targets_train", filters=filters, expectedrows=36210029)
y_array_valid = file_handler.createEArray(train_raw_group, 'y_valid', atom=y_atom, shape=y_shape,
title="targets_valid", filters=filters, expectedrows=4218938)
training set and test set
In [10]:
reader = csv.reader(open("train.csv"))
header = reader.next()
In [11]:
train_row = train_table_X.row
flag = 0
start = time.time()
for row in reader:
train_row['day'] = (int(row[2][4:6])-19)%7
train_row['hour'] = int(row[2][6:])
train_row['C1'] = int(row[3])
train_row['banner_pos'] = int(row[4])
train_row['site_id'] = row[5]
train_row['site_domain'] = row[6]
train_row['site_category'] = row[7]
train_row['app_id'] = row[8]
train_row['app_domain'] = row[9]
train_row['app_category'] = row[10]
train_row['device_id'] = row[11]
train_row['device_ip'] = row[12]
train_row['device_model'] = row[13]
train_row['device_type'] = int(row[14])
train_row['device_conn_type'] = int(row[15])
train_row['C14'] = int(row[16])
train_row['C15'] = int(row[17])
train_row['C16'] = int(row[18])
train_row['C17'] = int(row[19])
train_row['C18'] = int(row[20])
train_row['C19'] = int(row[21])
train_row['C20'] = int(row[22])
train_row['C21'] = int(row[23])
train_row.append()
flag += 1
if flag > 100:
train_table_X.flush()
flag = 0
train_table_X.flush()
end = time.time()
print end-start, 'second'
In [12]:
reader = csv.reader(open("train.csv"))
header = reader.next()
In [13]:
start = time.time()
for row in reader:
y_array.append(np.array([[int(row[1])]]))
end = time.time()
print end-start, 'second'
In [14]:
readertest = csv.reader(open("test.csv"))
headertest = readertest.next()
In [15]:
test_row = test_table_X.row
flag = 0
start = time.time()
for row in readertest:
test_row['day'] = (int(row[1][4:6])-19)%7
test_row['hour'] = int(row[1][6:])
test_row['C1'] = int(row[2])
test_row['banner_pos'] = int(row[3])
test_row['site_id'] = row[4]
test_row['site_domain'] = row[5]
test_row['site_category'] = row[6]
test_row['app_id'] = row[7]
test_row['app_domain'] = row[8]
test_row['app_category'] = row[9]
test_row['device_id'] = row[10]
test_row['device_ip'] = row[11]
test_row['device_model'] = row[12]
test_row['device_type'] = int(row[13])
test_row['device_conn_type'] = int(row[14])
test_row['C14'] = int(row[15])
test_row['C15'] = int(row[16])
test_row['C16'] = int(row[17])
test_row['C17'] = int(row[18])
test_row['C18'] = int(row[19])
test_row['C19'] = int(row[20])
test_row['C20'] = int(row[21])
test_row['C21'] = int(row[22])
test_row.append()
flag += 1
if flag > 100:
test_table_X.flush()
flag = 0
test_table_X.flush()
end = time.time()
print end-start, 'second'
training set to training and validation
In [6]:
reader = csv.reader(open("train.csv"))
header = reader.next()
In [7]:
flag = 0
i = 0
start = time.time()
for row in reader:
if i < 36210029:
train_row = train_table_X_train.row
else:
train_row = train_table_X_valid.row
train_row['day'] = (int(row[2][4:6])-19)%7
train_row['hour'] = int(row[2][6:])
train_row['C1'] = int(row[3])
train_row['banner_pos'] = int(row[4])
train_row['site_id'] = row[5]
train_row['site_domain'] = row[6]
train_row['site_category'] = row[7]
train_row['app_id'] = row[8]
train_row['app_domain'] = row[9]
train_row['app_category'] = row[10]
train_row['device_id'] = row[11]
train_row['device_ip'] = row[12]
train_row['device_model'] = row[13]
train_row['device_type'] = int(row[14])
train_row['device_conn_type'] = int(row[15])
train_row['C14'] = int(row[16])
train_row['C15'] = int(row[17])
train_row['C16'] = int(row[18])
train_row['C17'] = int(row[19])
train_row['C18'] = int(row[20])
train_row['C19'] = int(row[21])
train_row['C20'] = int(row[22])
train_row['C21'] = int(row[23])
train_row.append()
flag += 1
if flag > 100:
if i < 36210029:
train_table_X_train.flush()
else:
train_table_X_valid.flush()
flag = 0
i += 1
train_table_X_train.flush()
train_table_X_valid.flush()
end = time.time()
print end-start, 'second'
In [14]:
reader = csv.reader(open("train.csv"))
header = reader.next()
In [15]:
i = 0
start = time.time()
for row in reader:
if i < 36210029:
y_array_train.append(np.array([[int(row[1])]]))
else:
y_array_valid.append(np.array([[int(row[1])]]))
i += 1
end = time.time()
print end-start, 'second'