In [1]:
import tables
import csv
import time
import numpy as np

In [2]:
class XDataTrain(tables.IsDescription):
    day = tables.UInt8Col(pos=0)
    hour = tables.UInt8Col(pos=1)
    C1 = tables.UInt16Col(pos=2)
    banner_pos = tables.UInt8Col(pos=3)
    site_id = tables.StringCol(itemsize=8, pos=4) 
    site_domain = tables.StringCol(itemsize=8, pos=5)
    site_category = tables.StringCol(itemsize=8, pos=6)
    app_id = tables.StringCol(itemsize=8, pos=7)
    app_domain = tables.StringCol(itemsize=8, pos=8)
    app_category = tables.StringCol(itemsize=8, pos=9)
    device_id = tables.StringCol(itemsize=8, pos=10)
    device_ip = tables.StringCol(itemsize=8, pos=11)
    device_model = tables.StringCol(itemsize=8, pos=12)
    device_type = tables.UInt16Col(pos=13)
    device_conn_type = tables.UInt8Col(pos=14)
    C14 = tables.UInt16Col(pos=15)
    C15 = tables.UInt16Col(pos=16)
    C16 = tables.UInt16Col(pos=17)
    C17 = tables.UInt16Col(pos=18)
    C18 = tables.UInt8Col(pos=19)
    C19 = tables.UInt16Col(pos=20)
    C20 = tables.Int32Col(pos=21)
    C21 = tables.UInt8Col(pos=22)

In [3]:
class XDataTest(tables.IsDescription):
    day = tables.UInt8Col(pos=0)
    hour = tables.UInt8Col(pos=1)
    C1 = tables.UInt16Col(pos=2)
    banner_pos = tables.UInt8Col(pos=3)
    site_id = tables.StringCol(itemsize=8, pos=4) 
    site_domain = tables.StringCol(itemsize=8, pos=5)
    site_category = tables.StringCol(itemsize=8, pos=6)
    app_id = tables.StringCol(itemsize=8, pos=7)
    app_domain = tables.StringCol(itemsize=8, pos=8)
    app_category = tables.StringCol(itemsize=8, pos=9)
    device_id = tables.StringCol(itemsize=8, pos=10)
    device_ip = tables.StringCol(itemsize=8, pos=11)
    device_model = tables.StringCol(itemsize=8, pos=12)
    device_type = tables.UInt16Col(pos=13)
    device_conn_type = tables.UInt8Col(pos=14)
    C14 = tables.UInt16Col(pos=15)
    C15 = tables.UInt16Col(pos=16)
    C16 = tables.UInt16Col(pos=17)
    C17 = tables.UInt16Col(pos=18)
    C18 = tables.UInt8Col(pos=19)
    C19 = tables.UInt16Col(pos=20)
    C20 = tables.Int32Col(pos=21)
    C21 = tables.UInt8Col(pos=22)

In [4]:
file_handler = tables.open_file("click_data.h5", mode = "w", title = "Click data")

add training and test set


In [5]:
root = file_handler.root

In [6]:
train_group = file_handler.create_group(root, "train")
test_group = file_handler.create_group(root, "test")

In [7]:
train_raw_group = file_handler.create_group(train_group, "train_raw")
test_raw_group = file_handler.create_group(test_group, "test_raw")

In [8]:
filters = tables.Filters(complib='blosc', complevel=5)
y_atom = tables.UInt8Atom()
y_shape = (0,1)
y_array = file_handler.createEArray(train_raw_group, 'y', atom=y_atom, shape=y_shape, 
                                    title="targets", filters=filters, expectedrows=40428967)

In [9]:
train_table_X = file_handler.create_table(train_raw_group, "X", XDataTrain, expectedrows=40428967)
test_table_X = file_handler.create_table(test_raw_group, "X_t", XDataTest, expectedrows=4577464)

In [10]:
file_handler


Out[10]:
File(filename=click_data.h5, title='Click data', mode='w', root_uep='/', filters=Filters(complevel=0, shuffle=False, fletcher32=False, least_significant_digit=None))
/ (RootGroup) 'Click data'
/test (Group) u''
/train (Group) u''
/train/train_raw (Group) u''
/train/train_raw/X (Table(0,)) ''
  description := {
  "day": UInt8Col(shape=(), dflt=0, pos=0),
  "hour": UInt8Col(shape=(), dflt=0, pos=1),
  "C1": UInt16Col(shape=(), dflt=0, pos=2),
  "banner_pos": UInt8Col(shape=(), dflt=0, pos=3),
  "site_id": StringCol(itemsize=8, shape=(), dflt='', pos=4),
  "site_domain": StringCol(itemsize=8, shape=(), dflt='', pos=5),
  "site_category": StringCol(itemsize=8, shape=(), dflt='', pos=6),
  "app_id": StringCol(itemsize=8, shape=(), dflt='', pos=7),
  "app_domain": StringCol(itemsize=8, shape=(), dflt='', pos=8),
  "app_category": StringCol(itemsize=8, shape=(), dflt='', pos=9),
  "device_id": StringCol(itemsize=8, shape=(), dflt='', pos=10),
  "device_ip": StringCol(itemsize=8, shape=(), dflt='', pos=11),
  "device_model": StringCol(itemsize=8, shape=(), dflt='', pos=12),
  "device_type": UInt16Col(shape=(), dflt=0, pos=13),
  "device_conn_type": UInt8Col(shape=(), dflt=0, pos=14),
  "C14": UInt16Col(shape=(), dflt=0, pos=15),
  "C15": UInt16Col(shape=(), dflt=0, pos=16),
  "C16": UInt16Col(shape=(), dflt=0, pos=17),
  "C17": UInt16Col(shape=(), dflt=0, pos=18),
  "C18": UInt8Col(shape=(), dflt=0, pos=19),
  "C19": UInt16Col(shape=(), dflt=0, pos=20),
  "C20": Int32Col(shape=(), dflt=0, pos=21),
  "C21": UInt8Col(shape=(), dflt=0, pos=22)}
  byteorder := 'little'
  chunkshape := (5461,)
/train/train_raw/y (EArray(0, 1), shuffle, blosc(5)) 'targets'
  atom := UInt8Atom(shape=(), dflt=0)
  maindim := 0
  flavor := 'numpy'
  byteorder := 'irrelevant'
  chunkshape := (131072, 1)
/test/test_raw (Group) u''
/test/test_raw/X_t (Table(0,)) ''
  description := {
  "day": UInt8Col(shape=(), dflt=0, pos=0),
  "hour": UInt8Col(shape=(), dflt=0, pos=1),
  "C1": UInt16Col(shape=(), dflt=0, pos=2),
  "banner_pos": UInt8Col(shape=(), dflt=0, pos=3),
  "site_id": StringCol(itemsize=8, shape=(), dflt='', pos=4),
  "site_domain": StringCol(itemsize=8, shape=(), dflt='', pos=5),
  "site_category": StringCol(itemsize=8, shape=(), dflt='', pos=6),
  "app_id": StringCol(itemsize=8, shape=(), dflt='', pos=7),
  "app_domain": StringCol(itemsize=8, shape=(), dflt='', pos=8),
  "app_category": StringCol(itemsize=8, shape=(), dflt='', pos=9),
  "device_id": StringCol(itemsize=8, shape=(), dflt='', pos=10),
  "device_ip": StringCol(itemsize=8, shape=(), dflt='', pos=11),
  "device_model": StringCol(itemsize=8, shape=(), dflt='', pos=12),
  "device_type": UInt16Col(shape=(), dflt=0, pos=13),
  "device_conn_type": UInt8Col(shape=(), dflt=0, pos=14),
  "C14": UInt16Col(shape=(), dflt=0, pos=15),
  "C15": UInt16Col(shape=(), dflt=0, pos=16),
  "C16": UInt16Col(shape=(), dflt=0, pos=17),
  "C17": UInt16Col(shape=(), dflt=0, pos=18),
  "C18": UInt8Col(shape=(), dflt=0, pos=19),
  "C19": UInt16Col(shape=(), dflt=0, pos=20),
  "C20": Int32Col(shape=(), dflt=0, pos=21),
  "C21": UInt8Col(shape=(), dflt=0, pos=22)}
  byteorder := 'little'
  chunkshape := (2730,)

add training and validation set(from training set)

# modify hdf5 file_handler = tables.open_file("click_data.h5", mode = "r+")

In [4]:
train_raw_group = file_handler.root.train.train_raw
train_table_X_train = file_handler.create_table(train_raw_group, "X_train", XDataTrain, expectedrows=36210029)
train_table_X_valid = file_handler.create_table(train_raw_group, "X_valid", XDataTrain, expectedrows=4218938)

In [5]:
filters = tables.Filters(complib='blosc', complevel=5)
y_atom = tables.UInt8Atom()
y_shape = (0,1)
y_array_train = file_handler.createEArray(train_raw_group, 'y_train', atom=y_atom, shape=y_shape, 
                                    title="targets_train", filters=filters, expectedrows=36210029)
y_array_valid = file_handler.createEArray(train_raw_group, 'y_valid', atom=y_atom, shape=y_shape, 
                                    title="targets_valid", filters=filters, expectedrows=4218938)

read csv data write to hdf5

training set and test set


In [10]:
reader = csv.reader(open("train.csv"))
header = reader.next()

In [11]:
train_row = train_table_X.row

flag = 0
start = time.time()

for row in reader:
    train_row['day'] = (int(row[2][4:6])-19)%7
    train_row['hour'] = int(row[2][6:])
    train_row['C1'] = int(row[3])
    train_row['banner_pos'] = int(row[4])
    train_row['site_id'] = row[5]
    train_row['site_domain'] = row[6]
    train_row['site_category'] = row[7]
    train_row['app_id'] = row[8]
    train_row['app_domain'] = row[9]
    train_row['app_category'] = row[10]
    train_row['device_id'] = row[11]
    train_row['device_ip'] = row[12]
    train_row['device_model'] = row[13]
    train_row['device_type'] = int(row[14])
    train_row['device_conn_type'] = int(row[15])
    train_row['C14'] = int(row[16])
    train_row['C15'] = int(row[17])
    train_row['C16'] = int(row[18])
    train_row['C17'] = int(row[19])
    train_row['C18'] = int(row[20])
    train_row['C19'] = int(row[21])
    train_row['C20'] = int(row[22])
    train_row['C21'] = int(row[23])
    
    train_row.append()
    
    flag += 1
    
    if flag > 100:
        train_table_X.flush()
        flag = 0
        
train_table_X.flush()

end = time.time()
print end-start, 'second'


861.269048929 second

In [12]:
reader = csv.reader(open("train.csv"))
header = reader.next()

In [13]:
start = time.time()

for row in reader:
    y_array.append(np.array([[int(row[1])]]))
    
end = time.time()
print end-start, 'second'


1107.43256712 second

In [14]:
readertest = csv.reader(open("test.csv"))
headertest = readertest.next()

In [15]:
test_row = test_table_X.row
flag = 0
start = time.time()

for row in readertest:
    test_row['day'] = (int(row[1][4:6])-19)%7
    test_row['hour'] = int(row[1][6:])
    test_row['C1'] = int(row[2])
    test_row['banner_pos'] = int(row[3])
    test_row['site_id'] = row[4]
    test_row['site_domain'] = row[5]
    test_row['site_category'] = row[6]
    test_row['app_id'] = row[7]
    test_row['app_domain'] = row[8]
    test_row['app_category'] = row[9]
    test_row['device_id'] = row[10]
    test_row['device_ip'] = row[11]
    test_row['device_model'] = row[12]
    test_row['device_type'] = int(row[13])
    test_row['device_conn_type'] = int(row[14])
    test_row['C14'] = int(row[15])
    test_row['C15'] = int(row[16])
    test_row['C16'] = int(row[17])
    test_row['C17'] = int(row[18])
    test_row['C18'] = int(row[19])
    test_row['C19'] = int(row[20])
    test_row['C20'] = int(row[21])
    test_row['C21'] = int(row[22])
    
    test_row.append()
    
    flag += 1
    
    if flag > 100:
        test_table_X.flush()
        flag = 0
        
test_table_X.flush()
end = time.time()
print end-start, 'second'


106.725503922 second

training set to training and validation


In [6]:
reader = csv.reader(open("train.csv"))
header = reader.next()

In [7]:
flag = 0
i = 0
start = time.time()

for row in reader:
    if i < 36210029:
        train_row = train_table_X_train.row
        
    else:
        train_row = train_table_X_valid.row
        

    train_row['day'] = (int(row[2][4:6])-19)%7
    train_row['hour'] = int(row[2][6:])
    train_row['C1'] = int(row[3])
    train_row['banner_pos'] = int(row[4])
    train_row['site_id'] = row[5]
    train_row['site_domain'] = row[6]
    train_row['site_category'] = row[7]
    train_row['app_id'] = row[8]
    train_row['app_domain'] = row[9]
    train_row['app_category'] = row[10]
    train_row['device_id'] = row[11]
    train_row['device_ip'] = row[12]
    train_row['device_model'] = row[13]
    train_row['device_type'] = int(row[14])
    train_row['device_conn_type'] = int(row[15])
    train_row['C14'] = int(row[16])
    train_row['C15'] = int(row[17])
    train_row['C16'] = int(row[18])
    train_row['C17'] = int(row[19])
    train_row['C18'] = int(row[20])
    train_row['C19'] = int(row[21])
    train_row['C20'] = int(row[22])
    train_row['C21'] = int(row[23])
    
    train_row.append()
    
    flag += 1
    
    if flag > 100:
        if i < 36210029:
            train_table_X_train.flush()
        else:
            train_table_X_valid.flush()
        flag = 0
        
    i += 1 
    
train_table_X_train.flush()
train_table_X_valid.flush()

end = time.time()
print end-start, 'second'


666.416548967 second

In [14]:
reader = csv.reader(open("train.csv"))
header = reader.next()

In [15]:
i = 0
start = time.time()

for row in reader:
    if i < 36210029:
        y_array_train.append(np.array([[int(row[1])]]))
    else:
        y_array_valid.append(np.array([[int(row[1])]]))
    i += 1
end = time.time()
print end-start, 'second'


1039.38131189 second