In [1]:
import tables
import time
import cPickle
import numpy as np
from scipy import sparse
import itertools
In [2]:
from nominal2onehot import nominal2onehot
In [3]:
file_handler = tables.open_file("click_data.h5", mode = "r")
In [4]:
X = file_handler.root.train.train_raw.X
In [5]:
y = file_handler.root.train.train_raw.y
In [6]:
X_t = file_handler.root.test.test_raw.X_t
10 days in training data
In [7]:
f = open('dayrows.pkl', 'r')
dayrows = cPickle.load(f)
f.close()
In [8]:
dayrows
Out[8]:
columns in my notebook: [2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26]
In [9]:
f = open('colscate.pkl', 'rb')
colcate = []
for i in range(25):
colcate.append(cPickle.load(f))
f.close()
columns in my notebook:
In [10]:
columnindex = np.array([2,3,4,7,10,13,16,17,20,21,23,24,26])
columns in pyTables:
In [11]:
featureindex = columnindex-1
In [12]:
featureindex.shape[0]
Out[12]:
columns in colscate.pkl
In [13]:
categoryindex = columnindex-2
category length of each columns
In [14]:
catelen = []
for i in categoryindex:
catelen.append(colcate[i].shape[0])
In [15]:
catelen
Out[15]:
In [16]:
sum(catelen)
Out[16]:
In [17]:
rowlength = sum(catelen)
In [18]:
def one_row_transform(row):
rvl = np.empty((rowlength,), dtype=np.float32)
for i in range(featureindex.shape[0]):
binonehot = nominal2onehot(row[featureindex[i]], colcate[categoryindex[i]])
rvl[sum(catelen[:i]):sum(catelen[:i+1])] = binonehot
return rvl
the 1st day
st = time.time()
start = 0 end = dayrows[0]
val = one_row_transform(train_table_X[start]) val = sparse.lil_matrix(val, dtype=np.float32) for i in xrange(start+1, end): row = one_row_transform(train_table_X[i]) row = sparse.lil_matrix(row, dtype=np.float32) val = sparse.vstack([val,row])
en = time.time() print en-st, 'second'
In [32]:
val = sparse.lil_matrix((dayrows[0],rowlength), dtype=np.float32)
st = time.time()
for i, row_table in itertools.izip(xrange(dayrows[0]), X.iterrows(start=sum(dayrows[:0]))):
row = one_row_transform(row_table)
val[i,np.where(row==1)[0]] = 1
en = time.time()
print en-st, 'second'
In [35]:
val = sparse.lil_matrix((dayrows[1],rowlength), dtype=np.float32)
st = time.time()
for i, row_table in itertools.izip(xrange(dayrows[1]), X.iterrows(start=sum(dayrows[:1]))):
row = one_row_transform(row_table)
val[i,np.where(row==1)[0]] = 1
en = time.time()
print en-st, 'second'
method below are slow, see the comparation
In [ ]:
start = sum(dayrows[:1])
end = sum(dayrows[:2])
val = sparse.lil_matrix((dayrows[1],rowlength), dtype=np.float32)
st = time.time()
row = one_row_transform(X[start])
val[i] = sparse.lil_matrix(row)
en = time.time()
print en-st, 'second'
In [23]:
start = sum(dayrows[:1])
end = sum(dayrows[:2])
val = sparse.lil_matrix((dayrows[1],rowlength), dtype=np.float32)
st = time.time()
row = one_row_transform(X[start])
val[i-start,np.where(row==1)[0]] = 1
en = time.time()
print en-st, 'second'
In [17]:
start = 0
end = dayrows[0]
val = one_row_transform(train_table_X[start])
val = sparse.lil_matrix(val, dtype=np.float32)
st = time.time()
row = one_row_transform(train_table_X[1])
row = sparse.lil_matrix(row, dtype=np.float32)
val = sparse.vstack([val,row])
en = time.time()
print en-st, 'second'
In [36]:
valcsr = val.tocsr()
In [37]:
valcsr.nnz
Out[37]:
In [38]:
valcsr.has_sorted_indices
Out[38]:
In [39]:
np.savez('day2.npz', data=valcsr.data, indices=valcsr.indices, indptr=valcsr.indptr, shape=valcsr.shape)
In [17]:
del val
In [27]:
val = 0
valcsr = 0
In [28]:
import gc
In [40]:
gc.collect()
Out[40]:
In [3]:
del valcsr
In [12]:
loader = np.load('day1.npz')
valcsr = sparse.csr_matrix((loader['data'], loader['indices'], loader['indptr']), shape = loader['shape'])
In [24]:
valcsr[0:6]
Out[24]:
In [25]:
valcsr.shape
Out[25]:
In [13]:
%time row1 = one_row_transform(train_table_X[333])
In [15]:
%time val = sparse.lil_matrix((dayrows[1],1836), dtype=np.float32)
In [20]:
val
Out[20]:
In [ ]:
st = time.time()
for j in range(1,10)
start = dayrows[j-1]
end = sum(dayrows[:j+1])
val = one_row_transform(train_table_X[start])
val = sparse.lil_matrix(val, dtype=np.float32)
for i in xrange(start+1, end):
row = one_row_transform(train_table_X[i])
row = sparse.lil_matrix(row, dtype=np.float32)
val = sparse.vstack([val,row])
valcsr = val.tocsr()
filename = 'day'+str(j+1)+'.npz'
np.savez(filename, data=valcsr.data, indices=valcsr.indices, indptr=valcsr.indptr, shape=valcsr.shape)
en = time.time()
print en-st, 'second'
In [ ]:
In [13]:
>>> I = np.array([0,3,1,0])
>>> J = np.array([0,3,1,2])
>>> V = np.array([4,5,7,9])
>>> B = sparse.coo_matrix((V,(I,J)),shape=(4,4))
In [14]:
B.todense()
Out[14]:
In [ ]: