In [1]:
import tables
import time
import cPickle
import numpy as np
from scipy import sparse
import itertools

In [2]:
from nominal2onehot import nominal2onehot

open hdf5


In [3]:
file_handler = tables.open_file("click_data.h5", mode = "r")

In [4]:
X = file_handler.root.train.train_raw.X

In [5]:
y = file_handler.root.train.train_raw.y

In [6]:
X_t = file_handler.root.test.test_raw.X_t

number of rows in each day

10 days in training data


In [7]:
f = open('dayrows.pkl', 'r')
dayrows = cPickle.load(f)
f.close()

In [8]:
dayrows


Out[8]:
array([4761989, 4765304, 4768061, 4776491, 4776398, 4769862, 4769217,
       4766136, 4766812, 4766081])

categorical data in each column(except 'day').

columns in my notebook: [2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26]


In [9]:
f = open('colscate.pkl', 'rb')
colcate = []
for i in range(25):
    colcate.append(cPickle.load(f))
f.close()

featue list used to be one-hot encoded

columns in my notebook:


In [10]:
columnindex = np.array([2,3,4,7,10,13,16,17,20,21,23,24,26])

columns in pyTables:


In [11]:
featureindex = columnindex-1

In [12]:
featureindex.shape[0]


Out[12]:
13

columns in colscate.pkl


In [13]:
categoryindex = columnindex-2

category length of each columns


In [14]:
catelen = []
for i in categoryindex:
    catelen.append(colcate[i].shape[0])

In [15]:
catelen


Out[15]:
[24, 7, 7, 26, 38, 15, 4, 4, 8, 9, 4, 61, 53]

In [16]:
sum(catelen)


Out[16]:
260

code one row to one hot binary data


In [17]:
rowlength = sum(catelen)

In [18]:
def one_row_transform(row):
            
    rvl = np.empty((rowlength,), dtype=np.float32)
            
    for i in range(featureindex.shape[0]):
        binonehot = nominal2onehot(row[featureindex[i]], colcate[categoryindex[i]])
        rvl[sum(catelen[:i]):sum(catelen[:i+1])] = binonehot
                
    return rvl

dense to sparse and stacked

the 1st day

st = time.time()

start = 0 end = dayrows[0]

val = one_row_transform(train_table_X[start]) val = sparse.lil_matrix(val, dtype=np.float32) for i in xrange(start+1, end): row = one_row_transform(train_table_X[i]) row = sparse.lil_matrix(row, dtype=np.float32) val = sparse.vstack([val,row])

en = time.time() print en-st, 'second'


In [32]:
val = sparse.lil_matrix((dayrows[0],rowlength), dtype=np.float32)

st = time.time()
for i, row_table in itertools.izip(xrange(dayrows[0]), X.iterrows(start=sum(dayrows[:0]))):
    row = one_row_transform(row_table)
    val[i,np.where(row==1)[0]] = 1
en = time.time()
print en-st, 'second'


---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-32-d0b09e88fb3d> in <module>()
      3 st = time.time()
      4 for i, row_table in itertools.izip(xrange(dayrows[0]), X.iterrows(start=sum(dayrows[:0]))):
----> 5     row = one_row_transform(row_table)
      6     val[i,np.where(row==1)[0]] = 1
      7 en = time.time()

<ipython-input-18-3bb810ee8a57> in one_row_transform(row)
      4 
      5     for i in range(featureindex.shape[0]):
----> 6         binonehot = nominal2onehot(row[featureindex[i]], colcate[categoryindex[i]])
      7         rvl[sum(catelen[:i]):sum(catelen[:i+1])] = binonehot
      8 

/home/whale/Documents/click/nominal2onehot.pyc in nominal2onehot(x, xlist)
     10 
     11 def nominal2onehot(x, xlist):
---> 12     rvl = np.zeros((len(xlist),), dtype=np.float32)
     13     if x in xlist:
     14         rvl[np.where(xlist==x)]=1

KeyboardInterrupt: 

In [35]:
val = sparse.lil_matrix((dayrows[1],rowlength), dtype=np.float32)

st = time.time()
for i, row_table in itertools.izip(xrange(dayrows[1]), X.iterrows(start=sum(dayrows[:1]))):
    row = one_row_transform(row_table)
    val[i,np.where(row==1)[0]] = 1
en = time.time()
print en-st, 'second'


975.125770092 second

method below are slow, see the comparation


In [ ]:
start = sum(dayrows[:1])
end = sum(dayrows[:2])

val = sparse.lil_matrix((dayrows[1],rowlength), dtype=np.float32)

st = time.time()

row = one_row_transform(X[start])
val[i] = sparse.lil_matrix(row)

en = time.time()
print en-st, 'second'

In [23]:
start = sum(dayrows[:1])
end = sum(dayrows[:2])

val = sparse.lil_matrix((dayrows[1],rowlength), dtype=np.float32)

st = time.time()

row = one_row_transform(X[start])
val[i-start,np.where(row==1)[0]] = 1

en = time.time()
print en-st, 'second'


0.00065279006958 second

In [17]:
start = 0
end = dayrows[0]

val = one_row_transform(train_table_X[start])
val = sparse.lil_matrix(val, dtype=np.float32)

st = time.time()

row = one_row_transform(train_table_X[1])
row = sparse.lil_matrix(row, dtype=np.float32)
val = sparse.vstack([val,row])

en = time.time()
print en-st, 'second'


0.0020740032196 second

In [36]:
valcsr = val.tocsr()

In [37]:
valcsr.nnz


Out[37]:
61948952

In [38]:
valcsr.has_sorted_indices


Out[38]:
1

In [39]:
np.savez('day2.npz', data=valcsr.data, indices=valcsr.indices, indptr=valcsr.indptr, shape=valcsr.shape)

In [17]:
del val

In [27]:
val = 0
valcsr = 0

In [28]:
import gc

In [40]:
gc.collect()


Out[40]:
9

In [3]:
del valcsr

In [12]:
loader = np.load('day1.npz')
valcsr = sparse.csr_matrix((loader['data'], loader['indices'], loader['indptr']), shape = loader['shape'])

In [24]:
valcsr[0:6]


Out[24]:
<6x260 sparse matrix of type '<type 'numpy.float32'>'
	with 78 stored elements in Compressed Sparse Row format>

In [25]:
valcsr.shape


Out[25]:
(4761989, 260)

Testing


In [13]:
%time row1 = one_row_transform(train_table_X[333])


CPU times: user 1.09 ms, sys: 28 µs, total: 1.11 ms
Wall time: 744 µs

In [15]:
%time val = sparse.lil_matrix((dayrows[1],1836), dtype=np.float32)


CPU times: user 4.21 s, sys: 213 ms, total: 4.42 s
Wall time: 4.36 s

In [20]:
val


Out[20]:
<2x1836 sparse matrix of type '<type 'numpy.float32'>'
	with 36 stored elements in COOrdinate format>

In [ ]:
st = time.time()

for j in range(1,10)
    start = dayrows[j-1]
    end = sum(dayrows[:j+1])

    val = one_row_transform(train_table_X[start])
    val = sparse.lil_matrix(val, dtype=np.float32)
    for i in xrange(start+1, end):
        row = one_row_transform(train_table_X[i])
        row = sparse.lil_matrix(row, dtype=np.float32)
        val = sparse.vstack([val,row])
    valcsr = val.tocsr()
    
    filename = 'day'+str(j+1)+'.npz'
    np.savez(filename, data=valcsr.data, indices=valcsr.indices, indptr=valcsr.indptr, shape=valcsr.shape)

en = time.time()
print en-st, 'second'

In [ ]:


In [13]:
>>> I = np.array([0,3,1,0])
>>> J = np.array([0,3,1,2])
>>> V = np.array([4,5,7,9])
>>> B = sparse.coo_matrix((V,(I,J)),shape=(4,4))

In [14]:
B.todense()


Out[14]:
matrix([[4, 0, 9, 0],
        [0, 7, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 5]])

In [ ]: