In [1]:

    
import sklearn
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import time
import cPickle
import tables
import numpy as np
import scipy as sp
import itertools
from scipy import sparse



In [2]:

    
from sklearn import grid_search
from sklearn import cross_validation
from sklearn import metrics
from sklearn import preprocessing



In [3]:

    
file_handler = tables.open_file("click_data.h5", mode = "r")



In [4]:

    
X = file_handler.root.train.train_raw.X
y = file_handler.root.train.train_raw.y
X_t = file_handler.root.test.test_raw.X_t

feature extraction



In [5]:

    
f = open('colscate.pkl', 'rb')
colcate = []
for i in range(25):
    colcate.append(cPickle.load(f))
f.close()



In [6]:

    
f = open('colscatetest5_1518.pkl', 'rb')
colcatetest = []
for i in range(17):
    colcatetest.append(cPickle.load(f))
f.close()



In [7]:

    
#featureindex = [1,2,3,6,8,9,12,13,15,16,17,19,20,21,22,23,24,25]



In [8]:

    
f = open('dayrows.pkl', 'r')
dayrows = cPickle.load(f)
f.close()



In [9]:

    
X.shape









    Out[9]:





(47686351,)

columns in my notebook to be clustered



In [10]:

    
cols = [5, 6, 8, 9, 11, 12, 14, 15, 18, 19, 22, 25]



In [11]:

    
numbercols = len(cols)

colcate index in cols



In [12]:

    
trainindex = [3, 4, 6, 7, 9, 10, 12, 13, 16, 17, 20, 23]

colcatetest index in cols



In [13]:

    
testindex = [0, 1, 3, 4, 6, 7, 9, 10, 11, 12, 13, 15]

Start from here



In [17]:

    
ithcol = 11

feature hashing



In [3]:

    
from sklearn import feature_extraction



In [10]:

    
a = ['df', 'er', 'ru', 'oe', 'oi', '7f', '2d']



In [13]:

    
fh = feature_extraction.FeatureHasher(n_features=4, input_type='string', \
                                      dtype=np.float32, non_negative=False)
transformed = fh.transform([row] for row in a)



In [14]:

    
transformed.todense()









    Out[14]:





matrix([[ 0.,  0.,  0.,  2.],
        [ 0.,  1.,  0.,  1.],
        [ 1.,  1.,  0.,  0.],
        [ 0.,  0.,  0.,  2.],
        [ 0., -1.,  0.,  1.],
        [ 1.,  0.,  0.,  1.],
        [ 0.,  0.,  0.,  2.]], dtype=float32)

def tokens(row): dic = {} dic['traincount'] = row[0] dic['clickrate'] = row[1] dic['testcount'] = row[2] return dic

n_features is power of 2

fh = feature_extraction.FeatureHasher(n_features=8, input_type='dict', \ dtype=np.float32, non_negative=False) transformed = fh.transform(tokens(row) for row in coli)



In [19]:

    
st = time.time()

fh = feature_extraction.FeatureHasher(n_features=128, input_type='string', \
                                      dtype=np.float32, non_negative=False)
#if non_negative=True, memory usage is awesome.
index = cols[ithcol]-1
#index = 0
transformed = fh.transform(str(row[index]) for row in X)

en = time.time()
print en-st, 'second'









    



100.359618902 second



In [20]:

    
transformed









    Out[20]:





<47686351x128 sparse matrix of type '<type 'numpy.float32'>'
	with 132964948 stored elements in Compressed Sparse Row format>



In [21]:

    
transformed[20000].todense()









    Out[21]:





matrix([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0., -1.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0., -1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]], dtype=float32)



In [22]:

    
dayrows









    Out[22]:





array([4761989, 4765304, 4768061, 4776491, 4776398, 4769862, 4769217,
       4766136, 4766812, 4766081])



In [23]:

    
transformed1=transformed[sum(dayrows[:0]):sum(dayrows[:1])]



In [24]:

    
transformed2=transformed[sum(dayrows[:1]):sum(dayrows[:2])]



In [25]:

    
np.savez('hashingcol'+str(cols[ithcol])+'.npz', data=transformed.data, indices=transformed.indices,\
         indptr=transformed.indptr, shape=transformed.shape)



In [26]:

    
np.savez('hashingcol'+str(cols[ithcol])+'day1.npz', data=transformed1.data, indices=transformed1.indices,\
         indptr=transformed1.indptr, shape=transformed1.shape)



In [27]:

    
np.savez('hashingcol'+str(cols[ithcol])+'day2.npz', data=transformed2.data, indices=transformed2.indices,\
         indptr=transformed2.indptr, shape=transformed2.shape)



In [ ]: