In [1]:
import sklearn
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import time
import cPickle
import tables
import numpy as np
import scipy as sp
import itertools
from scipy import sparse

In [2]:
from sklearn import grid_search
from sklearn import cross_validation
from sklearn import metrics
from sklearn import preprocessing

In [3]:
file_handler = tables.open_file("click_data.h5", mode = "r")

In [4]:
X = file_handler.root.train.train_raw.X
y = file_handler.root.train.train_raw.y
X_t = file_handler.root.test.test_raw.X_t

feature extraction


In [5]:
f = open('colscate.pkl', 'rb')
colcate = []
for i in range(25):
    colcate.append(cPickle.load(f))
f.close()

In [6]:
f = open('colscatetest5_1518.pkl', 'rb')
colcatetest = []
for i in range(17):
    colcatetest.append(cPickle.load(f))
f.close()

In [7]:
#featureindex = [1,2,3,6,8,9,12,13,15,16,17,19,20,21,22,23,24,25]

In [8]:
f = open('dayrows.pkl', 'r')
dayrows = cPickle.load(f)
f.close()

In [9]:
X.shape


Out[9]:
(47686351,)

columns in my notebook to be clustered


In [10]:
cols = [5, 6, 8, 9, 11, 12, 14, 15, 18, 19, 22, 25]

In [11]:
numbercols = len(cols)

colcate index in cols


In [12]:
trainindex = [3, 4, 6, 7, 9, 10, 12, 13, 16, 17, 20, 23]

colcatetest index in cols


In [13]:
testindex = [0, 1, 3, 4, 6, 7, 9, 10, 11, 12, 13, 15]

Start from here


In [17]:
ithcol = 11

feature hashing


In [3]:
from sklearn import feature_extraction

In [10]:
a = ['df', 'er', 'ru', 'oe', 'oi', '7f', '2d']

In [13]:
fh = feature_extraction.FeatureHasher(n_features=4, input_type='string', \
                                      dtype=np.float32, non_negative=False)
transformed = fh.transform([row] for row in a)

In [14]:
transformed.todense()


Out[14]:
matrix([[ 0.,  0.,  0.,  2.],
        [ 0.,  1.,  0.,  1.],
        [ 1.,  1.,  0.,  0.],
        [ 0.,  0.,  0.,  2.],
        [ 0., -1.,  0.,  1.],
        [ 1.,  0.,  0.,  1.],
        [ 0.,  0.,  0.,  2.]], dtype=float32)

def tokens(row): dic = {} dic['traincount'] = row[0] dic['clickrate'] = row[1] dic['testcount'] = row[2] return dic

n_features is power of 2

fh = feature_extraction.FeatureHasher(n_features=8, input_type='dict', \ dtype=np.float32, non_negative=False) transformed = fh.transform(tokens(row) for row in coli)


In [19]:
st = time.time()

fh = feature_extraction.FeatureHasher(n_features=128, input_type='string', \
                                      dtype=np.float32, non_negative=False)
#if non_negative=True, memory usage is awesome.
index = cols[ithcol]-1
#index = 0
transformed = fh.transform(str(row[index]) for row in X)

en = time.time()
print en-st, 'second'


100.359618902 second

In [20]:
transformed


Out[20]:
<47686351x128 sparse matrix of type '<type 'numpy.float32'>'
	with 132964948 stored elements in Compressed Sparse Row format>

In [21]:
transformed[20000].todense()


Out[21]:
matrix([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0., -1.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0., -1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]], dtype=float32)

In [22]:
dayrows


Out[22]:
array([4761989, 4765304, 4768061, 4776491, 4776398, 4769862, 4769217,
       4766136, 4766812, 4766081])

In [23]:
transformed1=transformed[sum(dayrows[:0]):sum(dayrows[:1])]

In [24]:
transformed2=transformed[sum(dayrows[:1]):sum(dayrows[:2])]

In [25]:
np.savez('hashingcol'+str(cols[ithcol])+'.npz', data=transformed.data, indices=transformed.indices,\
         indptr=transformed.indptr, shape=transformed.shape)

In [26]:
np.savez('hashingcol'+str(cols[ithcol])+'day1.npz', data=transformed1.data, indices=transformed1.indices,\
         indptr=transformed1.indptr, shape=transformed1.shape)

In [27]:
np.savez('hashingcol'+str(cols[ithcol])+'day2.npz', data=transformed2.data, indices=transformed2.indices,\
         indptr=transformed2.indptr, shape=transformed2.shape)

In [ ]: