In [1]:
import sklearn
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import time
import cPickle
import tables
import numpy as np
import scipy as sp
import itertools
from scipy import sparse
In [2]:
from sklearn import grid_search
from sklearn import cross_validation
from sklearn import metrics
from sklearn import preprocessing
In [3]:
file_handler = tables.open_file("click_data.h5", mode = "r")
In [4]:
X = file_handler.root.train.train_raw.X
y = file_handler.root.train.train_raw.y
X_t = file_handler.root.test.test_raw.X_t
In [5]:
f = open('colscate.pkl', 'rb')
colcate = []
for i in range(25):
colcate.append(cPickle.load(f))
f.close()
In [6]:
f = open('colscatetest5_1518.pkl', 'rb')
colcatetest = []
for i in range(17):
colcatetest.append(cPickle.load(f))
f.close()
In [7]:
#featureindex = [1,2,3,6,8,9,12,13,15,16,17,19,20,21,22,23,24,25]
In [8]:
f = open('dayrows.pkl', 'r')
dayrows = cPickle.load(f)
f.close()
In [9]:
X.shape
Out[9]:
columns in my notebook to be clustered
In [10]:
cols = [5, 6, 8, 9, 11, 12, 14, 15, 18, 19, 22, 25]
In [11]:
numbercols = len(cols)
colcate index in cols
In [12]:
trainindex = [3, 4, 6, 7, 9, 10, 12, 13, 16, 17, 20, 23]
colcatetest index in cols
In [13]:
testindex = [0, 1, 3, 4, 6, 7, 9, 10, 11, 12, 13, 15]
In [17]:
ithcol = 11
In [3]:
from sklearn import feature_extraction
In [10]:
a = ['df', 'er', 'ru', 'oe', 'oi', '7f', '2d']
In [13]:
fh = feature_extraction.FeatureHasher(n_features=4, input_type='string', \
dtype=np.float32, non_negative=False)
transformed = fh.transform([row] for row in a)
In [14]:
transformed.todense()
Out[14]:
def tokens(row): dic = {} dic['traincount'] = row[0] dic['clickrate'] = row[1] dic['testcount'] = row[2] return dic
n_features is power of 2
fh = feature_extraction.FeatureHasher(n_features=8, input_type='dict', \ dtype=np.float32, non_negative=False) transformed = fh.transform(tokens(row) for row in coli)
In [19]:
st = time.time()
fh = feature_extraction.FeatureHasher(n_features=128, input_type='string', \
dtype=np.float32, non_negative=False)
#if non_negative=True, memory usage is awesome.
index = cols[ithcol]-1
#index = 0
transformed = fh.transform(str(row[index]) for row in X)
en = time.time()
print en-st, 'second'
In [20]:
transformed
Out[20]:
In [21]:
transformed[20000].todense()
Out[21]:
In [22]:
dayrows
Out[22]:
In [23]:
transformed1=transformed[sum(dayrows[:0]):sum(dayrows[:1])]
In [24]:
transformed2=transformed[sum(dayrows[:1]):sum(dayrows[:2])]
In [25]:
np.savez('hashingcol'+str(cols[ithcol])+'.npz', data=transformed.data, indices=transformed.indices,\
indptr=transformed.indptr, shape=transformed.shape)
In [26]:
np.savez('hashingcol'+str(cols[ithcol])+'day1.npz', data=transformed1.data, indices=transformed1.indices,\
indptr=transformed1.indptr, shape=transformed1.shape)
In [27]:
np.savez('hashingcol'+str(cols[ithcol])+'day2.npz', data=transformed2.data, indices=transformed2.indices,\
indptr=transformed2.indptr, shape=transformed2.shape)
In [ ]: