In [1]:

    
import sklearn
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import time
import cPickle
import tables
import numpy as np
import scipy as sp
import itertools



In [2]:

    
from sklearn import grid_search
from sklearn import cross_validation
from sklearn import metrics
from sklearn import preprocessing



In [3]:

    
file_handler = tables.open_file("click_data.h5", mode = "r")



In [4]:

    
X = file_handler.root.train.train_raw.X
y = file_handler.root.train.train_raw.y
X_t = file_handler.root.test.test_raw.X_t

feature extraction



In [5]:

    
f = open('colscate.pkl', 'rb')
colcate = []
for i in range(25):
    colcate.append(cPickle.load(f))
f.close()



In [6]:

    
f = open('colscatetest5_1518.pkl', 'rb')
colcatetest = []
for i in range(17):
    colcatetest.append(cPickle.load(f))
f.close()



In [7]:

    
featureindex = [1,2,3,6,8,9,12,13,15,16,17,19,20,21,22,23,24,25]



In [8]:

    
f = open('dayrows.pkl', 'r')
dayrows = cPickle.load(f)
f.close()



In [9]:

    
X.shape









    Out[9]:





(47686351,)

columns to be clustered



In [10]:

    
cols = [5, 6, 8, 9, 11, 12, 14, 15, 18, 19, 22, 25]



In [11]:

    
numbercols = len(cols)

colcate index in cols



In [12]:

    
trainindex = [3, 4, 6, 7, 9, 10, 12, 13, 16, 17, 20, 23]

colcatetest index in cols



In [13]:

    
testindex = [0, 1, 3, 4, 6, 7, 9, 10, 11, 12, 13, 15]

Start from here



In [14]:

    
ithcol = 0



In [15]:

    
colsi = np.unique(np.concatenate((colcate[trainindex[ithcol]],colcatetest[testindex[ithcol]])))
print 'features shape: ', colsi.shape
colid = {x: [0,0,0] for x in colsi}









    



features shape:  (4801,)



In [16]:

    
#train data
start = time.time()
#flag=0
for sample, target in itertools.izip(X, y):
    cr = colid[sample[cols[ithcol]-1]]
    cr[0]+=1
    cr[1]+=target[0]
    #flag+=1
    #if flag==100:
    #    break
end = time.time()
print 'train data:', end-start, 'second'









    



train data: 102.822608948 second



In [17]:

    
#test data
start = time.time()
for sample in X_t:
    colid[sample[cols[ithcol]-1]][2]+=1        
end = time.time()
print 'test data:', end-start, 'second'









    



test data: 8.8000459671 second



In [37]:

    
keys = sorted(colid.keys())
keys = np.array(keys)
coli = np.zeros((len(keys),3), dtype=np.float32)
for i in xrange(len(keys)):
    for j in range(3):
        coli[i][j] = colid[keys[i]][j]



In [19]:

    
a = coli[:,0] == 0.
notrain = keys[a]
b = coli[:,2] == 0
notest = keys[b]
c = []
for i,j in itertools.izip(a,b):
    c.append(not (i or j))
c = np.array(c)
traintest = keys[c]
coli = coli[c]



In [38]:

    
c = []
for i in a:
    c.append(not i)
c = np.array(c)
coli = coli[c]



In [40]:

    
print notrain.shape
print notest.shape
print traintest.shape
print coli.shape









    



(86,)
(1882,)
(2833,)
(4715, 3)

standarization



In [60]:

    
scaler = preprocessing.StandardScaler(copy=True, with_mean=True, with_std=True)
colistandard = scaler.fit_transform(coli) 
print 'mean:', scaler.mean_ 
print 'std:', scaler.std_









    



mean: [ 17623.609375     3064.93774414   1788.4934082 ]
std: [ 607138.625      117571.6171875   54796.1171875]



In [21]:

    
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

ax.scatter(colistandard[:,0], colistandard[:,1], colistandard[:,2], c='r', marker='o', alpha=0.3)

ax.set_xlabel('0 Label')
ax.set_ylabel('1 Label')
ax.set_zlabel('2 Label')
#ax.set_xlim3d(left=-1, right=1)
#ax.set_ylim3d(bottom=-1, top=1)
#ax.set_zlim3d(bottom=-1, top=1)

#u = np.linspace(0, 2 * np.pi, 100)
#v = np.linspace(0, np.pi, 100)

# gravity on earth from wikipedia
#g = 9.80665
#g=1

#x = g * np.outer(np.cos(u), np.sin(v))
#y = g * np.outer(np.sin(u), np.sin(v))
#z = g * np.outer(np.ones(np.size(u)), np.cos(v))
# alpha is used to denote the transparent of the figure, float (0.0 transparent through 1.0 opaque)
#ax.plot_surface(x, y, z,  rstride=4, cstride=4, color='y', alpha=0.3)

plt.show()

ax.view_init(70, 30)









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-21-d78ca3698bdb> in <module>()
      2 ax = fig.add_subplot(111, projection='3d')
      3 
----> 4 ax.scatter(colistandard[:,0], colistandard[:,1], colistandard[:,2], c='r', marker='o', alpha=0.3)
      5 
      6 ax.set_xlabel('0 Label')

NameError: name 'colistandard' is not defined

Totally Random Trees embedding



In [22]:

    
from sklearn import ensemble



In [55]:

    
rte = ensemble.RandomTreesEmbedding(n_estimators=15, max_depth=5, min_samples_split=2,\
                                    min_samples_leaf=1, max_leaf_nodes=None, \
                                    sparse_output=True, n_jobs=4, random_state=1988, \
                                    verbose=0, min_density=None)
transformed = rte.fit_transform(coli[:,:2])



In [ ]:

    
rte.apply()



In [56]:

    
rte.feature_importances_









    Out[56]:





array([ 0.62117192,  0.37882808])



In [57]:

    
transformed









    Out[57]:





<4715x184 sparse matrix of type '<type 'numpy.float64'>'
	with 70725 stored elements in Compressed Sparse Row format>



In [58]:

    
transformed[3].todense()









    Out[58]:





matrix([[ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,
          0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,
          0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.]])

TruncatedSVD



In [27]:

    
from sklearn import decomposition



In [59]:

    
tsvd = decomposition.TruncatedSVD(n_components=2, algorithm='randomized', n_iter=5,\
                                 random_state=None, tol=0.0, n_iterations=None)
reduced = tsvd.fit_transform(transformed)



In [60]:

    
reduced









    Out[60]:





array([[ 3.65474164,  0.65681303],
       [ 3.87038743, -0.12993361],
       [ 3.16583829,  1.73877759],
       ..., 
       [ 1.31878686,  1.12267532],
       [ 3.87038743, -0.12993361],
       [ 3.87038743, -0.12993361]])



In [61]:

    
coli[1]









    Out[61]:





array([ 2414.,    42.,   313.], dtype=float32)



In [62]:

    
######################################
fig, axes = plt.subplots()
axes.set_xlabel('')
axes.set_ylabel('')
axes.plot(reduced[:,0], reduced[:,1], 'bo')
plt.show()
#fig.savefig("rf_featureimportance_more_mul.svg")
#######################################



In [36]:

    
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

ax.scatter(reduced[:,0], reduced[:,1], reduced[:,2], c='r', marker='o', alpha=0.3)

ax.set_xlabel('0 Label')
ax.set_ylabel('1 Label')
ax.set_zlabel('2 Label')
#ax.set_xlim3d(left=-1, right=1)
#ax.set_ylim3d(bottom=-1, top=1)
#ax.set_zlim3d(bottom=-1, top=1)

#u = np.linspace(0, 2 * np.pi, 100)
#v = np.linspace(0, np.pi, 100)

# gravity on earth from wikipedia
#g = 9.80665
#g=1

#x = g * np.outer(np.cos(u), np.sin(v))
#y = g * np.outer(np.sin(u), np.sin(v))
#z = g * np.outer(np.ones(np.size(u)), np.cos(v))
# alpha is used to denote the transparent of the figure, float (0.0 transparent through 1.0 opaque)
#ax.plot_surface(x, y, z,  rstride=4, cstride=4, color='y', alpha=0.3)

plt.show()

ax.view_init(70, 30)

Testing



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]: