In [1]:
import sklearn
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import time
import cPickle
import tables
import numpy as np
import scipy as sp
import itertools

In [2]:
from sklearn import grid_search
from sklearn import cross_validation
from sklearn import metrics
from sklearn import preprocessing

In [3]:
file_handler = tables.open_file("click_data.h5", mode = "r")

In [4]:
X = file_handler.root.train.train_raw.X
y = file_handler.root.train.train_raw.y
X_t = file_handler.root.test.test_raw.X_t

feature extraction


In [5]:
f = open('colscate.pkl', 'rb')
colcate = []
for i in range(25):
    colcate.append(cPickle.load(f))
f.close()

In [6]:
f = open('colscatetest5_1518.pkl', 'rb')
colcatetest = []
for i in range(17):
    colcatetest.append(cPickle.load(f))
f.close()

In [7]:
featureindex = [1,2,3,6,8,9,12,13,15,16,17,19,20,21,22,23,24,25]

In [8]:
f = open('dayrows.pkl', 'r')
dayrows = cPickle.load(f)
f.close()

In [9]:
X.shape


Out[9]:
(47686351,)

columns to be clustered


In [10]:
cols = [5, 6, 8, 9, 11, 12, 14, 15, 18, 19, 22, 25]

In [11]:
numbercols = len(cols)

colcate index in cols


In [12]:
trainindex = [3, 4, 6, 7, 9, 10, 12, 13, 16, 17, 20, 23]

colcatetest index in cols


In [13]:
testindex = [0, 1, 3, 4, 6, 7, 9, 10, 11, 12, 13, 15]

Start from here


In [14]:
ithcol = 0

In [15]:
colsi = np.unique(np.concatenate((colcate[trainindex[ithcol]],colcatetest[testindex[ithcol]])))
print 'features shape: ', colsi.shape
colid = {x: [0,0,0] for x in colsi}


features shape:  (4801,)

In [16]:
#train data
start = time.time()
#flag=0
for sample, target in itertools.izip(X, y):
    cr = colid[sample[cols[ithcol]-1]]
    cr[0]+=1
    cr[1]+=target[0]
    #flag+=1
    #if flag==100:
    #    break
end = time.time()
print 'train data:', end-start, 'second'


train data: 102.822608948 second

In [17]:
#test data
start = time.time()
for sample in X_t:
    colid[sample[cols[ithcol]-1]][2]+=1        
end = time.time()
print 'test data:', end-start, 'second'


test data: 8.8000459671 second

In [37]:
keys = sorted(colid.keys())
keys = np.array(keys)
coli = np.zeros((len(keys),3), dtype=np.float32)
for i in xrange(len(keys)):
    for j in range(3):
        coli[i][j] = colid[keys[i]][j]

In [19]:
a = coli[:,0] == 0.
notrain = keys[a]
b = coli[:,2] == 0
notest = keys[b]
c = []
for i,j in itertools.izip(a,b):
    c.append(not (i or j))
c = np.array(c)
traintest = keys[c]
coli = coli[c]

In [38]:
c = []
for i in a:
    c.append(not i)
c = np.array(c)
coli = coli[c]

In [40]:
print notrain.shape
print notest.shape
print traintest.shape
print coli.shape


(86,)
(1882,)
(2833,)
(4715, 3)

standarization


In [60]:
scaler = preprocessing.StandardScaler(copy=True, with_mean=True, with_std=True)
colistandard = scaler.fit_transform(coli) 
print 'mean:', scaler.mean_ 
print 'std:', scaler.std_


mean: [ 17623.609375     3064.93774414   1788.4934082 ]
std: [ 607138.625      117571.6171875   54796.1171875]

In [21]:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

ax.scatter(colistandard[:,0], colistandard[:,1], colistandard[:,2], c='r', marker='o', alpha=0.3)

ax.set_xlabel('0 Label')
ax.set_ylabel('1 Label')
ax.set_zlabel('2 Label')
#ax.set_xlim3d(left=-1, right=1)
#ax.set_ylim3d(bottom=-1, top=1)
#ax.set_zlim3d(bottom=-1, top=1)

#u = np.linspace(0, 2 * np.pi, 100)
#v = np.linspace(0, np.pi, 100)

# gravity on earth from wikipedia
#g = 9.80665
#g=1

#x = g * np.outer(np.cos(u), np.sin(v))
#y = g * np.outer(np.sin(u), np.sin(v))
#z = g * np.outer(np.ones(np.size(u)), np.cos(v))
# alpha is used to denote the transparent of the figure, float (0.0 transparent through 1.0 opaque)
#ax.plot_surface(x, y, z,  rstride=4, cstride=4, color='y', alpha=0.3)

plt.show()

ax.view_init(70, 30)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-21-d78ca3698bdb> in <module>()
      2 ax = fig.add_subplot(111, projection='3d')
      3 
----> 4 ax.scatter(colistandard[:,0], colistandard[:,1], colistandard[:,2], c='r', marker='o', alpha=0.3)
      5 
      6 ax.set_xlabel('0 Label')

NameError: name 'colistandard' is not defined

Totally Random Trees embedding


In [22]:
from sklearn import ensemble

In [55]:
rte = ensemble.RandomTreesEmbedding(n_estimators=15, max_depth=5, min_samples_split=2,\
                                    min_samples_leaf=1, max_leaf_nodes=None, \
                                    sparse_output=True, n_jobs=4, random_state=1988, \
                                    verbose=0, min_density=None)
transformed = rte.fit_transform(coli[:,:2])

In [ ]:
rte.apply()

In [56]:
rte.feature_importances_


Out[56]:
array([ 0.62117192,  0.37882808])

In [57]:
transformed


Out[57]:
<4715x184 sparse matrix of type '<type 'numpy.float64'>'
	with 70725 stored elements in Compressed Sparse Row format>

In [58]:
transformed[3].todense()


Out[58]:
matrix([[ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,
          0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,
          0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.]])

TruncatedSVD


In [27]:
from sklearn import decomposition

In [59]:
tsvd = decomposition.TruncatedSVD(n_components=2, algorithm='randomized', n_iter=5,\
                                 random_state=None, tol=0.0, n_iterations=None)
reduced = tsvd.fit_transform(transformed)

In [60]:
reduced


Out[60]:
array([[ 3.65474164,  0.65681303],
       [ 3.87038743, -0.12993361],
       [ 3.16583829,  1.73877759],
       ..., 
       [ 1.31878686,  1.12267532],
       [ 3.87038743, -0.12993361],
       [ 3.87038743, -0.12993361]])

In [61]:
coli[1]


Out[61]:
array([ 2414.,    42.,   313.], dtype=float32)

In [62]:
######################################
fig, axes = plt.subplots()
axes.set_xlabel('')
axes.set_ylabel('')
axes.plot(reduced[:,0], reduced[:,1], 'bo')
plt.show()
#fig.savefig("rf_featureimportance_more_mul.svg")
#######################################

In [36]:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

ax.scatter(reduced[:,0], reduced[:,1], reduced[:,2], c='r', marker='o', alpha=0.3)

ax.set_xlabel('0 Label')
ax.set_ylabel('1 Label')
ax.set_zlabel('2 Label')
#ax.set_xlim3d(left=-1, right=1)
#ax.set_ylim3d(bottom=-1, top=1)
#ax.set_zlim3d(bottom=-1, top=1)

#u = np.linspace(0, 2 * np.pi, 100)
#v = np.linspace(0, np.pi, 100)

# gravity on earth from wikipedia
#g = 9.80665
#g=1

#x = g * np.outer(np.cos(u), np.sin(v))
#y = g * np.outer(np.sin(u), np.sin(v))
#z = g * np.outer(np.ones(np.size(u)), np.cos(v))
# alpha is used to denote the transparent of the figure, float (0.0 transparent through 1.0 opaque)
#ax.plot_surface(x, y, z,  rstride=4, cstride=4, color='y', alpha=0.3)

plt.show()

ax.view_init(70, 30)

Testing


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]: