In [1]:
import sklearn
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import time
import cPickle
import tables
import numpy as np
import scipy as sp
import itertools
In [2]:
from sklearn import grid_search
from sklearn import cross_validation
from sklearn import metrics
from sklearn import preprocessing
In [3]:
file_handler = tables.open_file("click_data.h5", mode = "r")
In [4]:
X = file_handler.root.train.train_raw.X
y = file_handler.root.train.train_raw.y
X_t = file_handler.root.test.test_raw.X_t
In [5]:
f = open('colscate.pkl', 'rb')
colcate = []
for i in range(25):
colcate.append(cPickle.load(f))
f.close()
In [6]:
f = open('colscatetest5_1518.pkl', 'rb')
colcatetest = []
for i in range(17):
colcatetest.append(cPickle.load(f))
f.close()
In [7]:
featureindex = [1,2,3,6,8,9,12,13,15,16,17,19,20,21,22,23,24,25]
In [8]:
f = open('dayrows.pkl', 'r')
dayrows = cPickle.load(f)
f.close()
In [9]:
X.shape
Out[9]:
columns to be clustered
In [10]:
cols = [5, 6, 8, 9, 11, 12, 14, 15, 18, 19, 22, 25]
In [11]:
numbercols = len(cols)
colcate index in cols
In [12]:
trainindex = [3, 4, 6, 7, 9, 10, 12, 13, 16, 17, 20, 23]
colcatetest index in cols
In [13]:
testindex = [0, 1, 3, 4, 6, 7, 9, 10, 11, 12, 13, 15]
In [14]:
ithcol = 0
In [15]:
colsi = np.unique(np.concatenate((colcate[trainindex[ithcol]],colcatetest[testindex[ithcol]])))
print 'features shape: ', colsi.shape
colid = {x: [0,0,0] for x in colsi}
In [16]:
#train data
start = time.time()
#flag=0
for sample, target in itertools.izip(X, y):
cr = colid[sample[cols[ithcol]-1]]
cr[0]+=1
cr[1]+=target[0]
#flag+=1
#if flag==100:
# break
end = time.time()
print 'train data:', end-start, 'second'
In [17]:
#test data
start = time.time()
for sample in X_t:
colid[sample[cols[ithcol]-1]][2]+=1
end = time.time()
print 'test data:', end-start, 'second'
In [37]:
keys = sorted(colid.keys())
keys = np.array(keys)
coli = np.zeros((len(keys),3), dtype=np.float32)
for i in xrange(len(keys)):
for j in range(3):
coli[i][j] = colid[keys[i]][j]
In [19]:
a = coli[:,0] == 0.
notrain = keys[a]
b = coli[:,2] == 0
notest = keys[b]
c = []
for i,j in itertools.izip(a,b):
c.append(not (i or j))
c = np.array(c)
traintest = keys[c]
coli = coli[c]
In [38]:
c = []
for i in a:
c.append(not i)
c = np.array(c)
coli = coli[c]
In [40]:
print notrain.shape
print notest.shape
print traintest.shape
print coli.shape
In [60]:
scaler = preprocessing.StandardScaler(copy=True, with_mean=True, with_std=True)
colistandard = scaler.fit_transform(coli)
print 'mean:', scaler.mean_
print 'std:', scaler.std_
In [21]:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(colistandard[:,0], colistandard[:,1], colistandard[:,2], c='r', marker='o', alpha=0.3)
ax.set_xlabel('0 Label')
ax.set_ylabel('1 Label')
ax.set_zlabel('2 Label')
#ax.set_xlim3d(left=-1, right=1)
#ax.set_ylim3d(bottom=-1, top=1)
#ax.set_zlim3d(bottom=-1, top=1)
#u = np.linspace(0, 2 * np.pi, 100)
#v = np.linspace(0, np.pi, 100)
# gravity on earth from wikipedia
#g = 9.80665
#g=1
#x = g * np.outer(np.cos(u), np.sin(v))
#y = g * np.outer(np.sin(u), np.sin(v))
#z = g * np.outer(np.ones(np.size(u)), np.cos(v))
# alpha is used to denote the transparent of the figure, float (0.0 transparent through 1.0 opaque)
#ax.plot_surface(x, y, z, rstride=4, cstride=4, color='y', alpha=0.3)
plt.show()
ax.view_init(70, 30)
In [22]:
from sklearn import ensemble
In [55]:
rte = ensemble.RandomTreesEmbedding(n_estimators=15, max_depth=5, min_samples_split=2,\
min_samples_leaf=1, max_leaf_nodes=None, \
sparse_output=True, n_jobs=4, random_state=1988, \
verbose=0, min_density=None)
transformed = rte.fit_transform(coli[:,:2])
In [ ]:
rte.apply()
In [56]:
rte.feature_importances_
Out[56]:
In [57]:
transformed
Out[57]:
In [58]:
transformed[3].todense()
Out[58]:
In [27]:
from sklearn import decomposition
In [59]:
tsvd = decomposition.TruncatedSVD(n_components=2, algorithm='randomized', n_iter=5,\
random_state=None, tol=0.0, n_iterations=None)
reduced = tsvd.fit_transform(transformed)
In [60]:
reduced
Out[60]:
In [61]:
coli[1]
Out[61]:
In [62]:
######################################
fig, axes = plt.subplots()
axes.set_xlabel('')
axes.set_ylabel('')
axes.plot(reduced[:,0], reduced[:,1], 'bo')
plt.show()
#fig.savefig("rf_featureimportance_more_mul.svg")
#######################################
In [36]:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(reduced[:,0], reduced[:,1], reduced[:,2], c='r', marker='o', alpha=0.3)
ax.set_xlabel('0 Label')
ax.set_ylabel('1 Label')
ax.set_zlabel('2 Label')
#ax.set_xlim3d(left=-1, right=1)
#ax.set_ylim3d(bottom=-1, top=1)
#ax.set_zlim3d(bottom=-1, top=1)
#u = np.linspace(0, 2 * np.pi, 100)
#v = np.linspace(0, np.pi, 100)
# gravity on earth from wikipedia
#g = 9.80665
#g=1
#x = g * np.outer(np.cos(u), np.sin(v))
#y = g * np.outer(np.sin(u), np.sin(v))
#z = g * np.outer(np.ones(np.size(u)), np.cos(v))
# alpha is used to denote the transparent of the figure, float (0.0 transparent through 1.0 opaque)
#ax.plot_surface(x, y, z, rstride=4, cstride=4, color='y', alpha=0.3)
plt.show()
ax.view_init(70, 30)
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: