In [1]:
import sklearn
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import time
import cPickle
import tables
import numpy as np
import scipy as sp
import itertools
In [2]:
from sklearn import grid_search
from sklearn import cross_validation
from sklearn import metrics
from sklearn import preprocessing
In [3]:
file_handler = tables.open_file("click_data.h5", mode = "r")
In [4]:
X = file_handler.root.train.train_raw.X
y = file_handler.root.train.train_raw.y
X_t = file_handler.root.test.test_raw.X_t
In [5]:
f = open('colscate.pkl', 'rb')
colcate = []
for i in range(25):
colcate.append(cPickle.load(f))
f.close()
In [6]:
f = open('colscatetest5_1518.pkl', 'rb')
colcatetest = []
for i in range(17):
colcatetest.append(cPickle.load(f))
f.close()
In [7]:
#featureindex = [1,2,3,6,8,9,12,13,15,16,17,19,20,21,22,23,24,25]
In [8]:
f = open('dayrows.pkl', 'r')
dayrows = cPickle.load(f)
f.close()
In [9]:
X.shape
Out[9]:
columns in my notebook to be clustered
In [10]:
cols = [5, 6, 8, 9, 11, 12, 14, 15, 18, 19, 22, 25]
In [11]:
numbercols = len(cols)
In [12]:
numbercols
Out[12]:
colcate index in cols
In [13]:
trainindex = [3, 4, 6, 7, 9, 10, 12, 13, 16, 17, 20, 23]
colcatetest index in cols
In [14]:
testindex = [0, 1, 3, 4, 6, 7, 9, 10, 11, 12, 13, 15]
In [175]:
ithcol = 11
In [176]:
colsi = np.unique(np.concatenate((colcate[trainindex[ithcol]],colcatetest[testindex[ithcol]])))
print 'features shape: ', colsi.shape
colid = {x: [0,0,0] for x in colsi}
In [179]:
X = file_handler.root.train.train_raw.X
y = file_handler.root.train.train_raw.y
X_t = file_handler.root.test.test_raw.X_t
In [180]:
#train data
start = time.time()
#flag=0
for sample, target in itertools.izip(X, y):
cr = colid[sample[cols[ithcol]-1]]
cr[0]+=1
cr[1]+=target[0]
#flag+=1
#if flag==100:
# break
end = time.time()
print 'train data:', end-start, 'second'
In [181]:
#test data
start = time.time()
for sample in X_t:
colid[sample[cols[ithcol]-1]][2]+=1
end = time.time()
print 'test data:', end-start, 'second'
dictionary to nparray
In [182]:
keys = sorted(colid.keys())
keys = np.array(keys)
coli = np.zeros((len(keys),3), dtype=np.float32)
for i in xrange(len(keys)):
for j in range(3):
coli[i][j] = colid[keys[i]][j]
delete categories not in train or test data
In [183]:
a = coli[:,0] == 0.
notrain = keys[a]
b = coli[:,2] == 0
notest = keys[b]
c = []
for i,j in itertools.izip(a,b):
c.append(not (i or j))
c = np.array(c)
traintest = keys[c]
coli = coli[c]
In [184]:
print notrain.shape
print notest.shape
print traintest.shape
click counts to click rate
not use the code below
In [58]:
coli[:,1] = coli[:,1]/coli[:,0]
In [59]:
coli
Out[59]:
In [54]:
coli[:,1] = coli[:,1]*coli[:,0]
In [47]:
######################################
fig, axes = plt.subplots()
axes.set_xlabel('')
axes.set_ylabel('')
axes.plot(coli[:,0], coli[:,1], 'bo')
plt.show()
#fig.savefig("rf_featureimportance_more_mul.svg")
#######################################
In [185]:
scaler = preprocessing.StandardScaler(copy=True, with_mean=True, with_std=True)
colistandard = scaler.fit_transform(coli)
print 'mean:', scaler.mean_
print 'std:', scaler.std_
In [186]:
######################################
fig, axes = plt.subplots()
axes.set_xlabel('')
axes.set_ylabel('')
axes.plot(colistandard[:,0], colistandard[:,1], 'bo')
plt.show()
#fig.savefig("rf_featureimportance_more_mul.svg")
#######################################
In [24]:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(colistandard[:,0], colistandard[:,1], colistandard[:,2], c='r', marker='o', alpha=0.3)
ax.set_xlabel('0 Label')
ax.set_ylabel('1 Label')
ax.set_zlabel('2 Label')
#ax.set_xlim3d(left=-1, right=1)
#ax.set_ylim3d(bottom=-1, top=1)
#ax.set_zlim3d(bottom=-1, top=1)
#u = np.linspace(0, 2 * np.pi, 100)
#v = np.linspace(0, np.pi, 100)
# gravity on earth from wikipedia
#g = 9.80665
#g=1
#x = g * np.outer(np.cos(u), np.sin(v))
#y = g * np.outer(np.sin(u), np.sin(v))
#z = g * np.outer(np.ones(np.size(u)), np.cos(v))
# alpha is used to denote the transparent of the figure, float (0.0 transparent through 1.0 opaque)
#ax.plot_surface(x, y, z, rstride=4, cstride=4, color='y', alpha=0.3)
plt.show()
ax.view_init(70, 30)
In [52]:
from sklearn import manifold
In [187]:
colistandard.shape
Out[187]:
In [188]:
tsne = manifold.TSNE(n_components=2, perplexity=5.0, early_exaggeration=4.0, learning_rate=1000.0, \
n_iter=1000, metric='euclidean', init='pca', verbose=0, random_state=1988)
transformed = tsne.fit_transform(colistandard.astype(np.float64))
In [189]:
######################################
fig, axes = plt.subplots()
axes.set_xlabel('')
axes.set_ylabel('')
axes.plot(transformed[:,0], transformed[:,1], 'bo')
plt.show()
#fig.savefig("rf_featureimportance_more_mul.svg")
#######################################
In [ ]:
isomap = manifold.Isomap(n_neighbors=5, n_components=2, eigen_solver='auto', tol=0, max_iter=None, path_method='auto', \
neighbors_algorithm='auto')
isotransformed = isomap.fit_transform(colistandard)
In [29]:
######################################
fig, axes = plt.subplots()
axes.set_xlabel('')
axes.set_ylabel('')
axes.plot(isotransformed[:,0], isotransformed[:,1], 'bo')
plt.show()
#fig.savefig("rf_featureimportance_more_mul.svg")
#######################################
In [ ]:
lle = manifold.LocallyLinearEmbedding(n_neighbors=5, n_components=2, reg=0.001, eigen_solver='auto', \
tol=1e-06, max_iter=100, method='standard', hessian_tol=0.0001, \
modified_tol=1e-12, neighbors_algorithm='auto', random_state=1988)
lletransformed = lle.fit_transform(colistandard)
In [31]:
######################################
fig, axes = plt.subplots()
axes.set_xlabel('')
axes.set_ylabel('')
axes.plot(lletransformed[:,0], lletransformed[:,1], 'bo')
plt.show()
#fig.savefig("rf_featureimportance_more_mul.svg")
#######################################
In [ ]:
mds = manifold.MDS(n_components=2, metric=True, n_init=4, max_iter=300, verbose=0, \
eps=0.001, n_jobs=1, random_state=1988, dissimilarity='euclidean')
mdstransformed = mds.fit_transform(colistandard.astype(np.float64))
In [33]:
######################################
fig, axes = plt.subplots()
axes.set_xlabel('')
axes.set_ylabel('')
axes.plot(mdstransformed[:,0], mdstransformed[:,1], 'bo')
plt.show()
#fig.savefig("rf_featureimportance_more_mul.svg")
#######################################
In [ ]:
se = manifold.SpectralEmbedding(n_components=2, affinity='nearest_neighbors', gamma=None, \
random_state=1988, eigen_solver=None, n_neighbors=None)
setransformed = se.fit_transform(colistandard)
In [35]:
######################################
fig, axes = plt.subplots()
axes.set_xlabel('')
axes.set_ylabel('')
axes.plot(setransformed[:,0], setransformed[:,1], 'bo')
plt.show()
#fig.savefig("rf_featureimportance_more_mul.svg")
#######################################
In [56]:
from sklearn import cluster
In [140]:
db = cluster.DBSCAN(eps=2.3, min_samples=3, metric='euclidean', algorithm='auto', leaf_size=30, p=None, random_state=1988)
db.fit_predict(transformed, y=None)
core_samples_db = db.core_sample_indices_
labels_db = db.labels_
print 'labels:', np.unique(labels_db)
db = cluster.DBSCAN(eps=1, min_samples=3, metric='euclidean', algorithm='auto', leaf_size=30, p=None, random_state=1988) db.fit_predict(coli[:,:2], y=None) core_samples_db = db.core_sampleindices labelsdb = db.labels print 'labels:', np.unique(labels_db)
In [141]:
len(transformed[labels_db == -1])
Out[141]:
In [142]:
fig, axes = plt.subplots()
axes.set_xlabel('')
axes.set_ylabel('')
#axes.set_xlim(left=-1, right=1)
#axes.set_ylim(bottom=-1, top=1)
unique_labels = set(labels_db)
colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))
for k, col in zip(unique_labels, colors):
if k == -1:
# Black used for noise.
col = 'k'
class_member_mask = (labels_db == k)
xy = transformed[class_member_mask]
axes.plot(xy[:,0], xy[:,1], 'o', markerfacecolor=col,
markeredgecolor='k')
plt.show()
#ax.view_init(70, 30)
In [190]:
colis = {x: 0 for x in colsi}
In [191]:
keys
Out[191]:
In [192]:
labels = labels_db+2
In [193]:
for key in notrain:
colis[key] = 0
for key in notest:
colis[key] = 1
In [194]:
flag=0
clusterlabel = np.unique(labels_db).shape[0]+1
for key,k in itertools.izip(traintest, xrange(len(traintest))):
if labels[k] == 1:
colis[key] = clusterlabel+flag
flag+=1
else:
colis[key] = labels[k]
or not transform
In [170]:
for key,k in itertools.izip(traintest, xrange(len(traintest))):
colis[key] = k+2
In [195]:
colis
Out[195]:
In [196]:
len(colis)
Out[196]:
In [197]:
f = open('clustercol'+str(cols[ithcol])+'.pkl', 'wb')
cPickle.dump(colis, f, -1)
f.close()
In [198]:
f = open('countcol'+str(cols[ithcol])+'.pkl', 'wb')
cPickle.dump(colid, f, -1)
f.close()
In [74]:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
#ax.scatter(col5standard[:,0], col5standard[:,1], col5standard[:,2], c='r', marker='o')
ax.set_xlabel('0 Label')
ax.set_ylabel('1 Label')
ax.set_zlabel('2 Label')
ax.set_xlim3d(left=-1, right=1)
ax.set_ylim3d(bottom=-1, top=1)
ax.set_zlim3d(bottom=-1, top=1)
unique_labels = set(labels_db)
colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))
for k, col in zip(unique_labels, colors):
if k == -1:
# Black used for noise.
col = 'k'
class_member_mask = (labels_db == k)
xyz = colistandard[class_member_mask]
ax.scatter(xyz[:,0], xyz[:,1], xyz[:,2], c=col, marker='o')
#u = np.linspace(0, 2 * np.pi, 100)
#v = np.linspace(0, np.pi, 100)
# gravity on earth from wikipedia
#g = 9.80665
#g=1
#x = g * np.outer(np.cos(u), np.sin(v))
#y = g * np.outer(np.sin(u), np.sin(v))
#z = g * np.outer(np.ones(np.size(u)), np.cos(v))
# alpha is used to denote the transparent of the figure, float (0.0 transparent through 1.0 opaque)
#ax.plot_surface(x, y, z, rstride=4, cstride=4, color='y', alpha=0.3)
plt.show()
ax.view_init(70, 30)
In [ ]:
metrics.silhouette_score(X, labels, metric='euclidean', sample_size=None, random_state=1988)`y
In [ ]:
metrics.silhouette_samples(X, labels, metric='euclidean')
In [67]:
from sklearn import decomposition
pca = decomposition.PCA(n_components=2, copy=True, whiten=False)
pcatransformed = pca.fit_transform(colistandard)
In [69]:
######################################
fig, axes = plt.subplots()
axes.set_xlabel('')
axes.set_ylabel('')
axes.plot(pcatransformed[:,0], pcatransformed[:,1], 'bo')
#axes.plot(pcatransformed, 'bo')
plt.show()
#fig.savefig("rf_featureimportance_more_mul.svg")
#######################################
In [ ]:
fig, axes = plt.subplots()
axes.set_xlabel('')
axes.set_ylabel('')
#axes.set_xlim(left=-1, right=1)
#axes.set_ylim(bottom=-1, top=1)
unique_labels = set(labels_db)
colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))
for k, col in zip(unique_labels, colors):
if k == -1:
# Black used for noise.
col = 'k'
class_member_mask = (labels_db == k)
xyz = transformed[class_member_mask]
axes.plot(xyz[:,0], xyz[:,1], 'o', markerfacecolor=col,
markeredgecolor='k')
plt.show()
#ax.view_init(70, 30)
In [ ]:
In [ ]:
#the method below cost too long time
col5 = np.zeros((colcate[3].shape[0], 2), dtype=np.float32)
start = time.time()
flag=0
for sample, target in itertools.izip(X, y):
cr = col5[np.where(colcate[3]==sample[4])[0]]
cr[:,0]+=1
cr[:,1]+=target[0]
flag+=1
if flag==100:
break
end = time.time()
print end-start, 'second'
col5t = np.zeros((colcate[3].shape[0], 1), dtype=np.float32)
start = time.time()
for sample in X_t:
index = np.where(colcate[3]==sample[4])[0]
if index.nbytes != 0:
col5t[index][0]+=1
end = time.time()
print end-start, 'second'
In [ ]: