In [1]:
import sklearn
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import time
import cPickle
import tables
import numpy as np
import scipy as sp
import itertools

In [2]:
from sklearn import grid_search
from sklearn import cross_validation
from sklearn import metrics
from sklearn import preprocessing

In [3]:
file_handler = tables.open_file("click_data.h5", mode = "r")

In [4]:
X = file_handler.root.train.train_raw.X
y = file_handler.root.train.train_raw.y
X_t = file_handler.root.test.test_raw.X_t

feature extraction


In [5]:
f = open('colscate.pkl', 'rb')
colcate = []
for i in range(25):
    colcate.append(cPickle.load(f))
f.close()

In [6]:
f = open('colscatetest5_1518.pkl', 'rb')
colcatetest = []
for i in range(17):
    colcatetest.append(cPickle.load(f))
f.close()

In [7]:
#featureindex = [1,2,3,6,8,9,12,13,15,16,17,19,20,21,22,23,24,25]

In [8]:
f = open('dayrows.pkl', 'r')
dayrows = cPickle.load(f)
f.close()

In [9]:
X.shape


Out[9]:
(47686351,)

columns in my notebook to be clustered


In [10]:
cols = [5, 6, 8, 9, 11, 12, 14, 15, 18, 19, 22, 25]

In [11]:
numbercols = len(cols)

In [12]:
numbercols


Out[12]:
12

colcate index in cols


In [13]:
trainindex = [3, 4, 6, 7, 9, 10, 12, 13, 16, 17, 20, 23]

colcatetest index in cols


In [14]:
testindex = [0, 1, 3, 4, 6, 7, 9, 10, 11, 12, 13, 15]

Start from here


In [175]:
ithcol = 11

In [176]:
colsi = np.unique(np.concatenate((colcate[trainindex[ithcol]],colcatetest[testindex[ithcol]])))
print 'features shape: ', colsi.shape
colid = {x: [0,0,0] for x in colsi}


features shape:  (171,)

In [179]:
X = file_handler.root.train.train_raw.X
y = file_handler.root.train.train_raw.y
X_t = file_handler.root.test.test_raw.X_t

In [180]:
#train data
start = time.time()
#flag=0
for sample, target in itertools.izip(X, y):
    cr = colid[sample[cols[ithcol]-1]]
    cr[0]+=1
    cr[1]+=target[0]
    #flag+=1
    #if flag==100:
    #    break
end = time.time()
print 'train data:', end-start, 'second'


train data: 101.175451994 second

In [181]:
#test data
start = time.time()
for sample in X_t:
    colid[sample[cols[ithcol]-1]][2]+=1        
end = time.time()
print 'test data:', end-start, 'second'


test data: 1.55491185188 second

dictionary to nparray


In [182]:
keys = sorted(colid.keys())
keys = np.array(keys)
coli = np.zeros((len(keys),3), dtype=np.float32)
for i in xrange(len(keys)):
    for j in range(3):
        coli[i][j] = colid[keys[i]][j]

delete categories not in train or test data


In [183]:
a = coli[:,0] == 0.
notrain = keys[a]
b = coli[:,2] == 0
notest = keys[b]
c = []
for i,j in itertools.izip(a,b):
    c.append(not (i or j))
c = np.array(c)
traintest = keys[c]
coli = coli[c]

In [184]:
print notrain.shape
print notest.shape
print traintest.shape


(0,)
(8,)
(163,)

click counts to click rate

not use the code below


In [58]:
coli[:,1] = coli[:,1]/coli[:,0]

In [59]:
coli


Out[59]:
array([[  8.88000000e+02,   4.56081092e-01,   9.10000000e+01],
       [  2.41400000e+03,   1.73985083e-02,   3.13000000e+02],
       [  1.38680000e+04,   8.89097229e-02,   4.50000000e+01],
       ..., 
       [  1.57682000e+05,   7.53351673e-02,   1.34560000e+04],
       [  1.80000000e+01,   2.77777791e-01,   4.00000000e+00],
       [  6.00000000e+01,   3.00000012e-01,   3.00000000e+00]], dtype=float32)

In [54]:
coli[:,1] = coli[:,1]*coli[:,0]

In [47]:
######################################
fig, axes = plt.subplots()
axes.set_xlabel('')
axes.set_ylabel('')
axes.plot(coli[:,0], coli[:,1], 'bo')
plt.show()
#fig.savefig("rf_featureimportance_more_mul.svg")
#######################################

standarization


In [185]:
scaler = preprocessing.StandardScaler(copy=True, with_mean=True, with_std=True)
colistandard = scaler.fit_transform(coli) 
print 'mean:', scaler.mean_ 
print 'std:', scaler.std_


mean: [ 291570.40625      50390.1484375    29260.12890625]
std: [ 1894564.25       371824.9375     194862.453125]

In [186]:
######################################
fig, axes = plt.subplots()
axes.set_xlabel('')
axes.set_ylabel('')
axes.plot(colistandard[:,0], colistandard[:,1], 'bo')
plt.show()
#fig.savefig("rf_featureimportance_more_mul.svg")
#######################################

In [24]:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

ax.scatter(colistandard[:,0], colistandard[:,1], colistandard[:,2], c='r', marker='o', alpha=0.3)

ax.set_xlabel('0 Label')
ax.set_ylabel('1 Label')
ax.set_zlabel('2 Label')
#ax.set_xlim3d(left=-1, right=1)
#ax.set_ylim3d(bottom=-1, top=1)
#ax.set_zlim3d(bottom=-1, top=1)

#u = np.linspace(0, 2 * np.pi, 100)
#v = np.linspace(0, np.pi, 100)

# gravity on earth from wikipedia
#g = 9.80665
#g=1

#x = g * np.outer(np.cos(u), np.sin(v))
#y = g * np.outer(np.sin(u), np.sin(v))
#z = g * np.outer(np.ones(np.size(u)), np.cos(v))
# alpha is used to denote the transparent of the figure, float (0.0 transparent through 1.0 opaque)
#ax.plot_surface(x, y, z,  rstride=4, cstride=4, color='y', alpha=0.3)

plt.show()

ax.view_init(70, 30)


/home/whale/anaconda/lib/python2.7/site-packages/mpl_toolkits/mplot3d/axes3d.py:1094: FutureWarning: comparison to `None` will result in an elementwise object comparison in the future.
  if self.button_pressed in self._rotate_btn:

feature clustering

Manifold


In [52]:
from sklearn import manifold

In [187]:
colistandard.shape


Out[187]:
(163, 3)

t-SNE


In [188]:
tsne = manifold.TSNE(n_components=2, perplexity=5.0, early_exaggeration=4.0, learning_rate=1000.0, \
                     n_iter=1000, metric='euclidean', init='pca', verbose=0, random_state=1988)
transformed = tsne.fit_transform(colistandard.astype(np.float64))

In [189]:
######################################
fig, axes = plt.subplots()
axes.set_xlabel('')
axes.set_ylabel('')
axes.plot(transformed[:,0], transformed[:,1], 'bo')
plt.show()
#fig.savefig("rf_featureimportance_more_mul.svg")
#######################################

isomap


In [ ]:
isomap = manifold.Isomap(n_neighbors=5, n_components=2, eigen_solver='auto', tol=0, max_iter=None, path_method='auto', \
                neighbors_algorithm='auto')
isotransformed = isomap.fit_transform(colistandard)

In [29]:
######################################
fig, axes = plt.subplots()
axes.set_xlabel('')
axes.set_ylabel('')
axes.plot(isotransformed[:,0], isotransformed[:,1], 'bo')
plt.show()
#fig.savefig("rf_featureimportance_more_mul.svg")
#######################################

LLE


In [ ]:
lle = manifold.LocallyLinearEmbedding(n_neighbors=5, n_components=2, reg=0.001, eigen_solver='auto', \
                                tol=1e-06, max_iter=100, method='standard', hessian_tol=0.0001, \
                                modified_tol=1e-12, neighbors_algorithm='auto', random_state=1988)
lletransformed = lle.fit_transform(colistandard)

In [31]:
######################################
fig, axes = plt.subplots()
axes.set_xlabel('')
axes.set_ylabel('')
axes.plot(lletransformed[:,0], lletransformed[:,1], 'bo')
plt.show()
#fig.savefig("rf_featureimportance_more_mul.svg")
#######################################

MDS


In [ ]:
mds = manifold.MDS(n_components=2, metric=True, n_init=4, max_iter=300, verbose=0, \
             eps=0.001, n_jobs=1, random_state=1988, dissimilarity='euclidean')
mdstransformed = mds.fit_transform(colistandard.astype(np.float64))

In [33]:
######################################
fig, axes = plt.subplots()
axes.set_xlabel('')
axes.set_ylabel('')
axes.plot(mdstransformed[:,0], mdstransformed[:,1], 'bo')
plt.show()
#fig.savefig("rf_featureimportance_more_mul.svg")
#######################################

Spectral Embedding


In [ ]:
se = manifold.SpectralEmbedding(n_components=2, affinity='nearest_neighbors', gamma=None, \
                                random_state=1988, eigen_solver=None, n_neighbors=None)
setransformed = se.fit_transform(colistandard)

In [35]:
######################################
fig, axes = plt.subplots()
axes.set_xlabel('')
axes.set_ylabel('')
axes.plot(setransformed[:,0], setransformed[:,1], 'bo')
plt.show()
#fig.savefig("rf_featureimportance_more_mul.svg")
#######################################

DBSCAN


In [56]:
from sklearn import cluster

In [140]:
db = cluster.DBSCAN(eps=2.3, min_samples=3, metric='euclidean', algorithm='auto', leaf_size=30, p=None, random_state=1988)
db.fit_predict(transformed, y=None)
core_samples_db = db.core_sample_indices_
labels_db = db.labels_
print 'labels:', np.unique(labels_db)


labels: [ -1   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16
  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34
  35  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52
  53  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70
  71  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88
  89  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106
 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124
 125 126 127]

db = cluster.DBSCAN(eps=1, min_samples=3, metric='euclidean', algorithm='auto', leaf_size=30, p=None, random_state=1988) db.fit_predict(coli[:,:2], y=None) core_samples_db = db.core_sampleindices labelsdb = db.labels print 'labels:', np.unique(labels_db)


In [141]:
len(transformed[labels_db == -1])


Out[141]:
35

In [142]:
fig, axes = plt.subplots()
axes.set_xlabel('')
axes.set_ylabel('')
#axes.set_xlim(left=-1, right=1)
#axes.set_ylim(bottom=-1, top=1)

unique_labels = set(labels_db)
colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))


for k, col in zip(unique_labels, colors):
    if k == -1:
        # Black used for noise.
        col = 'k'

    class_member_mask = (labels_db == k)

    xy = transformed[class_member_mask]

    axes.plot(xy[:,0], xy[:,1], 'o', markerfacecolor=col,
             markeredgecolor='k')

plt.show()

#ax.view_init(70, 30)

Storage


In [190]:
colis = {x: 0 for x in colsi}

In [191]:
keys


Out[191]:
array([    -1, 100000, 100001, 100002, 100003, 100004, 100005, 100006,
       100008, 100010, 100012, 100013, 100016, 100019, 100020, 100021,
       100022, 100024, 100025, 100026, 100027, 100028, 100029, 100031,
       100032, 100033, 100034, 100037, 100039, 100040, 100041, 100043,
       100046, 100048, 100049, 100050, 100051, 100052, 100053, 100054,
       100055, 100056, 100057, 100058, 100059, 100060, 100061, 100062,
       100063, 100064, 100065, 100068, 100070, 100071, 100072, 100073,
       100074, 100075, 100076, 100077, 100078, 100079, 100081, 100082,
       100083, 100084, 100086, 100087, 100088, 100090, 100091, 100093,
       100094, 100095, 100096, 100097, 100098, 100099, 100100, 100101,
       100103, 100105, 100106, 100107, 100108, 100109, 100111, 100112,
       100113, 100114, 100117, 100119, 100121, 100122, 100123, 100124,
       100126, 100128, 100130, 100131, 100132, 100133, 100134, 100135,
       100137, 100138, 100139, 100141, 100143, 100144, 100148, 100149,
       100150, 100151, 100152, 100153, 100155, 100156, 100157, 100160,
       100161, 100162, 100163, 100165, 100166, 100168, 100169, 100170,
       100171, 100172, 100173, 100175, 100176, 100177, 100178, 100179,
       100181, 100182, 100183, 100185, 100186, 100187, 100188, 100189,
       100190, 100191, 100192, 100193, 100194, 100195, 100198, 100199,
       100200, 100202, 100205, 100206, 100210, 100212, 100213, 100215,
       100217, 100221, 100224, 100225, 100228, 100229, 100233, 100241,
       100244, 100246, 100248])

In [192]:
labels = labels_db+2

In [193]:
for key in notrain:
    colis[key] = 0
for key in notest:
    colis[key] = 1

In [194]:
flag=0
clusterlabel = np.unique(labels_db).shape[0]+1
for key,k in itertools.izip(traintest, xrange(len(traintest))):
    if labels[k] == 1:
        colis[key] = clusterlabel+flag
        flag+=1
    else:
        colis[key] = labels[k]

or not transform


In [170]:
for key,k in itertools.izip(traintest, xrange(len(traintest))):
    colis[key] = k+2

In [195]:
colis


Out[195]:
{-1: 121,
 100000: 121,
 100001: 121,
 100002: 121,
 100003: 70,
 100004: 3,
 100005: 130,
 100006: 9,
 100008: 1,
 100010: 9,
 100012: 131,
 100013: 9,
 100016: 9,
 100019: 20,
 100020: 51,
 100021: 33,
 100022: 33,
 100024: 33,
 100025: 33,
 100026: 14,
 100027: 1,
 100028: 89,
 100029: 33,
 100031: 1,
 100032: 1,
 100033: 1,
 100034: 89,
 100037: 33,
 100039: 33,
 100040: 89,
 100041: 33,
 100043: 33,
 100046: 33,
 100048: 33,
 100049: 3,
 100050: 49,
 100051: 82,
 100052: 82,
 100053: 5,
 100054: 82,
 100055: 82,
 100056: 39,
 100057: 39,
 100058: 39,
 100059: 1,
 100060: 103,
 100061: 103,
 100062: 103,
 100063: 12,
 100064: 12,
 100065: 103,
 100068: 132,
 100070: 59,
 100071: 84,
 100072: 84,
 100073: 12,
 100074: 87,
 100075: 37,
 100076: 37,
 100077: 20,
 100078: 60,
 100079: 60,
 100081: 51,
 100082: 70,
 100083: 51,
 100084: 51,
 100086: 12,
 100087: 51,
 100088: 49,
 100090: 49,
 100091: 19,
 100093: 11,
 100094: 11,
 100095: 19,
 100096: 19,
 100097: 19,
 100098: 19,
 100099: 19,
 100100: 11,
 100101: 11,
 100103: 98,
 100105: 98,
 100106: 124,
 100107: 46,
 100108: 46,
 100109: 127,
 100111: 14,
 100112: 87,
 100113: 87,
 100114: 87,
 100117: 87,
 100119: 87,
 100121: 87,
 100122: 37,
 100123: 37,
 100124: 93,
 100126: 93,
 100128: 93,
 100130: 71,
 100131: 71,
 100132: 22,
 100133: 83,
 100134: 81,
 100135: 5,
 100137: 5,
 100138: 40,
 100139: 80,
 100141: 80,
 100143: 80,
 100144: 80,
 100148: 106,
 100149: 106,
 100150: 3,
 100151: 3,
 100152: 26,
 100153: 26,
 100155: 9,
 100156: 50,
 100157: 50,
 100160: 50,
 100161: 82,
 100162: 82,
 100163: 82,
 100165: 103,
 100166: 128,
 100168: 128,
 100169: 128,
 100170: 128,
 100171: 107,
 100172: 107,
 100173: 107,
 100175: 46,
 100176: 46,
 100177: 46,
 100178: 46,
 100179: 70,
 100181: 70,
 100182: 62,
 100183: 70,
 100185: 112,
 100186: 112,
 100187: 1,
 100188: 112,
 100189: 112,
 100190: 39,
 100191: 39,
 100192: 39,
 100193: 39,
 100194: 9,
 100195: 97,
 100198: 1,
 100199: 97,
 100200: 39,
 100202: 93,
 100205: 133,
 100206: 91,
 100210: 96,
 100212: 39,
 100213: 98,
 100215: 58,
 100217: 82,
 100221: 103,
 100224: 12,
 100225: 12,
 100228: 12,
 100229: 84,
 100233: 41,
 100241: 41,
 100244: 40,
 100246: 83,
 100248: 5}

In [196]:
len(colis)


Out[196]:
171

In [197]:
f = open('clustercol'+str(cols[ithcol])+'.pkl', 'wb')
cPickle.dump(colis, f, -1)
f.close()

In [198]:
f = open('countcol'+str(cols[ithcol])+'.pkl', 'wb')
cPickle.dump(colid, f, -1)
f.close()

Testing


In [74]:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

#ax.scatter(col5standard[:,0], col5standard[:,1], col5standard[:,2], c='r', marker='o')

ax.set_xlabel('0 Label')
ax.set_ylabel('1 Label')
ax.set_zlabel('2 Label')
ax.set_xlim3d(left=-1, right=1)
ax.set_ylim3d(bottom=-1, top=1)
ax.set_zlim3d(bottom=-1, top=1)

unique_labels = set(labels_db)
colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))

for k, col in zip(unique_labels, colors):
    if k == -1:
        # Black used for noise.
        col = 'k'

    class_member_mask = (labels_db == k)

    xyz = colistandard[class_member_mask]

    ax.scatter(xyz[:,0], xyz[:,1], xyz[:,2], c=col, marker='o')
#u = np.linspace(0, 2 * np.pi, 100)
#v = np.linspace(0, np.pi, 100)

# gravity on earth from wikipedia
#g = 9.80665
#g=1

#x = g * np.outer(np.cos(u), np.sin(v))
#y = g * np.outer(np.sin(u), np.sin(v))
#z = g * np.outer(np.ones(np.size(u)), np.cos(v))
# alpha is used to denote the transparent of the figure, float (0.0 transparent through 1.0 opaque)
#ax.plot_surface(x, y, z,  rstride=4, cstride=4, color='y', alpha=0.3)

plt.show()

ax.view_init(70, 30)

Silhouette Coefficient


In [ ]:
metrics.silhouette_score(X, labels, metric='euclidean', sample_size=None, random_state=1988)`y

In [ ]:
metrics.silhouette_samples(X, labels, metric='euclidean')

PCA


In [67]:
from sklearn import decomposition

pca = decomposition.PCA(n_components=2, copy=True, whiten=False)

pcatransformed = pca.fit_transform(colistandard)

In [69]:
######################################
fig, axes = plt.subplots()
axes.set_xlabel('')
axes.set_ylabel('')
axes.plot(pcatransformed[:,0], pcatransformed[:,1], 'bo')
#axes.plot(pcatransformed, 'bo')
plt.show()
#fig.savefig("rf_featureimportance_more_mul.svg")
#######################################

In [ ]:
fig, axes = plt.subplots()
axes.set_xlabel('')
axes.set_ylabel('')
#axes.set_xlim(left=-1, right=1)
#axes.set_ylim(bottom=-1, top=1)

unique_labels = set(labels_db)
colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))


for k, col in zip(unique_labels, colors):
    if k == -1:
        # Black used for noise.
        col = 'k'

    class_member_mask = (labels_db == k)

    xyz = transformed[class_member_mask]

    axes.plot(xyz[:,0], xyz[:,1], 'o', markerfacecolor=col,
             markeredgecolor='k')

plt.show()

#ax.view_init(70, 30)

In [ ]:


In [ ]:
#the method below cost too long time

col5 = np.zeros((colcate[3].shape[0], 2), dtype=np.float32)

start = time.time()
flag=0
for sample, target in itertools.izip(X, y):
    cr = col5[np.where(colcate[3]==sample[4])[0]]
    cr[:,0]+=1
    cr[:,1]+=target[0]
    flag+=1
    if flag==100:
        break
end = time.time()
print end-start, 'second'

col5t = np.zeros((colcate[3].shape[0], 1), dtype=np.float32)

start = time.time()
for sample in X_t:
    index = np.where(colcate[3]==sample[4])[0]
    if index.nbytes != 0:
        col5t[index][0]+=1        
end = time.time()
print end-start, 'second'

In [ ]: