In [1]:
import numpy as np

In [2]:
import pandas as pd

In [3]:
from sklearn.cluster import MiniBatchKMeans, KMeans
from sklearn.metrics.pairwise import pairwise_distances_argmin
import time
from sklearn.cluster import DBSCAN

In [4]:
bmatrix = np.load('bmatrix_train_date.npy')

In [5]:
bmatrix.shape


Out[5]:
(2367495L, 52L)

In [6]:
col_names = ['S'+str(i) for i in range(52)]

In [7]:
df = pd.DataFrame(bmatrix[bmatrix.sum(axis=1) > 0], columns=col_names, dtype=int)

In [8]:
df.head()


Out[8]:
S0 S1 S2 S3 S4 S5 S6 S7 S8 S9 ... S42 S43 S44 S45 S46 S47 S48 S49 S50 S51
0 1 1 1 0 1 0 0 1 1 0 ... 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 1 1 1 0 0 1 1 0 1 0 ... 0 0 0 0 0 0 0 0 0 0
3 1 1 1 0 1 0 0 1 1 0 ... 0 0 0 0 0 0 0 0 0 0
4 1 1 0 1 1 0 0 1 1 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 52 columns


In [9]:
df = df.sample(frac=0.5)

df.shape


Out[9]:
(591583, 52)

In [10]:
split_index = df.shape[0] / 2

In [11]:
split_index


Out[11]:
295791

In [12]:
train_X = df[:-split_index]
test_X = df[-split_index:]

In [32]:
train_X.shape


Out[32]:
(295792, 52)

In [14]:
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

In [33]:
%time db = DBSCAN(eps=0.3, min_samples=10).fit(train_X)


Wall time: 24min 53s

In [34]:
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

In [35]:
print('Estimated number of clusters: %d' % n_clusters_)


Estimated number of clusters: 967

In [20]:
import sys

In [30]:
sys.getsizeof(KMeans) / 1024


Out[30]:
0

In [31]:
del test_X

In [ ]:


In [ ]:
%time db = DBSCAN(eps=0.3, min_samples=10).fit(train_X)


Wall time: 25min 47s

In [37]:
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

In [38]:
print('Estimated number of clusters: %d' % n_clusters_)


Estimated number of clusters: 268

In [42]:
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)

In [43]:
core_samples_mask


Out[43]:
array([False, False, False, ..., False, False, False], dtype=bool)

In [44]:
labels


Out[44]:
array([  0,   1,   2, ...,  19,  92, 138], dtype=int64)

In [46]:
len(set(labels))


Out[46]:
269

In [48]:
import pickle

In [ ]:
pickle.dump(db, open('dbscan_cluster_1.pickle','wb'))

In [ ]: