In [1]:
import numpy as np
In [2]:
import pandas as pd
In [3]:
from sklearn.cluster import MiniBatchKMeans, KMeans
from sklearn.metrics.pairwise import pairwise_distances_argmin
import time
from sklearn.cluster import DBSCAN
In [4]:
bmatrix = np.load('bmatrix_train_date.npy')
In [5]:
bmatrix.shape
Out[5]:
In [6]:
col_names = ['S'+str(i) for i in range(52)]
In [7]:
df = pd.DataFrame(bmatrix[bmatrix.sum(axis=1) > 0], columns=col_names, dtype=int)
In [8]:
df.head()
Out[8]:
In [9]:
df = df.sample(frac=0.5)
df.shape
Out[9]:
In [10]:
split_index = df.shape[0] / 2
In [11]:
split_index
Out[11]:
In [12]:
train_X = df[:-split_index]
test_X = df[-split_index:]
In [32]:
train_X.shape
Out[32]:
In [14]:
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
In [33]:
%time db = DBSCAN(eps=0.3, min_samples=10).fit(train_X)
In [34]:
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
In [35]:
print('Estimated number of clusters: %d' % n_clusters_)
In [20]:
import sys
In [30]:
sys.getsizeof(KMeans) / 1024
Out[30]:
In [31]:
del test_X
In [ ]:
In [ ]:
%time db = DBSCAN(eps=0.3, min_samples=10).fit(train_X)
In [37]:
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
In [38]:
print('Estimated number of clusters: %d' % n_clusters_)
In [42]:
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
In [43]:
core_samples_mask
Out[43]:
In [44]:
labels
Out[44]:
In [46]:
len(set(labels))
Out[46]:
In [48]:
import pickle
In [ ]:
pickle.dump(db, open('dbscan_cluster_1.pickle','wb'))
In [ ]: