notebook.community

Edit and run



In [1]:

    
import numpy as np



In [2]:

    
import pandas as pd



In [3]:

    
from sklearn.cluster import MiniBatchKMeans, KMeans
from sklearn.metrics.pairwise import pairwise_distances_argmin
import time
from sklearn.cluster import DBSCAN



In [4]:

    
bmatrix = np.load('bmatrix_train_date.npy')



In [5]:

    
bmatrix.shape









    Out[5]:





(2367495L, 52L)



In [6]:

    
col_names = ['S'+str(i) for i in range(52)]



In [7]:

    
df = pd.DataFrame(bmatrix[bmatrix.sum(axis=1) > 0], columns=col_names, dtype=int)



In [8]:

    
df.head()









    Out[8]:







  
    
      
      S0
      S1
      S2
      S3
      S4
      S5
      S6
      S7
      S8
      S9
      ...
      S42
      S43
      S44
      S45
      S46
      S47
      S48
      S49
      S50
      S51
    
  
  
    
      0
      1
      1
      1
      0
      1
      0
      0
      1
      1
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      2
      1
      1
      1
      0
      0
      1
      1
      0
      1
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      3
      1
      1
      1
      0
      1
      0
      0
      1
      1
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      4
      1
      1
      0
      1
      1
      0
      0
      1
      1
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
  

5 rows × 52 columns



In [9]:

    
df = df.sample(frac=0.5)

df.shape









    Out[9]:





(591583, 52)



In [10]:

    
split_index = df.shape[0] / 2



In [11]:

    
split_index









    Out[11]:





295791



In [12]:

    
train_X = df[:-split_index]
test_X = df[-split_index:]



In [32]:

    
train_X.shape









    Out[32]:





(295792, 52)



In [14]:

    
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.preprocessing import StandardScaler



In [33]:

    
%time db = DBSCAN(eps=0.3, min_samples=10).fit(train_X)









    



Wall time: 24min 53s



In [34]:

    
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)



In [35]:

    
print('Estimated number of clusters: %d' % n_clusters_)









    



Estimated number of clusters: 967



In [20]:

    
import sys



In [30]:

    
sys.getsizeof(KMeans) / 1024









    Out[30]:





0



In [31]:

    
del test_X



In [ ]:



In [ ]:

    
%time db = DBSCAN(eps=0.3, min_samples=10).fit(train_X)









    



Wall time: 25min 47s



In [37]:

    
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)



In [38]:

    
print('Estimated number of clusters: %d' % n_clusters_)









    



Estimated number of clusters: 268



In [42]:

    
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)



In [43]:

    
core_samples_mask









    Out[43]:





array([False, False, False, ..., False, False, False], dtype=bool)



In [44]:

    
labels









    Out[44]:





array([  0,   1,   2, ...,  19,  92, 138], dtype=int64)



In [46]:

    
len(set(labels))









    Out[46]:





269



In [48]:

    
import pickle



In [ ]:

    
pickle.dump(db, open('dbscan_cluster_1.pickle','wb'))



In [ ]:

	S0	S1	S2	S3	S4	S5	S6	S7	S8	...
0	1	1	1	0	1	0	0	1	1	...
1	0	0	0	0	0	0	0	0	0	...
2	1	1	1	0	0	1	1	0	1	...
3	1	1	1	0	1	0	0	1	1	...
4	1	1	0	1	1	0	0	1	1	...

	S0	S1	S2	S3	S4	S5	S6	S7	S8	...
0	1	1	1	0	1	0	0	1	1	...
1	0	0	0	0	0	0	0	0	0	...
2	1	1	1	0	0	1	1	0	1	...
3	1	1	1	0	1	0	0	1	1	...
4	1	1	0	1	1	0	0	1	1	...

	S0	S1	S2	S3	S4	S5	S6	S7	S8	...
0	1	1	1	0	1	0	0	1	1	...
1	0	0	0	0	0	0	0	0	0	...
2	1	1	1	0	0	1	1	0	1	...
3	1	1	1	0	1	0	0	1	1	...
4	1	1	0	1	1	0	0	1	1	...