notebook.community

Edit and run



In [2]:

    
import numpy as np
import pandas as pd
import pickle
import hdbscan



In [3]:

    
hdbscan_model = pickle.load(open('hdbscan_cluster_jac_10.pickle', 'rb'))



In [6]:

    
col_names = ['S'+str(i) for i in range(52)]



In [10]:

    
bmatrix_date = np.load('bmatrix_train_date.npy')
path_df = pd.DataFrame(data=bmatrix_date, columns=col_names, dtype=int)
del bmatrix_date



In [11]:

    
path_df.head()









    Out[11]:







  
    
      
      S0
      S1
      S2
      S3
      S4
      S5
      S6
      S7
      S8
      S9
      ...
      S42
      S43
      S44
      S45
      S46
      S47
      S48
      S49
      S50
      S51
    
  
  
    
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      2
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      3
      1
      1
      1
      0
      1
      0
      0
      1
      1
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      4
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
  

5 rows × 52 columns



In [12]:

    
path_df['idx'] = path_df.index + 1



In [15]:

    
path_df = path_df.set_index('idx')



In [16]:

    
path_df_train = path_df[path_df.sum(axis=1) > 0]



In [20]:

    
test_labels, strengths = hdbscan.approximate_predict(hdbscan_model, path_df_train.values)
test_labels









    Out[20]:





array([665, 375, 670, ..., 778, 839, 811])



In [29]:

    
%%time

hdbscan_model_man_10 = pickle.load(open('hdbscan_cluster_man_10.pickle', 'rb'))









    



Wall time: 23.9 s



In [44]:

    
test_labels_man_10, strengths = hdbscan.approximate_predict(hdbscan_model_man_10, path_df_train.values)
test_labels_man_10









    Out[44]:





array([ 570,  714, 1150, ...,  871,  890,  864])



In [49]:

    
test_labels_man_10.shape









    Out[49]:





(1183165L,)



In [50]:

    
path_df_train.shape









    Out[50]:





(1183165, 53)



In [51]:

    
path_df_train['product'] = test_labels_man_10









    



d:\Anaconda\envs\Deep2\lib\site-packages\ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.



In [52]:

    
path_df_train.to_csv("product_man_10.csv")



In [ ]:

	S0	S1	S2	S4	S7	S8	...
0	0	0	0	0	0	0	...
1	0	0	0	0	0	0	...
2	0	0	0	0	0	0	...
3	1	1	1	1	1	1	...
4	0	0	0	0	0	0	...

	S0	S1	S2	S4	S7	S8	...
0	0	0	0	0	0	0	...
1	0	0	0	0	0	0	...
2	0	0	0	0	0	0	...
3	1	1	1	1	1	1	...
4	0	0	0	0	0	0	...

	S0	S1	S2	S4	S7	S8	...
0	0	0	0	0	0	0	...
1	0	0	0	0	0	0	...
2	0	0	0	0	0	0	...
3	1	1	1	1	1	1	...
4	0	0	0	0	0	0	...