In [2]:
import numpy as np
import pandas as pd
import pickle
import hdbscan

In [3]:
hdbscan_model = pickle.load(open('hdbscan_cluster_jac_10.pickle', 'rb'))

In [6]:
col_names = ['S'+str(i) for i in range(52)]

In [10]:
bmatrix_date = np.load('bmatrix_train_date.npy')
path_df = pd.DataFrame(data=bmatrix_date, columns=col_names, dtype=int)
del bmatrix_date

In [11]:
path_df.head()


Out[11]:
S0 S1 S2 S3 S4 S5 S6 S7 S8 S9 ... S42 S43 S44 S45 S46 S47 S48 S49 S50 S51
0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3 1 1 1 0 1 0 0 1 1 0 ... 0 0 0 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 52 columns


In [12]:
path_df['idx'] = path_df.index + 1

In [15]:
path_df = path_df.set_index('idx')

In [16]:
path_df_train = path_df[path_df.sum(axis=1) > 0]

In [20]:
test_labels, strengths = hdbscan.approximate_predict(hdbscan_model, path_df_train.values)
test_labels


Out[20]:
array([665, 375, 670, ..., 778, 839, 811])

In [29]:
%%time

hdbscan_model_man_10 = pickle.load(open('hdbscan_cluster_man_10.pickle', 'rb'))


Wall time: 23.9 s

In [44]:
test_labels_man_10, strengths = hdbscan.approximate_predict(hdbscan_model_man_10, path_df_train.values)
test_labels_man_10


Out[44]:
array([ 570,  714, 1150, ...,  871,  890,  864])

In [49]:
test_labels_man_10.shape


Out[49]:
(1183165L,)

In [50]:
path_df_train.shape


Out[50]:
(1183165, 53)

In [51]:
path_df_train['product'] = test_labels_man_10


d:\Anaconda\envs\Deep2\lib\site-packages\ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.

In [52]:
path_df_train.to_csv("product_man_10.csv")

In [ ]: