In [2]:
import numpy as np

In [3]:
# bmatrix = np.loadtxt('bmatrix_train_date.npy', delimiter=',')

In [4]:
bmatrix_date = np.load('bmatrix_train_date.npy')

In [5]:
bmatrix_date.shape


Out[5]:
(2367495L, 52L)

In [6]:
bmatrix[0:5].astype(int)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-6-e5312fc1a5ca> in <module>()
----> 1 bmatrix[0:5].astype(int)

NameError: name 'bmatrix' is not defined

In [ ]:
x = bmatrix[0:10].astype(int)

In [ ]:
x = bmatrix[bmatrix.sum(axis=1) > 0]

In [ ]:
c = bmatrix.sum(axis=1).astype(int)

In [ ]:
c > 0

In [ ]:
from collections import Counter
cc = Counter(c)

In [ ]:
cc[0]

In [ ]:
t = 0
for k in cc:
    t += cc[k]
print t
2367495-(t-1170)

In [ ]:
c[:-10]

In [ ]:
x[:-10]

In [7]:
import pandas as pd

In [8]:
col_names = ['S'+str(i) for i in range(52)]

In [10]:
df = pd.DataFrame(bmatrix_date[bmatrix_date.sum(axis=1) > 0], columns=col_names, dtype=int)

In [11]:
df.head()


Out[11]:
S0 S1 S2 S3 S4 S5 S6 S7 S8 S9 ... S42 S43 S44 S45 S46 S47 S48 S49 S50 S51
0 1 1 1 0 1 0 0 1 1 0 ... 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 1 1 1 0 0 1 1 0 1 0 ... 0 0 0 0 0 0 0 0 0 0
3 1 1 1 0 1 0 0 1 1 0 ... 0 0 0 0 0 0 0 0 0 0
4 1 1 0 1 1 0 0 1 1 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 52 columns


In [12]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1183165 entries, 0 to 1183164
Data columns (total 52 columns):
S0     1183165 non-null int32
S1     1183165 non-null int32
S2     1183165 non-null int32
S3     1183165 non-null int32
S4     1183165 non-null int32
S5     1183165 non-null int32
S6     1183165 non-null int32
S7     1183165 non-null int32
S8     1183165 non-null int32
S9     1183165 non-null int32
S10    1183165 non-null int32
S11    1183165 non-null int32
S12    1183165 non-null int32
S13    1183165 non-null int32
S14    1183165 non-null int32
S15    1183165 non-null int32
S16    1183165 non-null int32
S17    1183165 non-null int32
S18    1183165 non-null int32
S19    1183165 non-null int32
S20    1183165 non-null int32
S21    1183165 non-null int32
S22    1183165 non-null int32
S23    1183165 non-null int32
S24    1183165 non-null int32
S25    1183165 non-null int32
S26    1183165 non-null int32
S27    1183165 non-null int32
S28    1183165 non-null int32
S29    1183165 non-null int32
S30    1183165 non-null int32
S31    1183165 non-null int32
S32    1183165 non-null int32
S33    1183165 non-null int32
S34    1183165 non-null int32
S35    1183165 non-null int32
S36    1183165 non-null int32
S37    1183165 non-null int32
S38    1183165 non-null int32
S39    1183165 non-null int32
S40    1183165 non-null int32
S41    1183165 non-null int32
S42    1183165 non-null int32
S43    1183165 non-null int32
S44    1183165 non-null int32
S45    1183165 non-null int32
S46    1183165 non-null int32
S47    1183165 non-null int32
S48    1183165 non-null int32
S49    1183165 non-null int32
S50    1183165 non-null int32
S51    1183165 non-null int32
dtypes: int32(52)
memory usage: 234.7 MB

In [ ]:
df.to_csv('process_paths.csv')

In [ ]:
from sklearn.cluster import MiniBatchKMeans, KMeans
from sklearn.metrics.pairwise import pairwise_distances_argmin
import time
from sklearn.cluster import DBSCAN

In [60]:
df = df.sample(frac=1)

df.shape


Out[60]:
(1183165, 52)

In [61]:
split_index = df.shape[0] / 2

In [62]:
split_index


Out[62]:
591582

In [63]:
train_X = df[:-split_index]
test_X = df[-split_index:]

In [64]:
train_X.shape


Out[64]:
(591583, 52)

In [ ]:
mbk = MiniBatchKMeans(init='k-means++', n_clusters=1000, batch_size=1000,
                      n_init=10, max_no_improvement=10, verbose=0)
t0 = time.time()
mbk.fit(train_X.values)
t_mini_batch = time.time() - t0

In [ ]:
mbk_means_cluster_centers = np.sort(mbk.cluster_centers_, axis=0)
mbk_means_labels = pairwise_distances_argmin(train_X, mbk_means_cluster_centers)

In [ ]:


In [ ]:
mbk_means_labels

In [ ]:
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

In [ ]:
#db = DBSCAN(eps=0.3, min_samples=10).fit(train_X)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

In [ ]:
print('Estimated number of clusters: %d' % n_clusters_)
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
print("Adjusted Rand Index: %0.3f"
      % metrics.adjusted_rand_score(labels_true, labels))
print("Adjusted Mutual Information: %0.3f"
      % metrics.adjusted_mutual_info_score(labels_true, labels))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, labels))

In [25]:
import hdbscan

In [26]:
hdbscan_ = hdbscan.HDBSCAN()

In [65]:
%time clusterer = hdbscan.HDBSCAN(min_cluster_size=50, prediction_data=True, metric='jaccard', core_dist_n_jobs=8).fit(train_X[:591583/2])


Wall time: 18h 6min 32s

In [66]:
clusterer.labels_


Out[66]:
array([303, 233, 178, ..., 166, 432,  -1], dtype=int64)

In [67]:
clusterer.labels_.max()


Out[67]:
528

In [53]:
from collections import Counter

In [68]:
c = Counter(clusterer.labels_)

In [69]:
c.most_common()


Out[69]:
[(189, 14526),
 (185, 14303),
 (-1, 14088),
 (167, 5949),
 (195, 5590),
 (187, 5397),
 (188, 5189),
 (304, 2712),
 (178, 2704),
 (180, 2695),
 (274, 2677),
 (295, 2674),
 (276, 2663),
 (380, 2660),
 (277, 2657),
 (289, 2636),
 (271, 2634),
 (313, 2630),
 (381, 2627),
 (305, 2626),
 (421, 2622),
 (299, 2621),
 (301, 2618),
 (288, 2617),
 (269, 2607),
 (272, 2606),
 (270, 2598),
 (273, 2586),
 (307, 2585),
 (275, 2581),
 (287, 2572),
 (300, 2571),
 (419, 2569),
 (293, 2561),
 (291, 2559),
 (420, 2559),
 (292, 2549),
 (294, 2541),
 (354, 2538),
 (392, 2534),
 (425, 2534),
 (298, 2531),
 (303, 2530),
 (297, 2528),
 (424, 2528),
 (413, 2521),
 (384, 2517),
 (374, 2511),
 (422, 2508),
 (290, 2505),
 (346, 2503),
 (423, 2490),
 (360, 2486),
 (286, 2477),
 (296, 2448),
 (302, 2434),
 (418, 2413),
 (130, 1446),
 (131, 1394),
 (163, 1032),
 (191, 1012),
 (172, 1009),
 (141, 1008),
 (118, 1006),
 (164, 1006),
 (250, 1002),
 (158, 1000),
 (128, 993),
 (120, 992),
 (268, 990),
 (155, 988),
 (146, 985),
 (149, 982),
 (123, 981),
 (237, 980),
 (181, 977),
 (209, 977),
 (211, 977),
 (208, 972),
 (190, 971),
 (133, 966),
 (160, 964),
 (240, 964),
 (257, 963),
 (222, 960),
 (175, 957),
 (216, 956),
 (251, 956),
 (170, 951),
 (126, 949),
 (226, 948),
 (242, 946),
 (244, 944),
 (245, 944),
 (260, 942),
 (258, 941),
 (212, 940),
 (204, 939),
 (236, 935),
 (248, 934),
 (224, 933),
 (267, 929),
 (137, 927),
 (153, 924),
 (201, 917),
 (262, 906),
 (227, 904),
 (230, 898),
 (233, 878),
 (177, 357),
 (162, 354),
 (1, 287),
 (436, 279),
 (411, 277),
 (203, 256),
 (365, 255),
 (368, 254),
 (376, 253),
 (400, 250),
 (115, 247),
 (438, 247),
 (387, 246),
 (429, 246),
 (476, 244),
 (343, 243),
 (278, 242),
 (478, 242),
 (491, 242),
 (310, 240),
 (457, 240),
 (417, 237),
 (280, 236),
 (410, 236),
 (504, 236),
 (426, 235),
 (511, 235),
 (477, 234),
 (502, 233),
 (214, 231),
 (362, 231),
 (503, 230),
 (337, 229),
 (464, 229),
 (283, 227),
 (312, 227),
 (351, 227),
 (179, 226),
 (348, 226),
 (350, 226),
 (407, 226),
 (472, 226),
 (324, 224),
 (355, 223),
 (516, 223),
 (388, 222),
 (398, 222),
 (352, 221),
 (402, 221),
 (517, 220),
 (379, 219),
 (462, 218),
 (427, 217),
 (466, 217),
 (169, 216),
 (326, 216),
 (361, 216),
 (389, 216),
 (446, 216),
 (508, 216),
 (395, 215),
 (444, 214),
 (334, 213),
 (338, 213),
 (382, 213),
 (403, 213),
 (442, 213),
 (507, 211),
 (114, 210),
 (349, 210),
 (474, 209),
 (459, 207),
 (318, 206),
 (356, 206),
 (469, 205),
 (465, 204),
 (488, 204),
 (366, 203),
 (496, 202),
 (344, 201),
 (386, 201),
 (447, 201),
 (506, 201),
 (480, 200),
 (285, 198),
 (322, 198),
 (479, 197),
 (329, 196),
 (358, 196),
 (399, 195),
 (456, 194),
 (449, 193),
 (450, 193),
 (467, 191),
 (484, 191),
 (357, 190),
 (432, 189),
 (373, 188),
 (414, 187),
 (435, 186),
 (363, 185),
 (364, 182),
 (486, 181),
 (434, 180),
 (315, 178),
 (497, 178),
 (282, 172),
 (317, 171),
 (197, 170),
 (519, 167),
 (150, 165),
 (159, 164),
 (498, 164),
 (377, 153),
 (279, 148),
 (342, 148),
 (328, 145),
 (404, 143),
 (437, 140),
 (505, 140),
 (106, 138),
 (512, 138),
 (441, 137),
 (347, 136),
 (499, 135),
 (510, 134),
 (416, 133),
 (215, 131),
 (408, 131),
 (323, 130),
 (431, 130),
 (433, 130),
 (453, 130),
 (210, 129),
 (309, 129),
 (319, 129),
 (378, 129),
 (205, 128),
 (341, 128),
 (370, 128),
 (452, 128),
 (489, 128),
 (129, 126),
 (461, 126),
 (454, 125),
 (105, 124),
 (249, 124),
 (391, 122),
 (475, 122),
 (481, 122),
 (333, 120),
 (514, 120),
 (31, 119),
 (111, 119),
 (371, 119),
 (495, 119),
 (213, 118),
 (316, 118),
 (207, 116),
 (15, 115),
 (493, 115),
 (51, 114),
 (234, 114),
 (266, 114),
 (228, 113),
 (330, 113),
 (171, 112),
 (110, 111),
 (161, 111),
 (332, 111),
 (394, 111),
 (406, 111),
 (2, 109),
 (335, 109),
 (492, 109),
 (36, 108),
 (57, 108),
 (254, 108),
 (12, 107),
 (72, 107),
 (30, 104),
 (85, 104),
 (308, 104),
 (528, 104),
 (38, 103),
 (53, 103),
 (39, 102),
 (90, 102),
 (526, 102),
 (3, 101),
 (4, 101),
 (18, 100),
 (25, 100),
 (28, 100),
 (54, 100),
 (6, 99),
 (37, 99),
 (71, 99),
 (91, 99),
 (439, 99),
 (19, 98),
 (40, 98),
 (45, 98),
 (95, 98),
 (468, 98),
 (20, 97),
 (46, 97),
 (13, 96),
 (42, 96),
 (52, 96),
 (62, 96),
 (84, 96),
 (61, 94),
 (63, 94),
 (74, 94),
 (86, 94),
 (7, 93),
 (9, 93),
 (58, 93),
 (77, 93),
 (92, 93),
 (22, 92),
 (29, 92),
 (82, 92),
 (87, 92),
 (97, 92),
 (99, 92),
 (487, 92),
 (5, 91),
 (27, 91),
 (50, 91),
 (75, 91),
 (78, 91),
 (281, 91),
 (59, 90),
 (93, 90),
 (96, 90),
 (26, 89),
 (33, 89),
 (43, 89),
 (56, 89),
 (65, 89),
 (79, 89),
 (80, 89),
 (17, 88),
 (32, 88),
 (35, 88),
 (10, 87),
 (34, 87),
 (397, 87),
 (0, 86),
 (21, 86),
 (94, 86),
 (47, 85),
 (152, 85),
 (202, 85),
 (11, 84),
 (14, 84),
 (24, 84),
 (41, 84),
 (49, 84),
 (483, 84),
 (23, 83),
 (83, 83),
 (88, 83),
 (67, 82),
 (68, 82),
 (70, 82),
 (430, 82),
 (108, 81),
 (154, 81),
 (8, 80),
 (73, 80),
 (101, 80),
 (125, 80),
 (238, 80),
 (345, 80),
 (393, 80),
 (509, 80),
 (44, 79),
 (55, 79),
 (60, 79),
 (69, 79),
 (199, 79),
 (440, 79),
 (458, 79),
 (66, 78),
 (232, 78),
 (265, 78),
 (415, 78),
 (390, 77),
 (16, 76),
 (48, 76),
 (64, 76),
 (81, 76),
 (143, 76),
 (145, 76),
 (173, 76),
 (200, 76),
 (255, 76),
 (311, 76),
 (314, 76),
 (336, 76),
 (359, 76),
 (112, 75),
 (455, 75),
 (515, 75),
 (107, 74),
 (176, 74),
 (428, 74),
 (451, 74),
 (500, 74),
 (89, 73),
 (102, 73),
 (239, 73),
 (253, 73),
 (284, 73),
 (367, 73),
 (385, 73),
 (443, 73),
 (513, 73),
 (235, 72),
 (243, 72),
 (372, 72),
 (460, 72),
 (229, 71),
 (306, 71),
 (521, 71),
 (98, 70),
 (100, 70),
 (103, 70),
 (138, 70),
 (223, 70),
 (485, 70),
 (76, 69),
 (136, 69),
 (157, 69),
 (196, 69),
 (448, 69),
 (327, 68),
 (470, 68),
 (471, 68),
 (113, 67),
 (135, 67),
 (166, 67),
 (183, 67),
 (246, 67),
 (121, 66),
 (331, 66),
 (494, 66),
 (340, 65),
 (396, 65),
 (501, 65),
 (104, 64),
 (116, 64),
 (144, 64),
 (151, 64),
 (221, 64),
 (259, 64),
 (369, 64),
 (193, 63),
 (353, 63),
 (401, 63),
 (405, 63),
 (463, 63),
 (134, 62),
 (194, 62),
 (482, 62),
 (174, 61),
 (186, 61),
 (218, 61),
 (252, 61),
 (124, 60),
 (139, 60),
 (206, 60),
 (109, 59),
 (142, 59),
 (156, 59),
 (217, 59),
 (527, 59),
 (147, 58),
 (165, 58),
 (241, 58),
 (119, 57),
 (132, 57),
 (140, 57),
 (168, 57),
 (225, 57),
 (412, 57),
 (122, 56),
 (256, 56),
 (261, 56),
 (522, 56),
 (127, 55),
 (220, 55),
 (321, 55),
 (383, 55),
 (117, 54),
 (182, 54),
 (198, 54),
 (263, 54),
 (339, 54),
 (409, 54),
 (490, 54),
 (518, 54),
 (192, 53),
 (264, 53),
 (325, 53),
 (525, 53),
 (148, 52),
 (231, 52),
 (247, 52),
 (375, 52),
 (445, 52),
 (473, 52),
 (523, 52),
 (524, 52),
 (184, 51),
 (219, 51),
 (320, 51),
 (520, 50)]

In [38]:
%time clusterer2 = hdbscan.HDBSCAN(min_cluster_size=5, prediction_data=True, metric='jaccard', core_dist_n_jobs=8).fit(train_X)


Wall time: 2h 52min 57s

In [39]:
clusterer2.labels_.max()


Out[39]:
1888

In [40]:
c2 = Counter(clusterer2.labels_)

In [41]:
c2.most_common()


Out[41]:
[(1528, 28587),
 (1477, 27880),
 (1491, 11961),
 (1495, 11488),
 (1535, 10545),
 (1524, 10212),
 (-1, 5504),
 (1864, 5325),
 (1852, 5323),
 (1676, 5301),
 (1458, 5295),
 (1503, 5262),
 (1562, 5243),
 (1221, 5234),
 (1813, 5233),
 (541, 5230),
 (1264, 5217),
 (1158, 5208),
 (1772, 5199),
 (1804, 5193),
 (1820, 5187),
 (1445, 5182),
 (1549, 5178),
 (1484, 5169),
 (1824, 5161),
 (1649, 5154),
 (1570, 5146),
 (659, 5143),
 (1801, 5132),
 (1505, 5118),
 (1752, 5115),
 (1674, 5104),
 (1214, 5102),
 (1319, 5087),
 (1806, 5070),
 (1208, 5067),
 (1883, 5058),
 (1653, 5054),
 (1473, 5048),
 (1343, 5039),
 (716, 5037),
 (1880, 5037),
 (1155, 5031),
 (538, 5010),
 (1683, 5000),
 (1193, 4999),
 (1888, 4999),
 (1637, 4980),
 (782, 4971),
 (1302, 4970),
 (977, 4968),
 (1284, 4951),
 (999, 4938),
 (1116, 4938),
 (773, 4910),
 (873, 4876),
 (1075, 4850),
 (1509, 3029),
 (1526, 2971),
 (1513, 2132),
 (1449, 2007),
 (1122, 1963),
 (1056, 1962),
 (1730, 1958),
 (479, 1950),
 (1049, 1948),
 (960, 1940),
 (1172, 1939),
 (1233, 1939),
 (1539, 1916),
 (1467, 1912),
 (991, 1909),
 (1522, 1909),
 (1024, 1908),
 (1797, 1906),
 (948, 1905),
 (1064, 1894),
 (1120, 1892),
 (1795, 1891),
 (875, 1884),
 (1018, 1880),
 (1273, 1880),
 (864, 1879),
 (1323, 1873),
 (1211, 1869),
 (584, 1865),
 (1553, 1865),
 (1079, 1859),
 (1005, 1846),
 (1341, 1841),
 (1269, 1836),
 (1252, 1823),
 (1098, 1820),
 (1033, 1818),
 (555, 1817),
 (1332, 1816),
 (810, 1812),
 (1050, 1811),
 (641, 1809),
 (788, 1809),
 (415, 1808),
 (1815, 1808),
 (821, 1807),
 (1066, 1807),
 (674, 1796),
 (797, 1762),
 (800, 1758),
 (1289, 1747),
 (779, 1726),
 (1479, 705),
 (1453, 679),
 (884, 482),
 (1183, 475),
 (1247, 473),
 (1775, 473),
 (1421, 469),
 (517, 466),
 (1799, 466),
 (643, 462),
 (1839, 461),
 (1756, 460),
 (1151, 459),
 (1259, 458),
 (1808, 458),
 (492, 455),
 (1017, 455),
 (1109, 455),
 (1812, 454),
 (1245, 452),
 (1481, 452),
 (1565, 452),
 (637, 450),
 (1455, 449),
 (1561, 449),
 (1626, 449),
 (1777, 449),
 (515, 448),
 (1619, 447),
 (1696, 447),
 (895, 446),
 (1531, 446),
 (1176, 445),
 (645, 443),
 (880, 443),
 (1238, 443),
 (1579, 443),
 (1866, 443),
 (1044, 440),
 (1547, 440),
 (1597, 440),
 (1858, 438),
 (1591, 437),
 (1441, 436),
 (629, 435),
 (1278, 435),
 (1088, 433),
 (1413, 433),
 (1728, 433),
 (1742, 432),
 (1293, 431),
 (1346, 431),
 (1714, 430),
 (758, 428),
 (1854, 428),
 (914, 427),
 (1779, 427),
 (1038, 426),
 (1127, 426),
 (1557, 426),
 (781, 425),
 (1657, 425),
 (1706, 425),
 (1736, 425),
 (795, 424),
 (1588, 424),
 (1875, 424),
 (1071, 422),
 (1712, 422),
 (763, 420),
 (867, 420),
 (1220, 420),
 (624, 418),
 (784, 418),
 (1059, 417),
 (1393, 417),
 (1469, 417),
 (562, 416),
 (1396, 416),
 (837, 413),
 (1007, 413),
 (1294, 411),
 (513, 410),
 (1000, 409),
 (503, 407),
 (1825, 407),
 (1860, 407),
 (703, 404),
 (1644, 404),
 (1382, 403),
 (1207, 402),
 (1555, 397),
 (1629, 397),
 (1805, 396),
 (1585, 395),
 (543, 394),
 (1291, 391),
 (1832, 391),
 (1428, 390),
 (1687, 388),
 (1096, 386),
 (1143, 379),
 (1512, 375),
 (0, 372),
 (217, 366),
 (1400, 346),
 (278, 342),
 (1367, 328),
 (234, 316),
 (981, 287),
 (1135, 286),
 (1781, 286),
 (1327, 283),
 (570, 280),
 (1527, 277),
 (502, 269),
 (1320, 269),
 (1735, 269),
 (1338, 268),
 (1709, 268),
 (237, 266),
 (1844, 266),
 (902, 265),
 (1092, 264),
 (1236, 263),
 (451, 260),
 (1280, 260),
 (1786, 260),
 (573, 259),
 (1125, 259),
 (904, 258),
 (1542, 258),
 (1435, 256),
 (1634, 256),
 (1567, 255),
 (1206, 254),
 (985, 253),
 (1433, 252),
 (1131, 247),
 (1700, 245),
 (1438, 243),
 (285, 241),
 (661, 241),
 (676, 241),
 (1662, 241),
 (1106, 239),
 (627, 235),
 (791, 235),
 (1693, 234),
 (1851, 234),
 (1887, 234),
 (976, 233),
 (667, 229),
 (1870, 224),
 (670, 221),
 (721, 220),
 (1725, 219),
 (88, 218),
 (399, 218),
 (1197, 215),
 (110, 213),
 (418, 212),
 (514, 212),
 (183, 204),
 (339, 204),
 (449, 202),
 (396, 200),
 (484, 200),
 (358, 199),
 (372, 198),
 (446, 198),
 (678, 198),
 (443, 197),
 (171, 195),
 (434, 195),
 (445, 195),
 (458, 195),
 (316, 194),
 (249, 192),
 (341, 192),
 (725, 192),
 (359, 191),
 (410, 191),
 (687, 191),
 (268, 190),
 (578, 190),
 (594, 190),
 (134, 189),
 (143, 189),
 (247, 189),
 (169, 188),
 (529, 188),
 (720, 188),
 (84, 187),
 (506, 187),
 (334, 186),
 (751, 185),
 (649, 184),
 (321, 183),
 (386, 183),
 (497, 183),
 (141, 182),
 (369, 182),
 (370, 182),
 (650, 182),
 (680, 182),
 (314, 181),
 (361, 181),
 (105, 180),
 (132, 180),
 (376, 180),
 (378, 180),
 (162, 179),
 (436, 179),
 (200, 178),
 (457, 178),
 (462, 178),
 (98, 177),
 (270, 177),
 (360, 177),
 (633, 177),
 (408, 176),
 (651, 176),
 (686, 176),
 (123, 175),
 (313, 175),
 (752, 175),
 (350, 174),
 (618, 174),
 (89, 173),
 (496, 173),
 (743, 173),
 (1461, 173),
 (345, 172),
 (384, 172),
 (709, 172),
 (1876, 171),
 (326, 170),
 (575, 170),
 (601, 170),
 (368, 169),
 (566, 169),
 (607, 169),
 (87, 167),
 (475, 167),
 (564, 167),
 (212, 166),
 (286, 165),
 (498, 165),
 (1646, 164),
 (130, 163),
 (481, 163),
 (1154, 163),
 (374, 162),
 (432, 162),
 (1829, 162),
 (380, 161),
 (711, 161),
 (539, 160),
 (256, 158),
 (1013, 158),
 (1425, 158),
 (828, 155),
 (486, 154),
 (1311, 154),
 (1446, 154),
 (1707, 154),
 (472, 153),
 (1475, 153),
 (1862, 153),
 (1738, 150),
 (291, 149),
 (320, 149),
 (917, 149),
 (956, 149),
 (1168, 149),
 (1344, 149),
 (1457, 148),
 (1631, 148),
 (1833, 148),
 (549, 147),
 (1416, 147),
 (886, 146),
 (615, 145),
 (613, 144),
 (1159, 144),
 (1161, 144),
 (1608, 144),
 (907, 143),
 (1099, 143),
 (1380, 143),
 (1651, 142),
 (1885, 142),
 (1511, 141),
 (1593, 141),
 (102, 140),
 (1232, 140),
 (1451, 139),
 (1769, 139),
 (297, 137),
 (1166, 137),
 (1655, 137),
 (1666, 137),
 (1747, 137),
 (257, 136),
 (347, 136),
 (702, 136),
 (1177, 136),
 (1282, 136),
 (928, 135),
 (940, 135),
 (1419, 135),
 (1506, 135),
 (1684, 135),
 (808, 134),
 (949, 134),
 (996, 134),
 (1554, 134),
 (1685, 134),
 (280, 133),
 (592, 133),
 (872, 133),
 (1359, 133),
 (1260, 132),
 (1394, 132),
 (239, 131),
 (861, 131),
 (1202, 130),
 (1726, 130),
 (1144, 129),
 (1160, 129),
 (1342, 129),
 (854, 128),
 (787, 127),
 (148, 126),
 (464, 125),
 (1019, 125),
 (684, 124),
 (1173, 124),
 (811, 123),
 (932, 123),
 (1412, 123),
 (1492, 123),
 (259, 122),
 (1003, 122),
 (1032, 122),
 (1430, 122),
 (1538, 122),
 (242, 121),
 (353, 121),
 (1286, 121),
 (1355, 121),
 (488, 120),
 (789, 120),
 (1215, 120),
 (1217, 120),
 (236, 119),
 (1388, 119),
 (522, 118),
 (970, 118),
 (1085, 118),
 (1137, 118),
 (1180, 118),
 (1255, 118),
 (809, 116),
 (764, 115),
 (1134, 114),
 (583, 113),
 (918, 113),
 (1065, 113),
 (1682, 113),
 (559, 112),
 (413, 111),
 (196, 110),
 (768, 110),
 (1148, 110),
 (1514, 109),
 (908, 108),
 (176, 107),
 (277, 107),
 (128, 106),
 (1023, 106),
 (1534, 106),
 (1729, 106),
 (924, 105),
 (951, 105),
 (964, 105),
 (1129, 105),
 (127, 104),
 (610, 104),
 (874, 104),
 (1480, 104),
 (1537, 104),
 (1614, 104),
 (137, 103),
 (145, 103),
 (153, 103),
 (992, 103),
 (708, 102),
 (847, 102),
 (962, 102),
 (755, 101),
 (909, 101),
 (673, 100),
 (225, 99),
 (229, 99),
 (580, 99),
 (657, 99),
 (817, 99),
 (252, 98),
 (290, 98),
 (1353, 98),
 (191, 97),
 (989, 97),
 (1431, 97),
 (157, 96),
 (189, 96),
 (786, 96),
 (973, 96),
 (1063, 96),
 (1118, 96),
 (194, 95),
 (275, 95),
 (1201, 95),
 (1272, 95),
 (41, 94),
 (177, 94),
 (827, 94),
 (1054, 94),
 (1718, 94),
 (111, 93),
 (190, 93),
 (215, 93),
 (230, 93),
 (731, 93),
 (769, 93),
 (1027, 93),
 (1599, 93),
 (274, 92),
 (750, 92),
 (972, 92),
 (1331, 92),
 (1551, 92),
 (1869, 92),
 (282, 91),
 (354, 91),
 (404, 91),
 (950, 91),
 (1334, 91),
 (1652, 91),
 (118, 90),
 (152, 90),
 (531, 90),
 (855, 90),
 (893, 90),
 (912, 90),
 (164, 89),
 (165, 89),
 (185, 89),
 (206, 89),
 (214, 89),
 (921, 89),
 (1104, 89),
 (1817, 89),
 (34, 88),
 (75, 88),
 (76, 88),
 (231, 88),
 (1257, 88),
 (1659, 88),
 (1754, 88),
 (1767, 88),
 (136, 87),
 (305, 87),
 (640, 87),
 (693, 87),
 (717, 87),
 (766, 87),
 (1021, 87),
 (1493, 87),
 (55, 86),
 (68, 86),
 (204, 86),
 (205, 86),
 (318, 86),
 (647, 86),
 (1228, 86),
 (1329, 86),
 (1454, 86),
 (59, 85),
 (224, 85),
 (251, 85),
 (253, 85),
 (560, 85),
 (771, 85),
 (1191, 85),
 (1192, 85),
 (166, 84),
 (289, 84),
 (307, 84),
 (508, 84),
 (796, 84),
 (856, 84),
 (1231, 84),
 (187, 83),
 (192, 83),
 (226, 83),
 (228, 83),
 (403, 83),
 (685, 83),
 (1048, 83),
 (1586, 83),
 (60, 82),
 (178, 82),
 (213, 82),
 (276, 82),
 (1139, 82),
 (1181, 82),
 (61, 81),
 (69, 81),
 (94, 81),
 (126, 81),
 (150, 81),
 (151, 81),
 (216, 81),
 (530, 81),
 (834, 81),
 (1234, 81),
 (101, 80),
 (121, 80),
 (158, 80),
 (173, 80),
 (186, 80),
 (227, 80),
 (296, 80),
 (572, 80),
 (946, 80),
 (963, 80),
 (1403, 80),
 (1422, 80),
 (1507, 80),
 (1643, 80),
 (96, 79),
 (184, 79),
 (427, 79),
 (1052, 79),
 (1368, 79),
 (1759, 79),
 (288, 78),
 (746, 78),
 (805, 78),
 (1664, 78),
 (1836, 78),
 (93, 77),
 (117, 77),
 (193, 77),
 (272, 77),
 (309, 77),
 (1034, 77),
 (1169, 77),
 (1442, 77),
 (1884, 77),
 (119, 76),
 (865, 76),
 (898, 76),
 (1012, 76),
 (1194, 76),
 (1670, 76),
 (1814, 76),
 (163, 75),
 (831, 75),
 (897, 75),
 (1395, 75),
 (1464, 75),
 (120, 74),
 (220, 74),
 (303, 74),
 (304, 74),
 (1340, 74),
 (1548, 74),
 (1601, 74),
 (167, 73),
 (310, 73),
 (862, 73),
 (939, 73),
 (1149, 73),
 (1318, 73),
 (1470, 73),
 (1501, 73),
 (1645, 73),
 (188, 72),
 (664, 72),
 (1375, 72),
 (1823, 72),
 (591, 71),
 (144, 70),
 (174, 70),
 (295, 70),
 (317, 70),
 (1300, 70),
 (66, 69),
 (80, 69),
 (149, 69),
 (944, 69),
 (1336, 69),
 (1489, 69),
 (273, 68),
 (987, 68),
 (95, 67),
 (207, 67),
 (1276, 67),
 (175, 66),
 (930, 66),
 (1533, 66),
 (1761, 66),
 (1031, 65),
 (1605, 65),
 (390, 64),
 (1218, 64),
 (1702, 64),
 (219, 63),
 (1427, 63),
 (135, 62),
 (271, 62),
 (1080, 62),
 (840, 61),
 (1004, 61),
 (1376, 61),
 (590, 59),
 (706, 59),
 (1086, 59),
 (1504, 59),
 (598, 57),
 (1248, 57),
 (1136, 56),
 (1500, 56),
 (478, 55),
 (919, 55),
 (520, 54),
 (1268, 54),
 (581, 53),
 (602, 52),
 (648, 51),
 (723, 51),
 (747, 51),
 (974, 51),
 (1051, 51),
 (1162, 51),
 (72, 50),
 (414, 50),
 (841, 50),
 (1330, 50),
 (1717, 50),
 (785, 49),
 (988, 49),
 (1443, 49),
 (1487, 49),
 (232, 48),
 (744, 48),
 (965, 48),
 (1497, 48),
 (1498, 48),
 (952, 47),
 (986, 47),
 (1378, 47),
 (1386, 47),
 (947, 46),
 (1463, 46),
 (1694, 46),
 (156, 45),
 (756, 45),
 (1488, 45),
 (1525, 45),
 (402, 44),
 (510, 44),
 (938, 44),
 (1067, 44),
 (1543, 44),
 (1787, 44),
 (554, 43),
 (842, 43),
 (820, 42),
 (1733, 42),
 (804, 41),
 (835, 41),
 (891, 41),
 (1241, 41),
 (1285, 41),
 (1076, 40),
 (1212, 40),
 (1708, 40),
 (18, 39),
 (238, 39),
 (1306, 39),
 (1448, 39),
 (1740, 39),
 (690, 38),
 (1002, 38),
 (1196, 38),
 (195, 37),
 (961, 37),
 (1022, 37),
 (1209, 37),
 (551, 36),
 (683, 36),
 (732, 36),
 (770, 36),
 (1224, 36),
 (1350, 36),
 (1574, 36),
 (258, 35),
 (887, 35),
 (1147, 35),
 (1210, 35),
 (1405, 35),
 (1418, 35),
 (509, 34),
 (546, 34),
 (603, 34),
 (815, 34),
 (849, 34),
 (1077, 34),
 (1112, 34),
 (1521, 34),
 (241, 33),
 (518, 33),
 (681, 33),
 (885, 33),
 (925, 33),
 (1377, 33),
 (14, 32),
 (440, 32),
 (582, 32),
 (959, 32),
 (1046, 32),
 (1262, 32),
 (1609, 32),
 (17, 31),
 (823, 31),
 (1288, 31),
 (1322, 31),
 (1749, 31),
 (16, 30),
 (489, 30),
 (739, 30),
 (818, 30),
 (910, 30),
 (1299, 30),
 (1518, 30),
 (1764, 30),
 (1796, 30),
 (8, 29),
 (106, 29),
 (299, 29),
 (388, 29),
 (454, 29),
 (799, 29),
 (833, 29),
 (838, 29),
 (845, 29),
 (900, 29),
 (1290, 29),
 (1620, 29),
 (29, 28),
 (67, 28),
 (179, 28),
 (429, 28),
 (466, 28),
 (682, 28),
 (727, 28),
 (969, 28),
 (1128, 28),
 (1203, 28),
 (1314, 28),
 (1372, 28),
 (1502, 28),
 (1830, 28),
 (7, 27),
 (13, 27),
 (254, 27),
 (355, 27),
 (452, 27),
 (523, 27),
 (966, 27),
 (1061, 27),
 (1821, 27),
 (1848, 27),
 (6, 26),
 (19, 26),
 (39, 26),
 (40, 26),
 (70, 26),
 (138, 26),
 (638, 26),
 (754, 26),
 (822, 26),
 (1398, 26),
 (1583, 26),
 (74, 25),
 (547, 25),
 (550, 25),
 (668, 25),
 (724, 25),
 (881, 25),
 (1254, 25),
 (1584, 25),
 (1638, 25),
 (1679, 25),
 (1766, 25),
 (1811, 25),
 (4, 24),
 (9, 24),
 (20, 24),
 (22, 24),
 (139, 24),
 (792, 24),
 (1025, 24),
 (1039, 24),
 (1133, 24),
 (1263, 24),
 (1281, 24),
 (1287, 24),
 (1552, 24),
 (1715, 24),
 (54, 23),
 (911, 23),
 (1008, 23),
 (1409, 23),
 (1508, 23),
 (1598, 23),
 (1648, 23),
 (3, 22),
 (235, 22),
 (382, 22),
 (653, 22),
 (671, 22),
 (780, 22),
 (812, 22),
 (1170, 22),
 (1324, 22),
 (1436, 22),
 (1450, 22),
 (1575, 22),
 (1630, 22),
 (1681, 22),
 (1800, 22),
 (1818, 22),
 (1886, 22),
 (11, 21),
 (15, 21),
 (21, 21),
 (113, 21),
 (168, 21),
 (1541, 21),
 (1641, 21),
 (30, 20),
 (222, 20),
 (501, 20),
 (561, 20),
 (926, 20),
 (1295, 20),
 (23, 19),
 (25, 19),
 (255, 19),
 (263, 19),
 (634, 19),
 (689, 19),
 (695, 19),
 (726, 19),
 (933, 19),
 (1047, 19),
 (1360, 19),
 (1385, 19),
 (1515, 19),
 (1877, 19),
 (78, 18),
 (261, 18),
 (540, 18),
 (565, 18),
 (644, 18),
 (652, 18),
 (830, 18),
 (945, 18),
 (997, 18),
 (1015, 18),
 (1182, 18),
 (1459, 18),
 (1532, 18),
 (1671, 18),
 (31, 17),
 (46, 17),
 ...]

In [ ]:
train_X[clusterer2.labels_==484][col_names[20:30]]

In [ ]:
clusterer2.labels_==484

In [42]:
import pickle

In [70]:
pickle.dump(clusterer, open('hdbscan_cluster_jac_50.pickle','wb'))

In [44]:
pickle.dump(clusterer2, open('hdbscan_cluster_man_05.pickle','wb'))

In [ ]:
del clusterer
del clusterer2