In [1]:
cd executible/


/Users/alexeygilman/repos/Cu_transition_time_course-/executible

In [2]:
%run Cu_transition_functionalized.py

In [3]:
import hdbscan
import time
from sklearn import metrics

In [4]:
df1_raw_FM40 = raw_data_cleanup("5G_counts.tsv")


columns = ['5GB1_FM40_T0m_TR2', '5GB1_FM40_T10m_TR3', '5GB1_FM40_T20m_TR2', '5GB1_FM40_T40m_TR1',
           '5GB1_FM40_T60m_TR1', '5GB1_FM40_T90m_TR2', '5GB1_FM40_T150m_TR1_remake', '5GB1_FM40_T180m_TR1']

df2_TPM = TPM_counts(df1_raw_FM40, "start_coord", "end_coord",columns, remove_zero = True)  #TPM counts
df2_TPM_log2 = log_2_transform(df2_TPM, "5GB1_FM40_T0m_TR2","5GB1_FM40_T180m_TR1") #TPM log 2 transformed 
df2_TPM_mean = mean_center(df2_TPM, "5GB1_FM40_T0m_TR2","5GB1_FM40_T180m_TR1") #TPM mean centered 

df3_pearson_r = congruency_table(df2_TPM, "5GB1_FM40_T0m_TR2" , "5GB1_FM40_T180m_TR1", step = df2_TPM.shape[0])
df3_euclidean_mean = euclidean_distance(df2_TPM_mean, "5GB1_FM40_T0m_TR2" , "5GB1_FM40_T180m_TR1")
df3_euclidean_log2 = euclidean_distance(df2_TPM_mean, "5GB1_FM40_T0m_TR2" , "5GB1_FM40_T180m_TR1" )

print("The shape of the TPM table is ", df2_TPM.shape)
print("The shape of the pearson_r matrix is ", df3_pearson_r.shape)


5G_counts.tsv was located in the directory
5G_counts.tsv was imported into dataframe
QC columns were removed from dataframe
All non FM40 data were removed from dataframe
All FM40 columns were sorted by timecourse sequence
Clean-up of raw data complete
The shape of the TPM table is  (4480, 16)
The shape of the pearson_r matrix is  (4480, 4480)

Clustering pearsons_r with HDBSCAN


In [5]:
# Clustering the pearsons_R with N/A vlaues removed 

hdb_t1 = time.time()
hdb_pearson_r = hdbscan.HDBSCAN(metric = "precomputed", min_cluster_size=10).fit(df3_pearson_r)
hdb_pearson_r_labels = hdb_pearson_r.labels_
hdb_elapsed_time = time.time() - hdb_t1
print("time to cluster", hdb_elapsed_time)


time to cluster 4.785267114639282

In [6]:
print(np.unique(hdb_pearson_r_labels)) # unique bins, zero is noise
print(np.bincount(hdb_pearson_r_labels[hdb_pearson_r_labels!=-1]))


[-1  0  1]
[224  15]

In [24]:
pearson_clusters = {i: np.where(hdb_pearson_r_labels == i)[0] for i in range(2)}

In [23]:
pearson_clusters


Out[23]:
{0: array([  75,  109,  124,  125,  208,  217,  236,  237,  250,  278,  281,
         282,  318,  319,  320,  341,  342,  344,  404,  405,  406,  407,
         408,  499,  500,  501,  502,  503,  517,  525,  526,  530,  608,
         657,  660,  661,  663,  665,  666,  669,  673,  683,  686,  687,
         688,  690,  691,  692,  693,  694,  696,  699,  700,  738,  847,
         938,  983,  990,  992,  993,  995, 1028, 1107, 1123, 1151, 1152,
        1153, 1318, 1319, 1328, 1330, 1331, 1342, 1343, 1403, 1425, 1469,
        1503, 1512, 1544, 1550, 1577, 1632, 1633, 1641, 1643, 1804, 1805,
        1820, 1845, 1855, 1866, 1878, 1954, 1960, 1961, 1962, 1989, 2014,
        2110, 2156, 2162, 2165, 2175, 2189, 2218, 2234, 2318, 2332, 2334,
        2336, 2337, 2340, 2380, 2382, 2399, 2458, 2466, 2477, 2478, 2483,
        2506, 2518, 2540, 2586, 2587, 2603, 2605, 2606, 2654, 2725, 2727,
        2743, 2805, 2817, 2818, 2819, 2825, 3062, 3063, 3143, 3297, 3301,
        3363, 3391, 3414, 3437, 3483, 3484, 3596, 3597, 3599, 3602, 3603,
        3604, 3605, 3617, 3630, 3647, 3648, 3670, 3717, 3718, 3719, 3720,
        3723, 3724, 3725, 3726, 3727, 3734, 3735, 3736, 3738, 3740, 3741,
        3743, 3744, 3757, 3773, 3774, 3775, 3834, 3854, 3855, 3857, 3863,
        3865, 3866, 3877, 3884, 3915, 3951, 3965, 4046, 4082, 4083, 4087,
        4092, 4094, 4095, 4117, 4119, 4121, 4149, 4152, 4159, 4242, 4254,
        4261, 4283, 4304, 4306, 4353, 4354, 4355, 4382, 4390, 4398, 4427,
        4429, 4435, 4450, 4457]),
 1: array([1507, 2112, 2113, 2115, 2765, 3102, 3103, 3332, 3407, 4036, 4038,
        4098, 4391, 4394, 4397])}

In [25]:
#pd.set_option('display.height', 500)  #These two commands allow for the display of max of 500 rows - exploring genes
#pd.set_option('display.max_rows', 500) 
df2_TPM.iloc[pearson_clusters[1],:] #the genes that were clustered together [0,1]


Out[25]:
product type gene_symbol locus start_coord end_coord note translation 5GB1_FM40_T0m_TR2 5GB1_FM40_T10m_TR3 5GB1_FM40_T20m_TR2 5GB1_FM40_T40m_TR1 5GB1_FM40_T60m_TR1 5GB1_FM40_T90m_TR2 5GB1_FM40_T150m_TR1_remake 5GB1_FM40_T180m_TR1
locus_tag
MBURv2_160283 putative ferric siderophore transport system, ... CDS NaN MBURv2 3576162 3576686 Evidence 3 : Function proposed based on presen... NaN 118.243568 112.275736 110.964306 100.836391 77.197657 70.032566 73.893920 95.436505
MBURv2_20180 ferrous iron transporter, protein A CDS feoA MBURv2 281921 282154 Evidence 2a : Function of homologous gene expe... NaN 210.424105 158.632845 181.080545 126.342956 61.051857 70.194605 95.464947 118.980515
MBURv2_20181 fused ferrous iron transporter, protein B: GTP... CDS feoB MBURv2 282154 284478 Evidence 2a : Function of homologous gene expe... NaN 95.819838 86.744847 91.146440 71.621996 32.461622 39.674872 55.479556 64.157192
MBURv2_20183 conserved protein of unknown function CDS NaN MBURv2 284953 286617 Evidence 4 : Homologs of previously reported g... NaN 92.724781 77.558180 86.194789 70.352941 49.814896 47.463966 55.260844 59.371024
MBURv2_210346 Histidine kinase CDS NaN MBURv2 4474339 4475010 NaN NaN 70.518396 68.822867 66.867096 63.493064 49.910520 54.874815 53.845876 59.197819
MBURv2_240071 conserved protein of unknown function CDS NaN MBURv2 4822700 4823671 Evidence 4 : Homologs of previously reported g... NaN 115.521215 125.266171 122.873477 171.859978 242.070964 230.034969 215.806091 173.416287
MBURv2_240072 conserved protein of unknown function CDS NaN MBURv2 4823671 4825122 Evidence 4 : Homologs of previously reported g... NaN 152.884221 166.004637 159.391668 244.378006 339.549453 333.040188 298.420330 260.795367
MBURv2_30040 conserved exported protein of unknown function CDS NaN MBURv2 648480 649751 Evidence 4 : Homologs of previously reported g... NaN 346.196665 456.748233 416.782502 555.641953 1007.956458 965.279974 855.040811 766.323303
MBURv2_30115 Integrase/recombinase CDS int MBURv2 749393 750415 NaN NaN 18.954208 19.738377 18.230891 15.962341 11.687684 12.175099 14.009989 14.678014
MBURv2_60111 nitrite reductase, large subunit, NAD(P)H-binding CDS nirB MBURv2 1491766 1494321 Evidence 2a : Function of homologous gene expe... NaN 605.258423 691.437830 667.622137 817.519655 1077.573416 1024.202207 988.430960 914.548557
MBURv2_60113 Response regulator receiver and ANTAR domain p... CDS NaN MBURv2 1496503 1497081 NaN NaN 23.493069 27.461091 27.963444 34.230125 49.110936 51.477168 44.846438 43.132774
MBURv2_60177 Transcriptional antiterminator, Rof CDS NaN MBURv2 1561635 1561889 NaN NaN 245.862088 217.642197 254.978512 131.689317 45.544737 44.577813 81.965303 131.877279
MBURv2_80064 fused DNA-binding response regulator in two-co... CDS glnG MBURv2 1901015 1902424 Evidence 2a : Function of homologous gene expe... NaN 108.514855 116.924797 121.151452 163.911171 229.124357 225.811603 183.639002 180.891342
MBURv2_80067 nitrogen assimilation regulatory protein for G... CDS glnK MBURv2 1906036 1906374 Evidence 2a : Function of homologous gene expe... NaN 1236.258838 1480.016871 1362.990010 2351.761430 3215.524758 3305.708022 3031.353754 2351.879330
MBURv2_80070 ammonium transporter CDS amtB MBURv2 1908452 1909744 Evidence 2a : Function of homologous gene expe... NaN 178.796228 216.015070 211.287476 293.525608 434.560734 446.470951 376.033774 368.793380

Looks like there are two clusters, some expression and zero expression across samples.

Clustering mean centered euclidean distance with with HDBSCAN


In [27]:
df3_euclidean_mean.hist()


Out[27]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x113686b00>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x115b5dda0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x115baf9e8>, ...,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1181f8780>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x118324cc0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11836df28>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x1183afd30>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1183fd978>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x118532be0>, ...,
        <matplotlib.axes._subplots.AxesSubplot object at 0x119eecb70>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11a036f98>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11a078748>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x11a0c6198>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11a1008d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11a150518>, ...,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11bd07630>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11bd55278>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11bd912e8>],
       ..., 
       [<matplotlib.axes._subplots.AxesSubplot object at 0x1aad462e8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1aad88240>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1aadc8a90>, ...,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1ad083cf8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1ad0c6588>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1ad109a20>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x1ad159978>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1ad1a3048>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1ad458b38>, ...,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1aedabcc0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1aede3198>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1aef380f0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x1aef6e940>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1af12c2b0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1aefcf0b8>, ...,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1b0c3b240>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1b0c66780>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1b0c58da0>]], dtype=object)

In [10]:
# Clustering the mean centered euclidean distance of TPM counts 

hdb_t1 = time.time()
hdb_euclidean_mean = hdbscan.HDBSCAN(metric = "precomputed", min_cluster_size=10).fit(df3_euclidean_mean)
hdb_euclidean_mean_labels = hdb_euclidean_mean.labels_
hdb_elapsed_time = time.time() - hdb_t1
print("time to cluster", hdb_elapsed_time)


time to cluster 6.022979021072388

In [11]:
print(np.unique(hdb_euclidean_mean_labels))
print(np.bincount(hdb_euclidean_mean_labels[hdb_euclidean_mean_labels!=-1]))


[-1]
[]

In [12]:
euclidean_mean_clusters = {i: np.where(hdb_euclidean_mean_labels == i)[0] for i in range(2)}
df2_TPM.iloc[euclidean_mean_clusters[1],:]


Out[12]:
product type gene_symbol locus start_coord end_coord note translation 5GB1_FM40_T0m_TR2 5GB1_FM40_T10m_TR3 5GB1_FM40_T20m_TR2 5GB1_FM40_T40m_TR1 5GB1_FM40_T60m_TR1 5GB1_FM40_T90m_TR2 5GB1_FM40_T150m_TR1_remake 5GB1_FM40_T180m_TR1
locus_tag

Looks like 2 clusters - both with zero expression.

looks like wether it is a numpy array or pandas dataframe, the result is the same. lets now try to get index of the clustered points.

Clustering log transformed euclidean distance with with HDBSCAN


In [28]:
df3_euclidean_log2


Out[28]:
locus_tag MBURv2_100001 MBURv2_100002 MBURv2_100003 MBURv2_10001 MBURv2_10002 MBURv2_10003 MBURv2_10004 MBURv2_10005 MBURv2_10006 MBURv2_10007 ... MBURv2_tRNA4 MBURv2_tRNA40 MBURv2_tRNA41 MBURv2_tRNA42 MBURv2_tRNA43 MBURv2_tRNA44 MBURv2_tRNA5 MBURv2_tRNA6 MBURv2_tRNA7 MBURv2_tRNA8
locus_tag
MBURv2_100001 0.000000 5.223666 7.344067 14.843396 29.298972 21.745323 5.826292 5.193588 21.612292 13.313414 ... 26.117059 419.208451 302.921397 208.678333 61.568151 34.697223 345.013663 373.585068 158.429465 560.223957
MBURv2_100002 5.223666 0.000000 7.404187 11.260013 28.311112 25.650899 4.964426 5.368636 21.517591 12.377645 ... 29.484939 418.969169 302.496445 210.394089 62.091403 35.519929 344.939793 371.929211 160.390886 558.506612
MBURv2_100003 7.344067 7.404187 0.000000 11.144207 33.202231 23.468108 6.460714 8.801506 26.465912 17.807287 ... 25.927345 422.591759 306.608212 211.532877 63.378991 38.293495 348.362274 376.123186 162.422325 562.705638
MBURv2_10001 14.843396 11.260013 11.144207 0.000000 36.561595 32.567248 11.834242 15.719222 30.130630 21.260764 ... 31.907818 425.770363 308.543398 217.381023 69.715512 44.564529 351.601790 377.287805 169.289609 563.031356
MBURv2_10002 29.298972 28.311112 33.202231 36.561595 0.000000 42.318096 29.711239 26.956130 12.577875 17.564065 ... 52.600629 392.874864 279.141782 191.489245 46.816720 22.642093 322.403709 350.375007 146.718629 533.465735
MBURv2_10003 21.745323 25.650899 23.468108 32.567248 42.318096 0.000000 24.672872 23.686298 35.243224 30.588730 ... 20.153934 426.626657 309.720338 209.958893 67.288269 42.042220 349.383536 381.680300 153.974846 569.166693
MBURv2_10004 5.826292 4.964426 6.460714 11.834242 29.711239 24.672872 0.000000 6.982259 22.569002 14.056489 ... 26.930440 419.792728 302.463486 209.973733 63.109560 34.868167 344.580788 372.060187 159.402700 558.712223
MBURv2_10005 5.193588 5.368636 8.801506 15.719222 26.956130 23.686298 6.982259 0.000000 21.278581 12.748837 ... 29.101352 416.368657 300.264236 207.267639 57.952570 31.571145 341.959065 369.408899 155.940651 557.035614
MBURv2_10006 21.612292 21.517591 26.465912 30.130630 12.577875 35.243224 22.569002 21.278581 0.000000 9.313694 ... 43.521775 402.952193 288.522213 198.545200 55.964345 28.080435 332.237623 360.649670 152.974739 543.894001
MBURv2_10007 13.313414 12.377645 17.807287 21.260764 17.564065 30.588730 14.056489 12.748837 9.313694 0.000000 ... 37.379907 409.496997 294.428108 203.728809 57.434688 29.863178 337.654534 364.971505 156.377022 549.842995
MBURv2_10008 5.151832 3.527069 8.214563 13.148613 26.531868 25.798741 4.328001 5.765226 19.148144 10.245706 ... 29.550887 417.325488 301.081284 208.738904 61.177857 33.690741 343.527125 370.709311 159.281358 557.094958
MBURv2_10009 5.098532 4.023633 8.954840 14.298731 26.063487 25.260119 4.512657 4.106327 19.379939 10.660253 ... 29.480401 416.345368 299.660501 207.617770 59.878550 31.928729 341.855846 369.258604 157.065790 556.085646
MBURv2_10010 4.926435 5.004352 5.177241 12.330534 31.084286 23.654025 4.333495 5.925424 23.958112 15.192813 ... 25.592905 421.204966 304.917675 211.613220 62.842726 36.009687 346.714979 373.259983 160.344005 561.206832
MBURv2_10011 5.104089 3.419565 5.589917 11.694331 28.919057 25.388140 3.533459 4.963500 22.338442 13.394031 ... 28.354303 418.814756 302.546269 209.569689 61.151941 34.592945 344.617729 371.699040 159.695125 558.680952
MBURv2_10012 5.353349 4.335242 5.736744 11.513034 29.566707 25.266159 4.216527 5.667249 23.257519 14.480321 ... 28.551417 418.618687 301.854251 208.395201 61.033044 34.745621 344.021013 372.397410 159.166847 558.566800
MBURv2_10013 4.767168 2.904000 6.313091 12.050952 28.237107 25.548689 4.186581 4.386920 21.740342 12.753933 ... 29.026937 418.066858 301.845215 208.882883 60.564481 34.181762 344.020796 371.399311 159.277771 558.145870
MBURv2_10014 7.137315 4.573779 4.853084 9.072458 31.739107 25.784811 5.859765 8.083902 24.744124 15.651721 ... 27.796783 422.370803 306.382679 213.464757 64.358179 38.653216 348.603566 374.675381 163.614499 561.814913
MBURv2_10015 8.821576 7.634443 8.059400 10.890085 34.120947 23.488656 7.852092 11.123362 26.063890 17.560773 ... 24.976554 425.891701 308.913673 216.596630 68.493547 41.183690 351.172301 377.035327 164.809559 564.345631
MBURv2_10016 5.473072 4.640750 6.709995 13.228401 29.600020 23.341955 7.120622 6.014501 22.546320 13.722806 ... 27.750761 420.593808 305.220634 211.877660 62.178194 36.609192 347.205541 373.457470 161.286224 560.952354
MBURv2_10017 16.484242 15.216898 17.248377 17.184570 38.217767 27.355731 16.989735 17.944885 29.897758 22.285965 ... 27.269387 430.318642 314.069104 223.997088 73.531781 46.734393 356.503632 378.658753 169.043603 568.583008
MBURv2_10018 14.719229 12.653665 15.701557 18.470981 24.502257 32.894569 11.991885 12.170560 20.030998 13.145792 ... 34.778681 412.930571 297.100743 207.977142 58.113204 29.698566 338.962733 362.400244 156.633173 551.485149
MBURv2_10019 7.102828 6.078040 4.059463 10.827932 32.609974 23.512314 4.584478 7.668536 25.990928 17.198762 ... 25.515186 422.671662 305.742069 212.630346 64.044925 37.312389 347.416381 374.070513 161.171731 561.636255
MBURv2_10020 13.342915 10.779001 13.910215 17.017882 27.099766 29.007983 10.519584 10.059551 22.922785 15.228528 ... 32.079102 416.576860 299.618006 211.030145 59.997550 31.428062 340.945743 364.048508 156.404548 553.938846
MBURv2_10021 15.809889 16.304014 18.876677 25.297322 25.343368 28.480633 16.531385 12.930709 21.101930 15.827576 ... 32.603369 412.271905 297.914770 206.873601 54.683456 27.153217 338.574340 361.815627 151.696294 553.646611
MBURv2_10022 48.706590 48.057810 52.109685 56.060319 28.035638 56.189754 47.868454 46.014111 33.874448 39.029004 ... 64.530073 386.020025 271.373271 193.208714 53.966673 28.848718 311.948597 331.503029 136.339993 521.825252
MBURv2_10023 17.345098 16.513345 21.468757 22.759344 23.458081 31.277151 15.672589 18.065964 15.472613 12.637325 ... 36.104015 412.512668 292.918452 205.322444 64.960770 32.260503 336.531768 365.332080 155.048010 548.917820
MBURv2_10024 22.072849 21.269868 22.740729 28.163360 19.475415 31.497065 21.655572 20.610949 17.531862 15.960287 ... 42.118978 407.671006 294.558853 201.968313 54.336837 30.759292 336.314530 363.013479 156.090720 546.831545
MBURv2_10025 5.843282 6.871662 4.477948 13.070060 32.393181 20.603738 5.418681 7.488392 25.552959 17.172906 ... 24.512952 422.169634 305.116236 210.814710 63.406847 36.695945 346.725948 375.007036 159.698864 561.748556
MBURv2_10026 5.789831 6.246067 5.793123 13.005690 29.802820 22.436749 6.676438 8.228959 22.595212 14.372844 ... 28.105768 419.916980 303.758392 208.904746 62.432530 36.788103 346.060773 375.312493 160.985626 560.090725
MBURv2_10027 5.012880 3.927759 4.278387 10.509528 31.547935 23.715766 4.513973 6.442083 24.676551 15.750225 ... 26.620072 421.578129 304.869190 211.431882 63.405540 37.190341 346.946129 374.513045 161.138404 561.269852
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
MBURv2_tRNA17 89.598686 89.569546 92.759656 96.699079 71.982421 95.585421 88.510859 86.197666 80.070636 83.652526 ... 101.767019 351.970195 228.824117 163.622679 63.979192 56.702727 264.224739 291.267918 93.575526 486.762888
MBURv2_tRNA18 528.611516 526.668562 531.344919 530.181888 503.510613 541.573212 527.131137 525.511121 513.253973 518.514647 ... 548.989866 242.002337 269.042789 420.108531 490.962993 501.397772 272.178172 271.203277 460.464934 154.186542
MBURv2_tRNA19 45.001836 42.096171 45.984124 38.793854 47.010446 61.604272 41.612055 45.657097 43.266400 41.371464 ... 60.345644 415.339953 290.226140 214.024830 86.086118 56.948881 338.080158 366.446783 171.563743 542.983220
MBURv2_tRNA2 415.297577 416.393509 413.827333 418.146270 407.299425 412.983467 415.480054 415.123490 411.884815 414.005997 ... 424.329740 425.279357 396.710160 294.614779 386.389530 404.556761 408.563156 545.642954 400.694299 555.650298
MBURv2_tRNA21 184.256895 188.594409 184.410970 192.908915 201.989348 166.807344 187.809673 185.965860 196.128847 193.338233 ... 166.317274 519.483861 423.205160 287.545142 197.042766 193.947217 446.175888 491.771084 228.587445 692.860408
MBURv2_tRNA24 40.410035 39.474865 43.109352 44.868987 29.625714 50.849241 38.785253 37.753109 35.204007 35.790965 ... 58.637018 388.851935 265.367618 182.391183 50.646849 23.777437 307.669126 342.309612 132.322112 523.770497
MBURv2_tRNA27 1307.095350 1309.510973 1311.610325 1319.097838 1290.518093 1302.592800 1308.377514 1305.237997 1296.571333 1302.560038 ... 1310.692689 1030.104551 1086.255756 1145.353302 1264.962557 1275.659750 1030.809307 1092.566062 1157.036870 1046.090207
MBURv2_tRNA28 1750.818854 1750.350383 1754.848092 1757.611217 1727.163563 1756.299883 1750.618791 1747.178421 1736.771021 1742.370515 ... 1764.368238 1401.851348 1487.074290 1621.686576 1705.265267 1718.110614 1440.433784 1397.821662 1624.524665 1293.385865
MBURv2_tRNA29 1688.357563 1688.442695 1690.953012 1693.879444 1680.062415 1689.243765 1687.783661 1685.341327 1684.071790 1685.716953 ... 1686.419649 1616.928434 1617.928622 1690.393253 1668.582254 1667.701908 1578.798323 1448.707111 1603.347421 1598.527364
MBURv2_tRNA3 725.961575 726.736261 727.417878 730.447746 711.158594 726.581631 726.584013 725.218716 717.714374 721.865962 ... 739.554304 518.549743 541.842120 545.903423 690.350803 707.537303 537.204700 712.623951 657.084881 571.029864
MBURv2_tRNA30 1623.630931 1625.769305 1627.184656 1634.631891 1613.404920 1617.411856 1624.693388 1620.931312 1618.942891 1622.155984 ... 1621.538115 1449.605414 1478.228040 1525.963343 1586.039357 1595.292326 1415.937648 1401.826045 1480.514318 1473.249123
MBURv2_tRNA31 832.246686 834.562517 837.168203 842.688550 825.939926 829.750315 833.996759 830.575036 828.331758 831.194569 ... 829.009070 763.051880 746.532921 778.224488 808.368366 810.359452 714.758590 704.042644 716.926989 866.732078
MBURv2_tRNA32 91.256892 95.537997 95.163054 104.436681 92.847570 81.544943 93.530602 91.311219 91.643710 93.634404 ... 85.794106 395.897546 278.624793 169.475581 86.211996 75.442714 309.327317 361.332470 97.964651 551.754700
MBURv2_tRNA33 3714.406269 3712.633916 3716.643675 3717.342013 3689.917833 3721.891486 3712.451690 3710.463526 3700.358970 3705.298006 ... 3729.726831 3379.113976 3452.057772 3600.104992 3673.116737 3683.248598 3408.852386 3347.065546 3607.494981 3191.876275
MBURv2_tRNA34 2264.902744 2263.421788 2266.834227 2268.274163 2240.712340 2271.734821 2263.077758 2260.929503 2251.556771 2256.354522 ... 2280.253488 1919.575639 1996.873806 2135.592962 2219.641075 2232.713911 1949.219983 1905.103791 2153.009631 1742.271018
MBURv2_tRNA35 2525.574860 2523.732895 2527.659293 2527.733191 2500.503631 2534.356695 2523.579040 2521.979155 2511.086524 2516.215405 ... 2543.164161 2171.345149 2245.106825 2392.639565 2483.894455 2495.196254 2208.186487 2180.960039 2424.366480 1976.152881
MBURv2_tRNA36 526.928633 526.754539 529.963080 534.091200 506.457276 527.833869 526.012373 523.224923 515.596497 520.326172 ... 535.688676 352.247894 333.347056 439.337753 486.500870 493.994969 277.716358 218.947926 398.314453 336.182733
MBURv2_tRNA37 108.273438 105.302777 109.834925 106.079529 87.772027 124.280327 105.519978 106.052311 95.142485 98.618472 ... 129.565015 350.336706 225.431969 184.333360 101.980090 93.031462 277.365701 304.436675 164.489077 466.179068
MBURv2_tRNA38 444.225044 449.016741 446.033069 454.440061 452.364214 428.875487 446.147166 446.392012 448.229542 449.626373 ... 429.337661 612.881904 519.301571 402.591704 445.035667 438.663268 520.895381 642.002342 391.583424 771.744075
MBURv2_tRNA39 884.540437 885.837114 887.716339 893.193527 863.189283 885.146760 885.457252 882.753338 871.371429 877.783238 ... 897.804248 529.374047 627.316643 680.056539 835.597544 855.003043 589.737471 718.921385 767.269618 555.384858
MBURv2_tRNA4 26.117059 29.484939 25.927345 31.907818 52.600629 20.153934 26.930440 29.101352 43.521775 37.379907 ... 0.000000 438.825428 319.976013 221.675919 79.331880 51.108117 359.763697 388.578615 162.922679 579.852901
MBURv2_tRNA40 419.208451 418.969169 422.591759 425.770363 392.874864 426.626657 419.792728 416.368657 402.952193 409.496997 ... 438.825428 0.000000 177.771848 259.679241 367.531752 389.149017 173.087842 257.997541 333.585789 251.853620
MBURv2_tRNA41 302.921397 302.496445 306.608212 308.543398 279.141782 309.720338 302.463486 300.264236 288.522213 294.428108 ... 319.976013 177.771848 0.000000 171.270321 262.597191 272.509764 72.889486 211.780618 207.629045 290.536740
MBURv2_tRNA42 208.678333 210.394089 211.532877 217.381023 191.489245 209.958893 209.973733 207.267639 198.545200 203.728809 ... 221.675919 259.679241 171.270321 0.000000 163.613686 182.512782 200.596450 322.729339 139.946644 438.990067
MBURv2_tRNA43 61.568151 62.091403 63.378991 69.715512 46.816720 67.288269 63.109560 57.952570 55.964345 57.434688 ... 79.331880 367.531752 262.597191 163.613686 0.000000 38.349715 299.803018 334.153508 124.675989 519.912965
MBURv2_tRNA44 34.697223 35.519929 38.293495 44.564529 22.642093 42.042220 34.868167 31.571145 28.080435 29.863178 ... 51.108117 389.149017 272.509764 182.512782 38.349715 0.000000 312.294313 343.007076 128.543379 531.028663
MBURv2_tRNA5 345.013663 344.939793 348.362274 351.601790 322.403709 349.383536 344.580788 341.959065 332.237623 337.654534 ... 359.763697 173.087842 72.889486 200.596450 299.803018 312.294313 0.000000 195.888914 225.452239 275.748471
MBURv2_tRNA6 373.585068 371.929211 376.123186 377.287805 350.375007 381.680300 372.060187 369.408899 360.649670 364.971505 ... 388.578615 257.997541 211.780618 322.729339 334.153508 343.007076 195.888914 0.000000 288.852254 286.536007
MBURv2_tRNA7 158.429465 160.390886 162.422325 169.289609 146.718629 153.974846 159.402700 155.940651 152.974739 156.377022 ... 162.922679 333.585789 207.629045 139.946644 124.675989 128.543379 225.452239 288.852254 0.000000 479.344639
MBURv2_tRNA8 560.223957 558.506612 562.705638 563.031356 533.465735 569.166693 558.712223 557.035614 543.894001 549.842995 ... 579.852901 251.853620 290.536740 438.990067 519.912965 531.028663 275.748471 286.536007 479.344639 0.000000

4480 rows × 4480 columns


In [13]:
# Clustering the log2 transformed euclidean distance of TPM counts 

hdb_t1 = time.time()
hdb_euclidean_log2 = hdbscan.HDBSCAN(metric = "precomputed", min_cluster_size=10).fit(df3_euclidean_log2)
hdb_euclidean_log2_labels = hdb_euclidean_log2.labels_
hdb_elapsed_time = time.time() - hdb_t1
print("time to cluster", hdb_elapsed_time)


time to cluster 4.981569051742554

In [14]:
print(np.unique(hdb_euclidean_log2_labels))
print(np.bincount(hdb_euclidean_log2_labels[hdb_euclidean_log2_labels!=-1]))


[-1]
[]

In [15]:
euclidean_log2_clusters = {i: np.where(hdb_euclidean_log2_labels == i)[0] for i in range(2)}
df2_TPM.iloc[euclidean_log2_clusters[1],:]


Out[15]:
product type gene_symbol locus start_coord end_coord note translation 5GB1_FM40_T0m_TR2 5GB1_FM40_T10m_TR3 5GB1_FM40_T20m_TR2 5GB1_FM40_T40m_TR1 5GB1_FM40_T60m_TR1 5GB1_FM40_T90m_TR2 5GB1_FM40_T150m_TR1_remake 5GB1_FM40_T180m_TR1
locus_tag

Clustering using built-in HDBSCAN euclidean distance metric (mean centered and scaled to unit variance)


In [16]:
df2_TPM_values = df2_TPM.loc[:,"5GB1_FM40_T0m_TR2":"5GB1_FM40_T180m_TR1"] #isolating the data values 
    
df2_TPM_values_T = df2_TPM_values.T #transposing the data
standard_scaler = StandardScaler()

TPM_counts_mean_centered = standard_scaler.fit_transform(df2_TPM_values_T) #mean centering the data 
TPM_counts_mean_centered = pd.DataFrame(TPM_counts_mean_centered) #back to Dataframe

#transposing back to original form and reincerting indeces and columns 
my_index = df2_TPM_values.index
my_columns = df2_TPM_values.columns

TPM_counts_mean_centered = TPM_counts_mean_centered.T
TPM_counts_mean_centered.set_index(my_index, inplace=True)
TPM_counts_mean_centered.columns = my_columns

In [17]:
# Clustering the pearsons_R with N/A vlaues removed 

hdb_t1 = time.time()
hdb_euclidean = hdbscan.HDBSCAN(metric = "euclidean", min_cluster_size=5).fit(TPM_counts_mean_centered)
hdb_euclidean_labels = hdb_euclidean.labels_
hdb_elapsed_time = time.time() - hdb_t1
print("time to cluster", hdb_elapsed_time)


time to cluster 1.2530548572540283

In [18]:
print(np.unique(hdb_euclidean_labels))
print(np.bincount(hdb_euclidean_labels[hdb_euclidean_labels!=-1]))


[-1  0  1  2  3  4  5  6]
[   5    6    9  569    6 1538    6]
lets look at some clusters

Euclidean_standard_scaled_clusters = {i: np.where(hdb_euclidean_labels == i)[0] for i in range(7)} df2_TPM.iloc[Euclidean_standard_scaled_clusters[0],:]


In [30]:
Euclidean_standard_scaled_clusters = {i: np.where(hdb_euclidean_labels == i)[0] for i in range(7)}
df2_TPM.iloc[Euclidean_standard_scaled_clusters[1],:]


Out[30]:
product type gene_symbol locus start_coord end_coord note translation 5GB1_FM40_T0m_TR2 5GB1_FM40_T10m_TR3 5GB1_FM40_T20m_TR2 5GB1_FM40_T40m_TR1 5GB1_FM40_T60m_TR1 5GB1_FM40_T90m_TR2 5GB1_FM40_T150m_TR1_remake 5GB1_FM40_T180m_TR1
locus_tag
MBURv2_130823 conserved hypothetical protein; putative membr... CDS NaN MBURv2 3057071 3057622 Evidence 4 : Homologs of previously reported g... NaN 54.074431 42.385029 49.380410 45.924264 54.430108 52.911230 42.512312 49.776228
MBURv2_160196 DNA replication and repair protein RecF CDS recF MBURv2 3476509 3477588 NaN NaN 39.716085 33.586759 38.475054 34.019740 41.174621 39.432168 34.425668 39.151422
MBURv2_210033 histidyl tRNA synthetase CDS hisS MBURv2 4129786 4131057 Evidence 2a : Function of homologous gene expe... NaN 184.219401 153.374251 180.859518 160.574002 190.176907 182.024224 158.874769 178.341500
MBURv2_260039 hydroxymethylbilane synthase CDS hemC MBURv2 5063172 5064098 Evidence 2a : Function of homologous gene expe... NaN 186.700990 151.238240 172.944983 160.882580 188.444713 184.348282 153.582663 184.702961
MBURv2_60016 carbamoyl phosphate synthetase small subunit, ... CDS carA MBURv2 1374314 1375447 Evidence 2a : Function of homologous gene expe... NaN 417.342736 361.297562 405.602208 364.758505 454.132299 433.434899 357.685236 404.088838
MBURv2_60050 Na+/H+ antiporter NhaD CDS nhaD MBURv2 1419064 1420479 NaN NaN 196.773053 167.449917 183.707146 168.936189 219.249940 198.774079 178.440470 188.004395
Euclidean_standard_scaled_clusters

Clustering log2 transformed data using built-in HDBSCAN euclidean distance metric (mean centered and scaled to unit variance)


In [19]:
df2_TPM_log2_scale= df2_TPM_log2.T #transposing the data
standard_scaler = StandardScaler()

TPM_log2_mean_scaled = standard_scaler.fit_transform(df2_TPM_log2_scale) #mean centering the data 
TPM_log2_mean_scaled = pd.DataFrame(TPM_log2_mean_scaled) #back to Dataframe

#transposing back to original form and reincerting indeces and columns 
my_index = df2_TPM_values.index
my_columns = df2_TPM_values.columns

TPM_log2_mean_scaled = TPM_log2_mean_scaled.T
TPM_log2_mean_scaled.set_index(my_index, inplace=True)
TPM_log2_mean_scaled.columns = my_columns

In [20]:
# Clustering the pearsons_R with N/A vlaues removed 

hdb_t1 = time.time()
hdb_log2_euclidean = hdbscan.HDBSCAN(metric = "euclidean", min_cluster_size=5).fit(TPM_log2_mean_scaled)
hdb_log2_euclidean = hdb_log2_euclidean.labels_
hdb_elapsed_time = time.time() - hdb_t1
print("time to cluster", hdb_elapsed_time)


time to cluster 1.0190801620483398

In [21]:
print(np.unique(hdb_log2_euclidean))
print(np.bincount(hdb_log2_euclidean[hdb_log2_euclidean!=-1]))


[0 1]
[   8 4472]

In [ ]: