In [1]:
cd ../executible/


/Users/alexeygilman/repos/Cu_transition_time_course-/executible

In [2]:
%run Cu_transition_functionalized.py

In [3]:
df1_raw_FM40 = raw_data_cleanup("5G_counts.tsv")


columns = ['5GB1_FM40_T0m_TR2', '5GB1_FM40_T10m_TR3', '5GB1_FM40_T20m_TR2', '5GB1_FM40_T40m_TR1',
           '5GB1_FM40_T60m_TR1', '5GB1_FM40_T90m_TR2', '5GB1_FM40_T150m_TR1_remake', '5GB1_FM40_T180m_TR1']

df2_TPM = TPM_counts(df1_raw_FM40, "start_coord", "end_coord",columns, remove_zero = True)  #TPM counts
df2_TPM_log2 = log_2_transform(df2_TPM, "5GB1_FM40_T0m_TR2","5GB1_FM40_T180m_TR1") #TPM log 2 transformed 
df2_TPM_mean = mean_center(df2_TPM, "5GB1_FM40_T0m_TR2","5GB1_FM40_T180m_TR1") #TPM mean centered 

df3_pearson_r = congruency_table(df2_TPM, "5GB1_FM40_T0m_TR2" , "5GB1_FM40_T180m_TR1", step = df2_TPM.shape[0])
df3_euclidean_mean = euclidean_distance(df2_TPM_mean, "5GB1_FM40_T0m_TR2" , "5GB1_FM40_T180m_TR1")
df3_euclidean_log2 = euclidean_distance(df2_TPM_mean, "5GB1_FM40_T0m_TR2" , "5GB1_FM40_T180m_TR1" )

print("The shape of the TPM table is ", df2_TPM.shape)
print("The shape of the pearson_r matrix is ", df3_pearson_r.shape)


5G_counts.tsv was located in the directory
5G_counts.tsv was imported into dataframe
QC columns were removed from dataframe
All non FM40 data were removed from dataframe
All FM40 columns were sorted by timecourse sequence
Clean-up of raw data complete
The shape of the TPM table is  (4480, 16)
The shape of the pearson_r matrix is  (4480, 4480)

Historgram of raw data (TPM)


In [4]:
%matplotlib inline

In [5]:
df2_TPM_values = df2_TPM.loc[:,"5GB1_FM40_T0m_TR2":"5GB1_FM40_T180m_TR1"]
df2_TPM_values


Out[5]:
5GB1_FM40_T0m_TR2 5GB1_FM40_T10m_TR3 5GB1_FM40_T20m_TR2 5GB1_FM40_T40m_TR1 5GB1_FM40_T60m_TR1 5GB1_FM40_T90m_TR2 5GB1_FM40_T150m_TR1_remake 5GB1_FM40_T180m_TR1
locus_tag
MBURv2_100001 24.037381 19.388720 19.472578 20.618024 22.190693 20.189929 23.793199 21.197329
MBURv2_100002 7.344755 6.717352 3.700437 5.487055 3.996920 5.438920 4.915027 4.634447
MBURv2_100003 5.293517 3.812551 8.308673 2.372780 3.703710 2.939957 5.977735 0.939415
MBURv2_10001 14.858356 21.889301 17.962332 17.028791 13.191805 11.878101 11.058811 7.491024
MBURv2_10002 89.572248 87.557206 79.063709 99.523823 76.788120 104.245961 95.143906 105.923083
MBURv2_10003 113.068430 88.417142 98.559710 86.786918 104.395747 100.544474 111.386799 95.223404
MBURv2_10004 25.648352 24.401808 24.747034 26.128833 25.869279 27.132933 25.879122 21.871753
MBURv2_10005 22.297302 20.984785 19.619608 18.773890 21.282302 23.224037 22.635527 23.842506
MBURv2_10006 36.396059 26.023126 22.323652 44.069260 24.098416 39.400048 36.049981 41.600548
MBURv2_10007 20.143340 14.745406 11.247106 24.267950 11.512615 21.225053 18.401381 22.889647
MBURv2_10008 11.580831 9.197464 8.748884 12.469220 8.946006 11.460757 10.476239 10.181261
MBURv2_10009 14.828528 13.508870 11.664323 15.024491 13.869439 16.614155 14.931139 14.747629
MBURv2_10010 9.572940 6.566400 9.115164 6.905058 9.366779 8.046342 7.165447 6.736872
MBURv2_10011 4.470721 4.264058 4.801905 4.421453 4.038296 4.992245 4.530373 3.752895
MBURv2_10012 3.487920 5.521111 3.977287 3.908587 4.223751 4.221992 6.463597 2.856851
MBURv2_10013 5.018507 5.245979 4.566663 5.019768 4.319043 5.252161 5.502673 4.950706
MBURv2_10014 20.454360 19.060072 20.275982 17.839686 17.101530 16.968715 16.306752 15.838166
MBURv2_10015 26.360125 19.849017 20.674505 19.307832 20.091929 19.220243 17.461280 14.373755
MBURv2_10016 23.170053 17.866771 19.543940 17.196543 17.188522 18.316637 17.823307 19.170844
MBURv2_10017 48.902660 36.453399 35.572057 34.669799 37.677973 33.488041 26.343918 32.544445
MBURv2_10018 23.737398 25.080547 27.753276 31.490924 26.749574 34.978560 22.317962 31.074723
MBURv2_10019 18.517686 16.976580 19.045605 14.632146 17.579527 18.019855 16.532363 13.060714
MBURv2_10020 24.024225 23.626548 23.557157 22.488816 23.514604 32.865722 18.522146 24.827395
MBURv2_10021 29.692040 21.808389 28.230940 27.360109 31.130611 36.507817 24.709793 38.805558
MBURv2_10022 96.209126 80.186630 82.250455 105.792255 85.242146 132.293947 91.431841 115.618788
MBURv2_10023 52.762325 47.912540 38.340475 58.901854 47.660068 58.089143 52.059163 45.753342
MBURv2_10024 90.827452 79.500965 87.834543 88.998824 71.028470 97.820864 90.333875 90.524501
MBURv2_10025 21.283873 17.843615 19.669862 15.492861 19.162589 19.364575 21.273089 14.635096
MBURv2_10026 33.485767 29.903050 30.668632 29.972300 27.260100 29.436447 34.563738 27.470311
MBURv2_10027 18.273279 17.450707 16.849999 14.887130 16.593877 15.602263 17.156475 13.791518
... ... ... ... ... ... ... ... ...
MBURv2_tRNA17 82.725140 106.593766 88.990261 122.159169 127.120091 183.921364 129.213472 155.726567
MBURv2_tRNA18 165.063715 518.650264 208.992279 600.399320 297.515106 681.296260 449.336939 635.254629
MBURv2_tRNA19 240.373815 266.949303 230.898382 277.060641 246.489099 255.699862 245.304528 231.571882
MBURv2_tRNA2 923.892922 1037.300528 1123.839125 1056.980044 867.301613 1092.077828 1394.865251 1055.098941
MBURv2_tRNA21 592.158985 442.821785 578.244075 432.693470 601.985111 478.907477 592.292665 545.751277
MBURv2_tRNA24 52.271766 76.159066 49.240577 72.685662 65.404147 93.944977 83.012890 73.805073
MBURv2_tRNA27 1321.664261 1138.994162 1123.794180 1568.858729 1864.163541 2283.621094 2180.502562 2255.817087
MBURv2_tRNA28 1850.532738 2332.896748 1685.389878 2536.181316 2405.810033 3514.182013 2458.178456 3457.024862
MBURv2_tRNA29 2658.801473 2440.304960 2639.388451 2639.125095 3579.172518 3825.618534 2128.671610 3637.649487
MBURv2_tRNA3 528.059301 860.518930 588.890686 842.298553 530.796924 896.362224 1362.962510 880.243996
MBURv2_tRNA30 3201.926810 2924.169279 3120.726641 3008.061240 4111.568612 4394.074617 3718.379577 4332.903065
MBURv2_tRNA31 1476.682412 1350.718018 1286.313768 1497.965980 2100.636962 1843.507527 1602.654483 2070.408716
MBURv2_tRNA32 184.286591 142.111247 169.014954 173.590463 235.810871 214.731376 242.432109 218.029667
MBURv2_tRNA33 1332.106699 2909.850713 1603.173029 3164.009115 2068.180769 5586.343071 2542.362369 4140.817432
MBURv2_tRNA34 886.782579 1959.257055 1286.987936 2056.779200 1440.694363 3576.089717 1979.236088 2756.428834
MBURv2_tRNA35 388.885473 1816.071399 666.078618 1981.404394 873.161759 3265.498775 1709.750334 2328.352280
MBURv2_tRNA36 437.251415 490.715631 409.229121 531.032895 583.743138 999.489793 596.026809 794.250928
MBURv2_tRNA37 339.957252 418.482290 346.014866 437.824223 350.156898 435.113578 386.053027 397.464020
MBURv2_tRNA38 1006.870171 763.587658 966.259533 929.697950 1190.883589 1004.426588 1317.440923 944.709356
MBURv2_tRNA39 574.226335 758.450079 616.838040 993.655755 706.987688 1226.935527 1447.411609 1281.093569
MBURv2_tRNA4 42.733123 21.198915 37.928553 25.653763 51.611436 29.666835 36.479713 21.667545
MBURv2_tRNA40 95.868387 238.642759 115.957006 338.753440 121.259945 416.506747 382.402032 514.515088
MBURv2_tRNA41 355.313346 498.585555 327.915627 538.376913 431.264322 638.593163 581.563338 594.981213
MBURv2_tRNA42 119.082968 174.919839 149.611505 222.115981 170.839785 255.991822 352.112532 294.750830
MBURv2_tRNA43 28.187980 45.750612 47.774870 37.371293 34.259315 69.823969 72.330600 95.115762
MBURv2_tRNA44 25.664433 27.795938 25.323944 35.571252 35.834456 58.765340 47.285949 53.935375
MBURv2_tRNA5 228.927443 376.869604 227.571316 398.488453 338.588817 582.035045 495.779408 515.281293
MBURv2_tRNA6 204.361767 343.470744 226.342861 358.889567 295.157178 608.441789 280.723655 514.500011
MBURv2_tRNA7 233.681415 239.045069 204.947267 230.077887 307.546267 361.344322 337.611507 333.200760
MBURv2_tRNA8 67.438209 330.447906 67.539440 415.270289 75.136907 629.184122 357.903327 466.275401

4480 rows × 8 columns


In [6]:
df2_TPM_values.describe()


Out[6]:
5GB1_FM40_T0m_TR2 5GB1_FM40_T10m_TR3 5GB1_FM40_T20m_TR2 5GB1_FM40_T40m_TR1 5GB1_FM40_T60m_TR1 5GB1_FM40_T90m_TR2 5GB1_FM40_T150m_TR1_remake 5GB1_FM40_T180m_TR1
count 4480.000000 4480.000000 4480.000000 4480.000000 4480.000000 4480.000000 4480.000000 4480.000000
mean 223.212355 223.212637 223.212795 223.214017 223.212380 223.212530 223.213161 223.213154
std 1152.060115 1148.631981 1199.539401 1197.234575 1273.559525 1210.463857 1113.872291 1323.682884
min 0.046969 0.048326 0.040957 0.052634 0.054771 0.043477 0.017680 0.041677
25% 28.773103 27.152989 26.820393 25.717298 23.940108 24.267359 24.899048 22.366498
50% 65.454814 61.283415 60.593187 58.463796 55.551369 55.484343 57.089228 52.620759
75% 136.247090 129.407769 129.317888 125.427803 125.157696 121.476406 126.784235 115.605307
max 38276.484324 33395.468885 38156.821484 40112.163709 53334.428836 45789.636731 34735.533389 55623.888672

In [121]:
df2_TPM.idxmax?

In [138]:
index = df2_TPM_values.sort("5GB1_FM40_T0m_TR2", ascending = False).index.tolist()
top_expressed = df2_TPM.loc[index].iloc[:20,:]
top_expressed.to_csv("top_expressed.csv")
top_expressed

#note: looking with mitch at gene MCBURv2_200002 - this is low expressed in all other transcriptomcis data except for
#fm81 where we added extra NO3 and its within the top 15 top expressed genes 

#gene 20471 is top expressed (within top 15 genes across all transcripomics samples that we have)


/Users/alexeygilman/anaconda/lib/python3.4/site-packages/ipykernel/__main__.py:1: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
  if __name__ == '__main__':
Out[138]:
product type gene_symbol locus start_coord end_coord note translation 5GB1_FM40_T0m_TR2 5GB1_FM40_T10m_TR3 5GB1_FM40_T20m_TR2 5GB1_FM40_T40m_TR1 5GB1_FM40_T60m_TR1 5GB1_FM40_T90m_TR2 5GB1_FM40_T150m_TR1_remake 5GB1_FM40_T180m_TR1
locus_tag
MBURv2_200002 conserved protein of unknown function CDS NaN MBURv2 4089082 4089993 Evidence 4 : Homologs of previously reported g... NaN 38276.484324 33395.468885 38156.821484 34194.603990 29495.091672 26024.515218 25707.369485 19990.740574
MBURv2_210045 Particulate methane monooxygenase, C subunit CDS pmoC MBURv2 4145308 4146060 NaN NaN 34771.145240 30260.438551 35975.798188 40112.163709 53334.428836 45789.636731 34735.533389 55623.888672
MBURv2_130051 Methane monooxygenase component A alpha chain CDS mmoX MBURv2 2195408 2196985 NaN NaN 20876.047754 23334.700740 23554.421849 14831.488460 5236.638378 1206.833556 8886.181868 116.863397
MBURv2_210294 Methanol dehydrogenase [cytochrome c] subunit 2 CDS moxI MBURv2 4428610 4428894 NaN NaN 19513.752449 22060.136681 20446.725888 20908.798025 20034.727359 17919.427508 28439.225545 15807.184089
MBURv2_60380 Copper-repressible polypeptide CDS corA MBURv2 1788771 1789478 NaN NaN 14817.898886 13864.528615 14259.683730 13831.749325 15987.438513 13671.508292 12265.626813 10985.775012
MBURv2_210291 Methanol dehydrogenase [cytochrome c] subunit 1 CDS moxF MBURv2 4425383 4427191 NaN NaN 13590.696508 17995.517973 13917.409231 15841.555101 15445.970422 13572.525008 17521.826598 11629.983396
MBURv2_130049 Methane monooxygenase regulatory protein B CDS mmoB MBURv2 2193680 2194105 NaN NaN 13526.005086 17575.288572 17778.694312 11085.860188 3487.855641 743.191354 5752.450643 99.746681
MBURv2_130048 Methane monooxygenase component A gamma chain CDS mmoZ MBURv2 2193167 2193670 NaN NaN 13506.480648 16596.656923 16160.640965 9796.221967 3227.173118 703.930027 5362.338302 93.619968
MBURv2_210002 conserved exported protein of unknown function CDS NaN MBURv2 4091448 4091843 Evidence 4 : Homologs of previously reported g... NaN 13158.834189 12324.813785 11666.985452 11869.829886 13053.837277 13239.511781 12188.596640 8935.398178
MBURv2_160206 conserved exported protein of unknown function CDS NaN MBURv2 3487107 3487493 Evidence 4 : Homologs of previously reported g... NaN 12853.359995 8335.764865 12108.173320 8478.818374 10585.420214 8189.860667 7780.659138 6006.728337
MBURv2_130050 Methane monooxygenase component A beta chain CDS mmoY MBURv2 2194116 2195294 NaN NaN 12292.466793 16356.956614 14267.780334 9190.830875 2887.720299 640.353220 6107.690148 90.035631
MBURv2_210049 conserved protein of unknown function CDS NaN MBURv2 4154134 4154265 Evidence 4 : Homologs of previously reported g... NaN 12079.451895 11188.054669 12131.868994 13086.127050 12153.232550 14019.227620 13550.226666 14531.361293
MBURv2_210293 Cytochrome c-L CDS moxG MBURv2 4428119 4428592 NaN NaN 8286.929523 9314.841129 8757.279975 8589.671851 10849.361384 7838.814362 11135.942374 7112.372912
MBURv2_30142 conserved exported protein of unknown function CDS NaN MBURv2 777161 777496 Evidence 4 : Homologs of previously reported g... NaN 8183.369135 7849.765303 7914.715771 8389.706902 7332.818976 8663.907745 10141.324341 8488.331092
MBURv2_20471 putative lipoprotein CDS NaN MBURv2 582565 582843 Evidence 3 : Function proposed based on presen... NaN 7963.715630 5644.308977 8425.977153 5732.261326 5959.033371 4603.002597 5735.281545 4538.394367
MBURv2_210047 Particulate methane monooxygenase beta subunit CDS pmoB MBURv2 4147006 4148250 Evidence 1c : Function experimentally demonstr... NaN 7928.513074 8422.733585 8766.228380 18596.223948 24161.925562 26263.909885 18860.734976 32813.518120
MBURv2_60085 Formaldehyde-activating enzyme CDS fae MBURv2 1464085 1464594 NaN NaN 7782.099634 6087.106098 7376.795574 6539.665659 8794.769279 9085.662071 6882.483346 8987.998841
MBURv2_250064 stress protein, member of the CspA-family CDS cspC MBURv2 4929048 4929257 Evidence 2a : Function of homologous gene expe... NaN 7689.329406 7677.357274 7458.372623 8244.691880 6028.008091 10424.337136 5941.951633 10333.658241
MBURv2_210292 Protein MoxJ CDS moxJ MBURv2 4427257 4428102 NaN NaN 6645.041608 7625.123228 7414.621469 6797.254614 9605.161924 6365.786218 7434.370439 5746.529419
MBURv2_240078 conserved protein of unknown function CDS NaN MBURv2 4832847 4833041 Evidence 4 : Homologs of previously reported g... NaN 6578.942282 6780.391509 7116.136963 15123.673891 12958.629921 14331.971843 11549.935286 13292.663539

In [10]:
df2_TPM_values.plot.hist(bins=100)


Out[10]:
<matplotlib.axes._subplots.AxesSubplot at 0x1125f77f0>

In [11]:
import matplotlib.pyplot as plt 
plt.style.use("seaborn-white")

In [ ]:


In [12]:
"""
columns = ['5GB1_FM40_T0m_TR2', '5GB1_FM40_T10m_TR3', '5GB1_FM40_T20m_TR2', '5GB1_FM40_T40m_TR1',
           '5GB1_FM40_T60m_TR1', '5GB1_FM40_T90m_TR2', '5GB1_FM40_T150m_TR1_remake', '5GB1_FM40_T180m_TR1']
"""

x1 = df2_TPM_values["5GB1_FM40_T0m_TR2"]
x2 = df2_TPM_values['5GB1_FM40_T10m_TR3']
x3 = df2_TPM_values['5GB1_FM40_T20m_TR2']
x4 = df2_TPM_values['5GB1_FM40_T40m_TR1']
x5 = df2_TPM_values['5GB1_FM40_T60m_TR1']
x6 = df2_TPM_values['5GB1_FM40_T90m_TR2']
x7 = df2_TPM_values['5GB1_FM40_T150m_TR1_remake']
x8 = df2_TPM_values['5GB1_FM40_T180m_TR1']


kwargs = dict(histtype = "stepfilled", alpha = 0.1, normed = True, bins = 1000) 

axes = plt.gca()
axes.set_xlim([0,5000])
axes.set_ylim([0,0.001])

plt.hist(x1, **kwargs)
plt.hist(x2, **kwargs)
plt.hist(x3, **kwargs)
plt.hist(x4, **kwargs)
plt.hist(x5, **kwargs)
plt.hist(x6, **kwargs)
plt.hist(x7, **kwargs)
plt.hist(x8, **kwargs)


Out[12]:
(array([  9.31399724e-03,   4.00489842e-03,   1.50484660e-03,
          9.02907961e-04,   4.25369973e-04,   2.96956396e-04,
          2.36762532e-04,   1.04336031e-04,   9.22972583e-05,
          1.16374804e-04,   5.21680155e-05,   4.41421670e-05,
          4.01292427e-05,   5.21680155e-05,   4.41421670e-05,
          3.21033942e-05,   3.61163184e-05,   2.80904699e-05,
          2.40775456e-05,   1.60516971e-05,   2.40775456e-05,
          1.60516971e-05,   1.20387728e-05,   1.60516971e-05,
          1.20387728e-05,   1.60516971e-05,   1.20387728e-05,
          1.20387728e-05,   1.20387728e-05,   8.02584854e-06,
          1.60516971e-05,   3.21033942e-05,   8.02584854e-06,
          8.02584854e-06,   1.20387728e-05,   4.01292427e-06,
          1.20387728e-05,   8.02584854e-06,   2.00646214e-05,
          2.00646214e-05,   8.02584854e-06,   8.02584854e-06,
          2.80904699e-05,   1.60516971e-05,   8.02584854e-06,
          1.60516971e-05,   1.20387728e-05,   8.02584854e-06,
          1.20387728e-05,   4.01292427e-06,   8.02584854e-06,
          8.02584854e-06,   8.02584854e-06,   1.20387728e-05,
          0.00000000e+00,   0.00000000e+00,   4.01292427e-06,
          4.01292427e-06,   1.20387728e-05,   8.02584854e-06,
          4.01292427e-06,   0.00000000e+00,   8.02584854e-06,
          0.00000000e+00,   4.01292427e-06,   4.01292427e-06,
          4.01292427e-06,   0.00000000e+00,   4.01292427e-06,
          0.00000000e+00,   4.01292427e-06,   0.00000000e+00,
          0.00000000e+00,   4.01292427e-06,   8.02584854e-06,
          0.00000000e+00,   0.00000000e+00,   4.01292427e-06,
          8.02584854e-06,   4.01292427e-06,   0.00000000e+00,
          4.01292427e-06,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   4.01292427e-06,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   4.01292427e-06,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   8.02584854e-06,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   4.01292427e-06,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   4.01292427e-06,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   4.01292427e-06,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   4.01292427e-06,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   4.01292427e-06,   4.01292427e-06,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   4.01292427e-06,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   4.01292427e-06,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   4.01292427e-06,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   4.01292427e-06,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          4.01292427e-06,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   4.01292427e-06,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   4.01292427e-06,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   4.01292427e-06,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   4.01292427e-06,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          4.01292427e-06]),
 array([  4.16766818e-02,   5.56655237e+01,   1.11289371e+02, ...,
          5.55126410e+04,   5.55682648e+04,   5.56238887e+04]),
 <a list of 1 Patch objects>)

Standard scale across rows and replot historgram


In [13]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [14]:
#will start with standard scalar - need to scale along rows, therefore transposing the data to complete standard scale. 

df2_TPM_values_T = df2_TPM_values.T #transposing the data
standard_scaler = StandardScaler()

array_stand_scale = standard_scaler.fit_transform(df2_TPM_values_T)
df2a_stand_scale = pd.DataFrame(array_stand_scale)
df2a_stand_scale = df2a_stand_scale.T

df2a_stand_scale.columns = df2_TPM_values.columns
df2a_stand_scale.index = df2_TPM_values.index

In [15]:
df2a_stand_scale.describe()


Out[15]:
5GB1_FM40_T0m_TR2 5GB1_FM40_T10m_TR3 5GB1_FM40_T20m_TR2 5GB1_FM40_T40m_TR1 5GB1_FM40_T60m_TR1 5GB1_FM40_T90m_TR2 5GB1_FM40_T150m_TR1_remake 5GB1_FM40_T180m_TR1
count 4480.000000 4480.000000 4480.000000 4480.000000 4480.000000 4480.000000 4480.000000 4480.000000
mean 0.702837 0.197448 0.068517 -0.042205 -0.281593 -0.167623 -0.026290 -0.451091
std 0.995684 0.872936 0.894152 0.771213 0.935251 0.952852 0.960458 1.139213
min -2.098707 -2.370383 -2.298431 -2.347316 -2.464224 -2.392335 -2.502359 -2.568586
25% -0.060390 -0.467155 -0.687562 -0.599912 -0.982392 -0.946406 -0.709415 -1.373641
50% 0.915573 0.260343 0.166170 -0.075194 -0.328976 -0.334578 -0.063412 -0.727074
75% 1.512664 0.846053 0.790491 0.462960 0.365077 0.576012 0.637181 0.354870
max 2.571021 2.402075 2.449422 2.415837 2.336944 2.556948 2.524248 2.601184

In [16]:
"""
columns = ['5GB1_FM40_T0m_TR2', '5GB1_FM40_T10m_TR3', '5GB1_FM40_T20m_TR2', '5GB1_FM40_T40m_TR1',
           '5GB1_FM40_T60m_TR1', '5GB1_FM40_T90m_TR2', '5GB1_FM40_T150m_TR1_remake', '5GB1_FM40_T180m_TR1']
"""

x1 = df2a_stand_scale["5GB1_FM40_T0m_TR2"]
x2 = df2a_stand_scale['5GB1_FM40_T10m_TR3']
x3 = df2a_stand_scale['5GB1_FM40_T20m_TR2']
x4 = df2a_stand_scale['5GB1_FM40_T40m_TR1']
x5 = df2a_stand_scale['5GB1_FM40_T60m_TR1']
x6 = df2a_stand_scale['5GB1_FM40_T90m_TR2']
x7 = df2a_stand_scale['5GB1_FM40_T150m_TR1_remake']
x8 = df2a_stand_scale['5GB1_FM40_T180m_TR1']


kwargs = dict(histtype = "stepfilled", alpha = 0.1, normed = True, bins = 1000) 

""" 
axes = plt.gca()
axes.set_xlim([0,5000])
axes.set_ylim([0,0.001])
"""

plt.hist(x1, **kwargs)
#plt.hist(x2, **kwargs)
#plt.hist(x3, **kwargs)
#plt.hist(x4, **kwargs)
#plt.hist(x5, **kwargs)
#plt.hist(x6, **kwargs)
#plt.hist(x7, **kwargs)
#plt.hist(x8, **kwargs)


Out[16]:
(array([ 0.04780028,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.04780028,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.04780028,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.04780028,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.09560055,  0.        ,  0.04780028,  0.09560055,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.04780028,  0.04780028,
         0.        ,  0.04780028,  0.        ,  0.        ,  0.        ,
         0.04780028,  0.        ,  0.        ,  0.        ,  0.        ,
         0.09560055,  0.        ,  0.        ,  0.04780028,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.04780028,  0.        ,
         0.04780028,  0.        ,  0.        ,  0.        ,  0.        ,
         0.04780028,  0.        ,  0.04780028,  0.04780028,  0.04780028,
         0.09560055,  0.04780028,  0.04780028,  0.        ,  0.        ,
         0.09560055,  0.        ,  0.        ,  0.14340083,  0.09560055,
         0.04780028,  0.04780028,  0.04780028,  0.14340083,  0.        ,
         0.        ,  0.04780028,  0.04780028,  0.04780028,  0.        ,
         0.        ,  0.        ,  0.        ,  0.04780028,  0.04780028,
         0.09560055,  0.09560055,  0.09560055,  0.        ,  0.        ,
         0.        ,  0.09560055,  0.09560055,  0.1912011 ,  0.04780028,
         0.04780028,  0.14340083,  0.1912011 ,  0.09560055,  0.09560055,
         0.09560055,  0.14340083,  0.14340083,  0.09560055,  0.04780028,
         0.33460193,  0.04780028,  0.        ,  0.04780028,  0.1912011 ,
         0.1912011 ,  0.09560055,  0.09560055,  0.14340083,  0.04780028,
         0.04780028,  0.        ,  0.09560055,  0.09560055,  0.        ,
         0.14340083,  0.09560055,  0.09560055,  0.23900138,  0.1912011 ,
         0.09560055,  0.1912011 ,  0.        ,  0.09560055,  0.04780028,
         0.14340083,  0.09560055,  0.28680165,  0.14340083,  0.28680165,
         0.09560055,  0.1912011 ,  0.09560055,  0.09560055,  0.14340083,
         0.1912011 ,  0.1912011 ,  0.1912011 ,  0.14340083,  0.14340083,
         0.14340083,  0.1912011 ,  0.1912011 ,  0.09560055,  0.28680165,
         0.14340083,  0.14340083,  0.1912011 ,  0.14340083,  0.14340083,
         0.1912011 ,  0.14340083,  0.28680165,  0.28680165,  0.04780028,
         0.3824022 ,  0.23900138,  0.3824022 ,  0.09560055,  0.1912011 ,
         0.23900138,  0.09560055,  0.28680165,  0.04780028,  0.23900138,
         0.09560055,  0.23900138,  0.1912011 ,  0.04780028,  0.1912011 ,
         0.1912011 ,  0.1912011 ,  0.14340083,  0.09560055,  0.1912011 ,
         0.28680165,  0.3824022 ,  0.28680165,  0.28680165,  0.3824022 ,
         0.23900138,  0.04780028,  0.33460193,  0.14340083,  0.23900138,
         0.14340083,  0.14340083,  0.09560055,  0.1912011 ,  0.14340083,
         0.1912011 ,  0.28680165,  0.1912011 ,  0.1912011 ,  0.28680165,
         0.1912011 ,  0.09560055,  0.1912011 ,  0.14340083,  0.1912011 ,
         0.23900138,  0.23900138,  0.09560055,  0.28680165,  0.28680165,
         0.23900138,  0.14340083,  0.1912011 ,  0.14340083,  0.23900138,
         0.43020248,  0.14340083,  0.14340083,  0.14340083,  0.1912011 ,
         0.33460193,  0.1912011 ,  0.1912011 ,  0.14340083,  0.        ,
         0.1912011 ,  0.1912011 ,  0.3824022 ,  0.23900138,  0.04780028,
         0.        ,  0.09560055,  0.09560055,  0.14340083,  0.1912011 ,
         0.23900138,  0.3824022 ,  0.43020248,  0.28680165,  0.23900138,
         0.33460193,  0.09560055,  0.43020248,  0.1912011 ,  0.23900138,
         0.23900138,  0.1912011 ,  0.14340083,  0.04780028,  0.28680165,
         0.1912011 ,  0.1912011 ,  0.14340083,  0.23900138,  0.3824022 ,
         0.14340083,  0.1912011 ,  0.14340083,  0.14340083,  0.14340083,
         0.1912011 ,  0.23900138,  0.14340083,  0.14340083,  0.04780028,
         0.23900138,  0.09560055,  0.14340083,  0.28680165,  0.23900138,
         0.14340083,  0.14340083,  0.14340083,  0.04780028,  0.1912011 ,
         0.23900138,  0.        ,  0.14340083,  0.1912011 ,  0.28680165,
         0.14340083,  0.1912011 ,  0.33460193,  0.1912011 ,  0.23900138,
         0.1912011 ,  0.04780028,  0.23900138,  0.14340083,  0.14340083,
         0.23900138,  0.09560055,  0.09560055,  0.04780028,  0.1912011 ,
         0.1912011 ,  0.33460193,  0.23900138,  0.14340083,  0.23900138,
         0.1912011 ,  0.09560055,  0.23900138,  0.23900138,  0.33460193,
         0.3824022 ,  0.09560055,  0.09560055,  0.33460193,  0.23900138,
         0.28680165,  0.04780028,  0.14340083,  0.09560055,  0.04780028,
         0.14340083,  0.14340083,  0.23900138,  0.14340083,  0.3824022 ,
         0.23900138,  0.23900138,  0.1912011 ,  0.14340083,  0.09560055,
         0.23900138,  0.14340083,  0.28680165,  0.1912011 ,  0.28680165,
         0.09560055,  0.09560055,  0.14340083,  0.23900138,  0.14340083,
         0.28680165,  0.04780028,  0.09560055,  0.14340083,  0.1912011 ,
         0.14340083,  0.23900138,  0.09560055,  0.1912011 ,  0.28680165,
         0.1912011 ,  0.14340083,  0.04780028,  0.14340083,  0.14340083,
         0.33460193,  0.28680165,  0.1912011 ,  0.1912011 ,  0.28680165,
         0.14340083,  0.33460193,  0.23900138,  0.23900138,  0.09560055,
         0.14340083,  0.33460193,  0.14340083,  0.28680165,  0.23900138,
         0.        ,  0.23900138,  0.23900138,  0.09560055,  0.1912011 ,
         0.28680165,  0.09560055,  0.09560055,  0.23900138,  0.1912011 ,
         0.28680165,  0.1912011 ,  0.1912011 ,  0.09560055,  0.28680165,
         0.23900138,  0.23900138,  0.28680165,  0.04780028,  0.04780028,
         0.23900138,  0.28680165,  0.09560055,  0.09560055,  0.09560055,
         0.14340083,  0.14340083,  0.09560055,  0.1912011 ,  0.        ,
         0.04780028,  0.33460193,  0.09560055,  0.28680165,  0.1912011 ,
         0.14340083,  0.14340083,  0.14340083,  0.33460193,  0.28680165,
         0.1912011 ,  0.1912011 ,  0.52580303,  0.23900138,  0.28680165,
         0.23900138,  0.23900138,  0.09560055,  0.14340083,  0.1912011 ,
         0.23900138,  0.09560055,  0.23900138,  0.3824022 ,  0.3824022 ,
         0.1912011 ,  0.14340083,  0.09560055,  0.04780028,  0.1912011 ,
         0.1912011 ,  0.52580303,  0.09560055,  0.1912011 ,  0.14340083,
         0.28680165,  0.14340083,  0.23900138,  0.09560055,  0.04780028,
         0.09560055,  0.09560055,  0.23900138,  0.09560055,  0.23900138,
         0.28680165,  0.23900138,  0.23900138,  0.23900138,  0.14340083,
         0.23900138,  0.33460193,  0.1912011 ,  0.28680165,  0.09560055,
         0.28680165,  0.43020248,  0.33460193,  0.28680165,  0.28680165,
         0.14340083,  0.23900138,  0.14340083,  0.14340083,  0.09560055,
         0.04780028,  0.14340083,  0.1912011 ,  0.47800275,  0.14340083,
         0.14340083,  0.28680165,  0.23900138,  0.47800275,  0.14340083,
         0.28680165,  0.14340083,  0.14340083,  0.23900138,  0.09560055,
         0.28680165,  0.1912011 ,  0.09560055,  0.14340083,  0.47800275,
         0.1912011 ,  0.23900138,  0.33460193,  0.14340083,  0.33460193,
         0.43020248,  0.28680165,  0.43020248,  0.43020248,  0.43020248,
         0.1912011 ,  0.23900138,  0.33460193,  0.23900138,  0.09560055,
         0.33460193,  0.3824022 ,  0.33460193,  0.3824022 ,  0.33460193,
         0.28680165,  0.52580303,  0.28680165,  0.1912011 ,  0.33460193,
         0.1912011 ,  0.1912011 ,  0.43020248,  0.1912011 ,  0.43020248,
         0.23900138,  0.28680165,  0.33460193,  0.14340083,  0.23900138,
         0.1912011 ,  0.1912011 ,  0.1912011 ,  0.3824022 ,  0.1912011 ,
         0.3824022 ,  0.14340083,  0.23900138,  0.        ,  0.3824022 ,
         0.28680165,  0.1912011 ,  0.1912011 ,  0.1912011 ,  0.28680165,
         0.3824022 ,  0.33460193,  0.47800275,  0.28680165,  0.23900138,
         0.1912011 ,  0.47800275,  0.3824022 ,  0.23900138,  0.23900138,
         0.23900138,  0.28680165,  0.28680165,  0.47800275,  0.33460193,
         0.14340083,  0.33460193,  0.47800275,  0.43020248,  0.1912011 ,
         0.28680165,  0.23900138,  0.43020248,  0.43020248,  0.43020248,
         0.33460193,  0.23900138,  0.28680165,  0.33460193,  0.52580303,
         0.33460193,  0.28680165,  0.33460193,  0.1912011 ,  0.1912011 ,
         0.28680165,  0.52580303,  0.57360331,  0.52580303,  0.3824022 ,
         0.33460193,  0.43020248,  0.1912011 ,  0.3824022 ,  0.33460193,
         0.47800275,  0.1912011 ,  0.28680165,  0.        ,  0.28680165,
         0.33460193,  0.28680165,  0.43020248,  0.33460193,  0.28680165,
         0.57360331,  0.3824022 ,  0.3824022 ,  0.23900138,  0.33460193,
         0.28680165,  0.57360331,  0.3824022 ,  0.3824022 ,  0.1912011 ,
         0.43020248,  0.3824022 ,  0.52580303,  0.33460193,  0.47800275,
         0.28680165,  0.43020248,  0.47800275,  0.43020248,  0.47800275,
         0.1912011 ,  0.52580303,  0.33460193,  0.14340083,  0.1912011 ,
         0.57360331,  0.23900138,  0.47800275,  0.52580303,  0.28680165,
         0.3824022 ,  0.52580303,  0.66920386,  0.3824022 ,  0.3824022 ,
         0.57360331,  0.3824022 ,  0.23900138,  0.3824022 ,  0.33460193,
         0.1912011 ,  0.57360331,  0.43020248,  0.28680165,  0.33460193,
         0.47800275,  0.71700413,  0.28680165,  0.43020248,  0.43020248,
         0.43020248,  0.52580303,  0.33460193,  0.28680165,  0.3824022 ,
         0.57360331,  0.62140358,  0.23900138,  0.28680165,  0.3824022 ,
         0.23900138,  0.43020248,  0.33460193,  0.52580303,  0.43020248,
         0.28680165,  0.33460193,  0.52580303,  0.23900138,  0.66920386,
         0.3824022 ,  0.47800275,  0.43020248,  0.76480441,  0.62140358,
         0.47800275,  0.3824022 ,  0.1912011 ,  0.3824022 ,  0.43020248,
         0.52580303,  0.1912011 ,  0.47800275,  0.52580303,  0.23900138,
         0.33460193,  0.47800275,  0.47800275,  0.28680165,  0.81260468,
         0.57360331,  0.52580303,  0.43020248,  0.47800275,  0.3824022 ,
         0.33460193,  0.33460193,  0.47800275,  0.3824022 ,  0.28680165,
         0.43020248,  0.57360331,  0.1912011 ,  0.33460193,  0.28680165,
         0.33460193,  0.23900138,  0.52580303,  0.76480441,  0.71700413,
         0.47800275,  0.52580303,  0.43020248,  0.52580303,  0.28680165,
         0.33460193,  0.47800275,  0.62140358,  0.62140358,  0.43020248,
         0.1912011 ,  0.43020248,  0.47800275,  0.33460193,  0.28680165,
         0.66920386,  0.57360331,  0.57360331,  0.47800275,  0.52580303,
         0.47800275,  0.62140358,  0.3824022 ,  0.14340083,  0.1912011 ,
         0.43020248,  0.52580303,  0.47800275,  0.28680165,  0.3824022 ,
         0.62140358,  0.66920386,  0.47800275,  0.33460193,  0.71700413,
         0.47800275,  0.71700413,  0.66920386,  0.62140358,  0.1912011 ,
         0.43020248,  0.23900138,  0.14340083,  0.47800275,  0.71700413,
         0.43020248,  0.43020248,  0.23900138,  0.43020248,  0.57360331,
         0.28680165,  0.76480441,  0.33460193,  0.43020248,  0.57360331,
         0.33460193,  0.62140358,  0.43020248,  0.33460193,  0.28680165,
         0.3824022 ,  0.47800275,  0.33460193,  0.28680165,  0.57360331,
         0.33460193,  0.28680165,  0.3824022 ,  0.43020248,  0.33460193,
         0.43020248,  0.47800275,  0.47800275,  0.3824022 ,  0.52580303,
         0.43020248,  0.28680165,  0.3824022 ,  0.23900138,  0.43020248,
         0.52580303,  0.52580303,  0.43020248,  0.33460193,  0.47800275,
         0.66920386,  0.43020248,  0.23900138,  0.3824022 ,  0.28680165,
         0.33460193,  0.47800275,  0.1912011 ,  0.14340083,  0.81260468,
         0.3824022 ,  0.57360331,  0.3824022 ,  0.71700413,  0.28680165,
         0.33460193,  0.1912011 ,  0.23900138,  0.28680165,  0.23900138,
         0.3824022 ,  0.3824022 ,  0.1912011 ,  0.1912011 ,  0.23900138,
         0.23900138,  0.1912011 ,  0.28680165,  0.3824022 ,  0.23900138,
         0.1912011 ,  0.28680165,  0.3824022 ,  0.1912011 ,  0.23900138,
         0.1912011 ,  0.3824022 ,  0.28680165,  0.23900138,  0.28680165,
         0.14340083,  0.23900138,  0.28680165,  0.23900138,  0.33460193,
         0.1912011 ,  0.09560055,  0.23900138,  0.33460193,  0.28680165,
         0.04780028,  0.1912011 ,  0.3824022 ,  0.09560055,  0.09560055,
         0.23900138,  0.04780028,  0.23900138,  0.09560055,  0.1912011 ,
         0.43020248,  0.1912011 ,  0.1912011 ,  0.14340083,  0.28680165,
         0.04780028,  0.23900138,  0.23900138,  0.09560055,  0.09560055,
         0.        ,  0.09560055,  0.33460193,  0.1912011 ,  0.1912011 ,
         0.09560055,  0.23900138,  0.33460193,  0.09560055,  0.14340083,
         0.1912011 ,  0.09560055,  0.1912011 ,  0.1912011 ,  0.14340083,
         0.1912011 ,  0.04780028,  0.09560055,  0.1912011 ,  0.        ,
         0.        ,  0.04780028,  0.23900138,  0.04780028,  0.14340083,
         0.09560055,  0.09560055,  0.28680165,  0.09560055,  0.04780028,
         0.04780028,  0.09560055,  0.        ,  0.        ,  0.09560055,
         0.        ,  0.04780028,  0.09560055,  0.04780028,  0.04780028,
         0.        ,  0.04780028,  0.        ,  0.04780028,  0.09560055,
         0.09560055,  0.04780028,  0.04780028,  0.        ,  0.        ,
         0.04780028,  0.        ,  0.        ,  0.04780028,  0.14340083,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.04780028,  0.04780028,  0.        ,  0.        ,
         0.04780028,  0.        ,  0.        ,  0.        ,  0.04780028,
         0.04780028,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.04780028,  0.        ,  0.04780028]),
 array([-2.09870676, -2.09403703, -2.0893673 , ...,  2.5616818 ,
         2.56635153,  2.57102126]),
 <a list of 1 Patch objects>)

Would like to plot every standard scaled histogram by itself. Found a function from stack exchagne about making histograms for every column


In [17]:
def draw_histograms(df, variables, n_rows, n_cols):
    fig=plt.figure(figsize = (20,10))
    for i, var_name in enumerate(variables):
        ax=fig.add_subplot(n_rows,n_cols,i+1)
        df[var_name].hist(ax=ax, histtype = "stepfilled", alpha = 0.3, normed = True, bins = 1000) 
        ax.set_title(var_name)
    plt.show()

testa = df2a_stand_scale
draw_histograms(testa, testa.columns, 2, 4)


Min/Max scale across rows and replot the histograms


In [18]:
df2_TPM_values_T = df2_TPM_values.T #transposing the data
min_max_scalar = MinMaxScaler()

df2b_mean_max_rows = min_max_scalar.fit_transform(df2_TPM_values_T)
df2b_mean_max_rows = pd.DataFrame(df2b_mean_max_rows.T)

df2b_mean_max_rows.columns = df2_TPM_values.columns
df2b_mean_max_rows.index = df2_TPM_values.index

In [19]:
testb = df2b_mean_max_rows
draw_histograms(testb, testb.columns, 2, 4)


Min/Max scale across columns and replot the histograms


In [20]:
min_max_scalar = MinMaxScaler()

df2c_mean_max_columns = min_max_scalar.fit_transform(df2_TPM_values)
df2c_mean_max_columns = pd.DataFrame(df2c_mean_max_columns)

df2c_mean_max_columns.columns = df2_TPM_values.columns
df2c_mean_max_columns.index = df2_TPM_values.index

In [21]:
def draw_histograms(df, variables, n_rows, n_cols):
    fig=plt.figure(figsize = (20,10))
    
    for i, var_name in enumerate(variables):
        ax=fig.add_subplot(n_rows,n_cols,i+1)
        df[var_name].hist(ax=ax, histtype = "stepfilled", alpha = 0.3, normed = True, bins = 100) 
        ax.set_title(var_name)
        ax.set_ylim([0, 5])
    plt.show()

testc = df2c_mean_max_columns
draw_histograms(testc, testc.columns, 2, 4)


replicating Daves PCA


In [22]:
from sklearn.decomposition import PCA

In [23]:
pca = PCA(n_components=2)
pca.fit(df2c_mean_max_columns)
print(pca.explained_variance_ratio_) #how much of the dimensionality reduction is 
pcat = pca.transform(df2c_mean_max_columns)
plt.figure()
plt.scatter(pcat[:,0], pcat[:,1])


[ 0.9199641   0.06735895]
Out[23]:
<matplotlib.collections.PathCollection at 0x115500588>

In [24]:
pca = PCA(n_components=2)
pca.fit(df2c_mean_max_columns)
print(pca.explained_variance_ratio_) #how much of the dimensionality reduction is 
pcat = pca.transform(df2c_mean_max_columns)
plt.figure()
plt.xlim(-.1, .4)
plt.ylim(-.2, .2)
plt.scatter(pcat[:,0], pcat[:,1])


[ 0.9199641   0.06735895]
Out[24]:
<matplotlib.collections.PathCollection at 0x113a33080>

Applying PCA to standard scaled data


In [25]:
pca = PCA(n_components=2)
pca.fit(df2a_stand_scale)
print(pca.explained_variance_ratio_) #how much of the dimensionality reduction is 
pcat = pca.transform(df2a_stand_scale)
#plt.figure()
plt.scatter(pcat[:,0], pcat[:,1])


[ 0.3942109   0.18442032]
Out[25]:
<matplotlib.collections.PathCollection at 0x113bb5c50>

Reviewing Daves gridsearch scan through the epsioln parameter in DBSCAN


In [26]:
#!/usr/bin/env python3

import math
import sys

import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA

df = df2_TPM_values
#print(df.head())


#dfr = df / df.loc("5GB1_FM40_T0m_TR2")
#print(dfr.head())

#sys.exit()

# PCA and clustering of dfr

#df.plot.hist(bins=100)
#plt.savefig("hist.pdf")

#X = StandardScaler().fit_transform(df)
X = MinMaxScaler().fit_transform(df)
"""
dfX = pd.DataFrame(X)
dfX.plot.hist(bins=100)
print(dfX.head())
plt.savefig("histX.pdf")


pca = PCA(n_components=2)
pca.fit(X)
print(pca.explained_variance_ratio_) 
pcat = pca.transform(X)
plt.figure()
#plt.xlim(-5, 5)
#plt.ylim(-5, 5)
plt.scatter(pcat[:,0], pcat[:,1])
plt.show()
plt.savefig("pca.pdf")
 """

for eps_value in [0.0001, 0.005, 0.01, 0.05, 0.075, 0.1]:
    for ms in [3, 5]:
        db = DBSCAN(eps=eps_value, min_samples=ms).fit(X)
        print(set(db.labels_))
        labels = db.labels_
        n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
        print('eps = %f - min_samples = %d - number of clusters: %d' % (eps_value, ms, n_clusters_))


{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1}
eps = 0.000100 - min_samples = 3 - number of clusters: 14
{0, 1, -1}
eps = 0.000100 - min_samples = 5 - number of clusters: 2
{0, 1, 2, -1}
eps = 0.005000 - min_samples = 3 - number of clusters: 3
{0, -1}
eps = 0.005000 - min_samples = 5 - number of clusters: 1
{0, 1, 2, 3, 4, -1}
eps = 0.010000 - min_samples = 3 - number of clusters: 5
{0, 1, -1}
eps = 0.010000 - min_samples = 5 - number of clusters: 2
{0, 1, -1}
eps = 0.050000 - min_samples = 3 - number of clusters: 2
{0, -1}
eps = 0.050000 - min_samples = 5 - number of clusters: 1
{0, 1, -1}
eps = 0.075000 - min_samples = 3 - number of clusters: 2
{0, -1}
eps = 0.075000 - min_samples = 5 - number of clusters: 1
{0, 1, -1}
eps = 0.100000 - min_samples = 3 - number of clusters: 2
{0, -1}
eps = 0.100000 - min_samples = 5 - number of clusters: 1

What do the 14 clusters look like?


In [27]:
X = MinMaxScaler().fit_transform(df2_TPM_values)
db_TPM_values = DBSCAN(eps=0.0001, min_samples=3).fit(X)
labels_TPM_values = db_TPM_values.labels_
print(np.unique(labels_TPM_values))
print(np.bincount(labels_TPM_values[labels_TPM_values!=-1]))


[-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13]
[755   3   3   7   3   4   3   6   3   3   3   3   4   3]

In [156]:
labels_TPM_values


Out[156]:
array([-1,  0,  0, ..., -1, -1, -1])

Applying Daves grid search to mean centered data across rows


In [140]:
#!/usr/bin/env python3

import math
import sys

import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA

df = df2a_stand_scale
#print(df.head())


#dfr = df / df.loc("5GB1_FM40_T0m_TR2")
#print(dfr.head())

#sys.exit()

# PCA and clustering of dfr

#df.plot.hist(bins=100)
#plt.savefig("hist.pdf")

#X = StandardScaler().fit_transform(df)
#X = MinMaxScaler().fit_transform(df)
"""
dfX = pd.DataFrame(X)
dfX.plot.hist(bins=100)
print(dfX.head())
plt.savefig("histX.pdf")


pca = PCA(n_components=2)
pca.fit(X)
print(pca.explained_variance_ratio_) 
pcat = pca.transform(X)
plt.figure()
#plt.xlim(-5, 5)
#plt.ylim(-5, 5)
plt.scatter(pcat[:,0], pcat[:,1])
plt.show()
plt.savefig("pca.pdf")
 """

for eps_value in [0.0001, 0.005, 0.01, 0.05, 0.075, 0.1]:
    for ms in [3, 5]:
        db = DBSCAN(eps=eps_value, min_samples=ms).fit(df)
        print(set(db.labels_))
        labels = db.labels_
        n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
        print('eps = %f - min_samples = %d - number of clusters: %d' % (eps_value, ms, n_clusters_))


{-1}
eps = 0.000100 - min_samples = 3 - number of clusters: 0
{-1}
eps = 0.000100 - min_samples = 5 - number of clusters: 0
{-1}
eps = 0.005000 - min_samples = 3 - number of clusters: 0
{-1}
eps = 0.005000 - min_samples = 5 - number of clusters: 0
{-1}
eps = 0.010000 - min_samples = 3 - number of clusters: 0
{-1}
eps = 0.010000 - min_samples = 5 - number of clusters: 0
{-1}
eps = 0.050000 - min_samples = 3 - number of clusters: 0
{-1}
eps = 0.050000 - min_samples = 5 - number of clusters: 0
{-1}
eps = 0.075000 - min_samples = 3 - number of clusters: 0
{-1}
eps = 0.075000 - min_samples = 5 - number of clusters: 0
{-1}
eps = 0.100000 - min_samples = 3 - number of clusters: 0
{-1}
eps = 0.100000 - min_samples = 5 - number of clusters: 0

Applying Daves grid search to mean centered data across columns

It doesnt make sense to me to mean center across columns


In [141]:
#!/usr/bin/env python3

import math
import sys

import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA

df = df2_TPM_values
#print(df.head())


#dfr = df / df.loc("5GB1_FM40_T0m_TR2")
#print(dfr.head())

#sys.exit()

# PCA and clustering of dfr

#df.plot.hist(bins=100)
#plt.savefig("hist.pdf")

X = StandardScaler().fit_transform(df)
#X = MinMaxScaler().fit_transform(df)
"""
dfX = pd.DataFrame(X)
dfX.plot.hist(bins=100)
print(dfX.head())
plt.savefig("histX.pdf")


pca = PCA(n_components=2)
pca.fit(X)
print(pca.explained_variance_ratio_) 
pcat = pca.transform(X)
plt.figure()
#plt.xlim(-5, 5)
#plt.ylim(-5, 5)
plt.scatter(pcat[:,0], pcat[:,1])
plt.show()
plt.savefig("pca.pdf")
 """

for eps_value in [0.0001, 0.005, 0.01, 0.05, 0.075, 0.1]:
    for ms in [3, 5]:
        db = DBSCAN(eps=eps_value, min_samples=ms).fit(X)
        print(set(db.labels_))
        labels = db.labels_
        n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
        print('eps = %f - min_samples = %d - number of clusters: %d' % (eps_value, ms, n_clusters_))


{-1}
eps = 0.000100 - min_samples = 3 - number of clusters: 0
{-1}
eps = 0.000100 - min_samples = 5 - number of clusters: 0
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, -1}
eps = 0.005000 - min_samples = 3 - number of clusters: 21
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1}
eps = 0.005000 - min_samples = 5 - number of clusters: 10
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, -1}
eps = 0.010000 - min_samples = 3 - number of clusters: 33
{0, 1, 2, 3, 4, 5, 6, -1}
eps = 0.010000 - min_samples = 5 - number of clusters: 7
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1}
eps = 0.050000 - min_samples = 3 - number of clusters: 10
{0, 1, 2, 3, 4, -1}
eps = 0.050000 - min_samples = 5 - number of clusters: 5
{0, 1, 2, 3, 4, 5, -1}
eps = 0.075000 - min_samples = 3 - number of clusters: 6
{0, 1, -1}
eps = 0.075000 - min_samples = 5 - number of clusters: 2
{0, 1, 2, -1}
eps = 0.100000 - min_samples = 3 - number of clusters: 3
{0, 1, -1}
eps = 0.100000 - min_samples = 5 - number of clusters: 2

Log2 fold change over timepoint zero


In [29]:
df2_TPM_log2.columns
tp_zero = df2_TPM_log2["5GB1_FM40_T0m_TR2"]
df2_TPM_log2_diff = df2_TPM_log2.subtract(df2_TPM_log2["5GB1_FM40_T0m_TR2"], axis = "index")
df2_TPM_log2_diff = df2_TPM_log2_diff.loc[:,'5GB1_FM40_T10m_TR3':'5GB1_FM40_T180m_TR1']
df2_TPM_log2_diff.describe()


Out[29]:
5GB1_FM40_T10m_TR3 5GB1_FM40_T20m_TR2 5GB1_FM40_T40m_TR1 5GB1_FM40_T60m_TR1 5GB1_FM40_T90m_TR2 5GB1_FM40_T150m_TR1_remake 5GB1_FM40_T180m_TR1
count 4480.000000 4480.000000 4480.000000 4480.000000 4480.000000 4480.000000 4480.000000
mean -0.073165 -0.097126 -0.120614 -0.181631 -0.178782 -0.133880 -0.250784
std 0.280964 0.192104 0.341647 0.371661 0.522685 0.399440 0.625970
min -2.007806 -1.952493 -2.987708 -3.220118 -4.932123 -3.107027 -7.630993
25% -0.219900 -0.188297 -0.290906 -0.354218 -0.433002 -0.348005 -0.516284
50% -0.079705 -0.097245 -0.131879 -0.196607 -0.188377 -0.143833 -0.256458
75% 0.055538 -0.006195 0.034844 -0.017946 0.067917 0.066259 0.027040
max 2.795991 1.695479 2.994549 1.963696 3.307840 2.407932 3.376823

In [30]:
def draw_histograms(df, variables, n_rows, n_cols):
    fig=plt.figure(figsize = (20,10))
    for i, var_name in enumerate(variables):
        ax=fig.add_subplot(n_rows,n_cols,i+1)
        df[var_name].hist(ax=ax, histtype = "stepfilled", alpha = 0.3, normed = True, bins = 1000) 
        ax.set_title(var_name)
    plt.show()

testa = df2_TPM_log2_diff
draw_histograms(testa, testa.columns, 2, 4)


PCA this distribution


In [31]:
pca = PCA(n_components=2)
pca.fit(df2_TPM_log2_diff)
print(pca.explained_variance_ratio_) #how much of the dimensionality reduction is 
pcat = pca.transform(df2_TPM_log2_diff)
#plt.figure()
plt.scatter(pcat[:,0], pcat[:,1])


[ 0.73622532  0.10011064]
Out[31]:
<matplotlib.collections.PathCollection at 0x113bac780>

In [32]:
#!/usr/bin/env python3

import math
import sys

import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA

df = df2_TPM_log2_diff
#print(df.head())


#dfr = df / df.loc("5GB1_FM40_T0m_TR2")
#print(dfr.head())

#sys.exit()

# PCA and clustering of dfr

#df.plot.hist(bins=100)
#plt.savefig("hist.pdf")

#X = StandardScaler().fit_transform(df)
#X = MinMaxScaler().fit_transform(df)
X = df
"""
dfX = pd.DataFrame(X)
dfX.plot.hist(bins=100)
print(dfX.head())
plt.savefig("histX.pdf")


pca = PCA(n_components=2)
pca.fit(X)
print(pca.explained_variance_ratio_) 
pcat = pca.transform(X)
plt.figure()
#plt.xlim(-5, 5)
#plt.ylim(-5, 5)
plt.scatter(pcat[:,0], pcat[:,1])
plt.show()
plt.savefig("pca.pdf")
 """

for eps_value in [0.0001, 0.005, 0.01, 0.05, 0.075, 0.1]:
    for ms in [3, 5]:
        db = DBSCAN(eps=eps_value, min_samples=ms).fit(X)
        print(set(db.labels_))
        labels = db.labels_
        n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
        print(np.unique(labels))
        print('eps = %f - min_samples = %d - number of clusters: %d' % (eps_value, ms, n_clusters_))


{0, -1}
[-1  0]
eps = 0.000100 - min_samples = 3 - number of clusters: 1
{0, -1}
[-1  0]
eps = 0.000100 - min_samples = 5 - number of clusters: 1
{0, -1}
[-1  0]
eps = 0.005000 - min_samples = 3 - number of clusters: 1
{0, -1}
[-1  0]
eps = 0.005000 - min_samples = 5 - number of clusters: 1
{0, -1}
[-1  0]
eps = 0.010000 - min_samples = 3 - number of clusters: 1
{0, -1}
[-1  0]
eps = 0.010000 - min_samples = 5 - number of clusters: 1
{0, -1}
[-1  0]
eps = 0.050000 - min_samples = 3 - number of clusters: 1
{0, -1}
[-1  0]
eps = 0.050000 - min_samples = 5 - number of clusters: 1
{0, 1, 2, 3, -1}
[-1  0  1  2  3]
eps = 0.075000 - min_samples = 3 - number of clusters: 4
{0, -1}
[-1  0]
eps = 0.075000 - min_samples = 5 - number of clusters: 1
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, -1}
[-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37]
eps = 0.100000 - min_samples = 3 - number of clusters: 38
{0, 1, -1}
[-1  0  1]
eps = 0.100000 - min_samples = 5 - number of clusters: 2

ploting the 38 clusters unto the log2fold diff PCA

Lets explore the eps = 0.075 and min_samples =5


In [33]:
dbscan_log2diff = DBSCAN(eps=0.1, min_samples=3).fit(df2_TPM_log2_diff)
labels_log2diff = dbscan_log2diff.labels_
print(np.unique(labels_log2diff))
print(np.bincount(labels_log2diff[labels_log2diff!=-1]))


[-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37]
[ 5  3  4  4  4  4  3  3 10  3  4  5  6  5  3  4  3  6  3  3  4  4  3  3  3
  4  4  3  3  3  3  7  3  3  3  3  3  3]

In [34]:
#cluster index
clusters_38 = {i: np.where(labels_log2diff == i)[0] for i in range(38)}

In [35]:
clusters_38


Out[35]:
{0: array([ 118,  134, 1090, 2535, 4300]),
 1: array([ 188, 2995, 3101]),
 2: array([ 197, 2089, 2408, 4384]),
 3: array([ 233, 1014, 1514, 2737]),
 4: array([ 315,  808, 3357, 4047]),
 5: array([ 165,  350, 1457, 4402]),
 6: array([ 548, 4039, 4444]),
 7: array([ 646,  917, 1402]),
 8: array([ 651,  744, 1221, 1656, 1863, 2522, 2810, 2870, 3458, 4369]),
 9: array([ 801, 3922, 4288]),
 10: array([ 806, 2599, 3299, 3316]),
 11: array([ 222,  988, 1673, 2352, 3001]),
 12: array([1004, 1942, 2120, 2404, 3024, 3825]),
 13: array([1368, 1769, 2212, 2496, 3567]),
 14: array([1141, 1813, 2447]),
 15: array([ 441, 1815, 2267, 3533]),
 16: array([1898, 2140, 2429]),
 17: array([ 878, 1988, 2135, 3762, 3958, 4419]),
 18: array([2223, 2471, 3540]),
 19: array([ 417, 2348, 2771]),
 20: array([ 139, 2469, 2498, 3766]),
 21: array([ 940, 1337, 2683, 3058]),
 22: array([ 567, 1796, 2794]),
 23: array([2822, 3178, 3553]),
 24: array([2837, 3123, 3504]),
 25: array([2098, 2845, 3131, 3887]),
 26: array([ 229, 3047, 3968, 4405]),
 27: array([ 799, 1228, 3095]),
 28: array([ 506, 1499, 3114]),
 29: array([ 159,  421, 3125]),
 30: array([ 882, 2148, 3126]),
 31: array([ 133,  415, 2731, 3141, 3228, 3351, 4362]),
 32: array([ 453,  761, 3476]),
 33: array([ 136, 3924, 4360]),
 34: array([ 261, 2834, 3932]),
 35: array([ 633, 2361, 4126]),
 36: array([1497, 2745, 4316]),
 37: array([2368, 4374, 4377])}

In [144]:
pca = PCA(n_components=2)
pca.fit(df2_TPM_log2_diff)
print(pca.explained_variance_ratio_) #how much of the dimensionality reduction is 
pcat = pca.transform(df2_TPM_log2_diff)
pcat = pd.DataFrame(pcat)
pcat[2] = labels_log2diff

#plt.figure()
#plt.scatter(pcat[:,0], pcat[:,1])


[ 0.73622532  0.10011064]

In [151]:
pcat[2].max()


Out[151]:
37

In [154]:
def dfScatter(df, xcol=0, ycol=1, catcol=2):
    fig, ax = plt.subplots()
    categories = np.unique(df[catcol])
    colors = np.linspace(0, 1, len(categories))
    colordict = dict(zip(categories, colors))
    
    df["Color"] = df[catcol].apply(lambda x: colordict[x])
    ax.scatter(df[xcol], df[ycol], c = df.Color)
    plt.xlim(-1, 1)
    plt.ylim(-1, 1)
    
    return fig

In [155]:
dfScatter(pcat)


Out[155]:

In [104]:
pcat2 = pcat

In [105]:
pcat2.iloc[0:1000,2] = 0
#pcat2.iloc[1000:3000,2] = 1
#pcat2.iloc[3000:,2]= 2

dfScatter(pcat)


In [ ]:


In [ ]:

Code from Dave


In [ ]:
#!/usr/bin/env python3

import math
import sys

import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA

df = df2_TPM_values
print(df.head())


#dfr = df / df.loc("5GB1_FM40_T0m_TR2")
#print(dfr.head())

#sys.exit()

# PCA and clustering of dfr

df.plot.hist(bins=100)
plt.savefig("hist.pdf")

#X = StandardScaler().fit_transform(df)
X = MinMaxScaler().fit_transform(df)

dfX = pd.DataFrame(X)
dfX.plot.hist(bins=100)
print(dfX.head())
plt.savefig("histX.pdf")


pca = PCA(n_components=2)
pca.fit(X)
print(pca.explained_variance_ratio_) 
pcat = pca.transform(X)
plt.figure()
#plt.xlim(-5, 5)
#plt.ylim(-5, 5)
plt.scatter(pcat[:,0], pcat[:,1])
plt.show()
plt.savefig("pca.pdf")


for eps in [0.0001, 0.005, 0.01, 0.05, 0.075, 0.1]:
    for ms in [3, 5]:
        db = DBSCAN(eps=0.3, min_samples=3).fit(X)
        print(set(db.labels_))
        labels = db.labels_
        n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
        print('eps = %f - min_samples = %d - number of clusters: %d' % (eps, ms, n_clusters_))

Code from Dave with mods


In [ ]:
#!/usr/bin/env python3

import math
import sys

import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA

df = df2_TPM_values
#print(df.head())


#dfr = df / df.loc("5GB1_FM40_T0m_TR2")
#print(dfr.head())

#sys.exit()

# PCA and clustering of dfr

#df.plot.hist(bins=100)
#plt.savefig("hist.pdf")

#X = StandardScaler().fit_transform(df)
X = MinMaxScaler().fit_transform(df)
"""
dfX = pd.DataFrame(X)
dfX.plot.hist(bins=100)
print(dfX.head())
plt.savefig("histX.pdf")


pca = PCA(n_components=2)
pca.fit(X)
print(pca.explained_variance_ratio_) 
pcat = pca.transform(X)
plt.figure()
#plt.xlim(-5, 5)
#plt.ylim(-5, 5)
plt.scatter(pcat[:,0], pcat[:,1])
plt.show()
plt.savefig("pca.pdf")
 """

for eps_value in [0.0001, 0.005, 0.01, 0.05, 0.075, 0.1]:
    for ms in [3, 5]:
        db = DBSCAN(eps=eps_value, min_samples=ms).fit(array_stand_scale)
        print(set(db.labels_))
        labels = db.labels_
        n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
        print('eps = %f - min_samples = %d - number of clusters: %d' % (eps_value, ms, n_clusters_))