In [1]:
import subprocess
subprocess.call('pip install mwtab', shell=True)


Out[1]:
0

In [2]:
%load_ext autoreload
import src.project_fxns.organize_xcms as xcms_fxns
import src.data.preprocessing as preproc
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import src.data.data_exploration as explore
import pandas as pd
import scipy.stats as stats

from sklearn.decomposition import PCA
from sklearn import preprocessing
%matplotlib inline
%autoreload 2

In [3]:
mtbls315_pos = xcms_fxns.Xcms_organize(
        '/home/data/processed/MTBLS315/uhplc_pos/xcms_result.tsv'
)
mtbls315_pos.remove_column_prefix(prefix='X')

# get class to samplename mapping
class_col = 'Factor Value[patient group]'
mtbls315_pos.mtbls_class_mapping(
'/home/data/raw/MTBLS315/a_UPLC_POS_nmfi_and_bsi_diagnosis.txt',
'/home/data/raw/MTBLS315/s_NMFI and BSI diagnosis.txt', 
class_col)

# encode classes as numeric
mtbls315_pos.class_encoder()


Were you expecting 3 classes? because thats how many you have. look at self.sample_classes if this is unexpected

In [4]:
mtbls315_pos.class_dict
no_mal = np.concatenate([mtbls315_pos.class_dict['bacterial bloodstream infection'],
                mtbls315_pos.class_dict['non-malarial febrile illness']])
mal = mtbls315_pos.class_dict['malaria']
new_class_dict = {'malaria': mal, 'non-malarial fever': no_mal}

In [5]:
mtbls315_pos.feature_table


Out[5]:
100.075432918:130.5724 100.111791713:203.2736 100.91639733:45.6545 100.993050242:47.0331 101.008165437:39.0944 101.078669783:131.3189 101.095649335:945.9343 101.09567528:911.90055 101.095628546:1008.97985 101.095615198:974.8181 ... 994.377480083:630.4277 994.520108451:630.5829 994.663855991:630.8517 994.676556507:768.2737 994.711878029:1088.5343 994.806739435:630.4151 995.678509494:768.7573 996.642078383:879.4825 997.148884536:50.5859 998.48957211:982.2369
1001_P 4.124568e+04 4559.335425 10515.311682 38648.733397 9.922261e+06 143028.077748 1.117487e+06 7.318810e+05 3.534157e+05 1.065830e+06 ... 26196.711572 19110.949173 22247.840612 18149.024556 6261.200917 22621.254276 29839.237864 10744.035883 29166.468399 21550.114535
1002_P 1.609962e+05 7564.583385 19518.114811 66646.047225 2.366362e+05 94980.936472 1.254100e+06 2.108862e+05 1.110922e+06 1.333910e+06 ... 34861.625759 28069.457961 28739.672692 3334.598552 1658.139839 13207.837248 3261.719194 6318.615497 24345.438911 27478.652377
1003_P 1.010516e+05 16847.598916 20852.688848 73565.663671 7.021896e+06 98878.411262 2.083945e+06 1.165444e+06 1.098844e+06 1.534413e+06 ... 62245.635751 53002.144933 66379.285036 183025.356211 1766.838341 51351.422849 29094.288482 1521.646334 7315.075072 22549.041767
1004_P 1.878054e+05 3809.195651 13274.860513 80341.805688 1.155961e+07 76717.047933 8.845204e+05 7.304386e+05 9.577298e+05 1.338343e+06 ... 98086.965666 97156.194831 71840.855259 234576.046160 42136.190378 38274.218876 32460.707996 0.000000 36253.496775 18923.970138
1005_P 2.756288e+04 4935.104360 7155.077796 22357.844412 9.122485e+06 79906.211010 9.480340e+05 1.157906e+06 1.635425e+06 1.061791e+06 ... 64699.651105 61597.953881 51722.520059 254126.568567 4307.846401 44321.208296 0.000000 21306.866018 60731.056887 27153.899249
1006_P 6.088004e+06 86401.817081 23812.273844 80174.170292 2.271798e+05 219715.598558 1.149859e+06 1.176853e+06 9.993485e+05 2.699038e+06 ... 27635.663893 30350.938760 22075.733844 61229.095711 31504.373458 18956.755556 22847.560086 27971.388513 0.000000 15464.507132
1007_P 8.003690e+04 10143.368962 30196.736697 76783.883707 3.251242e+05 60060.385145 1.189075e+06 1.097152e+06 1.078930e+06 1.275878e+06 ... 22061.527667 13221.472695 23359.396577 703974.357187 17033.699489 0.000000 103390.960142 0.000000 25958.988093 32530.282928
1009_P 1.061486e+06 16702.159146 42420.188235 82968.172248 2.514958e+05 71447.264429 9.483942e+05 1.171721e+06 1.492515e+06 1.378631e+06 ... 30724.388220 20883.566602 22422.071706 138251.680852 2871.742979 12603.043652 16357.450962 1212.539392 17861.630098 0.000000
1010_P 9.996735e+04 12036.565716 22173.481810 82995.010757 7.787296e+06 65290.220015 1.204461e+06 1.079012e+06 1.045018e+06 1.269189e+06 ... 322355.824969 325358.958661 269896.622034 937412.294898 7951.024358 191146.579123 159432.195254 4189.998752 56516.148042 28420.350740
1011_P 8.258854e+04 7336.787830 7668.110141 67607.614566 7.879581e+06 91510.955075 1.120919e+06 2.231713e+06 1.040064e+06 1.410651e+06 ... 10865.142632 9987.921912 3183.033782 1441.660315 13250.875060 10363.853655 10042.800115 8759.171496 3456.279728 21768.570526
1012_P 2.486627e+04 0.000000 23798.645905 74318.926899 2.495282e+05 121150.182427 9.765861e+05 1.355930e+05 1.979297e+06 1.601035e+06 ... 39128.227695 32345.069643 28196.937296 310405.447950 10353.211787 16176.470673 42297.489961 14701.094110 62239.179648 7302.869522
1013_P 2.474193e+05 3042.058107 30019.777303 71372.348477 3.302758e+05 97202.942576 8.476398e+05 1.158097e+06 1.024790e+06 1.706655e+06 ... 80715.785832 66219.156667 60386.423948 38595.907721 1514.626463 43281.337814 7373.208245 4747.807271 16797.889922 27474.872301
1014_P 2.463496e+06 25957.567193 38099.147506 85947.722141 8.616325e+06 84360.142775 2.019288e+06 6.167707e+05 1.093245e+06 1.394724e+06 ... 61096.424634 63933.460434 39113.673434 238756.536646 23785.573524 50127.626497 35359.946288 15384.819463 9737.330080 15549.341143
1016_P 1.645971e+05 12750.282581 24899.764226 57443.849863 2.938372e+05 72740.012726 1.101518e+06 1.024618e+06 9.825788e+05 1.195288e+06 ... 59980.215067 48155.965003 41189.407306 41211.633292 2690.149634 27603.473365 0.000000 10301.801409 20403.051324 37023.938723
1017_P 3.091691e+05 58475.568604 31477.171364 25749.371942 2.216011e+05 317229.930481 1.148231e+06 9.914817e+05 8.962007e+05 1.688914e+06 ... 52068.336627 41441.226548 40988.571374 0.000000 717.838128 26759.358383 13153.129823 2668.684287 50367.540052 36007.374030
1018_P 1.137619e+05 20466.381243 13757.920725 908544.198101 2.788682e+05 50367.161126 8.120569e+05 7.701100e+05 1.119750e+06 1.250080e+06 ... 60019.130990 60607.804588 60011.830901 143679.873103 2016.795208 45122.772012 42538.453832 2245.291779 44235.812422 22255.122700
1019_P 7.267249e+04 12968.163271 25024.646722 82151.382618 3.512341e+05 87400.971952 2.113907e+06 2.042242e+06 9.970629e+05 9.990057e+05 ... 28083.515124 21863.153131 21006.372071 12901.596286 44354.442711 12986.122733 21527.060488 6424.226698 42038.997946 27529.143226
1020_P 3.222506e+04 14298.465570 23066.787036 73850.044685 2.962115e+05 60912.404154 6.632809e+05 9.533900e+05 1.891441e+06 1.693328e+06 ... 92213.167381 109568.459744 77468.217398 692511.475128 1213.242895 64647.913507 139933.971759 21347.792975 0.000000 35095.753322
1021_P 2.327809e+04 8340.653367 25455.303522 80895.822086 3.032674e+05 79895.629925 1.019961e+06 6.519891e+05 1.368225e+05 1.347813e+06 ... 28288.848990 30016.997173 24560.364840 517806.747270 5278.259633 10741.247414 81616.568701 14895.062966 9162.825715 20640.681741
1023_P 3.337373e+05 11795.515373 28443.539078 75984.586800 2.906894e+05 107033.087672 9.052235e+05 6.810129e+05 1.076645e+06 1.462631e+06 ... 138659.358464 162706.433702 149216.682200 868819.359738 3081.575900 97540.832845 148401.798190 5873.173149 36830.071481 17927.447718
1024_P 2.117277e+05 10346.058859 8454.406641 166778.017820 2.129828e+04 87331.616836 1.293672e+06 1.063622e+06 1.121348e+06 1.363596e+06 ... 80712.920970 48060.987411 49918.497158 511996.851729 4267.853397 40451.171993 85473.852180 10834.051454 31437.333201 13694.224913
1026_P 1.870968e+05 2419.009623 6465.235402 98130.813547 7.978295e+06 125003.213980 9.955556e+05 1.502324e+06 1.139035e+06 1.394946e+06 ... 1414.034271 0.000000 0.000000 1258.470810 982.316001 1926.428473 0.000000 9515.452569 39057.918092 14224.362030
1027_P 1.803382e+05 22608.257619 42094.312413 50414.543008 1.757186e+05 169427.720115 1.659208e+06 9.800125e+05 1.552092e+05 1.578041e+06 ... 21459.509410 24891.850030 16823.509382 264482.203981 45498.884924 12875.311044 62199.649235 2305.600091 35121.645026 20740.705649
1029_P 4.647967e+04 10568.923425 14521.450040 134887.748376 5.965082e+04 187039.980096 1.286727e+06 1.322187e+06 1.172244e+06 1.522669e+06 ... 8033.342250 8104.361292 6269.629696 105390.501988 2401.285917 1766.797879 15751.466979 39210.052275 0.000000 21609.531543
1030_P 2.581104e+04 7082.214844 15890.686765 65972.268403 3.173662e+05 110283.985098 2.474894e+06 8.688967e+05 2.179352e+05 1.286139e+06 ... 4583.975415 3523.627356 3707.695620 3791.343645 0.000000 6721.228704 1845.806587 20511.382442 39677.424190 30094.599576
1031_P 2.562884e+05 16662.152529 24521.804173 87972.421845 7.035345e+06 69312.845380 1.304332e+06 2.065069e+06 1.145935e+06 1.789892e+06 ... 53533.947052 64354.329842 38015.243636 69525.587518 6697.747912 23809.939718 7961.834623 4193.987205 20606.807473 25641.312108
1032_P 3.683516e+04 2849.116176 12335.602591 54663.415436 3.555183e+05 198510.892796 1.321202e+06 2.600974e+06 1.554602e+06 1.256757e+06 ... 27900.303419 15406.714299 13404.018849 415174.084512 1323.079280 12573.687998 72150.338036 3695.762089 0.000000 14690.473805
1033_P 3.994242e+04 10082.942519 20605.407887 67647.715299 3.702749e+04 104775.841626 9.467869e+05 1.137089e+06 1.442528e+06 1.329574e+06 ... 14532.383457 17143.261942 8333.169643 14356.901077 4048.748063 11759.265994 0.000000 1226.147800 3399.352821 33191.721306
1034_P 6.075605e+04 13116.059289 6792.845468 24873.225480 3.762094e+06 74151.194711 5.903634e+05 8.980376e+05 1.186792e+06 1.780189e+06 ... 25101.674955 11361.210289 10528.168441 7002.949024 13667.904785 16566.765485 7005.500557 834.607617 55372.475107 23834.699871
1035_P 8.074896e+04 17548.896563 28565.531684 48962.683438 3.258958e+05 73987.990600 1.168607e+06 1.123596e+06 1.236940e+06 1.409021e+06 ... 16020.773749 10569.810896 7572.842185 204290.232838 0.000000 1392.324782 27271.100358 0.000000 9492.034900 14679.934355
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1037_P 2.655596e+05 18292.892086 28387.397784 30236.142927 6.585410e+06 109265.274019 7.104049e+05 6.479000e+05 1.218787e+06 1.100249e+06 ... 39814.416426 46084.680855 45641.135402 9916.118976 3445.100160 34577.616868 0.000000 11437.452213 30614.836704 13381.937473
1038_P 2.778069e+04 15852.034150 17373.107230 20612.846972 8.371762e+06 366396.742891 1.344942e+06 1.216631e+06 9.981331e+05 1.295448e+06 ... 3560.791918 4006.949519 0.000000 6070.829806 19885.510995 2288.062418 0.000000 8784.341156 3544.466299 12061.230661
1039_P 4.266949e+04 3440.195922 30774.493389 52406.909691 2.485044e+05 180260.577483 2.457697e+06 2.015792e+06 1.228701e+06 1.450465e+06 ... 35340.192972 25820.981921 45849.967988 9976.382606 10601.753058 11688.702376 5561.219502 7174.462170 28388.478276 24708.274921
1040_P 8.292188e+05 17672.620903 13960.192456 79132.020850 4.288374e+04 22623.538786 1.241101e+06 7.128372e+05 1.683029e+06 1.478027e+06 ... 27340.968124 17083.528180 21305.578012 44393.762119 1376.503331 8549.366507 9788.350752 1212.204484 37356.115258 24558.718243
1041_P 7.313394e+04 12687.835896 25628.952510 96147.720667 3.295228e+05 62879.059126 1.388139e+06 1.240360e+06 6.269453e+05 1.302438e+06 ... 11024.677479 19574.810219 25756.735712 153618.626188 14278.330256 7471.489254 34767.063867 8526.264247 19209.104998 22498.564485
1042_P 1.821248e+04 13598.457234 23299.229289 81931.729834 2.488093e+05 81148.696977 1.216540e+06 3.830966e+05 1.084623e+06 2.000357e+06 ... 38859.722255 39359.686186 42290.816651 226545.462038 0.000000 28939.885479 34470.344314 8397.605049 21717.002787 25432.218277
1043_P 1.037403e+07 115521.423668 9295.530055 46495.671125 2.446635e+05 371368.317755 1.493897e+06 1.760604e+06 1.250195e+06 1.937830e+06 ... 23946.466811 20899.223786 8463.541593 2784.520931 618.144825 20843.527896 2215.936240 1968.784105 12319.643938 22064.343148
1044_P 9.948547e+05 23941.447973 12859.039625 54736.608794 1.538106e+05 123971.624135 5.593056e+05 1.649587e+06 2.807542e+05 2.232554e+06 ... 39124.968423 43131.019039 36348.419590 25170.057250 0.000000 17496.161945 4626.864228 2578.957136 33406.550978 26348.956165
1045_P 7.115300e+04 11582.677662 15777.912682 46862.098518 5.766879e+06 115991.650292 1.735859e+06 9.202376e+05 1.099512e+06 2.354299e+06 ... 20841.129831 18085.724408 11702.417364 34403.635053 5265.043575 4962.863042 0.000000 593.303950 5990.832267 34445.381649
1046_P 7.032138e+06 21166.255878 19721.284206 63850.608073 2.044078e+05 269445.731771 1.218245e+06 7.583727e+05 1.271308e+06 1.137304e+06 ... 10044.796577 8205.246504 3227.089438 22898.586285 652.439679 5307.666469 12364.954583 7860.489246 46428.488324 24451.865210
1048_P 8.427240e+04 16443.639921 23618.103613 63188.362343 1.540756e+05 91754.401273 1.947695e+06 8.573310e+05 1.322096e+06 1.319559e+06 ... 16264.414626 16704.496584 10101.377470 107764.087629 1070.835680 7015.120526 8366.822790 0.000000 14935.630655 20022.076288
1049_P 3.808075e+04 20832.818159 20113.292219 45145.826417 2.795221e+04 170021.874115 1.166936e+06 3.339735e+05 1.016814e+06 1.477956e+06 ... 2996.861301 2474.696778 6325.058983 9437.407958 1503.896539 931.687985 0.000000 15779.135972 7905.038118 22221.713738
1050_P 4.062318e+04 15559.742582 15683.818053 68581.637001 7.705592e+06 113825.917924 1.233735e+06 1.826777e+06 1.658257e+06 9.591426e+05 ... 1227.675583 0.000000 2674.883207 12271.398688 0.000000 0.000000 0.000000 0.000000 33666.785316 14562.459563
1051_P 1.325975e+07 138374.096802 14213.663496 77165.051665 2.974601e+05 551581.278080 1.225455e+06 1.194537e+06 1.095804e+06 1.417069e+06 ... 19060.720442 15002.541606 7768.906610 27406.498631 719.304706 5345.347080 12159.749122 0.000000 26038.096716 20311.014066
1052_P 6.841291e+04 26191.528907 20133.428705 69578.457443 3.240738e+05 44979.818389 1.168522e+06 1.939675e+06 1.042508e+06 1.086816e+06 ... 30886.601042 38576.252171 49670.778889 682970.544788 0.000000 20113.619985 133900.920580 22690.698481 21482.980252 16658.968360
1053_P 3.546758e+04 30253.686632 19672.697118 80089.429312 2.297169e+05 27500.594774 1.451091e+06 8.790974e+05 4.988744e+05 1.511260e+06 ... 12726.638086 22433.448335 15092.785261 229425.530938 0.000000 15860.133117 38979.182961 32478.008059 15727.805893 21400.304027
1054_P 1.789312e+05 48938.265457 4384.247730 69277.350761 6.860785e+06 155315.791932 1.994946e+06 1.468027e+06 9.544007e+05 1.606719e+06 ... 15900.389097 12999.898900 10546.810156 134249.753453 0.000000 13023.491499 20859.536972 11360.255992 32943.477100 25460.762430
1055_P 6.291803e+04 21497.501974 14885.173555 24953.242805 1.894337e+04 72581.613976 1.174310e+06 2.579142e+06 4.529910e+06 1.004264e+06 ... 101568.496753 121353.147416 97952.518597 559554.164050 11507.139565 51509.577178 84316.708448 5875.925195 53833.225045 14548.842359
1056_P 1.528215e+05 11870.076052 27645.406483 59069.521921 1.782167e+05 62114.533079 1.233003e+06 1.746494e+06 1.748746e+06 1.552603e+06 ... 41022.569366 17532.648357 17832.698809 6346.038758 11530.517425 14033.134750 14714.609320 0.000000 50496.985297 21814.133239
1057_P 8.422757e+04 13064.314749 11904.832671 15595.459913 1.869999e+05 66527.913317 1.530563e+06 9.230784e+05 1.140610e+06 9.096058e+05 ... 26524.780616 32857.976071 18734.728938 391714.632761 11011.561930 16302.230081 67880.376452 3180.170039 47368.896256 13318.795255
1058_P 1.034685e+05 8028.389777 22656.325004 53155.081098 4.923646e+06 55089.326719 2.701194e+06 1.237155e+06 8.949378e+05 1.359182e+06 ... 134237.350677 115730.166134 112349.966641 479008.831625 0.000000 58559.436574 74641.765437 2243.070247 39154.952956 20051.473160
1059_P 7.104665e+04 10698.177874 15459.548643 50300.621397 1.703407e+05 60121.288076 1.702521e+06 9.154164e+05 9.711397e+05 1.377425e+06 ... 37920.550172 31963.212627 29401.882274 101458.605222 1757.177914 22297.852630 4364.313262 0.000000 38939.024967 19165.651715
1060_P 1.897284e+05 5104.217142 10790.277198 56710.021135 2.238471e+05 62089.689879 1.252075e+06 2.154158e+06 9.360905e+05 8.939686e+05 ... 7459.138052 6884.477947 0.000000 16280.659175 0.000000 4915.496816 0.000000 4353.504923 37702.911943 13441.549595
1061_P 5.424944e+04 6849.324940 13988.381635 83400.897896 4.135435e+04 61191.085966 1.940338e+06 1.180713e+06 9.907773e+05 1.043167e+06 ... 3194.314472 5680.327819 8742.465101 76732.559742 859.204921 8703.358000 14550.192685 11805.253438 26066.557235 0.000000
1062_P 3.495749e+05 12573.583742 19333.586394 68785.617773 2.134557e+05 79278.360085 1.437416e+06 1.319319e+06 1.956470e+06 1.023652e+06 ... 7658.746568 16381.276441 6257.253941 78057.071022 1900.651008 9563.560915 4190.118164 12274.101979 20626.937718 13256.310721
1064_P 1.888923e+04 10289.069728 8038.189819 50370.640007 1.526264e+05 50500.125702 1.026474e+06 9.994862e+05 9.299864e+05 1.539845e+06 ... 76088.503278 90537.243457 51867.707440 76651.893305 0.000000 37446.251572 0.000000 8904.397431 46989.163695 10274.000543
1065_P 5.167056e+05 12545.624897 10208.962316 35403.535963 4.489790e+06 10331.818072 2.136149e+06 1.828746e+06 1.315835e+06 1.294394e+06 ... 9513.396327 8055.719621 9819.429100 138529.216838 4034.614198 12594.441611 29091.396364 0.000000 44742.463711 21028.855697
1066_P 8.199373e+05 15910.335471 2709.740693 41535.255628 1.500409e+05 24184.830090 3.442821e+05 1.324995e+06 1.090809e+06 7.307519e+05 ... 15682.835057 12542.477827 33169.361611 193928.065947 4157.509484 22863.089698 22140.673940 0.000000 36843.299698 25546.779878
1067_P 5.601352e+04 11376.854382 27342.055008 49806.803547 6.002846e+06 61900.004929 2.101270e+06 1.951734e+06 1.438264e+06 9.776893e+05 ... 108027.382558 92788.835260 72320.865298 147289.957345 11296.135281 55833.326509 18936.428559 0.000000 21939.133732 8340.705928
1068_P 4.497657e+04 10449.405357 9832.196492 17916.364597 3.990578e+04 48977.977768 1.151815e+06 1.567110e+06 9.563455e+05 1.253748e+06 ... 141451.494905 150709.609294 130055.521019 439757.072985 5848.318703 80676.851478 79824.676495 0.000000 38790.694160 12091.281787

61 rows × 22097 columns

Time to explore raw data and preprocess


In [12]:
sparse_graph = explore.plot_feature_sparsity(mtbls315_pos.feature_table,
                                            new_class_dict)



In [7]:
print('Original number of nans %s ' % mtbls315_pos.feature_table.isnull().sum().sum())
zero_filled = explore.fill_zero_nan(mtbls315_pos.feature_table, )
zero_filled.isnull().sum().sum()


Original number of nans 0 
Out[7]:
0

In [8]:
zero_filled.min(axis=1)


Out[8]:
1001_P    217.13836
1002_P    217.13836
1003_P    217.13836
1004_P    217.13836
1005_P    217.13836
1006_P    217.13836
1007_P    217.13836
1009_P    217.13836
1010_P    217.13836
1011_P    217.13836
1012_P    217.13836
1013_P    217.13836
1014_P    217.13836
1016_P    217.13836
1017_P    217.13836
1018_P    217.13836
1019_P    217.13836
1020_P    217.13836
1021_P    217.13836
1023_P    217.13836
1024_P    217.13836
1026_P    217.13836
1027_P    217.13836
1029_P    217.13836
1030_P    217.13836
1031_P    217.13836
1032_P    217.13836
1033_P    217.13836
1034_P    217.13836
1035_P    217.13836
            ...    
1037_P    217.13836
1038_P    217.13836
1039_P    217.13836
1040_P    217.13836
1041_P    217.13836
1042_P    217.13836
1043_P    217.13836
1044_P    217.13836
1045_P    217.13836
1046_P    217.13836
1048_P    217.13836
1049_P    217.13836
1050_P    217.13836
1051_P    217.13836
1052_P    217.13836
1053_P    217.13836
1054_P    217.13836
1055_P    217.13836
1056_P    217.13836
1057_P    217.13836
1058_P    217.13836
1059_P    217.13836
1060_P    217.13836
1061_P    217.13836
1062_P    217.13836
1064_P    217.13836
1065_P    217.13836
1066_P    217.13836
1067_P    217.13836
1068_P    217.13836
Length: 61, dtype: float64

In [9]:
tidy = explore.tidy(zero_filled)
tidy['value'] = np.log10(tidy['value'])

In [10]:
axes = explore.sample_feature_intensity(tidy, 
                                mtbls315_pos.class_dict)



In [10]:
explore.save_axes(axes, '/home/deletable', 
                 ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])

In [11]:
explore.sample_feature_intensity(tidy, plot_type=sns.boxplot)


Out[11]:
[<matplotlib.axes._subplots.AxesSubplot at 0x7f540b70e510>,
 <matplotlib.axes._subplots.AxesSubplot at 0x7f53ff4255d0>,
 <matplotlib.axes._subplots.AxesSubplot at 0x7f53ff35bd50>,
 <matplotlib.axes._subplots.AxesSubplot at 0x7f53ff010850>,
 <matplotlib.axes._subplots.AxesSubplot at 0x7f53fef39bd0>,
 <matplotlib.axes._subplots.AxesSubplot at 0x7f53fecc3f90>,
 <matplotlib.axes._subplots.AxesSubplot at 0x7f53feaf7650>]

In [53]:
# Plot the sparseness distribution per feature
feat_sparsity = ((mtbls315_pos.feature_table < 1e-15).sum(axis=0)
                / mtbls315_pos.feature_table.shape[0])
explore.distplot_classes(feat_sparsity)


Out[53]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f1c1863dd10>

In [51]:
sample_sparsity = ((mtbls315_pos.feature_table < 1e-15).sum(axis=1) 
            / mtbls315_pos.feature_table.shape[1])
explore.distplot_classes(sparsity, new_class_dict)


non-malarial fever
malaria
Out[51]:
[<matplotlib.axes._subplots.AxesSubplot at 0x7f1c1863de90>,
 <matplotlib.axes._subplots.AxesSubplot at 0x7f1c1863de90>]

In [48]:
# Sparseness per sample
sparsity = ((mtbls315_pos.feature_table < 1e-15).sum(axis=1) 
            / mtbls315_pos.feature_table.shape[1])

explore.plot_feature_sparsity(mtbls315_pos.feature_table, 
                              mtbls315_pos.class_dict)


non-malarial febrile illness
malaria
bacterial bloodstream infection
Out[48]:
[<matplotlib.axes._subplots.AxesSubplot at 0x7f1c19142a10>,
 <matplotlib.axes._subplots.AxesSubplot at 0x7f1c19142a10>,
 <matplotlib.axes._subplots.AxesSubplot at 0x7f1c19142a10>]

In [47]:
explore.plot_feature_sparsity(mtbls315_pos.feature_table,
                              new_class_dict, 
                              #bins=10
                             )


non-malarial fever
malaria
Out[47]:
[<matplotlib.axes._subplots.AxesSubplot at 0x7f1c183d4310>,
 <matplotlib.axes._subplots.AxesSubplot at 0x7f1c183d4310>]

In [41]:
explore.distplot_classes(np.log10(zero_filled),
                         new_class_dict,
                         fxn=np.mean,
                         axlabel='log10(Mean Intensity)',
                        bins=100)


non-malarial fever
malaria
Out[41]:
[<matplotlib.axes._subplots.AxesSubplot at 0x7f1c19044410>,
 <matplotlib.axes._subplots.AxesSubplot at 0x7f1c19044410>]

In [14]:
# plot intensities
#sns.distplot(np.log10(zero_filled.mean(axis=0)), bins=100)
explore.plot_mean_intensity(np.log10(zero_filled), 
                            class_dict=mtbls315_pos.class_dict,
                           axlabel='log10(Intensity)')


non-malarial febrile illness
malaria
bacterial bloodstream infection
Out[14]:
[<matplotlib.axes._subplots.AxesSubplot at 0x7f53fe4c77d0>,
 <matplotlib.axes._subplots.AxesSubplot at 0x7f53fe4c77d0>,
 <matplotlib.axes._subplots.AxesSubplot at 0x7f53fe4c77d0>]

In [15]:
mw_vals = explore.two_group_stat(zero_filled, new_class_dict,
                       stats.mannwhitneyu)

In [16]:
mw_pvals = np.array([i[1] for i in mw_vals])
sns.distplot(mw_pvals*2, bins=50, kde=False)


Out[16]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f53fdff7450>

In [18]:
#a = np.array_split(zero_filled.std(axis=0).sort_values(), 6)
ngroups=10
a = np.array_split(zero_filled.std(axis=0).sort_values(), 
                   ngroups)

n_cols=3.0
n_rows= np.ceil(ngroups/n_cols)
print n_rows
fig, axes = plt.subplots(int(n_rows), int(n_cols), 
                         sharex=True, sharey=True)
for i, arr in enumerate(a):
    stratified_mw = mw_vals[arr.index]
    pvals = [val[1] for val in stratified_mw]
    row = int(np.floor(i/n_cols))
    col = int(i%n_cols)
    #print('row: %s col: %s' % (row, col))
    ax = sns.distplot(pvals, bins=50, kde=False, ax=axes[row,col])
    ax.set_title(i+1)
    #plt.title('%.2f to %.2f' % (arr.min(), arr.max()))


4.0

In [55]:
mw_pvals = mw_vals.apply(lambda x: x[1])


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-55-1e9b61a27e05> in <module>()
----> 1 mw_pvals = mw_vals.apply(lambda x: x[1])

NameError: name 'mw_vals' is not defined

In [54]:
axes = explore.plot_pvals_stratified(zero_filled, 
                              zero_filled.std(axis=0),
                              mw_pvals*2,
                             'MW pval', ngroups=9)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-54-ca2c0f5743e3> in <module>()
      1 axes = explore.plot_pvals_stratified(zero_filled, 
      2                               zero_filled.std(axis=0),
----> 3                               mw_pvals*2,
      4                              'MW pval', ngroups=9)

NameError: name 'mw_pvals' is not defined

In [42]:
for class_label, samples in new_class_dict.iteritems():
    # split by class and plot 
    # mean intensity dist, 
    # intensity-std distribution
    # 
    zero_filled.loc[samples]

In [45]:
np.mean(zero_filled, axis=0)


Out[45]:
100.075432918:130.5724      8.024552e+05
100.111791713:203.2736      1.908554e+04
100.91639733:45.6545        1.935106e+04
100.993050242:47.0331       7.715497e+04
101.008165437:39.0944       2.302296e+06
101.078669783:131.3189      1.139960e+05
101.095649335:945.9343      1.325477e+06
101.09567528:911.90055      1.215631e+06
101.095628546:1008.97985    1.161039e+06
101.095615198:974.8181      1.399420e+06
101.095694694:1254.7685     2.700523e+06
101.095693253:1244.447      2.201434e+06
101.095666822:1025.0084     1.562755e+06
101.095601801:957.7579      1.030166e+06
101.095673092:1074.7635     1.599309e+06
101.095684028:1144.4827     1.442147e+06
101.095681595:812.1951      4.850251e+05
101.095640977:994.8119      1.178163e+06
101.095695064:878.8229      9.891524e+05
101.095704125:1111.705      1.174082e+06
101.095692954:1160.8976     1.217025e+06
101.095706591:1129.94795    1.219049e+06
101.095687723:984.8368      1.144841e+06
101.095715891:1175.8072     1.394085e+06
101.095716756:1283.0166     5.541692e+05
101.095669775:1042.8779     1.210147e+06
101.095697941:1050.1948     1.199373e+06
101.09571701:858.7704       6.404125e+05
101.095726786:836.11035     5.880417e+05
101.095704844:1272.82555    9.681475e+05
                                ...     
987.560752198:674.82365     2.949138e+04
989.618263756:768.273       2.722189e+04
990.119638664:768.2372      2.758304e+04
990.928497919:539.0733      2.413065e+04
991.115183169:68.25975      2.805897e+04
991.129110423:539.1816      4.037979e+04
991.32952565:539.0871       4.121510e+04
991.529710762:539.1293      3.220639e+04
991.652191558:810.6372      1.396812e+04
991.667935419:768.7313      6.291164e+06
991.730046704:538.5651      1.966377e+04
991.929460621:539.2491      1.050127e+04
992.671177339:768.5866      3.280605e+06
993.302487576:348.22345     1.395529e+04
993.502806799:348.713       1.949872e+04
993.673744942:768.32585     9.237232e+05
993.703379621:348.478       1.980537e+04
993.904511387:348.997       1.423662e+04
994.104861127:349.4035      7.764587e+03
994.234004836:630.7918      2.935423e+04
994.377480083:630.4277      4.403043e+04
994.520108451:630.5829      4.218310e+04
994.663855991:630.8517      3.657581e+04
994.676556507:768.2737      1.896739e+05
994.711878029:1088.5343     7.043251e+03
994.806739435:630.4151      2.532533e+04
995.678509494:768.7573      3.241843e+04
996.642078383:879.4825      7.626922e+03
997.148884536:50.5859       2.766075e+04
998.48957211:982.2369       2.075946e+04
Length: 22097, dtype: float64

In [36]:
covariates = mtbls315_pos.all_data.T.loc[['mz', 'rt',
                            'mzmin', 'mzmax', 'rtmin', 'rtmax']].T
covariates


Out[36]:
mz rt mzmin mzmax rtmin rtmax
100.075432918:130.5724 100.075433 130.57240 100.072203 100.078128 128.6014 132.4525
100.111791713:203.2736 100.111792 203.27360 100.111448 100.112040 202.4389 204.3650
100.91639733:45.6545 100.916397 45.65450 100.915915 100.916678 44.0424 49.4263
100.993050242:47.0331 100.993050 47.03310 100.992683 100.993208 46.0313 47.6765
101.008165437:39.0944 101.008165 39.09440 101.002420 101.012186 35.1102 43.8041
101.078669783:131.3189 101.078670 131.31890 101.078432 101.078842 129.2271 132.6766
101.095649335:945.9343 101.095649 945.93430 101.095199 101.095799 938.9658 951.3502
101.09567528:911.90055 101.095675 911.90055 101.095285 101.095862 903.5930 915.5464
101.095628546:1008.97985 101.095629 1008.97985 101.095316 101.095786 1000.7369 1016.4439
101.095615198:974.8181 101.095615 974.81810 101.095172 101.095783 969.2397 980.7654
101.095694694:1254.7685 101.095695 1254.76850 101.095355 101.095847 1250.8848 1260.9501
101.095693253:1244.447 101.095693 1244.44700 101.095306 101.095818 1233.0953 1250.2940
101.095666822:1025.0084 101.095667 1025.00840 101.095221 101.095803 1018.2467 1034.8922
101.095601801:957.7579 101.095602 957.75790 101.095280 101.095714 953.0018 963.9393
101.095673092:1074.7635 101.095673 1074.76350 101.095255 101.095821 1063.9496 1090.1290
101.095684028:1144.4827 101.095684 1144.48270 101.095216 101.095823 1135.6083 1152.6233
101.095681595:812.1951 101.095682 812.19510 101.095275 101.095826 805.4979 825.2273
101.095640977:994.8119 101.095641 994.81190 101.095262 101.095780 990.3489 999.5530
101.095695064:878.8229 101.095695 878.82290 101.095323 101.095878 867.1387 892.1970
101.095704125:1111.705 101.095704 1111.70500 101.095293 101.095852 1100.1215 1120.2270
101.095692954:1160.8976 101.095693 1160.89760 101.095474 101.095855 1154.0703 1167.1771
101.095706591:1129.94795 101.095707 1129.94795 101.095281 101.095830 1125.5219 1133.0796
101.095687723:984.8368 101.095688 984.83680 101.095529 101.095771 982.3311 987.4808
101.095715891:1175.8072 101.095716 1175.80720 101.095592 101.095820 1170.0488 1181.6916
101.095716756:1283.0166 101.095717 1283.01660 101.095374 101.095829 1279.1634 1287.4736
101.095669775:1042.8779 101.095670 1042.87790 101.095266 101.095797 1036.5669 1046.8438
101.095697941:1050.1948 101.095698 1050.19480 101.095283 101.095802 1047.9585 1052.9463
101.09571701:858.7704 101.095717 858.77040 101.095359 101.095857 854.4209 866.3015
101.095726786:836.11035 101.095727 836.11035 101.095356 101.095903 832.3125 838.5297
101.095704844:1272.82555 101.095705 1272.82555 101.095509 101.095827 1267.7330 1277.1186
... ... ... ... ... ... ...
987.560752198:674.82365 987.560752 674.82365 987.557672 987.561894 674.4028 677.0058
989.618263756:768.273 989.618264 768.27300 989.615487 989.621374 767.7240 768.3319
990.119638664:768.2372 990.119639 768.23720 990.116483 990.120948 767.7240 768.8285
990.928497919:539.0733 990.928498 539.07330 990.924868 990.932267 537.7272 540.1866
991.115183169:68.25975 991.115183 68.25975 991.110464 991.118108 65.2043 75.4105
991.129110423:539.1816 991.129110 539.18160 991.124752 991.131888 537.7985 540.1984
991.32952565:539.0871 991.329526 539.08710 991.324981 991.333056 537.0988 540.8758
991.529710762:539.1293 991.529711 539.12930 991.524962 991.532134 537.8051 540.9729
991.652191558:810.6372 991.652192 810.63720 991.648885 991.654576 808.9442 811.3304
991.667935419:768.7313 991.667935 768.73130 991.664176 991.669755 767.6750 769.6035
991.730046704:538.5651 991.730047 538.56510 991.725678 991.735010 537.3487 540.0389
991.929460621:539.2491 991.929461 539.24910 991.927535 991.930827 538.1806 539.7027
992.671177339:768.5866 992.671177 768.58660 992.666959 992.672987 767.6750 769.5414
993.302487576:348.22345 993.302488 348.22345 993.296323 993.306278 345.1466 350.2546
993.502806799:348.713 993.502807 348.71300 993.497784 993.506390 345.1466 350.2546
993.673744942:768.32585 993.673745 768.32585 993.669399 993.675309 767.6750 769.3274
993.703379621:348.478 993.703380 348.47800 993.696475 993.705291 345.7341 350.8933
993.904511387:348.997 993.904511 348.99700 993.896493 993.907071 345.7341 350.2546
994.104861127:349.4035 994.104861 349.40350 994.098871 994.107441 345.7341 349.5221
994.234004836:630.7918 994.234005 630.79180 994.230312 994.238566 628.6599 631.8694
994.377480083:630.4277 994.377480 630.42770 994.374149 994.379017 629.0254 633.6621
994.520108451:630.5829 994.520108 630.58290 994.515497 994.522774 628.5653 633.6621
994.663855991:630.8517 994.663856 630.85170 994.661387 994.665090 628.6599 633.6621
994.676556507:768.2737 994.676557 768.27370 994.671756 994.678288 767.7636 769.3274
994.711878029:1088.5343 994.711878 1088.53430 994.707565 994.714689 1087.3724 1090.7173
994.806739435:630.4151 994.806739 630.41510 994.804766 994.814478 628.9565 634.3069
995.678509494:768.7573 995.678509 768.75730 995.674540 995.680167 768.2477 768.8285
996.642078383:879.4825 996.642078 879.48250 996.636883 996.645895 878.1771 881.4147
997.148884536:50.5859 997.148885 50.58590 997.144884 997.152919 47.8403 55.1680
998.48957211:982.2369 998.489572 982.23690 998.485650 998.493349 980.2638 985.5128

22097 rows × 6 columns


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [41]:
pca = PCA(n_components=2)
pca_out = pca.fit_transform(zero_filled)

In [125]:
x = [i[0] for i in pca_out]
y = [i[1] for i in pca_out]
ax = plt.scatter(x,y, 
            c=y_coded,
           )



In [90]:
y_coded = le.fit_transform(mtbls315_pos.sample_classes[mtbls315_pos.class_label_col])

In [91]:
le.inverse_transform(y_coded)


Out[91]:
array(['malaria', 'malaria', 'malaria', 'malaria',
       'non-malarial febrile illness', 'malaria',
       'bacterial bloodstream infection', 'malaria',
       'non-malarial febrile illness', 'bacterial bloodstream infection',
       'bacterial bloodstream infection', 'malaria', 'malaria', 'malaria',
       'malaria', 'malaria', 'bacterial bloodstream infection',
       'non-malarial febrile illness', 'non-malarial febrile illness',
       'non-malarial febrile illness', 'non-malarial febrile illness',
       'bacterial bloodstream infection', 'malaria', 'malaria', 'malaria',
       'malaria', 'non-malarial febrile illness',
       'bacterial bloodstream infection', 'non-malarial febrile illness',
       'malaria', 'non-malarial febrile illness', 'malaria', 'malaria',
       'malaria', 'malaria', 'malaria', 'malaria',
       'non-malarial febrile illness', 'malaria',
       'bacterial bloodstream infection', 'malaria', 'malaria',
       'bacterial bloodstream infection',
       'bacterial bloodstream infection', 'malaria',
       'non-malarial febrile illness', 'bacterial bloodstream infection',
       'non-malarial febrile illness', 'non-malarial febrile illness',
       'malaria', 'bacterial bloodstream infection', 'malaria', 'malaria',
       'non-malarial febrile illness', 'bacterial bloodstream infection',
       'non-malarial febrile illness', 'malaria', 'malaria', 'malaria',
       'malaria', 'malaria'], dtype=object)

In [88]:
y


Out[88]:
array([1, 1, 1, 1, 2, 1, 0, 1, 2, 0, 0, 1, 1, 1, 1, 1, 0, 2, 2, 2, 2, 0, 1,
       1, 1, 1, 2, 0, 2, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 0, 1, 1, 0, 0, 1, 2,
       0, 2, 2, 1, 0, 1, 1, 2, 0, 2, 1, 1, 1, 1, 1])

In [ ]: