In [15]:
from scipy.stats import kendalltau as kTau
import matplotlib.pyplot as plt

# from sklearn.externals.joblib import Memory
# memory = Memory(cachedir='/tmp',verbose=0)

import jupyternotify
ip = get_ipython()
ip.register_magics(jupyternotify.JupyterNotifyMagics)

# %autonotify -a 30



In [16]:
# This is probably unnecesary ¯\_(ツ)_/¯
def ODF2DF(GP_ODF):
    GP_ODF = GP_ODF[['Rank','Feature']]
    GP_ODF.sort_values('Rank', inplace=true)
    GP_ODF.set_index('Rank', inplace=True)
    return GP_ODF

Data

Using CMS: Gold Standard


In [6]:
# Requires GenePattern Notebook: pip install genepattern-notebook
import gp
import genepattern

# Username and password removed for security reasons.
genepattern.GPAuthWidget(genepattern.register_session("https://genepattern.broadinstitute.org/gp", "", ""))



In [7]:
comparativemarkerselection_task = gp.GPTask(genepattern.get_session(0), 'urn:lsid:broad.mit.edu:cancer.software.genepattern.module.analysis:00044')
comparativemarkerselection_job_spec = comparativemarkerselection_task.make_job_spec()
comparativemarkerselection_job_spec.set_parameter("input.file", "https://software.broadinstitute.org/cancer/software/genepattern/data/all_aml/all_aml_test.gct")
comparativemarkerselection_job_spec.set_parameter("cls.file", "https://software.broadinstitute.org/cancer/software/genepattern/data/all_aml/all_aml_test.cls")
comparativemarkerselection_job_spec.set_parameter("confounding.variable.cls.file", "")
comparativemarkerselection_job_spec.set_parameter("test.direction", "2")
comparativemarkerselection_job_spec.set_parameter("test.statistic", "0")
comparativemarkerselection_job_spec.set_parameter("min.std", "")
comparativemarkerselection_job_spec.set_parameter("number.of.permutations", "10000")
comparativemarkerselection_job_spec.set_parameter("log.transformed.data", "false")
comparativemarkerselection_job_spec.set_parameter("complete", "false")
comparativemarkerselection_job_spec.set_parameter("balanced", "false")
comparativemarkerselection_job_spec.set_parameter("random.seed", "779948241")
comparativemarkerselection_job_spec.set_parameter("smooth.p.values", "true")
comparativemarkerselection_job_spec.set_parameter("phenotype.test", "one versus all")
comparativemarkerselection_job_spec.set_parameter("output.filename", "<input.file_basename>.comp.marker.odf")
genepattern.GPTaskWidget(comparativemarkerselection_task)



In [8]:
job1587350 = gp.GPJob(genepattern.get_session(0), 1587350)
genepattern.GPJobWidget(job1587350)



In [17]:
# The code below will only run if pandas is installed: http://pandas.pydata.org
from gp.data import ODF
all_aml_test_comp_marker_odf_1587350 = ODF(job1587350.get_file("all_aml_test.comp.marker.odf"))
all_aml_test_comp_marker_odf_1587350


Out[17]:
Rank Feature Description Score Feature P Feature P Low Feature P High FDR(BH) Q Value Bonferroni maxT FWER Fold Change ALL Mean ALL Std AML Mean AML Std k
0 1 M89957_at IGB Immunoglobulin-associated beta (B29) 8.335325 0.0002 0.00000 0.000299 0.011590 0.010518 1.0 0.0000 0.0000 -5.856628 2011.333333 1200.105468 -343.428571 396.421049 0
1 2 J05243_at SPTAN1 Spectrin, alpha, non-erythrocytic 1 (al... 7.305599 0.0002 0.00000 0.000299 0.011590 0.010518 1.0 0.0001 0.0001 3.773193 718.523810 299.612186 190.428571 115.366783 0
2 3 M11722_at Terminal transferase mRNA 7.287828 0.0002 0.00000 0.000299 0.011590 0.010518 1.0 0.0001 0.0001 70.797073 5067.047619 3134.301939 71.571429 169.242073 0
3 4 M31523_at TCF3 Transcription factor 3 (E2A immunoglobuli... 7.285013 0.0002 0.00000 0.000299 0.011590 0.010518 1.0 0.0001 0.0001 4.714167 1488.666667 720.431352 315.785714 129.907765 0
4 5 M84371_rna1_s_at CD19 gene 7.249615 0.0002 0.00000 0.000299 0.011590 0.010518 1.0 0.0001 0.0001 3.475639 1929.476190 807.017820 555.142857 262.578005 0
5 7 D88270_at GB DEF = (lambda) DNA for immunoglobin light c... 6.464445 0.0002 0.00000 0.000299 0.011590 0.010518 1.0 0.0012 0.0012 59.118573 4024.285714 2802.251811 68.071429 92.032991 0
6 8 U05259_rna1_at MB-1 gene 6.425300 0.0002 0.00000 0.000299 0.011590 0.010518 1.0 0.0012 0.0012 10.178065 5177.000000 3281.358271 508.642857 460.668011 0
7 9 M92287_at CCND3 Cyclin D3 6.389370 0.0002 0.00000 0.000299 0.011590 0.010518 1.0 0.0014 0.0014 5.087627 4570.142857 2573.249391 898.285714 457.413200 0
8 11 U29175_at Transcriptional activator hSNF2b 6.106756 0.0002 0.00000 0.000299 0.011590 0.010518 1.0 0.0031 0.0031 2.449352 1188.285714 459.793121 485.142857 211.345818 0
9 14 Z49194_at OBF-1 mRNA for octamer binding factor 1 5.713933 0.0002 0.00000 0.000299 0.011590 0.010518 1.0 0.0076 0.0076 -11.227687 440.285714 375.058414 -39.214286 69.363234 0
10 15 X59417_at PROTEASOME IOTA CHAIN 5.650425 0.0002 0.00000 0.000299 0.011590 0.010518 1.0 0.0101 0.0101 3.601700 3933.571429 2206.576434 1092.142857 542.501451 0
11 16 X66401_cds1_at LMP2 gene extracted from H.sapiens genes TAP1,... 5.499817 0.0002 0.00000 0.000299 0.011590 0.010518 1.0 0.0158 0.0158 2.684612 1610.000000 739.640994 599.714286 328.174304 0
12 17 M31211_s_at MYL1 Myosin light chain (alkali) 5.460969 0.0002 0.00000 0.000299 0.011590 0.010518 1.0 0.0177 0.0177 2.829403 462.809524 185.501110 163.571429 138.187005 0
13 18 U51240_at KIAA0085 gene, partial cds 5.455191 0.0004 0.00003 0.000640 0.016576 0.011780 1.0 0.0181 0.0181 2.327868 7474.619048 3016.387516 3210.928571 1576.865170 1
14 20 X58529_at IGHM Immunoglobulin mu 5.425563 0.0002 0.00000 0.000299 0.011590 0.010518 1.0 0.0200 0.0202 6.656980 5432.571429 3517.526483 816.071429 1373.782956 0
15 22 M33680_at 26-kDa cell surface protein TAPA-1 mRNA 5.295728 0.0002 0.00000 0.000299 0.011590 0.010518 1.0 0.0294 0.0297 2.622641 6025.142857 2583.860006 2297.357143 1576.775854 0
16 24 Y08612_at RABAPTIN-5 protein 5.255236 0.0004 0.00003 0.000640 0.016576 0.011780 1.0 0.0338 0.0341 2.087321 367.666667 144.434183 176.142857 68.463193 1
17 26 X69111_at ID3 Inhibitor of DNA binding 3, dominant negat... 5.225453 0.0002 0.00000 0.000299 0.011590 0.010518 1.0 0.0365 0.0369 3.951904 1889.857143 1210.983744 478.214286 209.906638 0
18 27 X67951_at PAGA Proliferation-associated gene A (natural ... 5.180374 0.0002 0.00000 0.000299 0.011590 0.010518 1.0 0.0401 0.0405 4.097582 4673.000000 3066.699594 1140.428571 490.242125 0
19 29 U88964_at HEM45 mRNA 5.140885 0.0002 0.00000 0.000299 0.011590 0.010518 1.0 0.0459 0.0464 2.165290 1648.095238 673.612270 761.142857 337.973112 0
20 31 U07139_at CAB3b mRNA for calcium channel beta3 subunit 5.117449 0.0002 0.00000 0.000299 0.011590 0.010518 1.0 0.0500 0.0506 -0.808652 110.380952 181.315878 -136.500000 103.278377 0
21 33 HG1612-HT1612_at Macmarcks 5.094685 0.0002 0.00000 0.000299 0.011590 0.010518 1.0 0.0522 0.0528 2.753363 3635.619048 1668.185705 1320.428571 1017.789741 0
22 36 X97267_rna1_s_at LPAP gene 4.964845 0.0002 0.00000 0.000299 0.011590 0.010518 1.0 0.0736 0.0745 4.712416 2786.047619 1849.861656 591.214286 674.313004 0
23 37 X82240_rna1_at TCL1 gene (T cell leukemia) extracted from H.s... 4.954403 0.0002 0.00000 0.000299 0.011590 0.010518 1.0 0.0759 0.0768 -120.650362 6342.761905 5910.101453 -52.571429 203.690834 0
24 39 J03473_at ADPRT ADP-ribosyltransferase (NAD+; poly (ADP-... 4.945180 0.0002 0.00000 0.000299 0.011590 0.010518 1.0 0.0781 0.0790 2.254494 1430.476190 704.483614 634.500000 178.462558 0
25 41 S50223_at HKR-T1 4.931526 0.0002 0.00000 0.000299 0.011590 0.010518 1.0 0.0799 0.0807 -6.278867 205.857143 206.483725 -32.785714 66.032834 0
26 42 U94855_at Translation initiation factor 3 47 kDa subunit... 4.922441 0.0002 0.00000 0.000299 0.011590 0.010518 1.0 0.0818 0.0826 1.890199 3802.000000 1518.813386 2011.428571 560.895399 0
27 43 D30742_at CAMK4 Calcium/calmodulin-dependent protein kin... 4.913935 0.0002 0.00000 0.000299 0.011590 0.010518 1.0 0.0844 0.0852 2.348602 238.047619 83.182015 101.357143 78.867569 0
28 44 M74719_at SEF2-1A protein (SEF2-1A) mRNA, 5' end 4.899401 0.0002 0.00000 0.000299 0.011590 0.010518 1.0 0.0878 0.0886 4.561467 1978.047619 1374.866265 433.642857 361.866658 0
29 46 M28170_at CD19 CD19 antigen 4.820329 0.0002 0.00000 0.000299 0.011590 0.010518 1.0 0.1081 0.1092 -9.601329 206.428571 183.235251 -21.500000 94.438217 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
7099 78 M14328_s_at ENO1 Enolase 1, (alpha) -4.278580 0.0004 0.00003 0.000640 0.016576 0.011780 1.0 0.3826 0.3866 1.775402 6287.428571 2476.978292 11162.714286 3753.265069 9999
7100 77 M62762_at ATP6C Vacuolar H+ ATPase proton channel subunit -4.284413 0.0002 0.00000 0.000299 0.011590 0.010518 1.0 0.3787 0.3824 1.920622 1415.761905 576.413992 2719.142857 1036.411699 10000
7101 76 M93056_at LEUKOCYTE ELASTASE INHIBITOR -4.300358 0.0004 0.00003 0.000640 0.016576 0.011780 1.0 0.3650 0.3689 3.942776 480.571429 678.199570 1894.785714 1098.838000 9999
7102 71 D11327_s_at PTPN7 Protein tyrosine phosphatase, non-recept... -4.349638 0.0004 0.00003 0.000640 0.016576 0.011780 1.0 0.3307 0.3340 3.344542 146.571429 153.909575 490.214286 267.568303 9999
7103 69 U10868_at ALDH7 Aldehyde dehydrogenase 7 -4.383202 0.0004 0.00003 0.000640 0.016576 0.011780 1.0 0.3078 0.3110 2.036434 588.142857 301.177736 1197.714286 458.578309 9999
7104 68 M63138_at CTSD Cathepsin D (lysosomal aspartyl protease) -4.399921 0.0002 0.00000 0.000299 0.011590 0.010518 1.0 0.2974 0.3007 3.692145 1521.047619 584.929011 5615.928571 3449.347284 10000
7105 67 M19507_at MPO Myeloperoxidase -4.409131 0.0002 0.00000 0.000299 0.011590 0.010518 1.0 0.2910 0.2943 20.551466 560.238095 785.642661 11513.714286 9273.129821 10000
7106 65 Y07604_at Nucleoside-diphosphate kinase -4.453201 0.0004 0.00003 0.000640 0.016576 0.011780 1.0 0.2640 0.2668 2.003732 854.904762 483.931700 1713.000000 603.071497 9999
7107 57 J02783_at P4HB Procollagen-proline, 2-oxoglutarate 4-dio... -4.496895 0.0002 0.00000 0.000299 0.011590 0.010518 1.0 0.2406 0.2430 3.170091 495.952381 419.798223 1572.214286 827.312905 10000
7108 56 X16546_at RNS2 Ribonuclease 2 (eosinophil-derived neurot... -4.529169 0.0002 0.00000 0.000299 0.011590 0.010518 1.0 0.2239 0.2262 3.452787 281.904762 166.651704 973.357143 554.782509 10000
7109 52 M27891_at CST3 Cystatin C (amyloid angiopathy and cerebr... -4.656582 0.0002 0.00000 0.000299 0.011590 0.010518 1.0 0.1617 0.1634 42.876270 243.809524 477.041048 10453.642857 8194.555440 10000
7110 49 U05572_s_at MANB Mannosidase alpha-B (lysosomal) -4.711378 0.0002 0.00000 0.000299 0.011590 0.010518 1.0 0.1424 0.1436 -4.822661 -95.190476 286.420429 459.071429 372.918317 10000
7111 48 X05908_at ANX1 Annexin I (lipocortin I) -4.775031 0.0002 0.00000 0.000299 0.011590 0.010518 1.0 0.1194 0.1205 12.963248 132.809524 344.828163 1721.642857 1212.737057 10000
7112 47 M32304_s_at TIMP2 Tissue inhibitor of metalloproteinase 2 -4.781974 0.0002 0.00000 0.000299 0.011590 0.010518 1.0 0.1167 0.1178 2.571388 506.952381 263.555208 1303.571429 584.990429 10000
7113 45 M23197_at CD33 CD33 antigen (differentiation antigen) -4.821914 0.0002 0.00000 0.000299 0.011590 0.010518 1.0 0.1080 0.1091 5.357715 178.380952 88.155814 955.714286 598.876824 10000
7114 40 U59878_at Low-Mr GTP-binding protein (RAB32) mRNA, parti... -4.937669 0.0002 0.00000 0.000299 0.011590 0.010518 1.0 0.0792 0.0800 3.902114 189.238095 88.497969 738.428571 409.843444 10000
7115 38 L09717_at LAMP2 Lysosome-associated membrane protein 2 {... -4.947793 0.0002 0.00000 0.000299 0.011590 0.010518 1.0 0.0775 0.0784 4.013771 44.952381 60.780323 180.428571 89.628905 10000
7116 35 L41559_at PCBD 6-pyruvoyl-tetrahydropterin synthase/dime... -5.010096 0.0002 0.00000 0.000299 0.011590 0.010518 1.0 0.0640 0.0648 135.362069 1.380952 94.627943 186.928571 115.032079 10000
7117 34 M22960_at PPGB Protective protein for beta-galactosidase... -5.085313 0.0002 0.00000 0.000299 0.011590 0.010518 1.0 0.0534 0.0541 4.102370 415.857143 269.889475 1706.000000 923.325428 10000
7118 32 X61587_at ARHG Ras homolog gene family, member G (rho G) -5.094950 0.0002 0.00000 0.000299 0.011590 0.010518 1.0 0.0521 0.0527 5.924811 358.142857 569.906684 2121.928571 1208.829724 10000
7119 30 M84526_at DF D component of complement (adipsin) -5.124765 0.0002 0.00000 0.000299 0.011590 0.010518 1.0 0.0486 0.0492 -68.529247 -93.619048 168.850371 6415.642857 4750.496000 10000
7120 28 M14636_at PYGL Glycogen phosphorylase L (liver form) -5.167731 0.0002 0.00000 0.000299 0.011590 0.010518 1.0 0.0417 0.0422 6.423604 45.190476 60.271568 290.285714 170.499460 10000
7121 25 X12447_at ALDOA Aldolase A -5.241978 0.0002 0.00000 0.000299 0.011590 0.010518 1.0 0.0347 0.0351 1.932889 5173.380952 2364.026110 9999.571429 2853.315419 10000
7122 23 Z29067_at Nek3 mRNA for protein kinase -5.279477 0.0002 0.00000 0.000299 0.011590 0.010518 1.0 0.0307 0.0310 11.653409 12.571429 75.233351 146.500000 72.359944 10000
7123 21 L09209_s_at APLP2 Amyloid beta (A4) precursor-like protein 2 -5.420151 0.0002 0.00000 0.000299 0.011590 0.010518 1.0 0.0203 0.0205 4.849143 580.809524 438.118320 2816.428571 1501.269602 10000
7124 19 U60319_at HLA-H MHC protein HLA-H (hereditary haemochrom... -5.431076 0.0002 0.00000 0.000299 0.011590 0.010518 1.0 0.0197 0.0198 -1.185282 -45.619048 47.170410 54.071429 56.864839 10000
7125 13 X17042_at PRG1 Proteoglycan 1, secretory granule -5.778204 0.0002 0.00000 0.000299 0.011590 0.010518 1.0 0.0066 0.0066 3.457207 1739.285714 1722.343901 6013.071429 2383.543370 10000
7126 12 X95735_at Zyxin -6.016839 0.0002 0.00000 0.000299 0.011590 0.010518 1.0 0.0037 0.0037 8.504233 410.619048 668.831031 3492.000000 1836.736738 10000
7127 10 M63959_at LRPAP1 Low density lipoprotein-related protein... -6.259319 0.0002 0.00000 0.000299 0.011590 0.010518 1.0 0.0019 0.0019 2.296480 656.190476 269.201898 1506.928571 458.594586 10000
7128 6 U46499_at GLUTATHIONE S-TRANSFERASE, MICROSOMAL -6.799713 0.0002 0.00000 0.000299 0.011590 0.010518 1.0 0.0005 0.0005 29.698819 48.380952 56.772772 1436.857143 762.625108 10000

7129 rows × 18 columns


In [18]:
cms_scores = all_aml_test_comp_marker_odf_1587350.dataframe
cms_scores.sort_values(by='Rank',inplace=True)
cms_scores


Out[18]:
Rank Feature Description Score Feature P Feature P Low Feature P High FDR(BH) Q Value Bonferroni maxT FWER Fold Change ALL Mean ALL Std AML Mean AML Std k
0 1 M89957_at IGB Immunoglobulin-associated beta (B29) 8.335325 0.000200 0.000000 0.000299 0.011590 0.010518 1.0 0.0000 0.0000 -5.856628 2011.333333 1200.105468 -343.428571 396.421049 0
1 2 J05243_at SPTAN1 Spectrin, alpha, non-erythrocytic 1 (al... 7.305599 0.000200 0.000000 0.000299 0.011590 0.010518 1.0 0.0001 0.0001 3.773193 718.523810 299.612186 190.428571 115.366783 0
2 3 M11722_at Terminal transferase mRNA 7.287828 0.000200 0.000000 0.000299 0.011590 0.010518 1.0 0.0001 0.0001 70.797073 5067.047619 3134.301939 71.571429 169.242073 0
3 4 M31523_at TCF3 Transcription factor 3 (E2A immunoglobuli... 7.285013 0.000200 0.000000 0.000299 0.011590 0.010518 1.0 0.0001 0.0001 4.714167 1488.666667 720.431352 315.785714 129.907765 0
4 5 M84371_rna1_s_at CD19 gene 7.249615 0.000200 0.000000 0.000299 0.011590 0.010518 1.0 0.0001 0.0001 3.475639 1929.476190 807.017820 555.142857 262.578005 0
7128 6 U46499_at GLUTATHIONE S-TRANSFERASE, MICROSOMAL -6.799713 0.000200 0.000000 0.000299 0.011590 0.010518 1.0 0.0005 0.0005 29.698819 48.380952 56.772772 1436.857143 762.625108 10000
5 7 D88270_at GB DEF = (lambda) DNA for immunoglobin light c... 6.464445 0.000200 0.000000 0.000299 0.011590 0.010518 1.0 0.0012 0.0012 59.118573 4024.285714 2802.251811 68.071429 92.032991 0
6 8 U05259_rna1_at MB-1 gene 6.425300 0.000200 0.000000 0.000299 0.011590 0.010518 1.0 0.0012 0.0012 10.178065 5177.000000 3281.358271 508.642857 460.668011 0
7 9 M92287_at CCND3 Cyclin D3 6.389370 0.000200 0.000000 0.000299 0.011590 0.010518 1.0 0.0014 0.0014 5.087627 4570.142857 2573.249391 898.285714 457.413200 0
7127 10 M63959_at LRPAP1 Low density lipoprotein-related protein... -6.259319 0.000200 0.000000 0.000299 0.011590 0.010518 1.0 0.0019 0.0019 2.296480 656.190476 269.201898 1506.928571 458.594586 10000
8 11 U29175_at Transcriptional activator hSNF2b 6.106756 0.000200 0.000000 0.000299 0.011590 0.010518 1.0 0.0031 0.0031 2.449352 1188.285714 459.793121 485.142857 211.345818 0
7126 12 X95735_at Zyxin -6.016839 0.000200 0.000000 0.000299 0.011590 0.010518 1.0 0.0037 0.0037 8.504233 410.619048 668.831031 3492.000000 1836.736738 10000
7125 13 X17042_at PRG1 Proteoglycan 1, secretory granule -5.778204 0.000200 0.000000 0.000299 0.011590 0.010518 1.0 0.0066 0.0066 3.457207 1739.285714 1722.343901 6013.071429 2383.543370 10000
9 14 Z49194_at OBF-1 mRNA for octamer binding factor 1 5.713933 0.000200 0.000000 0.000299 0.011590 0.010518 1.0 0.0076 0.0076 -11.227687 440.285714 375.058414 -39.214286 69.363234 0
10 15 X59417_at PROTEASOME IOTA CHAIN 5.650425 0.000200 0.000000 0.000299 0.011590 0.010518 1.0 0.0101 0.0101 3.601700 3933.571429 2206.576434 1092.142857 542.501451 0
11 16 X66401_cds1_at LMP2 gene extracted from H.sapiens genes TAP1,... 5.499817 0.000200 0.000000 0.000299 0.011590 0.010518 1.0 0.0158 0.0158 2.684612 1610.000000 739.640994 599.714286 328.174304 0
12 17 M31211_s_at MYL1 Myosin light chain (alkali) 5.460969 0.000200 0.000000 0.000299 0.011590 0.010518 1.0 0.0177 0.0177 2.829403 462.809524 185.501110 163.571429 138.187005 0
13 18 U51240_at KIAA0085 gene, partial cds 5.455191 0.000400 0.000030 0.000640 0.016576 0.011780 1.0 0.0181 0.0181 2.327868 7474.619048 3016.387516 3210.928571 1576.865170 1
7124 19 U60319_at HLA-H MHC protein HLA-H (hereditary haemochrom... -5.431076 0.000200 0.000000 0.000299 0.011590 0.010518 1.0 0.0197 0.0198 -1.185282 -45.619048 47.170410 54.071429 56.864839 10000
14 20 X58529_at IGHM Immunoglobulin mu 5.425563 0.000200 0.000000 0.000299 0.011590 0.010518 1.0 0.0200 0.0202 6.656980 5432.571429 3517.526483 816.071429 1373.782956 0
7123 21 L09209_s_at APLP2 Amyloid beta (A4) precursor-like protein 2 -5.420151 0.000200 0.000000 0.000299 0.011590 0.010518 1.0 0.0203 0.0205 4.849143 580.809524 438.118320 2816.428571 1501.269602 10000
15 22 M33680_at 26-kDa cell surface protein TAPA-1 mRNA 5.295728 0.000200 0.000000 0.000299 0.011590 0.010518 1.0 0.0294 0.0297 2.622641 6025.142857 2583.860006 2297.357143 1576.775854 0
7122 23 Z29067_at Nek3 mRNA for protein kinase -5.279477 0.000200 0.000000 0.000299 0.011590 0.010518 1.0 0.0307 0.0310 11.653409 12.571429 75.233351 146.500000 72.359944 10000
16 24 Y08612_at RABAPTIN-5 protein 5.255236 0.000400 0.000030 0.000640 0.016576 0.011780 1.0 0.0338 0.0341 2.087321 367.666667 144.434183 176.142857 68.463193 1
7121 25 X12447_at ALDOA Aldolase A -5.241978 0.000200 0.000000 0.000299 0.011590 0.010518 1.0 0.0347 0.0351 1.932889 5173.380952 2364.026110 9999.571429 2853.315419 10000
17 26 X69111_at ID3 Inhibitor of DNA binding 3, dominant negat... 5.225453 0.000200 0.000000 0.000299 0.011590 0.010518 1.0 0.0365 0.0369 3.951904 1889.857143 1210.983744 478.214286 209.906638 0
18 27 X67951_at PAGA Proliferation-associated gene A (natural ... 5.180374 0.000200 0.000000 0.000299 0.011590 0.010518 1.0 0.0401 0.0405 4.097582 4673.000000 3066.699594 1140.428571 490.242125 0
7120 28 M14636_at PYGL Glycogen phosphorylase L (liver form) -5.167731 0.000200 0.000000 0.000299 0.011590 0.010518 1.0 0.0417 0.0422 6.423604 45.190476 60.271568 290.285714 170.499460 10000
19 29 U88964_at HEM45 mRNA 5.140885 0.000200 0.000000 0.000299 0.011590 0.010518 1.0 0.0459 0.0464 2.165290 1648.095238 673.612270 761.142857 337.973112 0
7119 30 M84526_at DF D component of complement (adipsin) -5.124765 0.000200 0.000000 0.000299 0.011590 0.010518 1.0 0.0486 0.0492 -68.529247 -93.619048 168.850371 6415.642857 4750.496000 10000
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3482 7100 S73205_at GB DEF = Insulin activator factor [human, panc... -0.008349 0.985003 0.982491 0.987261 0.994733 0.667916 1.0 1.0000 1.0000 1.013672 12.190476 40.156717 12.357143 67.112543 5075
3453 7101 X59766_at AZGP1 Zinc-alpha-2-glycoprotein 1 0.008136 0.992202 0.990346 0.993802 0.996736 0.669301 1.0 1.0000 1.0000 1.021277 13.714286 90.495935 13.428571 108.646013 4961
3454 7102 U47621_at Nucleolar autoantigen No55 mRNA 0.007673 0.969806 0.966324 0.973035 0.990775 0.665306 1.0 1.0000 1.0000 2.333333 1.333333 280.266718 0.571429 292.715854 4849
3481 7103 HG3976-HT4246_at Pou-Domain Dna Binding Factor Pit1, Pituitary-... -0.007589 0.921816 0.916429 0.926954 0.973996 0.653992 1.0 1.0000 1.0000 1.002226 96.285714 107.844398 96.500000 58.372478 5391
3480 7104 U65002_at Zinc finger protein PLAG1 mRNA -0.006979 0.991602 0.989682 0.993266 0.996695 0.669233 1.0 1.0000 1.0000 0.995086 -19.380952 53.325863 -19.285714 26.675914 4958
3479 7105 X81372_at Biphenyl hydrolase-related protein -0.006885 0.974005 0.970758 0.976999 0.991813 0.665955 1.0 1.0000 1.0000 1.003117 106.952381 92.746146 107.285714 164.569568 5130
3478 7106 U58032_at GB DEF = Myotubularin related protein 1 (MTMR1... -0.006750 0.991602 0.989682 0.993266 0.996695 0.669233 1.0 1.0000 1.0000 1.003073 77.476190 81.879557 77.714286 113.787541 5042
3477 7107 M74826_at GAD2 Glutamate decarboxylase 2 -0.005795 0.982204 0.979483 0.984670 0.993976 0.667407 1.0 1.0000 1.0000 1.001053 158.333333 94.794163 158.500000 74.758843 5089
3455 7108 U81787_at Wnt10B mRNA 0.005695 0.987403 0.985086 0.989464 0.995643 0.668526 1.0 1.0000 1.0000 1.000978 195.047619 89.307601 194.857143 101.691734 4937
3456 7109 X16281_at ZNF44 Zinc finger protein 44 (KOX 7) 0.005660 0.991202 0.989240 0.992908 0.996695 0.669233 1.0 1.0000 1.0000 0.997725 -41.761905 49.761335 -41.857143 48.095829 5044
3476 7110 Z46629_at SOX9 SRY (sex-determining region Y)-box 9 (cam... -0.005463 0.993001 0.991235 0.994512 0.996917 0.669429 1.0 1.0000 1.0000 1.011628 8.190476 52.335092 8.285714 49.278531 4965
3457 7111 X63563_at RPS13 RNA polymerase II polypeptide B (140 kD) 0.004951 0.991202 0.989240 0.992908 0.996695 0.669233 1.0 1.0000 1.0000 1.001141 313.428571 195.114216 313.071429 217.898870 4956
3475 7112 L75847_at ZNF45 Zinc finger protein 45 (a Kruppel-associ... -0.004928 0.997600 0.996506 0.998439 0.999283 0.670971 1.0 1.0000 1.0000 1.003914 24.333333 38.497186 24.428571 65.123214 5012
3474 7113 L77559_at GB DEF = DGS-B partial mRNA -0.004650 1.000000 0.999701 1.000000 1.000000 0.671452 1.0 1.0000 1.0000 0.998031 -60.476190 77.445219 -60.357143 71.950418 5000
3458 7114 U12595_at Tumor necrosis factor type 1 receptor associat... 0.003895 0.981604 0.978840 0.984113 0.993976 0.667407 1.0 1.0000 1.0000 1.004320 71.952381 155.581964 71.642857 268.831834 4908
3459 7115 X60487_at GB DEF = H4/h gene for H4 histone 0.003891 0.992601 0.990790 0.994157 0.996796 0.669301 1.0 1.0000 1.0000 1.001554 61.380952 45.892784 61.285714 83.561519 5037
3460 7116 U57911_at Fetal brain (239FB) mRNA, from the WAGR region 0.003789 0.983003 0.980340 0.985412 0.994141 0.667518 1.0 1.0000 1.0000 1.004274 11.190476 33.853536 11.142857 38.036131 5085
3461 7117 M18700_s_at ELASTASE IIIA PRECURSOR 0.003198 0.990802 0.988799 0.992549 0.996695 0.669233 1.0 1.0000 1.0000 1.000846 309.761905 173.297693 309.500000 271.842927 5046
3473 7118 U67171_at GB DEF = Selenoprotein W (selW) mRNA -0.002712 0.988002 0.985738 0.990012 0.995745 0.668636 1.0 1.0000 1.0000 1.000810 499.523810 323.232829 499.928571 492.047913 5060
3462 7119 U52828_s_at Delta-catenin mRNA, partial cds 0.002499 0.993801 0.992130 0.995216 0.997158 0.669544 1.0 1.0000 1.0000 1.000605 118.142857 98.639893 118.071429 70.389536 5031
3472 7120 D31833_s_at AVPR1B Arginine vasopressin receptor 1B -0.002246 0.988202 0.985955 0.990194 0.995745 0.668636 1.0 1.0000 1.0000 0.999245 -126.095238 111.217762 -126.000000 130.072169 5059
3471 7121 X12662_rna1_at Arginase gene exon 1 and flanking regions (EC ... -0.001772 0.990002 0.987921 0.991828 0.996431 0.669198 1.0 1.0000 1.0000 1.001314 36.238095 79.984314 36.285714 76.460577 5050
3470 7122 J04156_at IL7 Interleukin 7 -0.001696 0.996601 0.995326 0.997620 0.998822 0.670756 1.0 1.0000 1.0000 0.998705 -18.380952 33.893180 -18.357143 44.663885 4983
3463 7123 U04898_at RORA RAR-related orphan receptor A 0.001460 0.990602 0.988579 0.992369 0.996695 0.669233 1.0 1.0000 1.0000 0.997653 -20.238095 84.060041 -20.285714 100.862762 5047
3469 7124 U79295_at Clone 23961 mRNA sequence -0.001441 0.994401 0.992806 0.995740 0.997620 0.669854 1.0 1.0000 1.0000 1.000956 49.809524 64.448909 49.857143 111.892708 4972
3464 7125 U82303_at GB DEF = Unknown protein mRNA, partial cds 0.001299 0.985803 0.983354 0.987997 0.994733 0.667916 1.0 1.0000 1.0000 1.000227 314.857143 169.164797 314.785714 152.441756 5071
3465 7126 HG721-HT4827_s_at Placental Protein 14, Endometrial Alpha 2 Glob... 0.000854 0.990002 0.987921 0.991828 0.996431 0.669198 1.0 1.0000 1.0000 1.000220 216.190476 113.471414 216.142857 186.935142 5050
3468 7127 U38291_rna1_at Microtubule-associated protein 1a (MAP1A) geno... -0.000607 0.999400 0.998774 0.999765 0.999681 0.671238 1.0 1.0000 1.0000 1.000130 182.476190 114.806628 182.500000 113.017187 4997
3466 7128 U72511_at B-cell receptor associated protein (hBAP) mRNA... 0.000284 0.989802 0.987702 0.991647 0.996431 0.669198 1.0 1.0000 1.0000 1.000028 1681.619048 561.958760 1681.571429 428.851187 4949
3467 7129 U52077_s_at GB DEF = Mariner1 transposase gene, complete c... 0.000000 0.983603 0.980984 0.985967 0.994202 0.667559 1.0 1.0000 1.0000 1.000000 200.428571 97.493370 200.428571 135.909310 5082

7129 rows × 18 columns


Using CCALnoir


In [19]:
import cuzcatlan as cusca
import pandas as pd
import numpy as np
from cuzcatlan import differential_gene_expression
import urllib.request

In [20]:
%%time
TOP = 500
permuations=1000

RUN = True

data_url = "https://software.broadinstitute.org/cancer/software/genepattern/data/all_aml/all_aml_test.gct"
pheno_url = "https://software.broadinstitute.org/cancer/software/genepattern/data/all_aml/all_aml_test.cls"

data_df = pd.read_table(data_url, header=2, index_col=0)
data_df.drop('Description', axis=1, inplace=True)
url_file, __ = urllib.request.urlretrieve(pheno_url)
temp = open(url_file)
temp.readline()
temp.readline()
classes = [int(i) for i in temp.readline().strip('\n').split(' ')]
classes = pd.Series(classes, index=data_df.columns)


CPU times: user 99.9 ms, sys: 21 ms, total: 121 ms
Wall time: 2.11 s

In [21]:
%%notify
%%time
raw_scores = differential_gene_expression(phenotypes=pheno_url, gene_expression=data_url, 
                                      output_filename='DE_test', ranking_method=cusca.custom_pearson_corr,
                                      number_of_permutations=10)


Dropping 0 axis-1 slices ...
Computing match score with <function custom_pearson_corr at 0x113e54e18> (1 process) ...
Computing MoEs with 30 samplings ...
Computing p-values and FDRs with 10 permutations ...
	1/10 ...
	2/10 ...
	3/10 ...
	4/10 ...
	5/10 ...
	6/10 ...
	7/10 ...
	8/10 ...
	9/10 ...
	10/10 ...
	10/10 - done.
CPU times: user 2.7 s, sys: 186 ms, total: 2.89 s
Wall time: 9.34 s

In [22]:
ccal_scores = raw_scores.copy()
ccal_scores['abs_score'] = abs(ccal_scores['Score'])
ccal_scores['Feature'] = ccal_scores.index
ccal_scores.sort_values('abs_score', ascending=False, inplace=True)
ccal_scores.reset_index(inplace=True)
ccal_scores['Rank'] = ccal_scores.index +1
print(ccal_scores)


                   Name         Score   0.95 MoE   p-value       FDR  \
0             U46499_at  8.245435e-01        NaN  0.000014  0.002381   
1             X95735_at  7.756710e-01        NaN  0.000014  0.002381   
2             M89957_at -7.756137e-01        NaN  0.000014  0.002273   
3             M63959_at  7.696607e-01        NaN  0.000014  0.002381   
4           L09209_s_at  7.475753e-01        NaN  0.000014  0.002381   
5             M84526_at  7.400529e-01        NaN  0.000014  0.002381   
6             J05243_at -7.371616e-01        NaN  0.000014  0.002273   
7             X17042_at  7.316505e-01        NaN  0.000014  0.002381   
8      M84371_rna1_s_at -7.298155e-01        NaN  0.000014  0.002273   
9             M14636_at  7.268337e-01        NaN  0.000014  0.002381   
10            M22960_at  7.260735e-01        NaN  0.000014  0.002381   
11            M31523_at -7.221959e-01  0.0228747  0.000014  0.002273   
12            U59878_at  7.209896e-01        NaN  0.000014  0.002381   
13            M11722_at -7.181291e-01        NaN  0.000014  0.002273   
14            M23197_at  7.162585e-01        NaN  0.000014  0.002381   
15            X61587_at  7.114731e-01        NaN  0.000014  0.002381   
16            M27891_at  7.067257e-01        NaN  0.000014  0.002381   
17            X05908_at  7.046788e-01        NaN  0.000014  0.002381   
18            U60319_at  7.007208e-01        NaN  0.000014  0.002381   
19          M32304_s_at  6.908595e-01        NaN  0.000014  0.002381   
20            X12447_at  6.880660e-01        NaN  0.000014  0.002381   
21            M19507_at  6.865670e-01        NaN  0.000014  0.002381   
22            X16546_at  6.844507e-01  0.0291901  0.000014  0.002381   
23            M63138_at  6.825158e-01        NaN  0.000014  0.002381   
24            L09717_at  6.809467e-01        NaN  0.000014  0.002381   
25            U29175_at -6.807426e-01  0.0185103  0.000014  0.002273   
26       U05259_rna1_at -6.755212e-01        NaN  0.000014  0.002273   
27            M92287_at -6.752193e-01        NaN  0.000014  0.002273   
28            D88270_at -6.749073e-01  0.0228069  0.000014  0.002273   
29            Z29067_at  6.737274e-01        NaN  0.000014  0.002381   
...                 ...           ...        ...       ...       ...   
7099          X59766_at -1.470199e-03        NaN  0.484949  0.940479   
7100          M29540_at  1.457900e-03        NaN  0.478440  0.935234   
7101          U47621_at -1.347656e-03        NaN  0.484584  0.940027   
7102          X81372_at  1.334454e-03        NaN  0.478594  0.935280   
7103          U58032_at  1.254863e-03        NaN  0.478777  0.935342   
7104   HG3976-HT4246_at  1.180234e-03        NaN  0.478889  0.935342   
7105          U65002_at  1.073424e-03        NaN  0.479099  0.935413   
7106          U81787_at -1.018220e-03        NaN  0.483967  0.939085   
7107          X16281_at -9.783761e-04        NaN  0.483841  0.939085   
7108          M74826_at  9.615364e-04        NaN  0.479366  0.935413   
7109          L75847_at  9.480223e-04        NaN  0.479408  0.935413   
7110          Z46629_at  9.393421e-04        NaN  0.479450  0.935413   
7111          X63563_at -8.815716e-04        NaN  0.483658  0.938998   
7112          L77559_at  7.973411e-04        NaN  0.479801  0.935841   
7113          X60487_at -7.571721e-04        NaN  0.483392  0.938736   
7114          U12595_at -7.518560e-04        NaN  0.483378  0.938736   
7115          U57911_at -6.756185e-04        NaN  0.483252  0.938736   
7116        M18700_s_at -6.074864e-04        NaN  0.483083  0.938736   
7117          U67171_at  5.126211e-04        NaN  0.480418  0.936557   
7118        U52828_s_at -4.067761e-04        NaN  0.482410  0.937851   
7119        D31833_s_at  4.037733e-04        NaN  0.480586  0.936557   
7120          J04156_at  3.120324e-04        NaN  0.480741  0.936557   
7121     X12662_rna1_at  3.056031e-04        NaN  0.480755  0.936557   
7122          U79295_at  2.783558e-04        NaN  0.480825  0.936557   
7123          U04898_at -2.638654e-04        NaN  0.482045  0.937398   
7124          U82303_at -2.213848e-04        NaN  0.481989  0.937398   
7125  HG721-HT4827_s_at -1.635845e-04        NaN  0.481793  0.937398   
7126     U38291_rna1_at  1.052758e-04        NaN  0.481119  0.936875   
7127          U72511_at -4.677248e-05        NaN  0.481554  0.937210   
7128        U52077_s_at -3.737679e-17        NaN  0.481526  0.937210   

         abs_score            Feature  Rank  
0     8.245435e-01          U46499_at     1  
1     7.756710e-01          X95735_at     2  
2     7.756137e-01          M89957_at     3  
3     7.696607e-01          M63959_at     4  
4     7.475753e-01        L09209_s_at     5  
5     7.400529e-01          M84526_at     6  
6     7.371616e-01          J05243_at     7  
7     7.316505e-01          X17042_at     8  
8     7.298155e-01   M84371_rna1_s_at     9  
9     7.268337e-01          M14636_at    10  
10    7.260735e-01          M22960_at    11  
11    7.221959e-01          M31523_at    12  
12    7.209896e-01          U59878_at    13  
13    7.181291e-01          M11722_at    14  
14    7.162585e-01          M23197_at    15  
15    7.114731e-01          X61587_at    16  
16    7.067257e-01          M27891_at    17  
17    7.046788e-01          X05908_at    18  
18    7.007208e-01          U60319_at    19  
19    6.908595e-01        M32304_s_at    20  
20    6.880660e-01          X12447_at    21  
21    6.865670e-01          M19507_at    22  
22    6.844507e-01          X16546_at    23  
23    6.825158e-01          M63138_at    24  
24    6.809467e-01          L09717_at    25  
25    6.807426e-01          U29175_at    26  
26    6.755212e-01     U05259_rna1_at    27  
27    6.752193e-01          M92287_at    28  
28    6.749073e-01          D88270_at    29  
29    6.737274e-01          Z29067_at    30  
...            ...                ...   ...  
7099  1.470199e-03          X59766_at  7100  
7100  1.457900e-03          M29540_at  7101  
7101  1.347656e-03          U47621_at  7102  
7102  1.334454e-03          X81372_at  7103  
7103  1.254863e-03          U58032_at  7104  
7104  1.180234e-03   HG3976-HT4246_at  7105  
7105  1.073424e-03          U65002_at  7106  
7106  1.018220e-03          U81787_at  7107  
7107  9.783761e-04          X16281_at  7108  
7108  9.615364e-04          M74826_at  7109  
7109  9.480223e-04          L75847_at  7110  
7110  9.393421e-04          Z46629_at  7111  
7111  8.815716e-04          X63563_at  7112  
7112  7.973411e-04          L77559_at  7113  
7113  7.571721e-04          X60487_at  7114  
7114  7.518560e-04          U12595_at  7115  
7115  6.756185e-04          U57911_at  7116  
7116  6.074864e-04        M18700_s_at  7117  
7117  5.126211e-04          U67171_at  7118  
7118  4.067761e-04        U52828_s_at  7119  
7119  4.037733e-04        D31833_s_at  7120  
7120  3.120324e-04          J04156_at  7121  
7121  3.056031e-04     X12662_rna1_at  7122  
7122  2.783558e-04          U79295_at  7123  
7123  2.638654e-04          U04898_at  7124  
7124  2.213848e-04          U82303_at  7125  
7125  1.635845e-04  HG721-HT4827_s_at  7126  
7126  1.052758e-04     U38291_rna1_at  7127  
7127  4.677248e-05          U72511_at  7128  
7128  3.737679e-17        U52077_s_at  7129  

[7129 rows x 8 columns]

In [23]:
%%time
%%notify
raw_ic_scores = differential_gene_expression(phenotypes=pheno_url, gene_expression=data_url, 
                                      output_filename='DE_test', ranking_method=cusca.compute_information_coefficient,
                                      number_of_permutations=10)


Dropping 0 axis-1 slices ...
Computing match score with <function compute_information_coefficient at 0x114d3cc80> (1 process) ...
Computing MoEs with 30 samplings ...
Computing p-values and FDRs with 10 permutations ...
	1/10 ...
	2/10 ...
	3/10 ...
	4/10 ...
	5/10 ...
	6/10 ...
	7/10 ...
	8/10 ...
	9/10 ...
	10/10 ...
	10/10 - done.
CPU times: user 3.14 s, sys: 188 ms, total: 3.33 s
Wall time: 1min 40s

In [25]:
ccal_ic_scores = raw_ic_scores.copy()
ccal_ic_scores['abs_score'] = abs(ccal_ic_scores['Score'])
ccal_ic_scores['Feature'] = ccal_ic_scores.index
ccal_ic_scores.sort_values('abs_score', ascending=False, inplace=True)
ccal_ic_scores.reset_index(inplace=True)
ccal_ic_scores['Rank'] = ccal_ic_scores.index +1
print(ccal_ic_scores)


                  Name     Score   0.95 MoE   p-value       FDR  abs_score  \
0            U46499_at  0.753714        NaN  0.000014  0.002778   0.753714   
1            M84526_at  0.649866        NaN  0.000014  0.002778   0.649866   
2            M89957_at -0.645258        NaN  0.000014  0.002941   0.645258   
3            M27891_at  0.634431        NaN  0.000014  0.002778   0.634431   
4            U59878_at  0.613581        NaN  0.000014  0.002778   0.613581   
5            X95735_at  0.608938        NaN  0.000014  0.002778   0.608938   
6            M11722_at -0.607143        NaN  0.000014  0.002941   0.607143   
7     M84371_rna1_s_at -0.601197        NaN  0.000014  0.002941   0.601197   
8            M31523_at -0.596433  0.0230147  0.000014  0.002941   0.596433   
9            M22960_at  0.591253        NaN  0.000014  0.002778   0.591253   
10           M63959_at  0.583427        NaN  0.000014  0.002778   0.583427   
11           M19507_at  0.579449        NaN  0.000014  0.002778   0.579449   
12         L09209_s_at  0.577150        NaN  0.000014  0.002778   0.577150   
13           X16546_at  0.569908  0.0278228  0.000014  0.002778   0.569908   
14           M63138_at  0.569470        NaN  0.000014  0.002778   0.569470   
15           J05243_at -0.566304        NaN  0.000014  0.002941   0.566304   
16           M11147_at  0.564826        NaN  0.000014  0.002778   0.564826   
17           X17042_at  0.562581        NaN  0.000014  0.002778   0.562581   
18           M92357_at  0.561667        NaN  0.000014  0.002778   0.561667   
19           M23197_at  0.556172        NaN  0.000014  0.002778   0.556172   
20           X05908_at  0.554616        NaN  0.000014  0.002778   0.554616   
21           M14636_at  0.546838        NaN  0.000014  0.002778   0.546838   
22           M92287_at -0.542418        NaN  0.000014  0.002941   0.542418   
23      M96326_rna1_at  0.534105        NaN  0.000014  0.002778   0.534105   
24           D88270_at -0.532985    0.02201  0.000014  0.002941   0.532985   
25         M32304_s_at  0.531376        NaN  0.000014  0.002778   0.531376   
26      U05259_rna1_at -0.529745        NaN  0.000014  0.002941   0.529745   
27           X12447_at  0.527874        NaN  0.000014  0.002778   0.527874   
28           U60319_at  0.519148        NaN  0.000014  0.002778   0.519148   
29           Z49194_at -0.518524        NaN  0.000014  0.002941   0.518524   
...                ...       ...        ...       ...       ...        ...   
7099         U78107_at -0.048218        NaN  0.485159  0.940631   0.048218   
7100         X70476_at -0.048158        NaN  0.485159  0.940631   0.048158   
7101       X57351_s_at -0.047757        NaN  0.485033  0.940631   0.047757   
7102         U57911_at -0.047616        NaN  0.484949  0.940631   0.047616   
7103         D42087_at -0.047045        NaN  0.484710  0.940631   0.047045   
7104         U07681_at  0.046782        NaN  0.478230  0.934164   0.046782   
7105         U00943_at -0.046711        NaN  0.484598  0.940631   0.046711   
7106       U24389_s_at  0.046648        NaN  0.478286  0.934164   0.046648   
7107         D67029_at  0.046491        NaN  0.478342  0.934164   0.046491   
7108         X64269_at  0.046176        NaN  0.478468  0.934164   0.046176   
7109         U34880_at  0.045413        NaN  0.478679  0.934164   0.045413   
7110  HG2264-HT2360_at  0.045404        NaN  0.478679  0.934164   0.045404   
7111         U79254_at  0.044575        NaN  0.479015  0.934565   0.044575   
7112         M94345_at -0.044011        NaN  0.483771  0.939472   0.044011   
7113         U65093_at  0.043777        NaN  0.479240  0.934747   0.043777   
7114         U96131_at -0.043237        NaN  0.483630  0.939455   0.043237   
7115         U44975_at -0.043093        NaN  0.483588  0.939455   0.043093   
7116       X03794_s_at  0.042390        NaN  0.479576  0.935148   0.042390   
7117  HG4593-HT4998_at -0.042009        NaN  0.483280  0.939286   0.042009   
7118    X66114_rna1_at -0.041481        NaN  0.483209  0.939286   0.041481   
7119         S69369_at -0.041401        NaN  0.483195  0.939286   0.041401   
7120         U18291_at  0.040940        NaN  0.479899  0.935521   0.040940   
7121         U79734_at  0.039445        NaN  0.480081  0.935621   0.039445   
7122         L40411_at -0.038328        NaN  0.482648  0.938827   0.038328   
7123         U18919_at  0.036377        NaN  0.480446  0.935874   0.036377   
7124         U05659_at  0.036083        NaN  0.480474  0.935874   0.036083   
7125         D86971_at -0.034714        NaN  0.482087  0.937991   0.034714   
7126         L05515_at  0.026648        NaN  0.481161  0.936957   0.026648   
7127         M14565_at -0.024418        NaN  0.481540  0.937183   0.024418   
7128         D87685_at  0.022770        NaN  0.481302  0.936974   0.022770   

               Feature  Rank  
0            U46499_at     1  
1            M84526_at     2  
2            M89957_at     3  
3            M27891_at     4  
4            U59878_at     5  
5            X95735_at     6  
6            M11722_at     7  
7     M84371_rna1_s_at     8  
8            M31523_at     9  
9            M22960_at    10  
10           M63959_at    11  
11           M19507_at    12  
12         L09209_s_at    13  
13           X16546_at    14  
14           M63138_at    15  
15           J05243_at    16  
16           M11147_at    17  
17           X17042_at    18  
18           M92357_at    19  
19           M23197_at    20  
20           X05908_at    21  
21           M14636_at    22  
22           M92287_at    23  
23      M96326_rna1_at    24  
24           D88270_at    25  
25         M32304_s_at    26  
26      U05259_rna1_at    27  
27           X12447_at    28  
28           U60319_at    29  
29           Z49194_at    30  
...                ...   ...  
7099         U78107_at  7100  
7100         X70476_at  7101  
7101       X57351_s_at  7102  
7102         U57911_at  7103  
7103         D42087_at  7104  
7104         U07681_at  7105  
7105         U00943_at  7106  
7106       U24389_s_at  7107  
7107         D67029_at  7108  
7108         X64269_at  7109  
7109         U34880_at  7110  
7110  HG2264-HT2360_at  7111  
7111         U79254_at  7112  
7112         M94345_at  7113  
7113         U65093_at  7114  
7114         U96131_at  7115  
7115         U44975_at  7116  
7116       X03794_s_at  7117  
7117  HG4593-HT4998_at  7118  
7118    X66114_rna1_at  7119  
7119         S69369_at  7120  
7120         U18291_at  7121  
7121         U79734_at  7122  
7122         L40411_at  7123  
7123         U18919_at  7124  
7124         U05659_at  7125  
7125         D86971_at  7126  
7126         L05515_at  7127  
7127         M14565_at  7128  
7128         D87685_at  7129  

[7129 rows x 8 columns]

Comparing results

CMS vs CCAL_correlation


In [26]:
# @memory.cache
def custom_metric(list_1, list_2):
    temp = list_1 - list_2
    temp.fillna(len(temp), inplace=True)
    # Metric is 0 if perfect overlap, 1 if list are reversed. It can be larger than one!
    return sum(abs(temp))/ np.floor(list_1.shape[0]**2/2)

In [29]:
# @memory.cache
def map_df1_to_df2(df_1, df_2):
    to_return = df_1.copy()
    df_2_copy = df_2.copy()
    
    to_return.sort_values(by='Rank', inplace=True)
    to_return.set_index('Feature', inplace=True)
    df_2_copy.sort_values(by='Rank', inplace=True)
    df_2_copy.set_index('Feature', inplace=True)
    
    df_2_copy.rename(columns={'Rank': 'new_Rank'}, inplace=True)
    to_return_2 = to_return.join(df_2_copy)
    
    return to_return_2

In [36]:
def compute_overlap(reference_df, new_df, col='index'):
    if col == 'index':
        common = (list(set(reference_df.index) & set(new_df.index)))
    else:
        common = (list(set(reference_df[col]) & set(new_df[col])))

    overlap = 100*len(common)/len(reference_df)
    return overlap

In [41]:
# @memory.cache
def compare_ranks(df_a, df_b, number_of_genes=5, verbose=False):
    # Not ssuming both df's are ranked already!
    subset_a = df_a.head(number_of_genes)[['Feature', 'Rank']]
    subset_b = df_b.head(number_of_genes)[['Feature', 'Rank']]

    a_in_b = map_df1_to_df2(subset_a, df_b[['Feature','Rank']])
    b_in_a = map_df1_to_df2(subset_b, df_a[['Feature','Rank']]) 

    metric_1 = custom_metric(a_in_b['Rank'], a_in_b['new_Rank'])
    metric_2 = custom_metric(b_in_a['Rank'], b_in_a['new_Rank'])
    
    overlap = compute_overlap(subset_a, subset_b, col='Feature')
    
    if verbose:
        print(a_in_b) 
        print(b_in_a)
        
    return ((metric_1 + metric_2)/2, overlap)

In [37]:
# @memory.cache
def compare_multiple_ranks(df_a, df_b, max_number_of_genes=10, verbose=False):

    # This is the largest subset we will consider
    subset_a = df_a.head(max_number_of_genes)[['Feature', 'Rank']]
    subset_b = df_b.head(max_number_of_genes)[['Feature', 'Rank']]
    
    df_a_to_use = df_a[['Feature','Rank']]
    df_b_to_use = df_b[['Feature','Rank']]
    
    indexes = []
    metrics = []
    overlap = []
    for i in range(max_number_of_genes, 0, -1):
        
        if i == max_number_of_genes:
            subset_a_to_use = subset_a
            subset_b_to_use = subset_b
        else:
            subset_a_to_use = subset_a_to_use.drop(subset_a_to_use.index[i])
            subset_b_to_use = subset_b_to_use.drop(subset_b_to_use.index[i])

        a_in_b = map_df1_to_df2(subset_a_to_use, df_b_to_use)
        b_in_a = map_df1_to_df2(subset_b_to_use, df_a_to_use)
        
        overlap.append(compute_overlap(subset_a_to_use, subset_b_to_use, col='Feature'))

        metric_1 = custom_metric(a_in_b['Rank'], a_in_b['new_Rank'])
        metric_2 = custom_metric(b_in_a['Rank'], b_in_a['new_Rank'])
        
        indexes.append(i)
#         print(i, metric_1, metric_2)
        metrics.append((metric_1 + metric_2)/2)
    
    if verbose:
        print('Depreciated!')
        
    return indexes, metrics, overlap

In [48]:
%%time
ixs, mets, over = compare_multiple_ranks(cms_scores, ccal_scores, max_number_of_genes=5, verbose=False)
print(ixs)
print(mets)
print(over)


[5, 4, 3, 2, 1]
[2.875, 3.0625, 4.375, 5.5, inf]
[20.0, 25.0, 33.333333333333336, 0.0, 0.0]
CPU times: user 51.7 ms, sys: 2.46 ms, total: 54.2 ms
Wall time: 52.9 ms
/Users/edjuaro/miniconda3/lib/python3.6/site-packages/ipykernel_launcher.py:6: RuntimeWarning: divide by zero encountered in double_scalars
  

In [47]:
%%time
m1, ov = compare_ranks(cms_scores, ccal_ic_scores, number_of_genes=10, verbose=True)
print("\nMetric =",m1, "Overlap=", ov)


                  Rank  new_Rank
Feature                         
M89957_at            1         3
J05243_at            2        16
M11722_at            3         7
M31523_at            4         9
M84371_rna1_s_at     5         8
U46499_at            6         1
D88270_at            7        25
U05259_rna1_at       8        27
M92287_at            9        23
M63959_at           10        11
                  Rank  new_Rank
Feature                         
U46499_at            1         6
M84526_at            2        30
M89957_at            3         1
M27891_at            4        52
U59878_at            5        40
X95735_at            6        12
M11722_at            7         3
M84371_rna1_s_at     8         5
M31523_at            9         4
M22960_at           10        34

Metric = 2.45 Overlap= 50.0
CPU times: user 15.4 ms, sys: 2.4 ms, total: 17.8 ms
Wall time: 15.8 ms

CMS vs CCAL_ic

CCAL_correlation vs CCAL_ic

CMS vs CCAL_ic


In [94]:
%%time
plt.clf()
fig, axs = plt.subplots(2,1,dpi=150)

# for i in range(int(len(scores)/2)):
# for i in range(1000):
#     if i ==0:
#         continue
#     metric = compare_ranks(cms_scores, ccal_ic_scores, number_of_genes=i)
#     fig.gca().scatter(i,metric,color='k')
# fig.gca().set_ylim(-0.1,8)

ixs, mets,over = compare_multiple_ranks(cms_scores, ccal_ic_scores, max_number_of_genes=500, verbose=False)
axs[0].scatter(ixs,mets,color='k')
axs[0].set_ylim(-0.1,8)
axs[0].set_ylabel('Custom metric')
axs[1].scatter(ixs,over,color='k')
axs[1].set_ylabel('% Overlap')
axs[1].set_xlabel('Top n genes')
axs[0].set_title("CMS vs CCAL_IC")
fig


CPU times: user 4.07 s, sys: 52 ms, total: 4.13 s
Wall time: 4.13 s
/Users/edjuaro/miniconda3/lib/python3.6/site-packages/ipykernel_launcher.py:6: RuntimeWarning: divide by zero encountered in double_scalars
  

In [95]:
fig


Out[95]:

CMS vs CCAL_correlation


In [96]:
%%time
plt.close('all')
plt.clf()
fig2, axs2 = plt.subplots(2,1,dpi=150)

# for i in range(int(len(scores)/2)):
# for i in range(1000):
#     if i ==0:
#         continue
#     metric = compare_ranks(cms_scores, ccal_scores, number_of_genes=i)
#     fig2.gca().scatter(i,metric,color='k')
ixs, mets, over = compare_multiple_ranks(cms_scores, ccal_scores, max_number_of_genes=100, verbose=False)
axs2[0].scatter(ixs,mets,color='k')
axs2[0].set_ylim(-0.1,8)
axs2[0].set_ylabel('Custom metric')
axs2[1].scatter(ixs,over,color='k')
axs2[1].set_ylabel('% Overlap')
axs2[1].set_xlabel('Top n genes')
axs2[0].set_title("CMS vs CCAL_PC")


CPU times: user 812 ms, sys: 13.6 ms, total: 826 ms
Wall time: 823 ms
/Users/edjuaro/miniconda3/lib/python3.6/site-packages/ipykernel_launcher.py:6: RuntimeWarning: divide by zero encountered in double_scalars
  

In [97]:
fig2


Out[97]:

CCAL_correlation vs CCAL_ic


In [90]:
%%time
fig3, axs3 = plt.subplots(2,1,dpi=150)
# for i in range(int(len(scores)/2)):
# for i in range(1000):
#     if i ==0:
#         continue
#     metric = compare_ranks(ccal_ic_scores, ccal_scores, number_of_genes=i)
#     fig2.gca().scatter(i,metric,color='k')
# fig2.gca().set_ylim(-0.1,8)
ixs, mets, over = compare_multiple_ranks(ccal_ic_scores, ccal_scores, max_number_of_genes=100, verbose=False)
axs3[0].scatter(ixs,mets,color='k')
axs3[0].set_ylim(-0.1,8)
axs3[0].set_ylabel('Custom metric')
axs3[1].scatter(ixs,over,color='k')
axs3[1].set_ylabel('% Overlap')
axs3[1].set_xlabel('Top n genes')


CPU times: user 977 ms, sys: 14.6 ms, total: 992 ms
Wall time: 991 ms
/Users/edjuaro/miniconda3/lib/python3.6/site-packages/ipykernel_launcher.py:6: RuntimeWarning: invalid value encountered in double_scalars
  

In [91]:
fig3


Out[91]:

In [ ]: