In [1]:
import pandas as pd
import numpy as np
import scipy as sp

In [2]:
ls


Feb1_majority_label.py            Jan27.py                          eluted_peptide_prediction.py
Feb5_hamming_isomap.py            Jan28.py                          elution_compact.csv
Feb5_mds.py                       Jan30_exclude_hla_a2.py           eval_dataset.py
Feb7_tumor_specific_antigens.py   Jan31_bigram.py                   iedb.py
Feb7_tumor_vs_self.py             LICENSE                           imma.py
IEDB_TCELL_HUMAN_IMM.txt          README.md                         immuno_enhance.py
IEDB_TCELL_HUMAN_NON.txt          Toxin_Protein_Table.txt           mds_9mer_cytotoxicity.png
IEDB_duplicates.png               Tumor_Mutant_Antigens_HLA_I.txt   mds_cytotoxicity_9mer.png
IEDB_noisy_labels.png             Tumor_Self_Antigens_HLA_I.txt     pipeline/
IMMA2_imm.txt                     Untitled0.ipynb                   pipeline2/
IMMA2_non.txt                     amino_acid.py                     prop_of_mhc.py
Jan17.py                          amino_acid_properties.txt         reduced_alphabet.py
Jan18.py                          conv.py                           s1.csv
Jan21_toxin.py                    danafarber_verified_antigens.txt  s2.csv
Jan21_toxin_positional.py         data/                             seq_feature_tests.py
Jan22.py                          data.py                           toxin.py
Jan23.py                          df_tumor_antigens.py              toxins.txt
Jan24.py                          dimensionality_reduction.py       viz/

In [3]:
df = pd.read_csv("elution_compact.csv", skipinitialspace=True)


/usr/local/Cellar/python/2.7.6/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/io/parsers.py:1070: DtypeWarning: Columns (5,11,12,20,21,22,23,24,25,30,31,33,34,35,36,37,38,43,44,46,47) have mixed types. Specify dtype option on import or set low_memory=False.
  data = self._reader.read(nrows)

In [8]:
df.count()


Out[8]:
Elution ID                                       267658
Reference ID                                     267658
Reference Type                                   267658
PubMed ID                                         87865
Author                                           267658
Journal                                           87865
Year                                             267658
Epitope ID                                       267658
Epitope Object Type                              267658
Epitope Object Description                       267658
Epitope Linear Sequence                          267364
Epitope Modification                               1371
Epitope Modified Residues                          1371
Epitope Starting Position                        241740
Epitope Ending Position                          241740
Epitope Source Molecule Accession                253700
Epitope Source Molecule Name                     253700
Epitope Source Organism ID                       253765
Epitope Source Organism Name                     253765
Host Organism ID                                  11374
Host Organism Name                                11374
Host Geolocation                                     36
In Vivo 1 Process Type                            10831
In Vivo 1 Immunogen Object Type                     352
In Vivo 1 Immunogen Object Description              352
In Vivo 1 Immunogen Linear Sequence                   3
In Vivo 1 Immunogen Modification                      0
In Vivo 1 Immunogen Modified Residues                 0
In Vivo 1 Immunogen Starting Position                 3
In Vivo 1 Immunogen Ending Position                   3
In Vivo 1 Immunogen Source Molecule Accession         3
In Vivo 1 Immunogen Source Molecule Name              3
In Vivo 1 Immunogen Source Organism ID              352
In Vivo 1 Immunogen Source Organism Name            352
In Vivo 1 Immunogen Epitope Relation                354
In Vitro Process Type                              1319
In Vitro Immunogen Object Type                     1313
In Vitro Immunogen Object Description              1313
In Vitro Immunogen Linear Sequence                   71
In Vitro Immunogen Modification                       0
In Vitro Immunogen Modified Residues                  0
In Vitro Immunogen Starting Position                 65
In Vitro Immunogen Ending Position                   65
In Vitro Immunogen Source Molecule Accession         65
In Vitro Immunogen Source Molecule Name              65
In Vitro Immunogen Source Organism ID              1307
In Vitro Immunogen Source Organism Name            1307
In Vitro Immunogen Epitope Relation                1313
MHC Allele ID                                    267119
MHC Allele Name                                  267119
Method/Technique ID                              267658
Method/Technique                                 267658
Assay Group                                      267658
Qualitative Measure                              267651
Unnamed: 54                                           0
Length: 55, dtype: int64

In [9]:
df.head()


Out[9]:
Elution ID Reference ID Reference Type PubMed ID Author Journal Year Epitope ID Epitope Object Type Epitope Object Description Epitope Linear Sequence Epitope Modification Epitope Modified Residues Epitope Starting Position Epitope Ending Position Epitope Source Molecule Accession Epitope Source Molecule Name Epitope Source Organism ID Epitope Source Organism Name Host Organism ID
0 26 274 Literature 15448372 Yi-Hsiang Huang; Mi-Hua Tao; Cheng-po Hu; Wan-... J Gen Virol 2004 31803 Linear peptide KLEDLERDL KLEDLERDL NaN NaN 26 34 11022742 large delta antigen 10000523 Hepatitis delta virus TW2667 NaN ...
1 115 299 Literature 15140958 Yue-Dan Wang; Wan-Yee Fion Sin; Guo-Bing Xu; H... J Virol 2004 36724 Linear peptide LITGRLQSL LITGRLQSL NaN NaN 978 986 30173397 Spike glycoprotein precursor 227859 SARS coronavirus NaN ...
2 143 304 Literature 15102821 Alberto Diaz-Qui�onez; Natalia Martin-Orozco; ... Infect Immun 2004 66114 Linear peptide TRVAFAGL TRVAFAGL NaN NaN 94 101 7428872 outer membrane porin C precursor - Salmonella ... 90371 Salmonella enterica subsp. enterica serovar Ty... NaN ...
3 144 304 Literature 15102821 Alberto Diaz-Qui�onez; Natalia Martin-Orozco; ... Infect Immun 2004 55063 Linear peptide RNTDFFGL RNTDFFGL NaN NaN 153 160 7428872 outer membrane porin C precursor - Salmonella ... 90371 Salmonella enterica subsp. enterica serovar Ty... NaN ...
4 247 329 Literature 15104671 C Sylvester-Hvid; M Nielsen; K Lamberth; G R�d... Tissue Antigens 2004 14829 Linear peptide EVMPVSMAK EVMPVSMAK NaN NaN 707 715 30173397 Spike glycoprotein precursor 227859 SARS coronavirus NaN ...

5 rows × 55 columns


In [10]:
df['Host Organism Name'].value_counts()


Out[10]:
Homo sapiens               7847
Mus musculus NOD           1095
Mus musculus C57BL/6        852
Mus musculus                714
Mus musculus C57BL/6N       587
B6.ERAAP null               160
Mus musculus B6.P            56
Gallus gallus                34
Mus musculus BALB/c          10
Mus musculus BALB.B           3
Mus musculus SNF1             3
Sus scrofa                    2
Mus musculus B10 X 129        2
Mus musculus C3H              2
Mus musculus SV40 Tg          2
Mus musculus C57BL/10         2
Rattus norvegicus Lewis       1
Mus musculus NOD/Lt           1
Pan troglodytes               1
dtype: int64

In [11]:
df["Epitope Source Organism Name"].value_counts()


Out[11]:
Vaccinia virus WR                                        35120
Homo sapiens                                             22361
Phleum pratense                                          17819
Mycobacterium tuberculosis                                9989
SARS coronavirus Tor2                                     9290
Zaire ebolavirus                                          7069
Mus musculus                                              4992
Lymphocytic choriomeningitis virus (strain Armstrong)     4251
Giardia lamblia ATCC 50803                                4204
SARS coronavirus                                          3914
Vaccinia virus Copenhagen                                 3035
Sabia virus                                               2881
Junin virus                                               2852
Guanarito virus                                           2759
Hepatitis B virus                                         2656
...
Influenza A virus (A/Chicken/Nanchang/2-220/2001(H3N6))    1
Hepatitis C virus genotype 4                               1
Hepatitis C virus genotype 5                               1
Plasmodium cynomolgi strain Berok                          1
Influenza A virus (A/New York/348/2003(H1N1))              1
Mumps virus                                                1
Influenza A virus (A/Weiss/1943(H1N1))                     1
Zea mays                                                   1
Moorella thermoacetica ATCC 39073                          1
Influenza A virus (A/turkey/Wisconsin/1968(H5N9))          1
Orcinus orca                                               1
Encephalomyocarditis virus                                 1
Autographa californica nucleopolyhedrovirus                1
Influenza A virus (A/Kitakyushu/93(H3N2))                  1
Human herpesvirus 5 strain Merlin                          1
Length: 1456, dtype: int64

In [28]:
df["MHC Allele Name"].value_counts()


Out[28]:
HLA-A*02:01       23550
HLA-DRB1*01:01     9635
HLA-A*03:01        7921
HLA-A*11:01        7214
HLA-A*68:02        6455
HLA-A*02:03        6312
HLA-B*15:01        6029
HLA-A*02:06        5774
HLA-B*07:02        5601
HLA-A*31:01        5539
HLA-A*01:01        5485
H-2-Db             5295
H-2-Kb             5215
HLA-A*02:02        5121
HLA-DRB1*04:01     5005
...
HLA-DQ5                      1
HLA-Cw3                      1
HLA-DQ9                      1
bMR1                         1
chCD1-2                      1
HLA-DP E69K mutant           1
HLA-A*02:01 W167A mutant     1
HLA-A*02:01 T163A mutant     1
H-2-Dbm13                    1
HLA-B53                      1
BoLA-DQ                      1
RT1-Ac                       1
HLA-DQA1*01:01/DQB1*05:03    1
HLA-B*35:02                  1
HLA-DRB1*13:05               1
Length: 472, dtype: int64

In [22]:
df2 = pd.read_csv("tcell_compact.csv", skipinitialspace=True)

In [452]:
reload(iedb)
import iedb
min_count = 0
hla_type = None

In [453]:
tcell = iedb.load_tcell(min_count=min_count, hla_type = hla_type)


Class I MHC Entries 60202
Class II MHC Entries 95524
Human entries 133735
Human Class I MHCs 54432
Human Class II MHCs 65320
Dropping 3824 null sequences
Dropping 83 bad sequences
Filtered sequences epitope sequences 132503

In [454]:
mhc = iedb.load_mhc(min_count=min_count, hla_type = hla_type)


Class I MHC Entries 166713
Class II MHC Entries 72612
Human entries 239619
Human Class I MHCs 166708
Human Class II MHCs 72612
Dropping 294 null sequences
Dropping 45 bad sequences
Filtered sequences epitope sequences 239475

In [455]:
df = pd.DataFrame({'mhc':mhc, 'tcell':tcell})
df.index.name = 'epitope'

In [456]:
both = ~(df.mhc.isnull() | df.tcell.isnull())
both.sum()


Out[456]:
9493

In [457]:
df[both]


Out[457]:
mhc tcell
epitope
AAAGAEAGKATTEEQ 0.240000 0.000000
AAAGLAAAAPLESRQ 0.828571 0.500000
AAALGIGTDSVILIKCDERG 0.000000 0.000000
AAASVPAADKFKTFE 0.840000 0.000000
AAATATATAAVGAAT 0.400000 0.000000
AAAWYLWEV 1.000000 1.000000
AACIVGCENV 0.000000 0.000000
AADHAAPEDKYEAFV 0.400000 0.000000
AADHCPVVEVNGVTI 0.000000 0.000000
AAEAMEVA 1.000000 0.000000
AAEKLLEKVPSDVLEMYKAI 0.857143 1.000000
AAESSSKAALTSKLD 0.600000 0.000000
AAFDRKSDAK 1.000000 0.666667
AAFEDLRVL 1.000000 1.000000
AAFKIAATAANSAPA 1.000000 1.000000
AAFNNAIKAGTGGAY 0.600000 0.125000
AAFTSSSKAATAKAP 0.800000 0.000000
AAGAATTAAGAASGA 0.480000 0.000000
AAGAAVKGV 0.166667 0.000000
AAGGHNAVFNFPPNG 0.000000 0.333333
AAGIGILTV 0.909091 0.882353
AAGLQDCTMLV 0.000000 0.000000
AAGTAAQAAVVRFQE 0.666667 0.750000
AAGVPPADKYRTFVA 0.600000 0.000000
AAHARFVAA 1.000000 1.000000
AAIGLSMAGSSAMILAAYHP 1.000000 0.666667
AAKDASIPTATIRRH 0.000000 0.000000
AAKEDFLGCLVKEIP 0.760000 0.000000
AAKPAAAATATATAA 0.320000 0.000000
AALAAAAGVPPADKY 0.720000 0.125000
AALFYTHRFNASGCS 0.000000 1.000000
AALGLWLSV 1.000000 0.000000
AALGVATAAQITAGI 0.000000 1.000000
AALLVVAVGLRV 0.500000 1.000000
AALLVVAVGLRVVCAKYALA 0.800000 1.000000
AANIRALNVPPSLDCRY 0.000000 0.000000
AANKQKQELDEISTN 0.074074 0.333333
AANPHATFGV 1.000000 0.000000
AANWILRGTSFVYVP 0.769231 1.000000
AAPAAGYTPATPAAP 0.440000 0.000000
AAPANDKFTVFEAAF 0.920000 0.250000
AAPANPGLIIGA 0.400000 1.000000
AAPGAGYTPATPAAP 0.400000 0.000000
AAPLSWSKDIYNYME 0.880000 0.000000
AAQNRFTAIATTQQAGSNNL 1.000000 1.000000
AARDRFPGL 1.000000 1.000000
AARLFKAFILDGDKL 0.971429 1.000000
AARVTAILSSLTVTQLLRRL 1.000000 0.000000
AASGAATVAAGGYKV 0.360000 0.000000
AASGADGTYDITKLG 0.280000 0.000000
AASTLLYATV 1.000000 0.500000
AATAAAAAAVDRGDP 0.333333 0.000000
AATAVMAASASAQSVPASRQ 1.000000 1.000000
AATEVELKERKHRIEDAVRN 0.000000 0.000000
AATGAATAATGGYKV 0.400000 0.055556
AAVDLSHFL 0.000000 0.000000
AAVEELKAL 1.000000 0.000000
AAVGATPEAKFDSFV 0.640000 0.000000
AAVLFAATAAAAAAV 0.777778 0.000000
AAVLLLVTHY 1.000000 1.000000
... ...

9493 rows × 2 columns


In [458]:
dfb = df[both]

In [459]:
pylab.scatter(dfb.tcell, dfb.mhc)


Out[459]:
<matplotlib.collections.PathCollection at 0x116e91550>

In [460]:
heatmap, xedges, yedges = np.histogram2d(dfb.tcell, dfb.mhc, bins=10)
extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]]

plt.clf()
plt.imshow(heatmap.T, extent=extent, origin='lower')
plt.colorbar()
plt.show()



In [461]:
plt.hexbin(dfb.tcell, dfb.mhc, gridsize=15)


Out[461]:
<matplotlib.collections.PolyCollection at 0x12a1d09d0>

In [393]:
x = iedb.load_tcell(min_count=5, hla_type = 1)
y = iedb.load_mhc(min_count=5, hla_type = 1)
df = pd.DataFrame({'x':x, 'y':y})
df.index.name = 'epitope'
both = ~(df.x.isnull() | df.y.isnull())
print "COUNT", both.sum()
dfb = df[both]


Class I MHC Entries 60202
Class II MHC Entries 95524
Human entries 133735
Human Class I MHCs 54432
Human Class II MHCs 65320
Dropping 3824 null sequences
Dropping 83 bad sequences
HLA A-2 count: 3709
Filtered sequences epitope sequences 54315
Class I MHC Entries 166713
Class II MHC Entries 72612
Human entries 239619
Human Class I MHCs 166708
Human Class II MHCs 72612
Dropping 294 null sequences
Dropping 45 bad sequences
HLA A-2 count: 2254
Filtered sequences epitope sequences 166669
COUNT 308

In [394]:
plt.scatter(dfb.x, dfb.y)


Out[394]:
<matplotlib.collections.PathCollection at 0x114cd6ed0>

In [395]:
plt.hexbin(dfb.x, dfb.y, gridsize=6)
plt.xlabel("t-cell response")
plt.ylabel("mhc binding")
plt.title("6+ samples, MHC I")
plt.colorbar()


Out[395]:
<matplotlib.colorbar.Colorbar instance at 0x10f7d27e8>

In [429]:
reload(iedb)
import iedb

In [430]:
tcell2 = iedb.load_tcell(min_count=2, key_by_allele=True)


Class I MHC Entries 60202
Class II MHC Entries 95524
Human entries 133735
Human Class I MHCs 54432
Human Class II MHCs 65320
Dropping 3824 null sequences
Dropping 83 bad sequences
Filtered sequences epitope sequences 132503

In [431]:
mhc2 = iedb.load_mhc(min_count=2, key_by_allele=True)


Class I MHC Entries 166713
Class II MHC Entries 72612
Human entries 239619
Human Class I MHCs 166708
Human Class II MHCs 72612
Dropping 294 null sequences
Dropping 45 bad sequences
Filtered sequences epitope sequences 239475

In [432]:
df_combined = pd.DataFrame({'mhc':mhc2, 'tcell':tcell2})
both = ~(df_combined.mhc.isnull() | df_combined.tcell.isnull())
print "COUNT", both.sum()
df_combined_filt = df_combined[both]


COUNT 369

In [433]:
#df_combined_filt.to_csv("mhc_vs_tcell_allele.csv")

In [434]:
df_combined_filt


Out[434]:
mhc tcell
Epitope Linear Sequence MHC Allele Name
AAGIGILTV HLA-A*02:01 0.900000 0.909091
AGFKGEQGPKGEP HLA-DR4 1.000000 1.000000
AIMDKNIIL HLA-A*02:01 1.000000 0.963636
ALFGIKLPAL HLA-A*02:01 1.000000 0.000000
ALMPLYACI HLA-A*02:01 1.000000 0.400000
ALNIALVAV HLA-A*02:01 1.000000 1.000000
ALPHIIDEV HLA-A*02:01 1.000000 0.636364
ALSTGLIHL HLA-A*02:01 1.000000 0.727273
ALWGFFPVL HLA-A*02:01 1.000000 1.000000
ALWGPDPAAA HLA-A*02:01 1.000000 0.909091
AMASTEGNV HLA-A*02:01 1.000000 1.000000
AMDSNTLEL HLA-A*02:01 1.000000 1.000000
AMPGVLSYV HLA-A*02:01 1.000000 1.000000
AMSTTDLEA HLA-A*02:01 0.750000 0.250000
APASSLLPAL HLA-B*07:02 1.000000 0.250000
ARKLLLDNL HLA-B*27:05 1.000000 1.000000
AVFDRKSDAK HLA-A*11:01 1.000000 0.866667
HLA-A11 1.000000 0.852941
AVYGNIKHK HLA-A*11:01 0.857143 1.000000
AVYNFATCGI HLA-A*02:01 1.000000 0.800000
CINGVCWTV HLA-A*02:01 0.600000 0.960000
CLFKDWEEL HLA-A*02:01 1.000000 0.750000
CLGGLLTMV HLA-A*02:01 1.000000 0.958333
HLA-A2 0.857143 0.913043
CPSQEPMSIYVY HLA-B*35:08 1.000000 1.000000
CTELKLSDY HLA-A*01:01 1.000000 0.800000
HLA-A1 1.000000 0.764706
CVNGVCWTV HLA-A*02:01 0.666667 0.863636
DLKPDNILL HLA-A*02:01 0.750000 0.666667
DLMGYIPLV HLA-A*02:01 1.000000 0.880952
DMWEHAFYL HLA-A*02:01 1.000000 0.888889
DSNIMNSINNVMDEIDFFEK HLA-DQA1*03:01/DQB1*03:02 1.000000 1.000000
DVMNILLQYVVKSFDRSTKV HLA-DRB1*04:01 1.000000 0.875000
EAAGIGILTV HLA-A*02:01 0.857143 1.000000
EENLLDFVRF HLA-B*44:05 1.000000 1.000000
EEVDMTPADALDDFD HLA-DQB1*03:02 1.000000 1.000000
ELAGIGILTV HLA-A*02:01 1.000000 0.981481
ELRSRYWAI HLA-B8 1.000000 0.925926
ENPVVHFFANIVTPR HLA-DRB1*15:01 1.000000 1.000000
ENPVVHFFKNIVTPR HLA-DR2 1.000000 0.971429
HLA-DRB1*15:01 1.000000 0.971429
EPLITKLIL HLA-B*07:02 0.666667 0.000000
EPLPQGQLTAY HLA-B*35:01 1.000000 0.923077
ETLLRAVESYLLAHS HLA-DRB1*01:01 1.000000 1.000000
HLA-DRB1*04:01 1.000000 1.000000
HLA-DRB1*15:01 1.000000 1.000000
EYLVSFGVW HLA-A*24:02 1.000000 0.538462
FAPGFFPYL HLA-A*02:01 1.000000 0.666667
FATGIGIITV HLA-A*02:01 1.000000 1.000000
FIDSYICQV HLA-A*02:01 1.000000 0.809524
FILGIIITV HLA-A*02:01 1.000000 0.900000
FIVVATAAV HLA-A*02:01 1.000000 0.250000
FLCKQYLNL HLA-A*02:01 1.000000 0.333333
FLFWFLKSGA HLA-A*02:01 1.000000 0.000000
FLIVSLCPT HLA-A*02:01 1.000000 0.750000
FLLLADARV HLA-A*02:01 1.000000 0.750000
FLLPLTSLV HLA-A*02:01 1.000000 0.400000
FLLPLTSLVI HLA-A*02:01 1.000000 0.200000
FLLSLGIHL HLA-A*02:01 1.000000 0.833333
FLLTRILTI HLA-A*02:01 1.000000 0.961538
... ...

369 rows × 2 columns


In [435]:
plt.hexbin(df_combined_filt.tcell, df_combined_filt.mhc, gridsize=6)


Out[435]:
<matplotlib.collections.PolyCollection at 0x1171d9710>

In [437]:
reload(iedb)
df_combined = iedb.load_tcell_vs_mhc(hla_type=1, min_count = 3, key_by_allele=True)
df_combined


Class I MHC Entries 166713
Class II MHC Entries 72612
Human entries 239619
Human Class I MHCs 166708
Human Class II MHCs 72612
Dropping 294 null sequences
Dropping 45 bad sequences
Filtered sequences epitope sequences 166669
Class I MHC Entries 60202
Class II MHC Entries 95524
Human entries 133735
Human Class I MHCs 54432
Human Class II MHCs 65320
Dropping 3824 null sequences
Dropping 83 bad sequences
Filtered sequences epitope sequences 54315
Out[437]:
mhc tcell
Epitope Linear Sequence MHC Allele Name
AAGIGILTV HLA-A*02:01 0.900000 0.909091
AIMDKNIIL HLA-A*02:01 1.000000 0.963636
ALMPLYACI HLA-A*02:01 1.000000 0.400000
ALNIALVAV HLA-A*02:01 1.000000 1.000000
ALSTGLIHL HLA-A*02:01 1.000000 0.727273
ALWGFFPVL HLA-A*02:01 1.000000 1.000000
ALWGPDPAAA HLA-A*02:01 1.000000 0.909091
AMASTEGNV HLA-A*02:01 1.000000 1.000000
AMPGVLSYV HLA-A*02:01 1.000000 1.000000
AMSTTDLEA HLA-A*02:01 0.750000 0.250000
AVYNFATCGI HLA-A*02:01 1.000000 0.800000
CINGVCWTV HLA-A*02:01 0.600000 0.960000
CLFKDWEEL HLA-A*02:01 1.000000 0.750000
CLGGLLTMV HLA-A*02:01 1.000000 0.958333
HLA-A2 0.857143 0.913043
CPSQEPMSIYVY HLA-B*35:08 1.000000 1.000000
CTELKLSDY HLA-A*01:01 1.000000 0.800000
HLA-A1 1.000000 0.764706
DLMGYIPLV HLA-A*02:01 1.000000 0.880952
EAAGIGILTV HLA-A*02:01 0.857143 1.000000
ELAGIGILTV HLA-A*02:01 1.000000 0.981481
ELRSRYWAI HLA-B8 1.000000 0.925926
EPLPQGQLTAY HLA-B*35:01 1.000000 0.923077
FIDSYICQV HLA-A*02:01 1.000000 0.809524
FILGIIITV HLA-A*02:01 1.000000 0.900000
FLIVSLCPT HLA-A*02:01 1.000000 0.750000
FLLLADARV HLA-A*02:01 1.000000 0.750000
FLLSLGIHL HLA-A*02:01 1.000000 0.833333
FLLTRILTI HLA-A*02:01 1.000000 0.961538
FLPIIFDAFL HLA-A*02:01 1.000000 0.250000
FLPSDFFPSI HLA-A*02:01 1.000000 0.769231
FLPSDFFPSV HLA-A*02:01 1.000000 0.915254
HLA-A*02:06 1.000000 0.500000
FLRGRAYGL HLA-B8 1.000000 0.990991
FLTSVINRV HLA-A*02:01 1.000000 0.888889
FLVIAINAM HLA-A*02:01 0.600000 0.250000
FMYSDFHFI HLA-A*02:01 1.000000 0.909091
FPTKDVAL HLA-B*35:08 1.000000 1.000000
FPYEGGKVF HLA-B*07:02 0.875000 0.750000
FTASLFLHL HLA-A*02:01 0.500000 0.428571
FVDTMSIYI HLA-A*02:01 1.000000 0.000000
FVDYNFTIV HLA-A*02:01 1.000000 1.000000
GILGFVFTL HLA-A*02:01 1.000000 0.978102
HLA-A2 0.833333 1.000000
GLCTLVAML HLA-A2 1.000000 0.978723
GLFDFVNFV HLA-A*02:01 1.000000 0.857143
GLLDRLYDL HLA-A*02:01 1.000000 0.250000
GLMWLSYFV HLA-A*02:01 0.857143 1.000000
GLNDYLHSV HLA-A*02:01 1.000000 0.888889
GLSPTVWLSV HLA-A*02:01 1.000000 0.875000
GLSRYVARL HLA-A*02:01 1.000000 0.857143
GLYSSTVPV HLA-A*02:01 1.000000 0.500000
GMSRIGMEV HLA-A*02:01 0.833333 1.000000
HLSLRGLPV HLA-A*02:01 0.600000 0.750000
HLVEALYLV HLA-A*02:01 1.000000 0.769231
HLYSHPIIL HLA-A*02:01 1.000000 0.800000
HMWNFISGI HLA-A*02:01 1.000000 0.750000
HVDGKILFV HLA-A*02:01 1.000000 0.800000
IIIPFIAYFV HLA-A*02:01 1.000000 1.000000
ILAGYGAGV HLA-A*02:01 1.000000 0.565217
... ...

160 rows × 2 columns


In [451]:
plt.hexbin(df_combined.tcell, df_combined.mhc, gridsize=10)


Out[451]:
<matplotlib.collections.PolyCollection at 0x12a11a1d0>

In [443]:
plt.hexbin?

In [ ]: