Using Human Protien Atlas to annotate DX genes



In [1]:

    
import NotebookImport
import Protein_Atlas as PA









    




importing IPython notebook from Protein_Atlas



In [2]:

    
import DX_screen as Screen









    




importing IPython notebook from DX_screen






    




importing IPython notebook from Imports






    




importing IPython notebook from /cellar/users/agross/anaconda2/lib/python2.7/site-packages/MethylTools/Probe_Annotations



In [3]:

    
from Imports import *



In [4]:

    
loc = PA.loc.set_index('Gene')
loc = loc['Main location']
loc.index = loc.index.map(lambda s: PA.mapping[s])
loc = loc.dropna()
loc = loc.groupby(level=0).first()



In [5]:

    
f = Screen.dx_rna.frac



In [6]:

    
rr = pd.DataFrame({s: anova(loc.dropna().map(lambda v: s in v), f) for 
                   s in ti(loc.value_counts() > 5)}).T
rr.sort('p').head()









    Out[6]:






  
    
      
      F
      p
    
  
  
    
      Endoplasmic reticulum
       22.92
       1.72e-06
    
    
      Vesicles
       17.69
       2.63e-05
    
    
      Nucleus
       16.01
       6.36e-05
    
    
      Plasma membrane
       14.92
       1.13e-04
    
    
      Nucleoli
       12.87
       3.36e-04



In [7]:

    
violin_plot_pandas(loc.map(lambda s: 'Endoplasmic reticulum' in s), f)



In [8]:

    
violin_plot_pandas(loc.map(lambda s: 'Vesicles' in s), f)

Differential Protien Expression



In [9]:

    
g = ti(PA.mapping == 'FOXM1')[0]



In [10]:

    
cancer_exp = PA.cancer.ix[g].sum()
normal_df = PA.normals.query('Gene == "{}"'.format(g))
normal_exp = normal_df.Level.value_counts()
d = pd.concat([normal_exp, cancer_exp], keys=['normal','tumor'], axis=1)
o = ['Not detected','Low','Medium','High']
d.ix[o].plot(kind='bar')









    Out[10]:





<matplotlib.axes.AxesSubplot at 0x7f2ee32df290>



In [11]:

    
tn_map = pd.DataFrame([[k,i[0], i[1]] for k,v in PA.tn_map.iteritems() 
                       for i in v])



In [12]:

    
c = tn_map.iloc[0]



In [13]:

    
cat_map = {'Not detected': 0, 'Low': 1, 'Medium': 2, 'High': 3}



In [14]:

    
df = PA.normals.set_index(['Gene','Tissue','Cell type']).Level.unstack('Gene').T
normal_counts = df.apply(pd.value_counts, 1).fillna(0)
tumor_counts = PA.cancer.groupby(level='Gene').sum()



In [15]:

    
f1 = normal_counts['Not detected'] / normal_counts.sum(1)
f2 = tumor_counts['Not detected'] / tumor_counts.sum(1)
f3 = ((f2 - f1).order() / 2) + .5
f3.index = f3.index.map(lambda s: PA.mapping[s])
f3 = f3.groupby(level=0).first()
plot_regression(f3, Screen.dx_rna.frac, density=True, rad=.1)



In [127]:

    
b = pd.concat([f3, Screen.dx_rna.frac], axis=1, keys=['a','b']).sort('a').dropna()
b[b.b < .2].head(10)



In [128]:

    
f1 = normal_counts['High'] / normal_counts.sum(1)
f2 = tumor_counts['High'] / tumor_counts.sum(1)
f3 = ((f2 - f1).order() / 2) + .5
f3.index = f3.index.map(lambda s: PA.mapping[s])
f3 = f3.groupby(level=0).first()
plot_regression(f3, Screen.dx_rna.frac, density=True, rad=.1)



In [131]:

    
b = pd.concat([f3, Screen.dx_rna.frac], axis=1, keys=['a','b']).sort('a').dropna()
b[b.b > .9].tail(10)



In [132]:

    
f1 = normal_counts[['Medium','High']].sum(1) / normal_counts.sum(1)
f2 = tumor_counts[['Medium','High']].sum(1) / tumor_counts.sum(1)
f3 = ((f2 - f1).order() / 2) + .5
f3.index = f3.index.map(lambda s: PA.mapping[s])
f3 = f3.groupby(level=0).first()
plot_regression(f3, Screen.dx_rna.frac, density=True, rad=.1)



In [137]:

    
b = pd.concat([f3, Screen.dx_rna.frac], axis=1, keys=['a','b']).sort('a').dropna()
b[b.b < .2].head(10)



In [135]:

    
b = pd.concat([f3, Screen.dx_rna.frac], axis=1, keys=['a','b']).sort('a').dropna()
b[b.b > .8].tail(10)



In [138]:

    
un_vc = lambda vc:  pd.Series([j for i,v in vc.iteritems() if v > 0 for j in [i]*v])



In [151]:

    
normal_us = normal_counts.apply(un_vc, 1).T
normal_us = normal_us.replace(cat_map)
tumor_us = tumor_counts.apply(un_vc, 1).T
tumor_us = tumor_us.replace(cat_map)



In [365]:

    
mtest = {}
for g in normal_us.columns:
    try:
        a,b = normal_us[g].dropna(), tumor_us[g].dropna()
        d = pd.concat([a,b], keys=[0,1])
        v = d.groupby(level=0).mean()
        sign = v[0] < v[1]
        r = stats.mannwhitneyu(a, b)
        mtest[g] = (sign, r[1])
    except:
        pass
mtest = pd.DataFrame(mtest).T
mtest = mtest.sort(1)



In [366]:

    
dd = mtest.join(PA.mapping).set_index('Gene').join(Screen.dx_rna.frac).dropna().sort(1)
dd = dd.groupby(level=0).first()



In [381]:

    
v = (bhCorrection(dd[1]) < .01) & (dd.frac > .7) & (dd[0] == 1)
v.value_counts()









    Out[381]:





False    13965
True       208
dtype: int64



In [382]:

    
rr = screen_feature(v, chi2_cont_test, Screen.gs2.T)
rr.head()









    Out[382]:






  
    
      
      chi2
      p
      dof
      q
    
  
  
    
      REACTOME_CELL_CYCLE_MITOTIC
       104.98
       1.23e-24
       1
       1.02e-21
    
    
      REACTOME_DNA_STRAND_ELONGATION
        91.75
       9.82e-22
       1
       4.08e-19
    
    
      KEGG_DNA_REPLICATION
        84.83
       3.26e-20
       1
       9.04e-18
    
    
      REACTOME_SYNTHESIS_OF_DNA
        79.28
       5.39e-19
       1
       1.02e-16
    
    
      REACTOME_S_PHASE
        79.02
       6.14e-19
       1
       1.02e-16



In [450]:

    
dd.frac.ix[ti((bhCorrection(dd[1]) < 10e-7) & (dd[0] 
              == 1))].ix[ti(Screen.gs2['REACTOME_TELOMERE_MAINTENANCE']>0)].dropna().order()









    Out[450]:





HIST1H4C    0.59
HIST1H4D    0.66
HIST1H4I    0.68
HIST1H4H    0.69
HIST2H4A    0.69
HIST1H4A    0.72
HIST1H4E    0.73
HIST1H4J    0.77
LIG1        0.81
PCNA        0.81
FEN1        0.92
Name: frac, dtype: float64



In [454]:

    
list(Out[450].index)









    Out[454]:





['HIST1H4C',
 'HIST1H4D',
 'HIST1H4I',
 'HIST1H4H',
 'HIST2H4A',
 'HIST1H4A',
 'HIST1H4E',
 'HIST1H4J',
 'LIG1',
 'PCNA',
 'FEN1']



In [451]:

    
g = ti(PA.mapping == 'HIST1H4J')[0]
cancer_exp = PA.cancer.ix[g].sum()
normal_df = PA.normals.query('Gene == "{}"'.format(g))
normal_exp = normal_df.Level.value_counts()
d = pd.concat([normal_exp, cancer_exp], keys=['normal','tumor'], axis=1)
d = d / d.sum()
o = ['Not detected','Low','Medium','High']
d.ix[o].plot(kind='bar')









    Out[451]:





<matplotlib.axes.AxesSubplot at 0x7f846e72bdd0>



In [439]:

    
violin_plot_pandas(dd[0], dd.frac.ix[ti((bhCorrection(dd[1]) < 10e-5))])



In [432]:

    
violin_plot_pandas(Screen.gs2['REACTOME_TELOMERE_MAINTENANCE'],
                   dd.frac.ix[ti((bhCorrection(dd[1]) < .01) & (dd[0] == 0))])



In [312]:

    
violin_plot_pandas(Screen.gs2['REACTOME_TELOMERE_MAINTENANCE'],
                   dd.frac.ix[ti((bhCorrection(dd[1]) < .01))])



In [402]:

    
v = (bhCorrection(dd[1]) < .01) & (dd.frac < .25) & (dd[0] == 0)
v.value_counts()









    Out[402]:





False    13727
True       446
dtype: int64



In [403]:

    
rr = screen_feature(v.groupby(level=0).first(), chi2_cont_test, Screen.gs2.T)
rr.head()









    Out[403]:






  
    
      
      chi2
      p
      dof
      q
    
  
  
    
      KEGG_NEUROACTIVE_LIGAND_RECEPTOR_INTERACTION
       27.60
       1.49e-07
       1
       1.24e-04
    
    
      REACTOME_ETHANOL_OXIDATION
       20.74
       5.25e-06
       1
       2.19e-03
    
    
      REACTOME_PHASE_1_FUNCTIONALIZATION_OF_COMPOUNDS
       16.52
       4.82e-05
       1
       1.34e-02
    
    
      KEGG_TRYPTOPHAN_METABOLISM
       11.38
       7.42e-04
       1
       1.54e-01
    
    
      REACTOME_G_ALPHA_S_SIGNALLING_EVENTS
       10.50
       1.19e-03
       1
       1.79e-01



In [408]:

    
violin_plot_pandas(Screen.gs2['REACTOME_ETHANOL_OXIDATION'],
                   dd.frac.ix[ti((bhCorrection(dd[1]) < .01) & (dd[0] == 0))])



In [316]:

    
dd[(dd.frac - .5).abs() > .4].sort(1).head(20)



In [331]:

    
paired_bp_tn_split(Screen.matched_rna.ix['ASPM'], codes)



In [332]:

    
nn = PA.normals.query('Gene == "{}"'.format(g))
nn.query('Tissue == "lung"')









    Out[332]:






  
    
      
      Gene
      Tissue
      Cell type
      Level
      Reliability
    
  
  
    
      231020
       ENSG00000105976
       lung
       macrophages
       High
       Uncertain
    
    
      231021
       ENSG00000105976
       lung
       pneumocytes
        Low
       Uncertain



In [333]:

    
PA.cancer.ix[g]









    Out[333]:






  
    
      Level
      High
      Low
      Medium
      Not detected
    
    
      Tumor
      
      
      
      
    
  
  
    
      breast cancer
       0
       0
       12
       0
    
    
      carcinoid
       0
       2
        2
       0
    
    
      cervical cancer
       0
       5
        6
       1
    
    
      colorectal cancer
       1
       0
       11
       0
    
    
      endometrial cancer
       0
       0
       12
       0
    
    
      glioma
       0
       5
        7
       0
    
    
      head and neck cancer
       0
       0
        4
       0
    
    
      liver cancer
       1
       0
       11
       0
    
    
      lung cancer
       1
       3
        5
       1
    
    
      lymphoma
       0
       2
       10
       0
    
    
      melanoma
       0
       2
       10
       0
    
    
      ovarian cancer
       0
       3
        8
       0
    
    
      pancreatic cancer
       0
       0
       12
       0
    
    
      prostate cancer
       1
       0
       10
       0
    
    
      renal cancer
       0
       4
        6
       1
    
    
      skin cancer
       0
       8
        3
       1
    
    
      stomach cancer
       1
       2
        9
       0
    
    
      testis cancer
       1
       2
        9
       0
    
    
      thyroid cancer
       0
       0
        4
       0
    
    
      urothelial cancer
       1
       1
       10
       0



In [334]:

    
g = ti(PA.mapping == 'ASPM')[0]
cancer_exp = PA.cancer.ix[g].sum()
normal_df = PA.normals.query('Gene == "{}"'.format(g))
normal_exp = normal_df.Level.value_counts()
d = pd.concat([normal_exp, cancer_exp], keys=['normal','tumor'], axis=1)
d = d / d.sum()
o = ['Not detected','Low','Medium','High']
d.ix[o].plot(kind='bar')









    Out[334]:





<matplotlib.axes.AxesSubplot at 0x7f846e7a2150>



In [317]:

    
paired_bp_tn_split(Screen.matched_rna.ix['TGFBR3'], codes)



In [318]:

    
g = ti(PA.mapping == 'TGFBR3')[0]
cancer_exp = PA.cancer.ix[g].sum()
normal_df = PA.normals.query('Gene == "{}"'.format(g))
normal_exp = normal_df.Level.value_counts()
d = pd.concat([normal_exp, cancer_exp], keys=['normal','tumor'], axis=1)
d = d / d.sum()
o = ['Not detected','Low','Medium','High']
d.ix[o].plot(kind='bar')









    Out[318]:





<matplotlib.axes.AxesSubplot at 0x7f846e8a7c50>



In [37]:

    
df = PA.normals.rename(columns=lambda s: s.replace(' ','_'))
df = df.set_index(['Gene','Tissue','Cell_type'])

rr = {}
for g in ti(PA.mapping.isin(Screen.dx_rna.index)):
    try:
        #nn = df.query(('Gene == "{}" & Tissue == "{}" & Cell_type == "{}"'
        #               .format(g, c[1], c[2]))).iloc[0]
        nn = df.ix[(g, c[1], c[2])]
        tt = PA.cancer.ix[(g, c[0])]
        tt = pd.Series([j for i,v in tt.iteritems() if v > 0 for j in [i]*v])
        tt = tt.map(cat_map)
        normal_level = cat_map[nn['Level']]
        v = (normal_level, (tt == normal_level).sum(), 
             (tt > normal_level).sum(), (tt < normal_level).sum())
        rr[PA.mapping[g]] = v
    except:
        pass
rr = pd.DataFrame(rr).T
rr['f'] = rr[2] / (rr[2] + rr[3])
rr = rr.replace(np.inf, 1)



In [38]:

    
violin_plot_pandas(rr[0], rr.f)



In [39]:

    
f1 = (rr[[2,3]].sum(1) > 1)
f2 = rr[0].isin([1,2])



In [ ]:



In [62]:

    
f3 = rr[2] / rr[[1,2,3]].sum(1)
f3 = f3[rr[0] == 0]



In [63]:

    
violin_plot_pandas(1.*(f3>.2) - 1.*(f3 < .8), Screen.dx_rna.frac, order=[-1,0,1])



In [40]:

    
v = rr[f1 & f2]['f']
v.hist()









    Out[40]:





<matplotlib.axes.AxesSubplot at 0x7f8415db36d0>

	a	b
LIFR	0.27	0.14
RASL11A	0.31	0.18
PKNOX2	0.35	0.13
CES1	0.35	0.18
ANAPC16	0.36	0.18
LRRN3	0.37	0.19
XPA	0.37	0.17
C14orf159	0.37	0.15
SCN2B	0.37	0.14
CRBN	0.37	0.15

	a	b
SPC25	0.54	0.91
FOXM1	0.57	0.95
ANLN	0.58	0.93
CDC20	0.58	0.90
AURKB	0.59	0.91
IQGAP3	0.59	0.95
EZH2	0.66	0.91
MKI67	0.66	0.92
FEN1	0.67	0.92
TOP2A	0.75	0.93

	a	b
CBX7	0.19	0.15
DNASE1L3	0.21	0.11
LRRN3	0.24	0.19
THRB	0.25	0.20
THRA	0.25	0.20
BEX1	0.27	0.19
RNF11	0.28	0.19
TACC1	0.29	0.18
TXNIP	0.31	0.17
PI16	0.31	0.09

	a	b
TYMS	0.69	0.85
PCNA	0.69	0.81
CDC20	0.69	0.90
EZH2	0.69	0.91
TCOF1	0.70	0.82
ANLN	0.70	0.93
ATAD2	0.71	0.88
CDK1	0.73	0.86
KIF4B	0.76	0.85
TOP2A	0.78	0.93

	0	1	frac
TOP2A	2117.5	3.96e-26	0.93
TGFBR3	4385.0	7.77e-20	0.08
GCOM1	4624.0	2.31e-18	0.09
CKAP2L	3300.5	8.01e-16	0.92
CDC20	3880.5	2.86e-14	0.90
EZH2	3918.0	2.60e-13	0.91
NCAPH	4060.0	3.21e-13	0.92
MKI67	4185.0	2.33e-12	0.92
NDRG2	4086.0	5.68e-12	0.09
ANLN	4422.0	3.91e-11	0.93
BIRC5	4811.5	8.87e-11	0.92
FEN1	4779.0	9.86e-10	0.92
PI16	4591.0	1.10e-09	0.09
SYNE1	4514.0	1.43e-09	0.09
SPC25	4870.5	2.44e-09	0.91
DLGAP5	4873.5	7.21e-08	0.92
CHRDL1	5534.0	1.64e-07	0.07
TPX2	5306.5	1.78e-07	0.94
ASPM	5106.5	3.70e-07	0.92
GPM6A	7292.5	1.49e-06	0.09

	F	p
Endoplasmic reticulum	22.92	1.72e-06
Vesicles	17.69	2.63e-05
Nucleus	16.01	6.36e-05
Plasma membrane	14.92	1.13e-04
Nucleoli	12.87	3.36e-04

	chi2	p	dof	q
REACTOME_CELL_CYCLE_MITOTIC	104.98	1.23e-24	1	1.02e-21
REACTOME_DNA_STRAND_ELONGATION	91.75	9.82e-22	1	4.08e-19
KEGG_DNA_REPLICATION	84.83	3.26e-20	1	9.04e-18
REACTOME_SYNTHESIS_OF_DNA	79.28	5.39e-19	1	1.02e-16
REACTOME_S_PHASE	79.02	6.14e-19	1	1.02e-16

	chi2	p	dof	q
KEGG_NEUROACTIVE_LIGAND_RECEPTOR_INTERACTION	27.60	1.49e-07	1	1.24e-04
REACTOME_ETHANOL_OXIDATION	20.74	5.25e-06	1	2.19e-03
REACTOME_PHASE_1_FUNCTIONALIZATION_OF_COMPOUNDS	16.52	4.82e-05	1	1.34e-02
KEGG_TRYPTOPHAN_METABOLISM	11.38	7.42e-04	1	1.54e-01
REACTOME_G_ALPHA_S_SIGNALLING_EVENTS	10.50	1.19e-03	1	1.79e-01

	Gene	Tissue	Cell type	Level	Reliability
231020	ENSG00000105976	lung	macrophages	High	Uncertain
231021	ENSG00000105976	lung	pneumocytes	Low	Uncertain

Level	High	Low	Medium	Not detected
Tumor
breast cancer	0	0	12	0
carcinoid	0	2	2	0
cervical cancer	0	5	6	1
colorectal cancer	1	0	11	0
endometrial cancer	0	0	12	0
glioma	0	5	7	0
head and neck cancer	0	0	4	0
liver cancer	1	0	11	0
lung cancer	1	3	5	1
lymphoma	0	2	10	0
melanoma	0	2	10	0
ovarian cancer	0	3	8	0
pancreatic cancer	0	0	12	0
prostate cancer	1	0	10	0
renal cancer	0	4	6	1
skin cancer	0	8	3	1
stomach cancer	1	2	9	0
testis cancer	1	2	9	0
thyroid cancer	0	0	4	0
urothelial cancer	1	1	10	0