In [1]:
import NotebookImport
import Protein_Atlas as PA
In [2]:
import DX_screen as Screen
In [3]:
from Imports import *
In [4]:
loc = PA.loc.set_index('Gene')
loc = loc['Main location']
loc.index = loc.index.map(lambda s: PA.mapping[s])
loc = loc.dropna()
loc = loc.groupby(level=0).first()
In [5]:
f = Screen.dx_rna.frac
In [6]:
rr = pd.DataFrame({s: anova(loc.dropna().map(lambda v: s in v), f) for
s in ti(loc.value_counts() > 5)}).T
rr.sort('p').head()
Out[6]:
In [7]:
violin_plot_pandas(loc.map(lambda s: 'Endoplasmic reticulum' in s), f)
In [8]:
violin_plot_pandas(loc.map(lambda s: 'Vesicles' in s), f)
In [9]:
g = ti(PA.mapping == 'FOXM1')[0]
In [10]:
cancer_exp = PA.cancer.ix[g].sum()
normal_df = PA.normals.query('Gene == "{}"'.format(g))
normal_exp = normal_df.Level.value_counts()
d = pd.concat([normal_exp, cancer_exp], keys=['normal','tumor'], axis=1)
o = ['Not detected','Low','Medium','High']
d.ix[o].plot(kind='bar')
Out[10]:
In [11]:
tn_map = pd.DataFrame([[k,i[0], i[1]] for k,v in PA.tn_map.iteritems()
for i in v])
In [12]:
c = tn_map.iloc[0]
In [13]:
cat_map = {'Not detected': 0, 'Low': 1, 'Medium': 2, 'High': 3}
In [14]:
df = PA.normals.set_index(['Gene','Tissue','Cell type']).Level.unstack('Gene').T
normal_counts = df.apply(pd.value_counts, 1).fillna(0)
tumor_counts = PA.cancer.groupby(level='Gene').sum()
In [15]:
f1 = normal_counts['Not detected'] / normal_counts.sum(1)
f2 = tumor_counts['Not detected'] / tumor_counts.sum(1)
f3 = ((f2 - f1).order() / 2) + .5
f3.index = f3.index.map(lambda s: PA.mapping[s])
f3 = f3.groupby(level=0).first()
plot_regression(f3, Screen.dx_rna.frac, density=True, rad=.1)
In [127]:
b = pd.concat([f3, Screen.dx_rna.frac], axis=1, keys=['a','b']).sort('a').dropna()
b[b.b < .2].head(10)
Out[127]:
In [128]:
f1 = normal_counts['High'] / normal_counts.sum(1)
f2 = tumor_counts['High'] / tumor_counts.sum(1)
f3 = ((f2 - f1).order() / 2) + .5
f3.index = f3.index.map(lambda s: PA.mapping[s])
f3 = f3.groupby(level=0).first()
plot_regression(f3, Screen.dx_rna.frac, density=True, rad=.1)
In [131]:
b = pd.concat([f3, Screen.dx_rna.frac], axis=1, keys=['a','b']).sort('a').dropna()
b[b.b > .9].tail(10)
Out[131]:
In [132]:
f1 = normal_counts[['Medium','High']].sum(1) / normal_counts.sum(1)
f2 = tumor_counts[['Medium','High']].sum(1) / tumor_counts.sum(1)
f3 = ((f2 - f1).order() / 2) + .5
f3.index = f3.index.map(lambda s: PA.mapping[s])
f3 = f3.groupby(level=0).first()
plot_regression(f3, Screen.dx_rna.frac, density=True, rad=.1)
In [137]:
b = pd.concat([f3, Screen.dx_rna.frac], axis=1, keys=['a','b']).sort('a').dropna()
b[b.b < .2].head(10)
Out[137]:
In [135]:
b = pd.concat([f3, Screen.dx_rna.frac], axis=1, keys=['a','b']).sort('a').dropna()
b[b.b > .8].tail(10)
Out[135]:
In [138]:
un_vc = lambda vc: pd.Series([j for i,v in vc.iteritems() if v > 0 for j in [i]*v])
In [151]:
normal_us = normal_counts.apply(un_vc, 1).T
normal_us = normal_us.replace(cat_map)
tumor_us = tumor_counts.apply(un_vc, 1).T
tumor_us = tumor_us.replace(cat_map)
In [365]:
mtest = {}
for g in normal_us.columns:
try:
a,b = normal_us[g].dropna(), tumor_us[g].dropna()
d = pd.concat([a,b], keys=[0,1])
v = d.groupby(level=0).mean()
sign = v[0] < v[1]
r = stats.mannwhitneyu(a, b)
mtest[g] = (sign, r[1])
except:
pass
mtest = pd.DataFrame(mtest).T
mtest = mtest.sort(1)
In [366]:
dd = mtest.join(PA.mapping).set_index('Gene').join(Screen.dx_rna.frac).dropna().sort(1)
dd = dd.groupby(level=0).first()
In [381]:
v = (bhCorrection(dd[1]) < .01) & (dd.frac > .7) & (dd[0] == 1)
v.value_counts()
Out[381]:
In [382]:
rr = screen_feature(v, chi2_cont_test, Screen.gs2.T)
rr.head()
Out[382]:
In [450]:
dd.frac.ix[ti((bhCorrection(dd[1]) < 10e-7) & (dd[0]
== 1))].ix[ti(Screen.gs2['REACTOME_TELOMERE_MAINTENANCE']>0)].dropna().order()
Out[450]:
In [454]:
list(Out[450].index)
Out[454]:
In [451]:
g = ti(PA.mapping == 'HIST1H4J')[0]
cancer_exp = PA.cancer.ix[g].sum()
normal_df = PA.normals.query('Gene == "{}"'.format(g))
normal_exp = normal_df.Level.value_counts()
d = pd.concat([normal_exp, cancer_exp], keys=['normal','tumor'], axis=1)
d = d / d.sum()
o = ['Not detected','Low','Medium','High']
d.ix[o].plot(kind='bar')
Out[451]:
In [439]:
violin_plot_pandas(dd[0], dd.frac.ix[ti((bhCorrection(dd[1]) < 10e-5))])
In [432]:
violin_plot_pandas(Screen.gs2['REACTOME_TELOMERE_MAINTENANCE'],
dd.frac.ix[ti((bhCorrection(dd[1]) < .01) & (dd[0] == 0))])
In [312]:
violin_plot_pandas(Screen.gs2['REACTOME_TELOMERE_MAINTENANCE'],
dd.frac.ix[ti((bhCorrection(dd[1]) < .01))])
In [402]:
v = (bhCorrection(dd[1]) < .01) & (dd.frac < .25) & (dd[0] == 0)
v.value_counts()
Out[402]:
In [403]:
rr = screen_feature(v.groupby(level=0).first(), chi2_cont_test, Screen.gs2.T)
rr.head()
Out[403]:
In [408]:
violin_plot_pandas(Screen.gs2['REACTOME_ETHANOL_OXIDATION'],
dd.frac.ix[ti((bhCorrection(dd[1]) < .01) & (dd[0] == 0))])
In [316]:
dd[(dd.frac - .5).abs() > .4].sort(1).head(20)
Out[316]:
In [331]:
paired_bp_tn_split(Screen.matched_rna.ix['ASPM'], codes)
In [332]:
nn = PA.normals.query('Gene == "{}"'.format(g))
nn.query('Tissue == "lung"')
Out[332]:
In [333]:
PA.cancer.ix[g]
Out[333]:
In [334]:
g = ti(PA.mapping == 'ASPM')[0]
cancer_exp = PA.cancer.ix[g].sum()
normal_df = PA.normals.query('Gene == "{}"'.format(g))
normal_exp = normal_df.Level.value_counts()
d = pd.concat([normal_exp, cancer_exp], keys=['normal','tumor'], axis=1)
d = d / d.sum()
o = ['Not detected','Low','Medium','High']
d.ix[o].plot(kind='bar')
Out[334]:
In [317]:
paired_bp_tn_split(Screen.matched_rna.ix['TGFBR3'], codes)
In [318]:
g = ti(PA.mapping == 'TGFBR3')[0]
cancer_exp = PA.cancer.ix[g].sum()
normal_df = PA.normals.query('Gene == "{}"'.format(g))
normal_exp = normal_df.Level.value_counts()
d = pd.concat([normal_exp, cancer_exp], keys=['normal','tumor'], axis=1)
d = d / d.sum()
o = ['Not detected','Low','Medium','High']
d.ix[o].plot(kind='bar')
Out[318]:
In [37]:
df = PA.normals.rename(columns=lambda s: s.replace(' ','_'))
df = df.set_index(['Gene','Tissue','Cell_type'])
rr = {}
for g in ti(PA.mapping.isin(Screen.dx_rna.index)):
try:
#nn = df.query(('Gene == "{}" & Tissue == "{}" & Cell_type == "{}"'
# .format(g, c[1], c[2]))).iloc[0]
nn = df.ix[(g, c[1], c[2])]
tt = PA.cancer.ix[(g, c[0])]
tt = pd.Series([j for i,v in tt.iteritems() if v > 0 for j in [i]*v])
tt = tt.map(cat_map)
normal_level = cat_map[nn['Level']]
v = (normal_level, (tt == normal_level).sum(),
(tt > normal_level).sum(), (tt < normal_level).sum())
rr[PA.mapping[g]] = v
except:
pass
rr = pd.DataFrame(rr).T
rr['f'] = rr[2] / (rr[2] + rr[3])
rr = rr.replace(np.inf, 1)
In [38]:
violin_plot_pandas(rr[0], rr.f)
In [39]:
f1 = (rr[[2,3]].sum(1) > 1)
f2 = rr[0].isin([1,2])
In [ ]:
In [62]:
f3 = rr[2] / rr[[1,2,3]].sum(1)
f3 = f3[rr[0] == 0]
In [63]:
violin_plot_pandas(1.*(f3>.2) - 1.*(f3 < .8), Screen.dx_rna.frac, order=[-1,0,1])
In [40]:
v = rr[f1 & f2]['f']
v.hist()
Out[40]: