In [1]:
import NotebookImport
from Imports import *
In [2]:
matched_rna = pd.read_hdf(RNA_SUBREAD_STORE, 'matched_tn')
matched_rna = matched_rna.ix[ti((matched_rna == -3).sum(1) !=
len(matched_rna.columns))]
In [3]:
n = len(matched_rna.columns.levels[0])
dx_rna = binomial_test_screen(matched_rna, fc=1.)
dx_rna = dx_rna[dx_rna.num_dx > (n / 2.)]
In [4]:
ttest_all = matched_rna.apply(ttest_rel, 1)
In [5]:
from Stats.Scipy import _split_on_index
def wilcoxon_pandas(a, b=None):
"""
Wrapper to do a one way t-test on pandas matched samples
------------------------------------------------
a,b: matched measurements
OR
a: Series of matched measurements with assignment on second level
of multi-index.
"""
if isinstance(b, pd.Series):
a, b = _match_series(a, b)
elif b is None and isinstance(a.index, pd.MultiIndex):
a, b = _split_on_index(a, matched=True)
z, p = stats.wilcoxon(a, b, zero_method='zsplit')
return pd.Series({'T': z, 'p': p})
In [6]:
wilcoxon_all = matched_rna.apply(wilcoxon_pandas, 1)
wilcox_stat = wilcoxon_all['T'] * np.sign(dx_rna.frac[669] - .5)
In [12]:
rank_sum = 1.*sum(range(669+1))
ts = ((rank_sum - wilcoxon_all['T']) / rank_sum) - (wilcoxon_all['T'] / rank_sum)
ts = (ts * np.sign(frac[669] - .5)).dropna()
In [7]:
fig, ax = subplots(figsize=(4,4))
series_scatter(dx_rna.frac, ts, ax=ax, ann=None, s=10, alpha=.5,
rasterized=True)
ax.set_xbound(-.05,1.05)
ax.set_xlabel('Fraction overexpressed')
ax.set_ylabel('Paired t-statistic')
ax.set_xticks([0,.5,1.])
ax.set_xbound(-.02,1.02)
#ax.set_ybound(-45,45)
prettify_ax(ax)
In [ ]:
fig, ax = subplots(figsize=(4,4))
series_scatter(dx_rna.frac, ts[ts < -.7], ax=ax, ann=None, s=10, alpha=.5,
rasterized=True)
ax.set_xlabel('Fraction overexpressed')
ax.set_ylabel('Paired t-statistic')
prettify_ax(ax)
In [ ]:
pearson_pandas(dx_rna.frac, ts)['rho']
In [14]:
ax.annotate?
In [19]:
fig, ax = subplots(figsize=(4,4))
series_scatter(dx_rna.frac, ttest_all.t, ax=ax, ann=None, s=10, alpha=.5,
rasterized=True)
ax.set_xbound(-.05,1.05)
ax.set_xlabel('Fraction upregulated')
ax.set_ylabel('Paired t-statistic')
ax.set_xticks([0,.5,1.])
ax.set_xbound(-.02,1.02)
ax.set_ybound(-45,45)
ax.annotate('$R^2 = .97$', (.7, -40), fontsize=16)
prettify_ax(ax)
fig.tight_layout()
fig.savefig(FIGDIR + 'S2_Fig.pdf')
In [11]:
pearson_pandas(dx_rna.frac, ttest_all.t) ** 2
Out[11]:
In [10]:
all_pts = matched_rna.columns.levels[0]
In [11]:
%%time
frac = {}
tstat = {}
pcc = {}
for n in [3,4,5,6,7,8,9,10,12,15,17,20,22,25,30,40,50,75,100,
150,200,250,300,350,400,450,500,550,600,len(all_pts)]:
pts = np.random.choice(all_pts, size=n, replace=True)
n = len(pts)
dx_rna = binomial_test_screen(matched_rna[pts], fc=1.)
dx_rna = dx_rna[dx_rna.num_dx > (n / 2.)]
ttest_all = matched_rna[pts].apply(ttest_rel, 1)
frac[n] = dx_rna.frac
tstat[n] = ttest_all.t
pcc[n] = pearson_pandas(dx_rna.frac, ttest_all.t)['rho']
pcc = pd.Series(pcc)
frac = pd.DataFrame(frac)
tstat = pd.DataFrame(tstat)
Correlation of t-statistic and fraction overexpressed
In [ ]:
fig, ax = subplots()
(1 - pcc).sort_index().plot()
plt.yscale('log')
Correlation of subsampled population with full data
In [ ]:
fig, ax = subplots()
(1 - frac.corr()[669].sort_index()[:-1]).plot(ls='', marker='o', ax=ax, label='$f_{up}$')
(1 - tstat.corr()[669].sort_index()[:-1]).plot(ls='', marker='o', ax=ax, label='$t$')
ax.set_yscale('log')
ax.set_xlabel('Number of Subjects')
ax.set_ylabel('1 - PCC')
ax.legend(numpoints=1, fancybox=True)
prettify_ax(ax)
In [ ]:
(frac[669].dropna().rank() - tstat[669].rank()).order().dropna().abs().tail(10)
In [ ]:
frac.ix['RAB25'][669], tstat.ix['RAB25'][669]
In [ ]:
wilcoxon_pandas(matched_rna.ix['RAB25'])
In [ ]:
fig, ax = subplots(figsize=(5,5))
mm = matched_rna.ix['RAB25'].unstack()[['01','11']].dropna().stack()
violin_plot_series(mm, ax=ax, order=['11','01'], ann=None)
for i,v in mm.unstack().iterrows():
ax.plot([0,1],[v['11'], v['01']], color='black', alpha=.05)
ax.set_xlabel('')
ax.set_xticklabels(['Normal','Tumor'])
prettify_ax(ax)