Comparing f_up verses a t-statistic


In [1]:
import NotebookImport
from Imports import *


importing IPython notebook from Imports
/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/rpy/__init__.py:11: FutureWarning: The pandas.rpy module is deprecated and will be removed in a future version. We refer to external packages like rpy2. 
See here for a guide on how to port your code to rpy2: http://pandas.pydata.org/pandas-docs/stable/r_interface.html
  FutureWarning)
importing IPython notebook from Global_Parameters

In [2]:
matched_rna = pd.read_hdf(RNA_SUBREAD_STORE, 'matched_tn')
matched_rna = matched_rna.ix[ti((matched_rna == -3).sum(1) != 
                                len(matched_rna.columns))]

In [3]:
n = len(matched_rna.columns.levels[0])
dx_rna = binomial_test_screen(matched_rna, fc=1.)
dx_rna = dx_rna[dx_rna.num_dx > (n / 2.)]

In [4]:
ttest_all = matched_rna.apply(ttest_rel, 1)

In [5]:
from Stats.Scipy import _split_on_index
def wilcoxon_pandas(a, b=None):
    """
    Wrapper to do a one way t-test on pandas matched samples
    ------------------------------------------------
    a,b: matched measurements

    OR

    a: Series of matched measurements with assignment on second level
       of multi-index.
    """
    if isinstance(b, pd.Series):
        a, b = _match_series(a, b)
    elif b is None and isinstance(a.index, pd.MultiIndex):
        a, b = _split_on_index(a, matched=True)
    z, p = stats.wilcoxon(a, b, zero_method='zsplit')
    return pd.Series({'T': z, 'p': p})

In [6]:
wilcoxon_all = matched_rna.apply(wilcoxon_pandas, 1)
wilcox_stat = wilcoxon_all['T'] * np.sign(dx_rna.frac[669] - .5)

In [12]:
rank_sum = 1.*sum(range(669+1))
ts = ((rank_sum - wilcoxon_all['T']) / rank_sum) - (wilcoxon_all['T'] / rank_sum)
ts = (ts * np.sign(frac[669] - .5)).dropna()


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-12-fc7eb6cb0659> in <module>()
      1 rank_sum = 1.*sum(range(669+1))
      2 ts = ((rank_sum - wilcoxon_all['T']) / rank_sum) - (wilcoxon_all['T'] / rank_sum)
----> 3 ts = (ts * np.sign(frac[669] - .5)).dropna()

NameError: name 'frac' is not defined

In [7]:
fig, ax = subplots(figsize=(4,4))
series_scatter(dx_rna.frac, ts, ax=ax, ann=None, s=10, alpha=.5,
               rasterized=True)
ax.set_xbound(-.05,1.05)
ax.set_xlabel('Fraction overexpressed')
ax.set_ylabel('Paired t-statistic')
ax.set_xticks([0,.5,1.])
ax.set_xbound(-.02,1.02)
#ax.set_ybound(-45,45)
prettify_ax(ax)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-7-edfb1ae1eb7e> in <module>()
      1 fig, ax = subplots(figsize=(4,4))
----> 2 series_scatter(dx_rna.frac, ts, ax=ax, ann=None, s=10, alpha=.5,
      3                rasterized=True)
      4 ax.set_xbound(-.05,1.05)
      5 ax.set_xlabel('Fraction overexpressed')

NameError: name 'ts' is not defined

In [ ]:
fig, ax = subplots(figsize=(4,4))
series_scatter(dx_rna.frac, ts[ts < -.7], ax=ax, ann=None, s=10, alpha=.5,
               rasterized=True)
ax.set_xlabel('Fraction overexpressed')
ax.set_ylabel('Paired t-statistic')
prettify_ax(ax)

In [ ]:
pearson_pandas(dx_rna.frac, ts)['rho']

In [14]:
ax.annotate?

In [19]:
fig, ax = subplots(figsize=(4,4))
series_scatter(dx_rna.frac, ttest_all.t, ax=ax, ann=None, s=10, alpha=.5,
               rasterized=True)
ax.set_xbound(-.05,1.05)
ax.set_xlabel('Fraction upregulated')
ax.set_ylabel('Paired t-statistic')
ax.set_xticks([0,.5,1.])
ax.set_xbound(-.02,1.02)
ax.set_ybound(-45,45)
ax.annotate('$R^2 = .97$', (.7, -40), fontsize=16)
prettify_ax(ax)
fig.tight_layout()
fig.savefig(FIGDIR + 'S2_Fig.pdf')



In [11]:
pearson_pandas(dx_rna.frac, ttest_all.t) ** 2


Out[11]:
rho    0.97
p      0.00
dtype: float64

In [10]:
all_pts = matched_rna.columns.levels[0]

In [11]:
%%time
frac = {}
tstat = {}
pcc = {}
for n in [3,4,5,6,7,8,9,10,12,15,17,20,22,25,30,40,50,75,100,
          150,200,250,300,350,400,450,500,550,600,len(all_pts)]:
    pts = np.random.choice(all_pts, size=n, replace=True)
    n = len(pts)
    dx_rna = binomial_test_screen(matched_rna[pts], fc=1.)
    dx_rna = dx_rna[dx_rna.num_dx > (n / 2.)]
    ttest_all = matched_rna[pts].apply(ttest_rel, 1)
    frac[n] = dx_rna.frac
    tstat[n] = ttest_all.t
    pcc[n] = pearson_pandas(dx_rna.frac, ttest_all.t)['rho']
pcc = pd.Series(pcc)
frac = pd.DataFrame(frac)
tstat = pd.DataFrame(tstat)


---------------------------------------------------------------------------
NotImplementedError                       Traceback (most recent call last)
<ipython-input-11-b013203e5538> in <module>()
----> 1 get_ipython().run_cell_magic(u'time', u'', u"frac = {}\ntstat = {}\npcc = {}\nfor n in [3,4,5,6,7,8,9,10,12,15,17,20,22,25,30,40,50,75,100,\n          150,200,250,300,350,400,450,500,550,600,len(all_pts)]:\n    pts = np.random.choice(all_pts, size=n, replace=True)\n    n = len(pts)\n    dx_rna = binomial_test_screen(matched_rna[pts], fc=1.)\n    dx_rna = dx_rna[dx_rna.num_dx > (n / 2.)]\n    ttest_all = matched_rna[pts].apply(ttest_rel, 1)\n    frac[n] = dx_rna.frac\n    tstat[n] = ttest_all.t\n    pcc[n] = pearson_pandas(dx_rna.frac, ttest_all.t)['rho']\npcc = pd.Series(pcc)\nfrac = pd.DataFrame(frac)\ntstat = pd.DataFrame(tstat)")

/cellar/users/agross/anaconda2/lib/python2.7/site-packages/IPython/core/interactiveshell.pyc in run_cell_magic(self, magic_name, line, cell)
   2291             magic_arg_s = self.var_expand(line, stack_depth)
   2292             with self.builtin_trap:
-> 2293                 result = fn(magic_arg_s, cell)
   2294             return result
   2295 

/cellar/users/agross/anaconda2/lib/python2.7/site-packages/IPython/core/magics/execution.pyc in time(self, line, cell, local_ns)

/cellar/users/agross/anaconda2/lib/python2.7/site-packages/IPython/core/magic.pyc in <lambda>(f, *a, **k)
    191     # but it's overkill for just that one bit of state.
    192     def magic_deco(arg):
--> 193         call = lambda f, *a, **k: f(*a, **k)
    194 
    195         if callable(arg):

/cellar/users/agross/anaconda2/lib/python2.7/site-packages/IPython/core/magics/execution.pyc in time(self, line, cell, local_ns)
   1165         else:
   1166             st = clock2()
-> 1167             exec(code, glob, local_ns)
   1168             end = clock2()
   1169             out = None

<timed exec> in <module>()

/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/core/frame.pyc in __getitem__(self, key)
   1789         if isinstance(key, (Series, np.ndarray, Index, list)):
   1790             # either boolean or fancy integer index
-> 1791             return self._getitem_array(key)
   1792         elif isinstance(key, DataFrame):
   1793             return self._getitem_frame(key)

/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/core/frame.pyc in _getitem_array(self, key)
   1833             return self.take(indexer, axis=0, convert=False)
   1834         else:
-> 1835             indexer = self.ix._convert_to_indexer(key, axis=1)
   1836             return self.take(indexer, axis=1, convert=True)
   1837 

/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/core/indexing.pyc in _convert_to_indexer(self, obj, axis, is_setter)
   1088                         not isinstance(objarr[0], tuple)):
   1089                     level = 0
-> 1090                     _, indexer = labels.reindex(objarr, level=level)
   1091 
   1092                     # take all

/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/core/index.pyc in reindex(self, target, method, level, limit)
   4917             target, indexer, _ = self._join_level(target, level, how='right',
   4918                                                   return_indexers=True,
-> 4919                                                   keep_order=False)
   4920         else:
   4921             if self.equals(target):

/cellar/users/agross/anaconda2/lib/python2.7/site-packages/pandas/core/index.pyc in _join_level(self, other, level, how, return_indexers, keep_order)
   2191 
   2192         if not right.is_unique:
-> 2193             raise NotImplementedError('Index._join_level on non-unique index '
   2194                                       'is not implemented')
   2195 

NotImplementedError: Index._join_level on non-unique index is not implemented

Correlation of t-statistic and fraction overexpressed


In [ ]:
fig, ax = subplots()
(1 - pcc).sort_index().plot()
plt.yscale('log')

Correlation of subsampled population with full data


In [ ]:
fig, ax = subplots()
(1 - frac.corr()[669].sort_index()[:-1]).plot(ls='', marker='o', ax=ax, label='$f_{up}$')
(1 - tstat.corr()[669].sort_index()[:-1]).plot(ls='', marker='o', ax=ax, label='$t$')
ax.set_yscale('log')
ax.set_xlabel('Number of Subjects')
ax.set_ylabel('1 - PCC')
ax.legend(numpoints=1, fancybox=True)
prettify_ax(ax)

Lets look at the genes that disagree between the two statistics


In [ ]:
(frac[669].dropna().rank() - tstat[669].rank()).order().dropna().abs().tail(10)

In [ ]:
frac.ix['RAB25'][669], tstat.ix['RAB25'][669]

In [ ]:
wilcoxon_pandas(matched_rna.ix['RAB25'])

In [ ]:
fig, ax = subplots(figsize=(5,5))
mm = matched_rna.ix['RAB25'].unstack()[['01','11']].dropna().stack()
violin_plot_series(mm, ax=ax, order=['11','01'], ann=None)

for i,v in mm.unstack().iterrows():
    ax.plot([0,1],[v['11'], v['01']], color='black', alpha=.05)
ax.set_xlabel('')
ax.set_xticklabels(['Normal','Tumor'])
prettify_ax(ax)