In [1]:
# Init
%run ~/relmapping/annot_cb/notebooks/annot__init__.ipynb


/mnt/home3/jj374/anaconda36/lib/python3.6/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools
os.getcwd(): /mnt/beegfs/scratch_copy/ahringer/jj374/lab/relmapping

In [2]:
def mp(fp): return os.path.join('annot_cb', 'metrics_lcap', fp)

l_cond = list(config['annot_cb']['lcap_samples'].keys())
fp_atac = 'annot_cb/accessible_sites_cb.tsv'
df_atac = pd.read_csv(fp_atac, sep='\t')
l_atac_peak_pos = df_atac[['start', 'end']].mean(axis=1).map(int)
df_atac.head()


Out[2]:
chrom concave_start concave_end mode wt_ya_rep1_score wt_ya_rep2_score min_rank start end wt_ya_globalIDR max_globalIDR
0 I 7954 8180 8048 218.772699 431.677619 6379.0 7973 8124 3.671405 3.671405
1 I 16609 16809 16695 1089.472352 752.060789 1192.0 16620 16771 4.686630 4.686630
2 I 17076 17279 17167 634.236105 392.992569 2525.0 17092 17243 3.956980 3.956980
3 I 21469 21679 21560 2845.210881 3857.913439 65.0 21485 21636 5.000000 5.000000
4 I 39778 40055 39915 1768.688064 7326.561438 95.0 39840 39991 5.000000 5.000000

In [3]:
# Jump test

# 2-condition, 2-replicate one-sided test
def deseq2x2(df_counts):
    fh_inp, fp_inp = tempfile.mkstemp(dir=os.getcwd(), prefix='deseq2x2_inp_')
    fh_out, fp_out = tempfile.mkstemp(dir=os.getcwd(), prefix='deseq2x2_out_')
    df_counts.to_csv(fp_inp, sep='\t')
    !cat {fp_inp} | scripts/deseq2x2_greater.R > {fp_out}
    #!wc -l {fp_inp}
    #!wc -l {fp_out}
    #!tail -n 20 {fp_out}
    df_out = pd.read_csv(fp_out, sep='\s+')
    assert len(df_counts) == len(df_out)
    !rm {fp_inp}
    !rm {fp_out}
    return df_out

# Count reads & call DESeq
def jump_test(fp_rep1, fp_rep2, chrom, u_start, u_end, d_start, d_end, f):
    df_counts = pd.DataFrame()
    df_counts['ucount_rep1'] = list(map(f, read_regions(fp_rep1, chrom, u_start, u_end)))
    df_counts['ucount_rep2'] = list(map(f, read_regions(fp_rep2, chrom, u_start, u_end)))
    df_counts['dcount_rep1'] = list(map(f, read_regions(fp_rep1, chrom, d_start, d_end)))
    df_counts['dcount_rep2'] = list(map(f, read_regions(fp_rep2, chrom, d_start, d_end)))

    # Sample pseudoreplicates
    #np.random.seed(0)
    #df_counts['ucount_sum'] = df_counts[ ['ucount_rep1', 'ucount_rep2'] ].sum(axis=1)
    #df_counts['dcount_sum'] = df_counts[ ['dcount_rep1', 'dcount_rep2'] ].sum(axis=1)
    #df_counts['ucount_rep1'] = list(map(lambda n: np.random.binomial(n, 0.5), df_counts['ucount_sum']))
    #df_counts['dcount_rep1'] = list(map(lambda n: np.random.binomial(n, 0.5), df_counts['dcount_sum']))
    #df_counts['ucount_rep2'] = df_counts['ucount_sum'] - df_counts['ucount_rep1']
    #df_counts['dcount_rep2'] = df_counts['dcount_sum'] - df_counts['dcount_rep1']

    df_deseq = deseq2x2(df_counts[['ucount_rep1', 'ucount_rep2', 'dcount_rep1', 'dcount_rep2']])
    return pd.concat([df_counts, df_deseq], axis=1)

def itemRgb_(passed):
    if passed:
        return RED
    else:
        return BLUE

def jump_test_fwd(stage):
    fp_rep1 = pf('annot_cb_lcap_%(stage)s_rep1' % locals(), config['annot_cb']['lcap_firstbp_fwd'], '.bw', 'lcap')
    fp_rep2 = pf('annot_cb_lcap_%(stage)s_rep2' % locals(), config['annot_cb']['lcap_firstbp_fwd'], '.bw', 'lcap')

    chrom = df_atac['chrom'].tolist()
    u_start = (l_atac_peak_pos - 250).tolist()
    u_end = (l_atac_peak_pos - 75 + 1).tolist()
    d_start = (l_atac_peak_pos + 75).tolist()
    d_end = (l_atac_peak_pos + 250 + 1).tolist()
    df_jump = jump_test(fp_rep1, fp_rep2, chrom=chrom, u_start=u_start, u_end=u_end,
                        d_start=d_start, d_end=d_end, f=lambda s: int(np.nan_to_num(np.nansum(s))))
    #df_jump['passed'] = (df_jump['log2FoldChange'] > 2) & (df_jump['padj'] < 0.05)
    df_jump['passed_jump'] = (df_jump['log2FoldChange'] > 1.5) & (df_jump['padj'] < 0.1)
    df_jump['passed_incr'] = (df_jump['ucount_rep1'] == 0) & (df_jump['ucount_rep2'] == 0) \
                           & (df_jump['dcount_rep1'] >= 1) & (df_jump['dcount_rep2'] >= 1) \
                           & ((df_jump['dcount_rep1'] + df_jump['dcount_rep2']) >= 3)
    df_jump['passed'] = df_jump['passed_jump'] | df_jump['passed_incr']
    df_jump['summary'] = ['%s / %.02f / %.3g' % (passed, lfc, padj) \
                          for passed, lfc, padj in zip(df_jump['passed'], df_jump['log2FoldChange'], df_jump['padj'])]
    df_jump.columns = ['lcap_%s_fwd_%s' % (stage, col) for col in df_jump.columns]
    
    write_gffbed(mp('lcap_%s_fwd.bed' % (stage,)),
        chrom = chrom,
        start = u_start,
        thickStart = np.array(u_end) - 1,
        thickEnd = np.array(d_start) + 1,
        end = d_end,
        attr = df_jump,
        itemRgb = list(map(itemRgb_, df_jump['lcap_%s_fwd_passed' % (stage,)]
    )))
    return df_jump

df_jump_fwd = pd.concat(pmap(jump_test_fwd, l_cond, n_jobs=20), axis=1)
df_jump_fwd.to_csv(mp('lcap_all_fwd.tsv'), sep='\t', index=False)


estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
-- note: fitType='parametric', but the dispersion trend was not well captured by the
   function: y = a/x + b, and a local regression fit was automatically substituted.
   specify fitType='local' or 'mean' to avoid this message next time.
final dispersion estimates
fitting model and testing
[Parallel(n_jobs=20)]: Done   1 out of   1 | elapsed:   17.1s finished

In [4]:
def jump_test_rev(stage):
    fp_rep1 = pf('annot_cb_lcap_%(stage)s_rep1' % locals(), config['annot_cb']['lcap_firstbp_rev'], '.bw', 'lcap')
    fp_rep2 = pf('annot_cb_lcap_%(stage)s_rep2' % locals(), config['annot_cb']['lcap_firstbp_rev'], '.bw', 'lcap')

    chrom = df_atac['chrom'].tolist()
    d_start = (l_atac_peak_pos - 250).tolist()
    d_end = (l_atac_peak_pos - 75 + 1).tolist()
    u_start = (l_atac_peak_pos + 75).tolist()
    u_end = (l_atac_peak_pos + 250 + 1).tolist()
    df_jump = jump_test(fp_rep1, fp_rep2, chrom=chrom, u_start=u_start, u_end=u_end,
                        d_start=d_start, d_end=d_end, f=lambda s: int(np.nan_to_num(np.nansum(s))))
    #df_jump['passed'] = (df_jump['log2FoldChange'] > 2) & (df_jump['padj'] < 0.05)
    df_jump['passed_jump'] = (df_jump['log2FoldChange'] > 1.5) & (df_jump['padj'] < 0.1)
    df_jump['passed_incr'] = (df_jump['ucount_rep1'] == 0) & (df_jump['ucount_rep2'] == 0) \
                           & (df_jump['dcount_rep1'] >= 1) & (df_jump['dcount_rep2'] >= 1) \
                           & ((df_jump['dcount_rep1'] + df_jump['dcount_rep2']) >= 3)
    df_jump['passed'] = df_jump['passed_jump'] | df_jump['passed_incr']
    df_jump['summary'] = ['%s / %.02f / %.3g' % (passed, lfc, padj) \
                          for passed, lfc, padj in zip(df_jump['passed'], df_jump['log2FoldChange'], df_jump['padj'])]
    df_jump.columns = ['lcap_%s_rev_%s' % (stage, col) for col in df_jump.columns]
    
    write_gffbed(mp('lcap_%s_rev.bed' % (stage,)),
        chrom = chrom,
        start = d_start,
        thickStart = np.array(d_end) - 1,
        thickEnd = np.array(u_start) + 1,
        end = u_end,
        attr = df_jump,
        itemRgb = list(map(itemRgb_, df_jump['lcap_%s_rev_passed' % (stage,)]
    )))
    return df_jump

df_jump_rev = pd.concat(pmap(jump_test_rev, l_cond, n_jobs=20), axis=1)
df_jump_rev.to_csv(mp('lcap_all_rev.tsv'), sep='\t', index=False)


estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
[Parallel(n_jobs=20)]: Done   1 out of   1 | elapsed:   17.2s finished

In [5]:
pd.concat([df_jump_fwd['lcap_%s_fwd_passed' % (stage,)].value_counts() for stage in l_cond], axis=1)


Out[5]:
lcap_wt_ya_fwd_passed
False 6341
True 2204

In [6]:
pd.concat([df_jump_rev['lcap_%s_rev_passed' % (stage,)].value_counts() for stage in l_cond], axis=1)


Out[6]:
lcap_wt_ya_rev_passed
False 6439
True 2106

In [7]:
u_start = (l_atac_peak_pos - 250).tolist()
u_end = (l_atac_peak_pos - 75 + 1).tolist()
d_start = (l_atac_peak_pos + 75).tolist()
d_end = (l_atac_peak_pos + 250 + 1).tolist()

l_passed_any = df_jump_fwd[['lcap_%s_fwd_passed' % (stage,) for stage in l_cond]].any(axis=1)
write_gffbed(mp('lcap_all_fwd.bed'),
    chrom = df_atac['chrom'],
    start = u_start,
    thickStart = np.array(u_end) - 1,
    thickEnd = np.array(d_start) + 1,
    end = d_end,
    attr = df_jump_fwd[['lcap_%s_fwd_summary' % (stage,) for stage in l_cond]],
    itemRgb = list(map(itemRgb_, l_passed_any)),
)

l_passed_any = df_jump_rev[['lcap_%s_rev_passed' % (stage,) for stage in l_cond]].any(axis=1)
write_gffbed(mp('lcap_all_rev.bed'),
    chrom = df_atac['chrom'],
    start = u_start,
    thickStart = np.array(u_end) - 1,
    thickEnd = np.array(d_start) + 1,
    end = d_end,
    attr = df_jump_rev[['lcap_%s_rev_summary' % (stage,) for stage in l_cond]],
    itemRgb = list(map(itemRgb_, l_passed_any)),
)