In [1]:
# Init & specify annotation target
%run ~/relmapping/annot/notebooks/annot__init__.ipynb

#annot_ = 'annot_ce10'
#annot_ = 'annot_ce10_eLife_full'
#annot_ = 'annot_mapq0_ce10'
annot_ = 'annot_ce10_tissues'
print('annotation target: %s' % (annot_,))

def mp(fp, annot_=annot_): return os.path.join(annot_, 'metrics_lcap', fp)

if annot_ == 'annot_ce10':
    l_cond = config['stages']
    fp_atac = os.path.join(annot_, 'accessible_sites.tsv')
    df_atac = pd.read_csv(fp_atac, sep='\t')
    l_atac_peak_pos = df_atac[['start', 'end']].mean(axis=1).map(int)
    step_startbp_fwd = 'trim20.bwa_pe.rm_unmapped_pe.rm_chrM.rm_rRNA_broad.rm_blacklist.rm_q10.startbp_fwd'
    step_startbp_rev = 'trim20.bwa_pe.rm_unmapped_pe.rm_chrM.rm_rRNA_broad.rm_blacklist.rm_q10.startbp_rev'

elif annot_ == 'annot_ce10_eLife_full':
    l_cond = config['stages']
    fp_atac = os.path.join(annot_, 'accessible_sites.tsv')
    df_atac = pd.read_csv(fp_atac, sep='\t')
    l_atac_peak_pos = df_atac[['start', 'end']].mean(axis=1).map(int)
    step_startbp_fwd = 'trim20.bwa_pe.rm_unmapped_pe.rm_chrM.rm_rRNA_broad.rm_blacklist.rm_q10.startbp_fwd'
    step_startbp_rev = 'trim20.bwa_pe.rm_unmapped_pe.rm_chrM.rm_rRNA_broad.rm_blacklist.rm_q10.startbp_rev'

elif annot_ == 'annot_ce10_tissues':
    l_cond = config['stages'] + config['lcap823_tissues']
    fp_atac = os.path.join(annot_, 'accessible_sites.tsv')
    df_atac = pd.read_csv(fp_atac, sep='\t')
    l_atac_peak_pos = df_atac[['start', 'end']].mean(axis=1).map(int)
    step_startbp_fwd = 'trim20.bwa_pe.rm_unmapped_pe.rm_chrM.rm_rRNA_broad.rm_blacklist.rm_q10.startbp_fwd'
    step_startbp_rev = 'trim20.bwa_pe.rm_unmapped_pe.rm_chrM.rm_rRNA_broad.rm_blacklist.rm_q10.startbp_rev'

elif annot_ == 'annot_mapq0_ce10':
    l_cond = config['stages']
    fp_atac = os.path.join('annot_ce10', 'accessible_sites.tsv')
    df_atac = pd.read_csv(fp_atac, sep='\t')
    l_atac_peak_pos = df_atac[['start', 'end']].mean(axis=1).map(int)
    step_startbp_fwd = 'trim20.bwa_pe.rm_unmapped_pe.rm_chrM.rm_rRNA_broad.rm_blacklist.startbp_fwd'
    step_startbp_rev = 'trim20.bwa_pe.rm_unmapped_pe.rm_chrM.rm_rRNA_broad.rm_blacklist.startbp_rev'

else:
    assert False


/mnt/home3/jj374/anaconda36/lib/python3.6/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools
os.getcwd(): /mnt/beegfs/scratch_copy/ahringer/jj374/lab/relmapping
annotation target: annot_ce10_tissues

In [2]:
# Jump test

# 2-condition, 2-replicate one-sided test
def deseq2x2(df_counts):
    fh_inp, fp_inp = tempfile.mkstemp(dir=os.getcwd(), prefix='deseq2x2_inp_')
    fh_out, fp_out = tempfile.mkstemp(dir=os.getcwd(), prefix='deseq2x2_out_')
    df_counts.to_csv(fp_inp, sep='\t')
    !cat {fp_inp} | scripts/deseq2x2_greater.R > {fp_out}
    #!wc -l {fp_inp}
    #!wc -l {fp_out}
    #!tail -n 20 {fp_out}
    df_out = pd.read_csv(fp_out, sep='\s+')
    assert len(df_counts) == len(df_out)
    !rm {fp_inp}
    !rm {fp_out}
    return df_out

# Count reads & call DESeq
def jump_test(fp_rep1, fp_rep2, chrom, u_start, u_end, d_start, d_end, f):
    df_counts = pd.DataFrame()
    df_counts['ucount_rep1'] = list(map(f, yp.read_regions(fp_rep1, chrom, u_start, u_end)))
    df_counts['ucount_rep2'] = list(map(f, yp.read_regions(fp_rep2, chrom, u_start, u_end)))
    df_counts['dcount_rep1'] = list(map(f, yp.read_regions(fp_rep1, chrom, d_start, d_end)))
    df_counts['dcount_rep2'] = list(map(f, yp.read_regions(fp_rep2, chrom, d_start, d_end)))

    # Sample pseudoreplicates
    #np.random.seed(0)
    #df_counts['ucount_sum'] = df_counts[ ['ucount_rep1', 'ucount_rep2'] ].sum(axis=1)
    #df_counts['dcount_sum'] = df_counts[ ['dcount_rep1', 'dcount_rep2'] ].sum(axis=1)
    #df_counts['ucount_rep1'] = list(map(lambda n: np.random.binomial(n, 0.5), df_counts['ucount_sum']))
    #df_counts['dcount_rep1'] = list(map(lambda n: np.random.binomial(n, 0.5), df_counts['dcount_sum']))
    #df_counts['ucount_rep2'] = df_counts['ucount_sum'] - df_counts['ucount_rep1']
    #df_counts['dcount_rep2'] = df_counts['dcount_sum'] - df_counts['dcount_rep1']

    df_deseq = deseq2x2(df_counts[['ucount_rep1', 'ucount_rep2', 'dcount_rep1', 'dcount_rep2']])
    return pd.concat([df_counts, df_deseq], axis=1)

def itemRgb_(passed):
    if passed:
        return yp.RED
    else:
        return yp.BLUE

def jump_test_fwd(stage, step_startbp_fwd=step_startbp_fwd):
    if stage in config['stages']:
        fp_rep1 = pf('lcap808_%(stage)s_rep1' % locals(), step_startbp_fwd, '.bw', 'lcap808')
        fp_rep2 = pf('lcap808_%(stage)s_rep2' % locals(), step_startbp_fwd, '.bw', 'lcap808')
    elif stage in config['lcap823_tissues']:
        fp_rep1 = pf('lcap823_%(stage)s_rep1' % locals(), step_startbp_fwd, '.bw', 'lcap823')
        fp_rep2 = pf('lcap823_%(stage)s_rep2' % locals(), step_startbp_fwd, '.bw', 'lcap823')
    else:
        assert False

    chrom = df_atac['chrom'].tolist()
    u_start = (l_atac_peak_pos - 250).tolist()
    u_end = (l_atac_peak_pos - 75 + 1).tolist()
    d_start = (l_atac_peak_pos + 75).tolist()
    d_end = (l_atac_peak_pos + 250 + 1).tolist()
    df_jump = jump_test(fp_rep1, fp_rep2, chrom=chrom, u_start=u_start, u_end=u_end,
                        d_start=d_start, d_end=d_end, f=lambda s: int(np.nan_to_num(np.nansum(s))))
    #df_jump['passed'] = (df_jump['log2FoldChange'] > 2) & (df_jump['padj'] < 0.05)
    df_jump['passed_jump'] = (df_jump['log2FoldChange'] > 1.5) & (df_jump['padj'] < 0.1)
    df_jump['passed_incr'] = (df_jump['ucount_rep1'] == 0) & (df_jump['ucount_rep2'] == 0) \
                           & (df_jump['dcount_rep1'] >= 1) & (df_jump['dcount_rep2'] >= 1) \
                           & ((df_jump['dcount_rep1'] + df_jump['dcount_rep2']) >= 3)
    df_jump['passed'] = df_jump['passed_jump'] | df_jump['passed_incr']
    df_jump['summary'] = ['%s / %.02f / %.3g' % (passed, lfc, padj) \
                          for passed, lfc, padj in zip(df_jump['passed'], df_jump['log2FoldChange'], df_jump['padj'])]
    df_jump.columns = ['lcap_%s_fwd_%s' % (stage, col) for col in df_jump.columns]
    
    write_gffbed(mp('lcap_%s_fwd.bed' % (stage,)),
        chrom = chrom,
        start = u_start,
        thickStart = np.array(u_end) - 1,
        thickEnd = np.array(d_start) + 1,
        end = d_end,
        attr = df_jump,
        itemRgb = list(map(itemRgb_, df_jump['lcap_%s_fwd_passed' % (stage,)]
    )))
    return df_jump

df_jump_fwd = pd.concat(pmap(jump_test_fwd, l_cond, n_jobs=20), axis=1)
df_jump_fwd.to_csv(mp('lcap_all_fwd.tsv'), sep='\t', index=False)


estimating size factors
estimating dispersions
gene-wise dispersion estimates
estimating size factors
estimating size factors
estimating dispersions
gene-wise dispersion estimates
estimating dispersions
gene-wise dispersion estimates
estimating size factors
estimating size factors
estimating dispersions
gene-wise dispersion estimates
estimating dispersions
gene-wise dispersion estimates
estimating size factors
estimating dispersions
gene-wise dispersion estimates
estimating size factors
estimating dispersions
gene-wise dispersion estimates
estimating size factors
estimating dispersions
gene-wise dispersion estimates
estimating size factors
estimating size factors
estimating dispersions
gene-wise dispersion estimates
estimating size factors
estimating dispersions
estimating dispersions
gene-wise dispersion estimates
gene-wise dispersion estimates
estimating size factors
estimating dispersions
gene-wise dispersion estimates
estimating size factors
estimating dispersions
gene-wise dispersion estimates
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
mean-dispersion relationship
final dispersion estimates
final dispersion estimates
mean-dispersion relationship
mean-dispersion relationship
mean-dispersion relationship
final dispersion estimates
mean-dispersion relationship
mean-dispersion relationship
mean-dispersion relationship
estimating size factors
estimating dispersions
final dispersion estimates
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
final dispersion estimates
final dispersion estimates
final dispersion estimates
mean-dispersion relationship
mean-dispersion relationship
final dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
final dispersion estimates
final dispersion estimates
fitting model and testing
mean-dispersion relationship
fitting model and testing
fitting model and testing
fitting model and testing
fitting model and testing
fitting model and testing
fitting model and testing
final dispersion estimates
fitting model and testing
mean-dispersion relationship
fitting model and testing
mean-dispersion relationship
fitting model and testing
fitting model and testing
final dispersion estimates
final dispersion estimates
fitting model and testing
fitting model and testing
fitting model and testing
[Parallel(n_jobs=20)]: Done  15 out of  15 | elapsed:  1.6min finished

In [3]:
def jump_test_rev(stage, step_startbp_rev=step_startbp_rev):
    if stage in config['stages']:
        fp_rep1 = pf('lcap808_%(stage)s_rep1' % locals(), step_startbp_rev, '.bw', 'lcap808')
        fp_rep2 = pf('lcap808_%(stage)s_rep2' % locals(), step_startbp_rev, '.bw', 'lcap808')
    elif stage in config['lcap823_tissues']:
        fp_rep1 = pf('lcap823_%(stage)s_rep1' % locals(), step_startbp_rev, '.bw', 'lcap823')
        fp_rep2 = pf('lcap823_%(stage)s_rep2' % locals(), step_startbp_rev, '.bw', 'lcap823')
    else:
        assert False

    chrom = df_atac['chrom'].tolist()
    d_start = (l_atac_peak_pos - 250).tolist()
    d_end = (l_atac_peak_pos - 75 + 1).tolist()
    u_start = (l_atac_peak_pos + 75).tolist()
    u_end = (l_atac_peak_pos + 250 + 1).tolist()
    df_jump = jump_test(fp_rep1, fp_rep2, chrom=chrom, u_start=u_start, u_end=u_end,
                        d_start=d_start, d_end=d_end, f=lambda s: int(np.nan_to_num(np.nansum(s))))
    #df_jump['passed'] = (df_jump['log2FoldChange'] > 2) & (df_jump['padj'] < 0.05)
    df_jump['passed_jump'] = (df_jump['log2FoldChange'] > 1.5) & (df_jump['padj'] < 0.1)
    df_jump['passed_incr'] = (df_jump['ucount_rep1'] == 0) & (df_jump['ucount_rep2'] == 0) \
                           & (df_jump['dcount_rep1'] >= 1) & (df_jump['dcount_rep2'] >= 1) \
                           & ((df_jump['dcount_rep1'] + df_jump['dcount_rep2']) >= 3)
    df_jump['passed'] = df_jump['passed_jump'] | df_jump['passed_incr']
    df_jump['summary'] = ['%s / %.02f / %.3g' % (passed, lfc, padj) \
                          for passed, lfc, padj in zip(df_jump['passed'], df_jump['log2FoldChange'], df_jump['padj'])]
    df_jump.columns = ['lcap_%s_rev_%s' % (stage, col) for col in df_jump.columns]
    
    write_gffbed(mp('lcap_%s_rev.bed' % (stage,)),
        chrom = chrom,
        start = d_start,
        thickStart = np.array(d_end) - 1,
        thickEnd = np.array(u_start) + 1,
        end = u_end,
        attr = df_jump,
        itemRgb = list(map(itemRgb_, df_jump['lcap_%s_rev_passed' % (stage,)]
    )))
    return df_jump

df_jump_rev = pd.concat(pmap(jump_test_rev, l_cond, n_jobs=20), axis=1)
df_jump_rev.to_csv(mp('lcap_all_rev.tsv'), sep='\t', index=False)


estimating size factors
estimating dispersions
gene-wise dispersion estimates
estimating size factors
estimating dispersions
gene-wise dispersion estimates
estimating size factors
estimating dispersions
gene-wise dispersion estimates
estimating size factors
estimating size factors
estimating dispersions
estimating dispersions
gene-wise dispersion estimates
gene-wise dispersion estimates
estimating size factors
estimating dispersions
gene-wise dispersion estimates
estimating size factors
estimating dispersions
gene-wise dispersion estimates
estimating size factors
estimating size factors
estimating dispersions
gene-wise dispersion estimates
estimating dispersions
gene-wise dispersion estimates
estimating size factors
estimating size factors
estimating size factors
estimating dispersions
gene-wise dispersion estimates
estimating dispersions
gene-wise dispersion estimates
estimating dispersions
gene-wise dispersion estimates
estimating size factors
estimating dispersions
gene-wise dispersion estimates
estimating size factors
estimating dispersions
gene-wise dispersion estimates
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
mean-dispersion relationship
final dispersion estimates
mean-dispersion relationship
mean-dispersion relationship
final dispersion estimates
final dispersion estimates
mean-dispersion relationship
final dispersion estimates
mean-dispersion relationship
fitting model and testing
mean-dispersion relationship
mean-dispersion relationship
mean-dispersion relationship
fitting model and testing
mean-dispersion relationship
final dispersion estimates
mean-dispersion relationship
final dispersion estimates
mean-dispersion relationship
mean-dispersion relationship
mean-dispersion relationship
final dispersion estimates
fitting model and testing
fitting model and testing
final dispersion estimates
final dispersion estimates
final dispersion estimates
final dispersion estimates
final dispersion estimates
fitting model and testing
mean-dispersion relationship
final dispersion estimates
fitting model and testing
fitting model and testing
final dispersion estimates
fitting model and testing
fitting model and testing
fitting model and testing
fitting model and testing
fitting model and testing
fitting model and testing
fitting model and testing
fitting model and testing
[Parallel(n_jobs=20)]: Done  15 out of  15 | elapsed:  1.6min finished

In [4]:
pd.concat([df_jump_fwd['lcap_%s_fwd_passed' % (stage,)].value_counts() for stage in l_cond], axis=1)


Out[4]:
lcap_wt_emb_fwd_passed lcap_wt_l1_fwd_passed lcap_wt_l2_fwd_passed lcap_wt_l3_fwd_passed lcap_wt_l4_fwd_passed lcap_wt_ya_fwd_passed lcap_glp1_d1_fwd_passed lcap_glp1_d2_fwd_passed lcap_glp1_d6_fwd_passed lcap_glp1_d9_fwd_passed lcap_glp1_d13_fwd_passed lcap_neurons_fwd_passed lcap_gonad_fwd_passed lcap_muscle_fwd_passed lcap_intest_fwd_passed
False 36746 36343 35243 36445 36484 37006 38206 38824 37660 37862 37011 36932 37392 38471 37210
True 6128 6531 7631 6429 6390 5868 4668 4050 5214 5012 5863 5942 5482 4403 5664

In [5]:
pd.concat([df_jump_rev['lcap_%s_rev_passed' % (stage,)].value_counts() for stage in l_cond], axis=1)


Out[5]:
lcap_wt_emb_rev_passed lcap_wt_l1_rev_passed lcap_wt_l2_rev_passed lcap_wt_l3_rev_passed lcap_wt_l4_rev_passed lcap_wt_ya_rev_passed lcap_glp1_d1_rev_passed lcap_glp1_d2_rev_passed lcap_glp1_d6_rev_passed lcap_glp1_d9_rev_passed lcap_glp1_d13_rev_passed lcap_neurons_rev_passed lcap_gonad_rev_passed lcap_muscle_rev_passed lcap_intest_rev_passed
False 36824 36325 35182 36199 36523 37039 38196 38736 37671 37850 37032 37086 37354 38297 37183
True 6050 6549 7692 6675 6351 5835 4678 4138 5203 5024 5842 5788 5520 4577 5691

In [6]:
u_start = (l_atac_peak_pos - 250).tolist()
u_end = (l_atac_peak_pos - 75 + 1).tolist()
d_start = (l_atac_peak_pos + 75).tolist()
d_end = (l_atac_peak_pos + 250 + 1).tolist()

l_passed_any = df_jump_fwd[['lcap_%s_fwd_passed' % (stage,) for stage in l_cond]].any(axis=1)
write_gffbed(mp('lcap_all_fwd.bed'),
    chrom = df_atac['chrom'],
    start = u_start,
    thickStart = np.array(u_end) - 1,
    thickEnd = np.array(d_start) + 1,
    end = d_end,
    attr = df_jump_fwd[['lcap_%s_fwd_summary' % (stage,) for stage in l_cond]],
    itemRgb = list(map(itemRgb_, l_passed_any)),
)

l_passed_any = df_jump_rev[['lcap_%s_rev_passed' % (stage,) for stage in l_cond]].any(axis=1)
write_gffbed(mp('lcap_all_rev.bed'),
    chrom = df_atac['chrom'],
    start = u_start,
    thickStart = np.array(u_end) - 1,
    thickEnd = np.array(d_start) + 1,
    end = d_end,
    attr = df_jump_rev[['lcap_%s_rev_summary' % (stage,) for stage in l_cond]],
    itemRgb = list(map(itemRgb_, l_passed_any)),
)