notebook.community

Edit and run



In [1]:

    
# Init & specify annotation target
%run ~/relmapping/annot/notebooks/annot__init__.ipynb

#annot_ = 'annot_ce10'
#annot_ = 'annot_ce10_eLife_full'
#annot_ = 'annot_mapq0_ce10'
annot_ = 'annot_ce10_tissues'
print('annotation target: %s' % (annot_,))

def mp(fp, annot_=annot_): return os.path.join(annot_, 'metrics_lcap', fp)

if annot_ == 'annot_ce10':
    l_cond = config['stages']
    fp_atac = os.path.join(annot_, 'accessible_sites.tsv')
    df_atac = pd.read_csv(fp_atac, sep='\t')
    l_atac_peak_pos = df_atac[['start', 'end']].mean(axis=1).map(int)
    step_startbp_fwd = 'trim20.bwa_pe.rm_unmapped_pe.rm_chrM.rm_rRNA_broad.rm_blacklist.rm_q10.startbp_fwd'
    step_startbp_rev = 'trim20.bwa_pe.rm_unmapped_pe.rm_chrM.rm_rRNA_broad.rm_blacklist.rm_q10.startbp_rev'

elif annot_ == 'annot_ce10_eLife_full':
    l_cond = config['stages']
    fp_atac = os.path.join(annot_, 'accessible_sites.tsv')
    df_atac = pd.read_csv(fp_atac, sep='\t')
    l_atac_peak_pos = df_atac[['start', 'end']].mean(axis=1).map(int)
    step_startbp_fwd = 'trim20.bwa_pe.rm_unmapped_pe.rm_chrM.rm_rRNA_broad.rm_blacklist.rm_q10.startbp_fwd'
    step_startbp_rev = 'trim20.bwa_pe.rm_unmapped_pe.rm_chrM.rm_rRNA_broad.rm_blacklist.rm_q10.startbp_rev'

elif annot_ == 'annot_ce10_tissues':
    l_cond = config['stages'] + config['lcap823_tissues']
    fp_atac = os.path.join(annot_, 'accessible_sites.tsv')
    df_atac = pd.read_csv(fp_atac, sep='\t')
    l_atac_peak_pos = df_atac[['start', 'end']].mean(axis=1).map(int)
    step_startbp_fwd = 'trim20.bwa_pe.rm_unmapped_pe.rm_chrM.rm_rRNA_broad.rm_blacklist.rm_q10.startbp_fwd'
    step_startbp_rev = 'trim20.bwa_pe.rm_unmapped_pe.rm_chrM.rm_rRNA_broad.rm_blacklist.rm_q10.startbp_rev'

elif annot_ == 'annot_mapq0_ce10':
    l_cond = config['stages']
    fp_atac = os.path.join('annot_ce10', 'accessible_sites.tsv')
    df_atac = pd.read_csv(fp_atac, sep='\t')
    l_atac_peak_pos = df_atac[['start', 'end']].mean(axis=1).map(int)
    step_startbp_fwd = 'trim20.bwa_pe.rm_unmapped_pe.rm_chrM.rm_rRNA_broad.rm_blacklist.startbp_fwd'
    step_startbp_rev = 'trim20.bwa_pe.rm_unmapped_pe.rm_chrM.rm_rRNA_broad.rm_blacklist.startbp_rev'

else:
    assert False









    



/mnt/home3/jj374/anaconda36/lib/python3.6/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools






    



os.getcwd(): /mnt/beegfs/scratch_copy/ahringer/jj374/lab/relmapping
annotation target: annot_ce10_tissues



In [2]:

    
# Jump test

# 2-condition, 2-replicate one-sided test
def deseq2x2(df_counts):
    fh_inp, fp_inp = tempfile.mkstemp(dir=os.getcwd(), prefix='deseq2x2_inp_')
    fh_out, fp_out = tempfile.mkstemp(dir=os.getcwd(), prefix='deseq2x2_out_')
    df_counts.to_csv(fp_inp, sep='\t')
    !cat {fp_inp} | scripts/deseq2x2_greater.R > {fp_out}
    #!wc -l {fp_inp}
    #!wc -l {fp_out}
    #!tail -n 20 {fp_out}
    df_out = pd.read_csv(fp_out, sep='\s+')
    assert len(df_counts) == len(df_out)
    !rm {fp_inp}
    !rm {fp_out}
    return df_out

# Count reads & call DESeq
def jump_test(fp_rep1, fp_rep2, chrom, u_start, u_end, d_start, d_end, f):
    df_counts = pd.DataFrame()
    df_counts['ucount_rep1'] = list(map(f, yp.read_regions(fp_rep1, chrom, u_start, u_end)))
    df_counts['ucount_rep2'] = list(map(f, yp.read_regions(fp_rep2, chrom, u_start, u_end)))
    df_counts['dcount_rep1'] = list(map(f, yp.read_regions(fp_rep1, chrom, d_start, d_end)))
    df_counts['dcount_rep2'] = list(map(f, yp.read_regions(fp_rep2, chrom, d_start, d_end)))

    # Sample pseudoreplicates
    #np.random.seed(0)
    #df_counts['ucount_sum'] = df_counts[ ['ucount_rep1', 'ucount_rep2'] ].sum(axis=1)
    #df_counts['dcount_sum'] = df_counts[ ['dcount_rep1', 'dcount_rep2'] ].sum(axis=1)
    #df_counts['ucount_rep1'] = list(map(lambda n: np.random.binomial(n, 0.5), df_counts['ucount_sum']))
    #df_counts['dcount_rep1'] = list(map(lambda n: np.random.binomial(n, 0.5), df_counts['dcount_sum']))
    #df_counts['ucount_rep2'] = df_counts['ucount_sum'] - df_counts['ucount_rep1']
    #df_counts['dcount_rep2'] = df_counts['dcount_sum'] - df_counts['dcount_rep1']

    df_deseq = deseq2x2(df_counts[['ucount_rep1', 'ucount_rep2', 'dcount_rep1', 'dcount_rep2']])
    return pd.concat([df_counts, df_deseq], axis=1)

def itemRgb_(passed):
    if passed:
        return yp.RED
    else:
        return yp.BLUE

def jump_test_fwd(stage, step_startbp_fwd=step_startbp_fwd):
    if stage in config['stages']:
        fp_rep1 = pf('lcap808_%(stage)s_rep1' % locals(), step_startbp_fwd, '.bw', 'lcap808')
        fp_rep2 = pf('lcap808_%(stage)s_rep2' % locals(), step_startbp_fwd, '.bw', 'lcap808')
    elif stage in config['lcap823_tissues']:
        fp_rep1 = pf('lcap823_%(stage)s_rep1' % locals(), step_startbp_fwd, '.bw', 'lcap823')
        fp_rep2 = pf('lcap823_%(stage)s_rep2' % locals(), step_startbp_fwd, '.bw', 'lcap823')
    else:
        assert False

    chrom = df_atac['chrom'].tolist()
    u_start = (l_atac_peak_pos - 250).tolist()
    u_end = (l_atac_peak_pos - 75 + 1).tolist()
    d_start = (l_atac_peak_pos + 75).tolist()
    d_end = (l_atac_peak_pos + 250 + 1).tolist()
    df_jump = jump_test(fp_rep1, fp_rep2, chrom=chrom, u_start=u_start, u_end=u_end,
                        d_start=d_start, d_end=d_end, f=lambda s: int(np.nan_to_num(np.nansum(s))))
    #df_jump['passed'] = (df_jump['log2FoldChange'] > 2) & (df_jump['padj'] < 0.05)
    df_jump['passed_jump'] = (df_jump['log2FoldChange'] > 1.5) & (df_jump['padj'] < 0.1)
    df_jump['passed_incr'] = (df_jump['ucount_rep1'] == 0) & (df_jump['ucount_rep2'] == 0) \
                           & (df_jump['dcount_rep1'] >= 1) & (df_jump['dcount_rep2'] >= 1) \
                           & ((df_jump['dcount_rep1'] + df_jump['dcount_rep2']) >= 3)
    df_jump['passed'] = df_jump['passed_jump'] | df_jump['passed_incr']
    df_jump['summary'] = ['%s / %.02f / %.3g' % (passed, lfc, padj) \
                          for passed, lfc, padj in zip(df_jump['passed'], df_jump['log2FoldChange'], df_jump['padj'])]
    df_jump.columns = ['lcap_%s_fwd_%s' % (stage, col) for col in df_jump.columns]
    
    write_gffbed(mp('lcap_%s_fwd.bed' % (stage,)),
        chrom = chrom,
        start = u_start,
        thickStart = np.array(u_end) - 1,
        thickEnd = np.array(d_start) + 1,
        end = d_end,
        attr = df_jump,
        itemRgb = list(map(itemRgb_, df_jump['lcap_%s_fwd_passed' % (stage,)]
    )))
    return df_jump

df_jump_fwd = pd.concat(pmap(jump_test_fwd, l_cond, n_jobs=20), axis=1)
df_jump_fwd.to_csv(mp('lcap_all_fwd.tsv'), sep='\t', index=False)









    



estimating size factors
estimating dispersions
gene-wise dispersion estimates
estimating size factors
estimating size factors
estimating dispersions
gene-wise dispersion estimates
estimating dispersions
gene-wise dispersion estimates
estimating size factors
estimating size factors
estimating dispersions
gene-wise dispersion estimates
estimating dispersions
gene-wise dispersion estimates
estimating size factors
estimating dispersions
gene-wise dispersion estimates
estimating size factors
estimating dispersions
gene-wise dispersion estimates
estimating size factors
estimating dispersions
gene-wise dispersion estimates
estimating size factors
estimating size factors
estimating dispersions
gene-wise dispersion estimates
estimating size factors
estimating dispersions
estimating dispersions
gene-wise dispersion estimates
gene-wise dispersion estimates
estimating size factors
estimating dispersions
gene-wise dispersion estimates
estimating size factors
estimating dispersions
gene-wise dispersion estimates
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
mean-dispersion relationship
final dispersion estimates
final dispersion estimates
mean-dispersion relationship
mean-dispersion relationship
mean-dispersion relationship
final dispersion estimates
mean-dispersion relationship
mean-dispersion relationship
mean-dispersion relationship
estimating size factors
estimating dispersions
final dispersion estimates
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
final dispersion estimates
final dispersion estimates
final dispersion estimates
mean-dispersion relationship
mean-dispersion relationship
final dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
final dispersion estimates
final dispersion estimates
fitting model and testing
mean-dispersion relationship
fitting model and testing
fitting model and testing
fitting model and testing
fitting model and testing
fitting model and testing
fitting model and testing
final dispersion estimates
fitting model and testing
mean-dispersion relationship
fitting model and testing
mean-dispersion relationship
fitting model and testing
fitting model and testing
final dispersion estimates
final dispersion estimates
fitting model and testing
fitting model and testing
fitting model and testing






    



[Parallel(n_jobs=20)]: Done  15 out of  15 | elapsed:  1.6min finished



In [3]:

    
def jump_test_rev(stage, step_startbp_rev=step_startbp_rev):
    if stage in config['stages']:
        fp_rep1 = pf('lcap808_%(stage)s_rep1' % locals(), step_startbp_rev, '.bw', 'lcap808')
        fp_rep2 = pf('lcap808_%(stage)s_rep2' % locals(), step_startbp_rev, '.bw', 'lcap808')
    elif stage in config['lcap823_tissues']:
        fp_rep1 = pf('lcap823_%(stage)s_rep1' % locals(), step_startbp_rev, '.bw', 'lcap823')
        fp_rep2 = pf('lcap823_%(stage)s_rep2' % locals(), step_startbp_rev, '.bw', 'lcap823')
    else:
        assert False

    chrom = df_atac['chrom'].tolist()
    d_start = (l_atac_peak_pos - 250).tolist()
    d_end = (l_atac_peak_pos - 75 + 1).tolist()
    u_start = (l_atac_peak_pos + 75).tolist()
    u_end = (l_atac_peak_pos + 250 + 1).tolist()
    df_jump = jump_test(fp_rep1, fp_rep2, chrom=chrom, u_start=u_start, u_end=u_end,
                        d_start=d_start, d_end=d_end, f=lambda s: int(np.nan_to_num(np.nansum(s))))
    #df_jump['passed'] = (df_jump['log2FoldChange'] > 2) & (df_jump['padj'] < 0.05)
    df_jump['passed_jump'] = (df_jump['log2FoldChange'] > 1.5) & (df_jump['padj'] < 0.1)
    df_jump['passed_incr'] = (df_jump['ucount_rep1'] == 0) & (df_jump['ucount_rep2'] == 0) \
                           & (df_jump['dcount_rep1'] >= 1) & (df_jump['dcount_rep2'] >= 1) \
                           & ((df_jump['dcount_rep1'] + df_jump['dcount_rep2']) >= 3)
    df_jump['passed'] = df_jump['passed_jump'] | df_jump['passed_incr']
    df_jump['summary'] = ['%s / %.02f / %.3g' % (passed, lfc, padj) \
                          for passed, lfc, padj in zip(df_jump['passed'], df_jump['log2FoldChange'], df_jump['padj'])]
    df_jump.columns = ['lcap_%s_rev_%s' % (stage, col) for col in df_jump.columns]
    
    write_gffbed(mp('lcap_%s_rev.bed' % (stage,)),
        chrom = chrom,
        start = d_start,
        thickStart = np.array(d_end) - 1,
        thickEnd = np.array(u_start) + 1,
        end = u_end,
        attr = df_jump,
        itemRgb = list(map(itemRgb_, df_jump['lcap_%s_rev_passed' % (stage,)]
    )))
    return df_jump

df_jump_rev = pd.concat(pmap(jump_test_rev, l_cond, n_jobs=20), axis=1)
df_jump_rev.to_csv(mp('lcap_all_rev.tsv'), sep='\t', index=False)









    



estimating size factors
estimating dispersions
gene-wise dispersion estimates
estimating size factors
estimating dispersions
gene-wise dispersion estimates
estimating size factors
estimating dispersions
gene-wise dispersion estimates
estimating size factors
estimating size factors
estimating dispersions
estimating dispersions
gene-wise dispersion estimates
gene-wise dispersion estimates
estimating size factors
estimating dispersions
gene-wise dispersion estimates
estimating size factors
estimating dispersions
gene-wise dispersion estimates
estimating size factors
estimating size factors
estimating dispersions
gene-wise dispersion estimates
estimating dispersions
gene-wise dispersion estimates
estimating size factors
estimating size factors
estimating size factors
estimating dispersions
gene-wise dispersion estimates
estimating dispersions
gene-wise dispersion estimates
estimating dispersions
gene-wise dispersion estimates
estimating size factors
estimating dispersions
gene-wise dispersion estimates
estimating size factors
estimating dispersions
gene-wise dispersion estimates
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
mean-dispersion relationship
final dispersion estimates
mean-dispersion relationship
mean-dispersion relationship
final dispersion estimates
final dispersion estimates
mean-dispersion relationship
final dispersion estimates
mean-dispersion relationship
fitting model and testing
mean-dispersion relationship
mean-dispersion relationship
mean-dispersion relationship
fitting model and testing
mean-dispersion relationship
final dispersion estimates
mean-dispersion relationship
final dispersion estimates
mean-dispersion relationship
mean-dispersion relationship
mean-dispersion relationship
final dispersion estimates
fitting model and testing
fitting model and testing
final dispersion estimates
final dispersion estimates
final dispersion estimates
final dispersion estimates
final dispersion estimates
fitting model and testing
mean-dispersion relationship
final dispersion estimates
fitting model and testing
fitting model and testing
final dispersion estimates
fitting model and testing
fitting model and testing
fitting model and testing
fitting model and testing
fitting model and testing
fitting model and testing
fitting model and testing
fitting model and testing






    



[Parallel(n_jobs=20)]: Done  15 out of  15 | elapsed:  1.6min finished



In [4]:

    
pd.concat([df_jump_fwd['lcap_%s_fwd_passed' % (stage,)].value_counts() for stage in l_cond], axis=1)









    Out[4]:







  
    
      
      lcap_wt_emb_fwd_passed
      lcap_wt_l1_fwd_passed
      lcap_wt_l2_fwd_passed
      lcap_wt_l3_fwd_passed
      lcap_wt_l4_fwd_passed
      lcap_wt_ya_fwd_passed
      lcap_glp1_d1_fwd_passed
      lcap_glp1_d2_fwd_passed
      lcap_glp1_d6_fwd_passed
      lcap_glp1_d9_fwd_passed
      lcap_glp1_d13_fwd_passed
      lcap_neurons_fwd_passed
      lcap_gonad_fwd_passed
      lcap_muscle_fwd_passed
      lcap_intest_fwd_passed
    
  
  
    
      False
      36746
      36343
      35243
      36445
      36484
      37006
      38206
      38824
      37660
      37862
      37011
      36932
      37392
      38471
      37210
    
    
      True
      6128
      6531
      7631
      6429
      6390
      5868
      4668
      4050
      5214
      5012
      5863
      5942
      5482
      4403
      5664



In [5]:

    
pd.concat([df_jump_rev['lcap_%s_rev_passed' % (stage,)].value_counts() for stage in l_cond], axis=1)









    Out[5]:







  
    
      
      lcap_wt_emb_rev_passed
      lcap_wt_l1_rev_passed
      lcap_wt_l2_rev_passed
      lcap_wt_l3_rev_passed
      lcap_wt_l4_rev_passed
      lcap_wt_ya_rev_passed
      lcap_glp1_d1_rev_passed
      lcap_glp1_d2_rev_passed
      lcap_glp1_d6_rev_passed
      lcap_glp1_d9_rev_passed
      lcap_glp1_d13_rev_passed
      lcap_neurons_rev_passed
      lcap_gonad_rev_passed
      lcap_muscle_rev_passed
      lcap_intest_rev_passed
    
  
  
    
      False
      36824
      36325
      35182
      36199
      36523
      37039
      38196
      38736
      37671
      37850
      37032
      37086
      37354
      38297
      37183
    
    
      True
      6050
      6549
      7692
      6675
      6351
      5835
      4678
      4138
      5203
      5024
      5842
      5788
      5520
      4577
      5691



In [6]:

    
u_start = (l_atac_peak_pos - 250).tolist()
u_end = (l_atac_peak_pos - 75 + 1).tolist()
d_start = (l_atac_peak_pos + 75).tolist()
d_end = (l_atac_peak_pos + 250 + 1).tolist()

l_passed_any = df_jump_fwd[['lcap_%s_fwd_passed' % (stage,) for stage in l_cond]].any(axis=1)
write_gffbed(mp('lcap_all_fwd.bed'),
    chrom = df_atac['chrom'],
    start = u_start,
    thickStart = np.array(u_end) - 1,
    thickEnd = np.array(d_start) + 1,
    end = d_end,
    attr = df_jump_fwd[['lcap_%s_fwd_summary' % (stage,) for stage in l_cond]],
    itemRgb = list(map(itemRgb_, l_passed_any)),
)

l_passed_any = df_jump_rev[['lcap_%s_rev_passed' % (stage,) for stage in l_cond]].any(axis=1)
write_gffbed(mp('lcap_all_rev.bed'),
    chrom = df_atac['chrom'],
    start = u_start,
    thickStart = np.array(u_end) - 1,
    thickEnd = np.array(d_start) + 1,
    end = d_end,
    attr = df_jump_rev[['lcap_%s_rev_summary' % (stage,) for stage in l_cond]],
    itemRgb = list(map(itemRgb_, l_passed_any)),
)

	lcap_wt_emb_fwd_passed	lcap_wt_l1_fwd_passed	lcap_wt_l2_fwd_passed	lcap_wt_l3_fwd_passed	lcap_wt_l4_fwd_passed	lcap_wt_ya_fwd_passed	lcap_glp1_d1_fwd_passed	lcap_glp1_d2_fwd_passed	lcap_glp1_d6_fwd_passed	lcap_glp1_d9_fwd_passed	lcap_glp1_d13_fwd_passed	lcap_neurons_fwd_passed	lcap_gonad_fwd_passed	lcap_muscle_fwd_passed	lcap_intest_fwd_passed
False	36746	36343	35243	36445	36484	37006	38206	38824	37660	37862	37011	36932	37392	38471	37210
True	6128	6531	7631	6429	6390	5868	4668	4050	5214	5012	5863	5942	5482	4403	5664

	lcap_wt_emb_rev_passed	lcap_wt_l1_rev_passed	lcap_wt_l2_rev_passed	lcap_wt_l3_rev_passed	lcap_wt_l4_rev_passed	lcap_wt_ya_rev_passed	lcap_glp1_d1_rev_passed	lcap_glp1_d2_rev_passed	lcap_glp1_d6_rev_passed	lcap_glp1_d9_rev_passed	lcap_glp1_d13_rev_passed	lcap_neurons_rev_passed	lcap_gonad_rev_passed	lcap_muscle_rev_passed	lcap_intest_rev_passed
False	36824	36325	35182	36199	36523	37039	38196	38736	37671	37850	37032	37086	37354	38297	37183
True	6050	6549	7692	6675	6351	5835	4678	4138	5203	5024	5842	5788	5520	4577	5691