notebook.community

Edit and run



In [1]:

    
# Init
%run ~/relmapping/annot_cb/notebooks/annot__init__.ipynb









    



/mnt/home3/jj374/anaconda36/lib/python3.6/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools






    



os.getcwd(): /mnt/beegfs/scratch_copy/ahringer/jj374/lab/relmapping



In [2]:

    
def mp(fp): return os.path.join('annot_cb', 'metrics_lcap', fp)

l_cond = list(config['annot_cb']['lcap_samples'].keys())
fp_atac = 'annot_cb/accessible_sites_cb.tsv'
df_atac = pd.read_csv(fp_atac, sep='\t')
l_atac_peak_pos = df_atac[['start', 'end']].mean(axis=1).map(int)
df_atac.head()









    Out[2]:







  
    
      
      chrom
      concave_start
      concave_end
      mode
      wt_ya_rep1_score
      wt_ya_rep2_score
      min_rank
      start
      end
      wt_ya_globalIDR
      max_globalIDR
    
  
  
    
      0
      I
      7954
      8180
      8048
      218.772699
      431.677619
      6379.0
      7973
      8124
      3.671405
      3.671405
    
    
      1
      I
      16609
      16809
      16695
      1089.472352
      752.060789
      1192.0
      16620
      16771
      4.686630
      4.686630
    
    
      2
      I
      17076
      17279
      17167
      634.236105
      392.992569
      2525.0
      17092
      17243
      3.956980
      3.956980
    
    
      3
      I
      21469
      21679
      21560
      2845.210881
      3857.913439
      65.0
      21485
      21636
      5.000000
      5.000000
    
    
      4
      I
      39778
      40055
      39915
      1768.688064
      7326.561438
      95.0
      39840
      39991
      5.000000
      5.000000



In [3]:

    
# Jump test

# 2-condition, 2-replicate one-sided test
def deseq2x2(df_counts):
    fh_inp, fp_inp = tempfile.mkstemp(dir=os.getcwd(), prefix='deseq2x2_inp_')
    fh_out, fp_out = tempfile.mkstemp(dir=os.getcwd(), prefix='deseq2x2_out_')
    df_counts.to_csv(fp_inp, sep='\t')
    !cat {fp_inp} | scripts/deseq2x2_greater.R > {fp_out}
    #!wc -l {fp_inp}
    #!wc -l {fp_out}
    #!tail -n 20 {fp_out}
    df_out = pd.read_csv(fp_out, sep='\s+')
    assert len(df_counts) == len(df_out)
    !rm {fp_inp}
    !rm {fp_out}
    return df_out

# Count reads & call DESeq
def jump_test(fp_rep1, fp_rep2, chrom, u_start, u_end, d_start, d_end, f):
    df_counts = pd.DataFrame()
    df_counts['ucount_rep1'] = list(map(f, read_regions(fp_rep1, chrom, u_start, u_end)))
    df_counts['ucount_rep2'] = list(map(f, read_regions(fp_rep2, chrom, u_start, u_end)))
    df_counts['dcount_rep1'] = list(map(f, read_regions(fp_rep1, chrom, d_start, d_end)))
    df_counts['dcount_rep2'] = list(map(f, read_regions(fp_rep2, chrom, d_start, d_end)))

    # Sample pseudoreplicates
    #np.random.seed(0)
    #df_counts['ucount_sum'] = df_counts[ ['ucount_rep1', 'ucount_rep2'] ].sum(axis=1)
    #df_counts['dcount_sum'] = df_counts[ ['dcount_rep1', 'dcount_rep2'] ].sum(axis=1)
    #df_counts['ucount_rep1'] = list(map(lambda n: np.random.binomial(n, 0.5), df_counts['ucount_sum']))
    #df_counts['dcount_rep1'] = list(map(lambda n: np.random.binomial(n, 0.5), df_counts['dcount_sum']))
    #df_counts['ucount_rep2'] = df_counts['ucount_sum'] - df_counts['ucount_rep1']
    #df_counts['dcount_rep2'] = df_counts['dcount_sum'] - df_counts['dcount_rep1']

    df_deseq = deseq2x2(df_counts[['ucount_rep1', 'ucount_rep2', 'dcount_rep1', 'dcount_rep2']])
    return pd.concat([df_counts, df_deseq], axis=1)

def itemRgb_(passed):
    if passed:
        return RED
    else:
        return BLUE

def jump_test_fwd(stage):
    fp_rep1 = pf('annot_cb_lcap_%(stage)s_rep1' % locals(), config['annot_cb']['lcap_firstbp_fwd'], '.bw', 'lcap')
    fp_rep2 = pf('annot_cb_lcap_%(stage)s_rep2' % locals(), config['annot_cb']['lcap_firstbp_fwd'], '.bw', 'lcap')

    chrom = df_atac['chrom'].tolist()
    u_start = (l_atac_peak_pos - 250).tolist()
    u_end = (l_atac_peak_pos - 75 + 1).tolist()
    d_start = (l_atac_peak_pos + 75).tolist()
    d_end = (l_atac_peak_pos + 250 + 1).tolist()
    df_jump = jump_test(fp_rep1, fp_rep2, chrom=chrom, u_start=u_start, u_end=u_end,
                        d_start=d_start, d_end=d_end, f=lambda s: int(np.nan_to_num(np.nansum(s))))
    #df_jump['passed'] = (df_jump['log2FoldChange'] > 2) & (df_jump['padj'] < 0.05)
    df_jump['passed_jump'] = (df_jump['log2FoldChange'] > 1.5) & (df_jump['padj'] < 0.1)
    df_jump['passed_incr'] = (df_jump['ucount_rep1'] == 0) & (df_jump['ucount_rep2'] == 0) \
                           & (df_jump['dcount_rep1'] >= 1) & (df_jump['dcount_rep2'] >= 1) \
                           & ((df_jump['dcount_rep1'] + df_jump['dcount_rep2']) >= 3)
    df_jump['passed'] = df_jump['passed_jump'] | df_jump['passed_incr']
    df_jump['summary'] = ['%s / %.02f / %.3g' % (passed, lfc, padj) \
                          for passed, lfc, padj in zip(df_jump['passed'], df_jump['log2FoldChange'], df_jump['padj'])]
    df_jump.columns = ['lcap_%s_fwd_%s' % (stage, col) for col in df_jump.columns]
    
    write_gffbed(mp('lcap_%s_fwd.bed' % (stage,)),
        chrom = chrom,
        start = u_start,
        thickStart = np.array(u_end) - 1,
        thickEnd = np.array(d_start) + 1,
        end = d_end,
        attr = df_jump,
        itemRgb = list(map(itemRgb_, df_jump['lcap_%s_fwd_passed' % (stage,)]
    )))
    return df_jump

df_jump_fwd = pd.concat(pmap(jump_test_fwd, l_cond, n_jobs=20), axis=1)
df_jump_fwd.to_csv(mp('lcap_all_fwd.tsv'), sep='\t', index=False)









    



estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
-- note: fitType='parametric', but the dispersion trend was not well captured by the
   function: y = a/x + b, and a local regression fit was automatically substituted.
   specify fitType='local' or 'mean' to avoid this message next time.
final dispersion estimates
fitting model and testing






    



[Parallel(n_jobs=20)]: Done   1 out of   1 | elapsed:   17.1s finished



In [4]:

    
def jump_test_rev(stage):
    fp_rep1 = pf('annot_cb_lcap_%(stage)s_rep1' % locals(), config['annot_cb']['lcap_firstbp_rev'], '.bw', 'lcap')
    fp_rep2 = pf('annot_cb_lcap_%(stage)s_rep2' % locals(), config['annot_cb']['lcap_firstbp_rev'], '.bw', 'lcap')

    chrom = df_atac['chrom'].tolist()
    d_start = (l_atac_peak_pos - 250).tolist()
    d_end = (l_atac_peak_pos - 75 + 1).tolist()
    u_start = (l_atac_peak_pos + 75).tolist()
    u_end = (l_atac_peak_pos + 250 + 1).tolist()
    df_jump = jump_test(fp_rep1, fp_rep2, chrom=chrom, u_start=u_start, u_end=u_end,
                        d_start=d_start, d_end=d_end, f=lambda s: int(np.nan_to_num(np.nansum(s))))
    #df_jump['passed'] = (df_jump['log2FoldChange'] > 2) & (df_jump['padj'] < 0.05)
    df_jump['passed_jump'] = (df_jump['log2FoldChange'] > 1.5) & (df_jump['padj'] < 0.1)
    df_jump['passed_incr'] = (df_jump['ucount_rep1'] == 0) & (df_jump['ucount_rep2'] == 0) \
                           & (df_jump['dcount_rep1'] >= 1) & (df_jump['dcount_rep2'] >= 1) \
                           & ((df_jump['dcount_rep1'] + df_jump['dcount_rep2']) >= 3)
    df_jump['passed'] = df_jump['passed_jump'] | df_jump['passed_incr']
    df_jump['summary'] = ['%s / %.02f / %.3g' % (passed, lfc, padj) \
                          for passed, lfc, padj in zip(df_jump['passed'], df_jump['log2FoldChange'], df_jump['padj'])]
    df_jump.columns = ['lcap_%s_rev_%s' % (stage, col) for col in df_jump.columns]
    
    write_gffbed(mp('lcap_%s_rev.bed' % (stage,)),
        chrom = chrom,
        start = d_start,
        thickStart = np.array(d_end) - 1,
        thickEnd = np.array(u_start) + 1,
        end = u_end,
        attr = df_jump,
        itemRgb = list(map(itemRgb_, df_jump['lcap_%s_rev_passed' % (stage,)]
    )))
    return df_jump

df_jump_rev = pd.concat(pmap(jump_test_rev, l_cond, n_jobs=20), axis=1)
df_jump_rev.to_csv(mp('lcap_all_rev.tsv'), sep='\t', index=False)









    



estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing






    



[Parallel(n_jobs=20)]: Done   1 out of   1 | elapsed:   17.2s finished



In [5]:

    
pd.concat([df_jump_fwd['lcap_%s_fwd_passed' % (stage,)].value_counts() for stage in l_cond], axis=1)









    Out[5]:







  
    
      
      lcap_wt_ya_fwd_passed
    
  
  
    
      False
      6341
    
    
      True
      2204



In [6]:

    
pd.concat([df_jump_rev['lcap_%s_rev_passed' % (stage,)].value_counts() for stage in l_cond], axis=1)









    Out[6]:







  
    
      
      lcap_wt_ya_rev_passed
    
  
  
    
      False
      6439
    
    
      True
      2106



In [7]:

    
u_start = (l_atac_peak_pos - 250).tolist()
u_end = (l_atac_peak_pos - 75 + 1).tolist()
d_start = (l_atac_peak_pos + 75).tolist()
d_end = (l_atac_peak_pos + 250 + 1).tolist()

l_passed_any = df_jump_fwd[['lcap_%s_fwd_passed' % (stage,) for stage in l_cond]].any(axis=1)
write_gffbed(mp('lcap_all_fwd.bed'),
    chrom = df_atac['chrom'],
    start = u_start,
    thickStart = np.array(u_end) - 1,
    thickEnd = np.array(d_start) + 1,
    end = d_end,
    attr = df_jump_fwd[['lcap_%s_fwd_summary' % (stage,) for stage in l_cond]],
    itemRgb = list(map(itemRgb_, l_passed_any)),
)

l_passed_any = df_jump_rev[['lcap_%s_rev_passed' % (stage,) for stage in l_cond]].any(axis=1)
write_gffbed(mp('lcap_all_rev.bed'),
    chrom = df_atac['chrom'],
    start = u_start,
    thickStart = np.array(u_end) - 1,
    thickEnd = np.array(d_start) + 1,
    end = d_end,
    attr = df_jump_rev[['lcap_%s_rev_summary' % (stage,) for stage in l_cond]],
    itemRgb = list(map(itemRgb_, l_passed_any)),
)

	chrom	concave_start	concave_end	mode	wt_ya_rep1_score	wt_ya_rep2_score	min_rank	start	end	wt_ya_globalIDR	max_globalIDR
0	I	7954	8180	8048	218.772699	431.677619	6379.0	7973	8124	3.671405	3.671405
1	I	16609	16809	16695	1089.472352	752.060789	1192.0	16620	16771	4.686630	4.686630
2	I	17076	17279	17167	634.236105	392.992569	2525.0	17092	17243	3.956980	3.956980
3	I	21469	21679	21560	2845.210881	3857.913439	65.0	21485	21636	5.000000	5.000000
4	I	39778	40055	39915	1768.688064	7326.561438	95.0	39840	39991	5.000000	5.000000