In [1]:
%run ~/relmapping/annot/notebooks/__init__.ipynb


/mnt/home3/jj374/anaconda36/lib/python3.6/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools
os.getcwd(): /mnt/b2/scratch/ahringer/jj374/lab/relmapping

In [2]:
df_regl = regl_addons()
df_ = pd.crosstab(\
    pd.Categorical(df_regl['annot_fwd']),\
    pd.Categorical(df_regl['annot_rev']))\
.loc[config['annot_strand'], config['annot_strand']]


7076 non-promoters outside of outron/gene body (=no gene_id)
/mnt/home3/jj374/anaconda36/lib/python3.6/site-packages/ipykernel_launcher.py:64: RuntimeWarning: Mean of empty slice
13195 of 42245 sites with CV values via promoter annotation
32525 of 42245 sites with CV values via "associated gene"

In [3]:
fp_chen = 'annot/FigC_TSS/bed_files/Chen2013_tss.bed'
fp_kruesi = 'annot/FigC_TSS/bed_files/Kruesi2013_tss.bed'
fp_saito = 'annot/FigC_TSS/bed_files/Saito2013_tss.bed'
fp_gu = 'annot/FigC_TSS/bed_files/Gu2012_tss.bed'

def df_closest_(df_a, df_b, b_prefix, **kwargs):
    df_a_sort = df_a
    df_b_sort = df_b.sort_values(list(df_b.columns[:3]))
    fn_ = BedTool.from_dataframe(df_a).closest(BedTool.from_dataframe(df_b_sort).fn, **kwargs).fn
    names_ = list(itertools.chain(df_a.columns.values,
        ['%s_%s' % (b_prefix, col) for col in df_b.columns.values],
        ['%s_distance' % (b_prefix)]
    ))
    df_ = pd.read_csv(fn_, names=names_, sep='\t')
    return df_#[names_[len(df_a.columns):]]

kwa_ = {'sep': '\t', 'names': yp.NAMES_BED6, 'comment': '#', 'usecols': yp.NAMES_BED6}
df_chen_fwd = df_closest_(regl_mode()[yp.NAMES_BED3], pd.read_csv(fp_chen, **kwa_).query('strand == "+"'), 'chen2013_fwd', D='ref', t='first')
df_chen_rev = df_closest_(regl_mode()[yp.NAMES_BED3], pd.read_csv(fp_chen, **kwa_).query('strand == "-"'), 'chen2013_rev', D='ref', t='first')
kwa_ = {'sep': '\t', 'names': yp.NAMES_BED9, 'comment': '#', 'usecols': yp.NAMES_BED6}
df_kruesi_fwd = df_closest_(regl_mode()[yp.NAMES_BED3], pd.read_csv(fp_kruesi, **kwa_).query('strand == "+"'), 'kruesi2013_fwd', D='ref', t='first')
df_kruesi_rev = df_closest_(regl_mode()[yp.NAMES_BED3], pd.read_csv(fp_kruesi, **kwa_).query('strand == "-"'), 'kruesi2013_rev', D='ref', t='first')
df_saito_fwd = df_closest_(regl_mode()[yp.NAMES_BED3], pd.read_csv(fp_saito, **kwa_).query('strand == "+"'), 'saito2013_fwd', D='ref', t='first')
df_saito_rev = df_closest_(regl_mode()[yp.NAMES_BED3], pd.read_csv(fp_saito, **kwa_).query('strand == "-"'), 'saito2013_rev', D='ref', t='first')
df_gu_fwd = df_closest_(regl_mode()[yp.NAMES_BED3], pd.read_csv(fp_gu, **kwa_).query('strand == "+"'), 'gu2012_fwd', D='ref', t='first')
df_gu_rev = df_closest_(regl_mode()[yp.NAMES_BED3], pd.read_csv(fp_gu, **kwa_).query('strand == "-"'), 'gu2012_rev', D='ref', t='first')

print(len(df_chen_fwd.query('abs(chen2013_fwd_distance) < 150')))
print(len(df_chen_rev.query('abs(chen2013_rev_distance) < 150')))
print(len(df_kruesi_fwd.query('abs(kruesi2013_fwd_distance) < 150')))
print(len(df_kruesi_rev.query('abs(kruesi2013_rev_distance) < 150')))
print(len(df_saito_fwd.query('abs(saito2013_fwd_distance) < 150')))
print(len(df_saito_rev.query('abs(saito2013_rev_distance) < 150')))
print(len(df_gu_fwd.query('abs(gu2012_fwd_distance) < 150')))
print(len(df_gu_rev.query('abs(gu2012_rev_distance) < 150')))


3292
3260
2828
2793
4023
3903
3682
3677

In [4]:
th_dist = 150
df_regl['tss_chen_fwd'] = df_chen_fwd['chen2013_fwd_distance'].map(lambda dist: abs(dist) <= th_dist)
df_regl['tss_chen_rev'] = df_chen_rev['chen2013_rev_distance'].map(lambda dist: abs(dist) <= th_dist)
df_regl['tss_kruesi_fwd'] = df_kruesi_fwd['kruesi2013_fwd_distance'].map(lambda dist: abs(dist) <= th_dist)
df_regl['tss_kruesi_rev'] = df_kruesi_rev['kruesi2013_rev_distance'].map(lambda dist: abs(dist) <= th_dist)
df_regl['tss_saito_fwd'] = df_saito_fwd['saito2013_fwd_distance'].map(lambda dist: abs(dist) <= th_dist)
df_regl['tss_saito_rev'] = df_saito_rev['saito2013_rev_distance'].map(lambda dist: abs(dist) <= th_dist)
df_regl['tss_gu_fwd'] = df_gu_fwd['gu2012_fwd_distance'].map(lambda dist: abs(dist) <= th_dist)
df_regl['tss_gu_rev'] = df_gu_rev['gu2012_rev_distance'].map(lambda dist: abs(dist) <= th_dist)
df_regl['tss_any_fwd'] = df_regl[['tss_chen_fwd', 'tss_kruesi_fwd', 'tss_saito_fwd', 'tss_gu_fwd']].sum(axis=1)
df_regl['tss_any_rev'] = df_regl[['tss_chen_rev', 'tss_kruesi_rev', 'tss_saito_rev', 'tss_gu_rev']].sum(axis=1)

In [5]:
df_fwd = pd.concat([
    df_regl.loc[df_regl['tss_chen_fwd']]['annot_fwd'].value_counts(),
    df_regl.loc[df_regl['tss_kruesi_fwd']]['annot_fwd'].value_counts(),
    df_regl.loc[df_regl['tss_saito_fwd']]['annot_fwd'].value_counts(),
    df_regl.loc[df_regl['tss_gu_fwd']]['annot_fwd'].value_counts(),
    df_regl.query('tss_any_fwd == 1')['annot_fwd'].value_counts(),
    df_regl.query('tss_any_fwd == 2')['annot_fwd'].value_counts(),
    df_regl.query('tss_any_fwd == 3')['annot_fwd'].value_counts(),
    df_regl.query('tss_any_fwd == 4')['annot_fwd'].value_counts(),
    df_regl.query('tss_any_fwd == 0')['annot_fwd'].value_counts(),
], axis=1).loc[config['annot_strand']].fillna(0)
df_fwd.columns = (
    'Chen2013_TSS',
    'Kruesi2013_TSS',
    'Saito2013_TSS',
    'Gu2012_TSS',
    'Any1_TSS', 
    'Any2_TSS',
    'Any3_TSS',
    'All4_TSS',
    'No_previously_known_TSS',
)
df_fwd


Out[5]:
Chen2013_TSS Kruesi2013_TSS Saito2013_TSS Gu2012_TSS Any1_TSS Any2_TSS Any3_TSS All4_TSS No_previously_known_TSS
coding_promoter 2763.0 2333 2969 2761 1555 1259 1127 843.0 3263
pseudogene_promoter 0.0 7 27 17 26 8 3 0.0 156
unknown_promoter 10.0 32 65 17 85 9 7 0.0 2499
transcription_initiation 414.0 424 742 591 1058 304 111 43.0 17209
non-coding_RNA 8.0 9 27 12 32 7 2 1.0 491
no_transcription 100.0 28 201 291 396 89 10 4.0 11648

In [6]:
df_rev = pd.concat([
    df_regl.loc[df_regl['tss_chen_rev']]['annot_rev'].value_counts(),
    df_regl.loc[df_regl['tss_kruesi_rev']]['annot_rev'].value_counts(),
    df_regl.loc[df_regl['tss_saito_rev']]['annot_rev'].value_counts(),
    df_regl.loc[df_regl['tss_gu_rev']]['annot_rev'].value_counts(),
    df_regl.query('tss_any_rev == 1')['annot_rev'].value_counts(),
    df_regl.query('tss_any_rev == 2')['annot_rev'].value_counts(),
    df_regl.query('tss_any_rev == 3')['annot_rev'].value_counts(),
    df_regl.query('tss_any_rev == 4')['annot_rev'].value_counts(),
    df_regl.query('tss_any_rev == 0')['annot_rev'].value_counts(),
], axis=1).loc[config['annot_strand']].fillna(0)
df_rev.columns = (
    'Chen2013_TSS',
    'Kruesi2013_TSS',
    'Saito2013_TSS',
    'Gu2012_TSS',
    'Any1_TSS', 
    'Any2_TSS',
    'Any3_TSS',
    'All4_TSS',
    'No_previously_known_TSS',
)
df_rev


Out[6]:
Chen2013_TSS Kruesi2013_TSS Saito2013_TSS Gu2012_TSS Any1_TSS Any2_TSS Any3_TSS All4_TSS No_previously_known_TSS
coding_promoter 2733.0 2281 2861 2748 1514 1228 1091 845.0 3193
pseudogene_promoter 0.0 13 32 13 36 8 2 0.0 169
unknown_promoter 8.0 26 66 11 73 13 4 0.0 2374
transcription_initiation 407.0 445 717 601 998 308 104 61.0 17356
non-coding_RNA 4.0 7 27 16 34 3 2 2.0 488
no_transcription 113.0 29 209 294 411 92 14 2.0 11820

In [7]:
df_fwd + df_rev


Out[7]:
Chen2013_TSS Kruesi2013_TSS Saito2013_TSS Gu2012_TSS Any1_TSS Any2_TSS Any3_TSS All4_TSS No_previously_known_TSS
coding_promoter 5496.0 4614 5830 5509 3069 2487 2218 1688.0 6456
pseudogene_promoter 0.0 20 59 30 62 16 5 0.0 325
unknown_promoter 18.0 58 131 28 158 22 11 0.0 4873
transcription_initiation 821.0 869 1459 1192 2056 612 215 104.0 34565
non-coding_RNA 12.0 16 54 28 66 10 4 3.0 979
no_transcription 213.0 57 410 585 807 181 24 6.0 23468

In [16]:
df_regl.query('(annot_fwd == "coding_promoter") & (tss_any_fwd == 0)')[yp.NAMES_BED3]\
    .to_csv('annot/FigC_TSS/TSS_not_previously_known_fwd.bed', sep='\t', index=False, header=False)

df_regl.query('(annot_rev == "coding_promoter") & (tss_any_rev == 0)')[yp.NAMES_BED3]\
    .to_csv('annot/FigC_TSS/TSS_not_previously_known_rev.bed', sep='\t', index=False, header=False)

!wc -l annot/FigC_TSS/TSS_not_previously_known_fwd.bed
!wc -l annot/FigC_TSS/TSS_not_previously_known_rev.bed


3263 annot/FigC_TSS/TSS_not_previously_known_fwd.bed
3193 annot/FigC_TSS/TSS_not_previously_known_rev.bed

In [8]:
#df_regl.query('(annot_fwd == "no_transcription") & (tss_chen_fwd != 0)')
#df_regl.query('(annot_fwd == "coding_promoter") & (tss_any_fwd == 0)')

In [9]:
def itemRgb_(flag):
    if flag:
        return RED
    else:
        return BLUE

strand = 'fwd'
write_gffbed(
    fp = 'annot/FigC_TSS/TSS_Chen_%(strand)s.bed' % locals(),
    chrom = df_regl['chrom'],
    start = df_regl['start'],
    end = df_regl['end'],
    attr = df_chen_fwd[['chen2013_fwd_distance', 'chen2013_fwd_name']],
    strand = '+',
    itemRgb = df_regl['tss_chen_fwd'].map(itemRgb_),
)

strand = 'rev'
write_gffbed(
    fp = 'annot/FigC_TSS/TSS_Chen_%(strand)s.bed' % locals(),
    chrom = df_regl['chrom'],
    start = df_regl['start'],
    end = df_regl['end'],
    attr = df_chen_rev[['chen2013_rev_distance', 'chen2013_rev_name']],
    strand = '-',
    itemRgb = df_regl['tss_chen_rev'].map(itemRgb_),
)