In [1]:
%run ~/relmapping/annot/notebooks/__init__.ipynb
In [2]:
df_regl = regl_addons()
df_ = pd.crosstab(\
pd.Categorical(df_regl['annot_fwd']),\
pd.Categorical(df_regl['annot_rev']))\
.loc[config['annot_strand'], config['annot_strand']]
In [3]:
fp_chen = 'annot/FigC_TSS/bed_files/Chen2013_tss.bed'
fp_kruesi = 'annot/FigC_TSS/bed_files/Kruesi2013_tss.bed'
fp_saito = 'annot/FigC_TSS/bed_files/Saito2013_tss.bed'
fp_gu = 'annot/FigC_TSS/bed_files/Gu2012_tss.bed'
def df_closest_(df_a, df_b, b_prefix, **kwargs):
df_a_sort = df_a
df_b_sort = df_b.sort_values(list(df_b.columns[:3]))
fn_ = BedTool.from_dataframe(df_a).closest(BedTool.from_dataframe(df_b_sort).fn, **kwargs).fn
names_ = list(itertools.chain(df_a.columns.values,
['%s_%s' % (b_prefix, col) for col in df_b.columns.values],
['%s_distance' % (b_prefix)]
))
df_ = pd.read_csv(fn_, names=names_, sep='\t')
return df_#[names_[len(df_a.columns):]]
kwa_ = {'sep': '\t', 'names': yp.NAMES_BED6, 'comment': '#', 'usecols': yp.NAMES_BED6}
df_chen_fwd = df_closest_(regl_mode()[yp.NAMES_BED3], pd.read_csv(fp_chen, **kwa_).query('strand == "+"'), 'chen2013_fwd', D='ref', t='first')
df_chen_rev = df_closest_(regl_mode()[yp.NAMES_BED3], pd.read_csv(fp_chen, **kwa_).query('strand == "-"'), 'chen2013_rev', D='ref', t='first')
kwa_ = {'sep': '\t', 'names': yp.NAMES_BED9, 'comment': '#', 'usecols': yp.NAMES_BED6}
df_kruesi_fwd = df_closest_(regl_mode()[yp.NAMES_BED3], pd.read_csv(fp_kruesi, **kwa_).query('strand == "+"'), 'kruesi2013_fwd', D='ref', t='first')
df_kruesi_rev = df_closest_(regl_mode()[yp.NAMES_BED3], pd.read_csv(fp_kruesi, **kwa_).query('strand == "-"'), 'kruesi2013_rev', D='ref', t='first')
df_saito_fwd = df_closest_(regl_mode()[yp.NAMES_BED3], pd.read_csv(fp_saito, **kwa_).query('strand == "+"'), 'saito2013_fwd', D='ref', t='first')
df_saito_rev = df_closest_(regl_mode()[yp.NAMES_BED3], pd.read_csv(fp_saito, **kwa_).query('strand == "-"'), 'saito2013_rev', D='ref', t='first')
df_gu_fwd = df_closest_(regl_mode()[yp.NAMES_BED3], pd.read_csv(fp_gu, **kwa_).query('strand == "+"'), 'gu2012_fwd', D='ref', t='first')
df_gu_rev = df_closest_(regl_mode()[yp.NAMES_BED3], pd.read_csv(fp_gu, **kwa_).query('strand == "-"'), 'gu2012_rev', D='ref', t='first')
print(len(df_chen_fwd.query('abs(chen2013_fwd_distance) < 150')))
print(len(df_chen_rev.query('abs(chen2013_rev_distance) < 150')))
print(len(df_kruesi_fwd.query('abs(kruesi2013_fwd_distance) < 150')))
print(len(df_kruesi_rev.query('abs(kruesi2013_rev_distance) < 150')))
print(len(df_saito_fwd.query('abs(saito2013_fwd_distance) < 150')))
print(len(df_saito_rev.query('abs(saito2013_rev_distance) < 150')))
print(len(df_gu_fwd.query('abs(gu2012_fwd_distance) < 150')))
print(len(df_gu_rev.query('abs(gu2012_rev_distance) < 150')))
In [4]:
th_dist = 150
df_regl['tss_chen_fwd'] = df_chen_fwd['chen2013_fwd_distance'].map(lambda dist: abs(dist) <= th_dist)
df_regl['tss_chen_rev'] = df_chen_rev['chen2013_rev_distance'].map(lambda dist: abs(dist) <= th_dist)
df_regl['tss_kruesi_fwd'] = df_kruesi_fwd['kruesi2013_fwd_distance'].map(lambda dist: abs(dist) <= th_dist)
df_regl['tss_kruesi_rev'] = df_kruesi_rev['kruesi2013_rev_distance'].map(lambda dist: abs(dist) <= th_dist)
df_regl['tss_saito_fwd'] = df_saito_fwd['saito2013_fwd_distance'].map(lambda dist: abs(dist) <= th_dist)
df_regl['tss_saito_rev'] = df_saito_rev['saito2013_rev_distance'].map(lambda dist: abs(dist) <= th_dist)
df_regl['tss_gu_fwd'] = df_gu_fwd['gu2012_fwd_distance'].map(lambda dist: abs(dist) <= th_dist)
df_regl['tss_gu_rev'] = df_gu_rev['gu2012_rev_distance'].map(lambda dist: abs(dist) <= th_dist)
df_regl['tss_any_fwd'] = df_regl[['tss_chen_fwd', 'tss_kruesi_fwd', 'tss_saito_fwd', 'tss_gu_fwd']].sum(axis=1)
df_regl['tss_any_rev'] = df_regl[['tss_chen_rev', 'tss_kruesi_rev', 'tss_saito_rev', 'tss_gu_rev']].sum(axis=1)
In [5]:
df_fwd = pd.concat([
df_regl.loc[df_regl['tss_chen_fwd']]['annot_fwd'].value_counts(),
df_regl.loc[df_regl['tss_kruesi_fwd']]['annot_fwd'].value_counts(),
df_regl.loc[df_regl['tss_saito_fwd']]['annot_fwd'].value_counts(),
df_regl.loc[df_regl['tss_gu_fwd']]['annot_fwd'].value_counts(),
df_regl.query('tss_any_fwd == 1')['annot_fwd'].value_counts(),
df_regl.query('tss_any_fwd == 2')['annot_fwd'].value_counts(),
df_regl.query('tss_any_fwd == 3')['annot_fwd'].value_counts(),
df_regl.query('tss_any_fwd == 4')['annot_fwd'].value_counts(),
df_regl.query('tss_any_fwd == 0')['annot_fwd'].value_counts(),
], axis=1).loc[config['annot_strand']].fillna(0)
df_fwd.columns = (
'Chen2013_TSS',
'Kruesi2013_TSS',
'Saito2013_TSS',
'Gu2012_TSS',
'Any1_TSS',
'Any2_TSS',
'Any3_TSS',
'All4_TSS',
'No_previously_known_TSS',
)
df_fwd
Out[5]:
In [6]:
df_rev = pd.concat([
df_regl.loc[df_regl['tss_chen_rev']]['annot_rev'].value_counts(),
df_regl.loc[df_regl['tss_kruesi_rev']]['annot_rev'].value_counts(),
df_regl.loc[df_regl['tss_saito_rev']]['annot_rev'].value_counts(),
df_regl.loc[df_regl['tss_gu_rev']]['annot_rev'].value_counts(),
df_regl.query('tss_any_rev == 1')['annot_rev'].value_counts(),
df_regl.query('tss_any_rev == 2')['annot_rev'].value_counts(),
df_regl.query('tss_any_rev == 3')['annot_rev'].value_counts(),
df_regl.query('tss_any_rev == 4')['annot_rev'].value_counts(),
df_regl.query('tss_any_rev == 0')['annot_rev'].value_counts(),
], axis=1).loc[config['annot_strand']].fillna(0)
df_rev.columns = (
'Chen2013_TSS',
'Kruesi2013_TSS',
'Saito2013_TSS',
'Gu2012_TSS',
'Any1_TSS',
'Any2_TSS',
'Any3_TSS',
'All4_TSS',
'No_previously_known_TSS',
)
df_rev
Out[6]:
In [7]:
df_fwd + df_rev
Out[7]:
In [16]:
df_regl.query('(annot_fwd == "coding_promoter") & (tss_any_fwd == 0)')[yp.NAMES_BED3]\
.to_csv('annot/FigC_TSS/TSS_not_previously_known_fwd.bed', sep='\t', index=False, header=False)
df_regl.query('(annot_rev == "coding_promoter") & (tss_any_rev == 0)')[yp.NAMES_BED3]\
.to_csv('annot/FigC_TSS/TSS_not_previously_known_rev.bed', sep='\t', index=False, header=False)
!wc -l annot/FigC_TSS/TSS_not_previously_known_fwd.bed
!wc -l annot/FigC_TSS/TSS_not_previously_known_rev.bed
In [8]:
#df_regl.query('(annot_fwd == "no_transcription") & (tss_chen_fwd != 0)')
#df_regl.query('(annot_fwd == "coding_promoter") & (tss_any_fwd == 0)')
In [9]:
def itemRgb_(flag):
if flag:
return RED
else:
return BLUE
strand = 'fwd'
write_gffbed(
fp = 'annot/FigC_TSS/TSS_Chen_%(strand)s.bed' % locals(),
chrom = df_regl['chrom'],
start = df_regl['start'],
end = df_regl['end'],
attr = df_chen_fwd[['chen2013_fwd_distance', 'chen2013_fwd_name']],
strand = '+',
itemRgb = df_regl['tss_chen_fwd'].map(itemRgb_),
)
strand = 'rev'
write_gffbed(
fp = 'annot/FigC_TSS/TSS_Chen_%(strand)s.bed' % locals(),
chrom = df_regl['chrom'],
start = df_regl['start'],
end = df_regl['end'],
attr = df_chen_rev[['chen2013_rev_distance', 'chen2013_rev_name']],
strand = '-',
itemRgb = df_regl['tss_chen_rev'].map(itemRgb_),
)