In [2]:
# initialise and load WB annotations
%run ~/relmapping/annot/notebooks/__init__.ipynb
fp_ = 'WS260_ce10/WS260_ce10.transcripts.annot.gtf.gz'
df_annot = yp.df_gfftags_unpack(yp.read_wbgtf(fp_, parse_attr=False), name='attribute')
def vp(fp): return os.path.join('annot/Fig2D2_genomic_regions/', fp) # "verbose path"
In [3]:
# annot/S3_genomic_regions/df_exonic.bed: "union-of-exons" having the same gene_id
df_ = df_annot.query('feature == "exon"')[['chrom', 'start', 'end', 'gene_id', 'score', 'strand']]
df_['start'] = df_['start'] - 1 # gtf-to-bed
df_ = BedTool.from_dataframe(df_).cluster(s=True).to_dataframe()
df_.columns = ['chrom', 'start', 'end', 'gene_id', 'score', 'strand', 'cluster_id']
df_exonic = df_.groupby(['gene_id', 'cluster_id']).agg({
'chrom': lambda s: list(set(s))[0],
'start': np.min,
'end': np.max,
'score': lambda s: list(set(s))[0],
'strand': lambda s: list(set(s))[0],
}).reset_index()[['chrom', 'start', 'end', 'gene_id', 'score', 'strand']]\
.sort_values(['chrom', 'start', 'end', 'strand']).reset_index(drop=True)
fp_ = vp('df_exonic.bed')
df_exonic.to_csv(fp_, header=False, index=False, sep='\t')
!wc -l {fp_}
df_exonic.head()
Out[3]:
In [4]:
# annot/S3_genomic_regions/df_gene_ends.bed: sequence flanking flank_ at the 3' end of the last exon of a gene
# (So alternative 3' ends are ignored by construction)
# df_ <- all exons
df_ = df_annot.query('feature == "exon"')[['chrom', 'start', 'end', 'gene_id', 'score', 'strand']]
df_['start'] = df_['start'] - 1 # gtf-to-bed
# df_ <- collapse to gene boundaries by gene_id
df_ = df_.groupby(['gene_id']).agg({
'chrom': lambda s: list(set(s))[0],
'start': np.min,
'end': np.max,
'gene_id': lambda s: ','.join(sorted(set(s))),
'score': lambda s: list(set(s))[0],
'strand': lambda s: list(set(s))[0],
})
def gene_end_(start, end, strand):
if strand == '+':
return end - 1
elif strand == '-':
return start
assert False
flank_ = 100
df_gene_ends = pd.DataFrame()
df_gene_ends['chrom'] = df_['chrom']
df_gene_ends['start'] = list(map(gene_end_, df_['start'], df_['end'], df_['strand']))
df_gene_ends['end'] = df_gene_ends['start'] + 1
df_gene_ends['start'] -= flank_
df_gene_ends['end'] += flank_
df_gene_ends['gene_id'] = df_['gene_id']
df_gene_ends['score'] = df_['score']
df_gene_ends['strand'] = df_['strand']
df_gene_ends = df_gene_ends.sort_values(['chrom', 'start', 'end', 'strand']).reset_index(drop=True)
fp_ = vp('df_gene_ends.bed')
df_gene_ends.to_csv(fp_, header=False, index=False, sep='\t')
!wc -l {fp_}
df_gene_ends.head()
Out[4]:
In [5]:
# annot/S3_genomic_regions/df_intronic.bed: "intron regions", derived from "union-of-exons'
def to_intronic(df_gene_id):
l_chrom = []
l_start = []
l_end = []
l_gene_id = []
l_score = []
l_strand = []
for ((i0, r0), (i1, r1)) in yp.pairwise(df_gene_id.iterrows()):
if r0['chrom'] != r1['chrom']:
continue
l_chrom.append(r0['chrom'])
l_start.append(r0['end'])
l_end.append(r1['start'])
l_gene_id.append(r0['gene_id'])
l_score.append(r0['score'])
l_strand.append(r0['strand'])
df_intronic = pd.DataFrame({'chrom': l_chrom, 'start': l_start, 'end': l_end,
'gene_id': l_gene_id, 'score': l_score, 'strand': l_strand})
df_intronic['start'] = df_intronic['start'].astype(int)
df_intronic['end'] = df_intronic['end'].astype(int)
return df_intronic[['chrom', 'start', 'end', 'gene_id', 'score', 'strand']]
df_intronic = pd.concat([to_intronic(df_gene_id_) for gene_id_, df_gene_id_ in df_exonic.groupby('gene_id')], axis=0)
df_intronic = df_intronic.sort_values(['chrom', 'start', 'end', 'strand']).reset_index(drop=True)
fp_ = vp('df_intronic.bed')
df_intronic.to_csv(fp_, header=False, index=False, sep='\t')
!wc -l {fp_}
df_intronic.head()
Out[5]:
In [8]:
# annot/S3_genomic_regions/df_outronic.bed: TSS to closest downstream annotated transcript start
def S2_exon1_(df_regl): # Attaches exon1, exactly as in the regulatory annotation (should really be calculated once and stored)
#NAMES_EXON = ('chrom', 'start', 'end', 'transcript_id', 'score', 'strand', 'gene_id', 'gene_biotype', 'display_id')
df_exon1_fwd = read_gffbed('WS260_ce10/WS260_ce10.transcripts.annot_pass1_exon1_fwd.bed')
df_exon1_rev = read_gffbed('WS260_ce10/WS260_ce10.transcripts.annot_pass1_exon1_rev.bed')
# set boundaries to 5' end only
df_exon1_fwd['end'] = df_exon1_fwd['start'] + 1
df_exon1_rev['start'] = df_exon1_rev['end'] - 1
print(len(df_exon1_fwd), len(df_exon1_rev))
# Match hypersensitive sites to closest first/other exon
def df_closest(df_a, df_b, b_prefix, **kwargs):
df_a_sort = df_a
df_b_sort = df_b.sort_values(list(df_b.columns[:3]))
fn_ = BedTool.from_dataframe(df_a).closest(BedTool.from_dataframe(df_b_sort).fn, **kwargs).fn
names_ = list(itertools.chain(df_a.columns.values,
['%s_%s' % (b_prefix, col) for col in df_b.columns.values],
['%s_distance' % (b_prefix)]
))
df_ = pd.read_csv(fn_, names=names_, sep='\t')
return df_[names_[len(df_a.columns):]]
# Closest exon1 -- 200 upstream from atac_mode towards downstream
closest_exon1_flank = 200
df_ = pd.concat([df_regl['chrom'], summit_pos(df_regl) - closest_exon1_flank, summit_pos(df_regl) - closest_exon1_flank + 1], axis=1).copy()
df_exon1_fwd_out_ = df_closest(df_, df_exon1_fwd, 'exon1', D='ref', t='first', iu=True)
df_ = pd.concat([df_regl['chrom'], summit_pos(df_regl) + closest_exon1_flank, summit_pos(df_regl) + closest_exon1_flank + 1], axis=1).copy()
df_exon1_rev_out_ = df_closest(df_, df_exon1_rev, 'exon1', D='ref', t='first', id=True)
return[df_exon1_fwd_out_, df_exon1_rev_out_]
df_regl = regl_Apr27()
print('%d regions loaded' % (len(df_regl),))
(df_exon1_fwd, df_exon1_rev) = S2_exon1_(df_regl)
df_regl['exon1_start_fwd'] = df_exon1_fwd['exon1_start']
df_regl['exon1_end_rev'] = df_exon1_rev['exon1_end']
# coding promoters, regions (this will intentionally leave out really short outrons,
# as these wouldn't "include" any accessible sites anyways)
q_fwd_ = '(annot_fwd == "coding_promoter") & (((exon1_start_fwd - 1) - end) > 10)'
q_rev_ = '(annot_rev == "coding_promoter") & ((start - (exon1_end_rev - 1)) > 10)'
df_outronic_fwd = df_regl.query(q_fwd_)[['chrom', 'end', 'exon1_start_fwd', 'promoter_gene_id_fwd']].reset_index(drop=True)
df_outronic_rev = df_regl.query(q_rev_)[['chrom', 'exon1_end_rev', 'start', 'promoter_gene_id_rev']].reset_index(drop=True)
df_outronic_fwd['exon1_start_fwd'] -= 1 # gtf-to-bed
df_outronic_fwd.columns = ['chrom', 'start', 'end', 'gene_id']
df_outronic_rev.columns = ['chrom', 'start', 'end', 'gene_id']
df_outronic_fwd['score'] = '.'
df_outronic_rev['score'] = '.'
df_outronic_fwd['strand'] = '+'
df_outronic_rev['strand'] = '-'
# Merge across strands
df_outronic = pd.concat([df_outronic_fwd, df_outronic_rev], axis=0)\
.sort_values(['chrom', 'start', 'end', 'strand']).reset_index(drop=True)
fp_ = vp('df_outronic.bed')
df_outronic.to_csv(fp_, header=False, index=False, sep='\t')
!wc -l {fp_}
df_outronic.head()
Out[8]:
In [9]:
# annot/S3_genomic_regions/df_intergenic.bed: "all except exonic, intronic, outronic"
df_ = pd.concat([df_outronic, df_exonic, df_intronic, df_gene_ends], axis=0)\
.sort_values(['chrom', 'start', 'end', 'strand']).reset_index(drop=True)
df_intergenic = BedTool.from_dataframe(df_).complement(g='shared/ce10.chroms').to_dataframe()
df_intergenic['gene_id'] = '.'
df_intergenic['score'] = '.'
df_intergenic['strand'] = '.'
fp_ = vp('df_intergenic.bed')
df_intergenic.to_csv(fp_, header=False, index=False, sep='\t')
!wc -l {fp_}
df_intergenic.head()
Out[9]:
In [10]:
# df_pooled <- exonic, intronic, intergenic, outronic
df_outronic['type'] = 'outronic'
df_exonic['type'] = 'exonic'
df_intronic['type'] = 'intronic'
df_gene_ends['type'] = 'gene_end'
df_intergenic['type'] = 'intergenic'
df_pooled = pd.concat([df_outronic, df_exonic, df_intronic, df_gene_ends, df_intergenic], axis=0)\
.sort_values(['chrom', 'start', 'end', 'strand']).reset_index(drop=True)
print('%d pooled segments' % (len(df_pooled),))
In [12]:
# annot/S3_genomic_regions/S3a_genomic_regions_full.bed <- full regions, unmixed
d_type_colour = {
'outronic': yp.RED,
'exonic': yp.ORANGE,
'intronic': yp.YELLOW,
'gene_end': yp.PURPLE,
'intergenic': yp.SKYBLUE,
'mixed': yp.BLACK,
}
def name_(type_, gene_id_):
if type_ == 'intergenic' or type_ == 'mixed':
return type_
else:
return '%s: %s' % (type_, gene_id_)
write_gffbed(vp('Fig2D2_genomic_regions_pre-unique.bed'),
chrom = df_pooled['chrom'],
start = df_pooled['start'],
end = df_pooled['end'],
name = list(map(name_, df_pooled['type'], df_pooled['gene_id'])),
strand = df_pooled['strand'],
itemRgb = map(lambda type_: d_type_colour[type_], df_pooled['type']),
attr = df_pooled[['type', 'gene_id']]
)
In [13]:
# Build gas
gas = hts.GenomicArrayOfSets(chroms=yp.chroms_ce10, stranded=False)
for r in df_pooled.itertuples():
iv = hts.GenomicInterval(r.chrom, r.start, r.end)
try:
gas[iv] += r
except IndexError:
print(iv, r.type)
In [14]:
# annot/S3_genomic_regions/S3a_genomic_regions_unique.bed <- regions, mixed
# Note: leaves some "identical" neighbouring states at the moment
# last exon, by construction, overlaps with gene end
# => remove exonic if there's a gene_end with the same gene_id
# => exons can be short, therefore also ignore intronic
def collapse_at_gene_end(s_r_):
l_gene_end_gene_id = set([r.gene_id for r in s_r_ if r.type == 'gene_end'])
for r in s_r_:
if (r.type in ['exonic', 'intronic']) and (r.gene_id in l_gene_end_gene_id):
continue
yield r
def df_unique_(gas):
n_exonic_gene_end = 0
for chrom in gas.chrom_vectors.keys():
iv_chrom = gas.chrom_vectors[chrom]['.'].iv
for iv, s_r_ in itertools.islice(gas[iv_chrom].steps(), None):
s_r = list(collapse_at_gene_end(s_r_))
type_ = ','.join(sorted(set([r.type for r in s_r])))
gene_id = ','.join(sorted(set([r.gene_id for r in s_r])))
l_strand_ = sorted(set([r.strand for r in s_r]))
strand = '.' if len(l_strand_) > 1 else l_strand_[0]
# "Unique-ify based on type, e.g. sequence in exons of two different genes is still exonic"
if (len(set([r.type for r in s_r])) > 1):
type_ = 'mixed'
gene_id = ','.join(sorted(set([r.gene_id for r in s_r])))
l_strand_ = list(set([r.strand for r in s_r]))
strand = '.' if len(l_strand_) > 1 else l_strand_[0]
yield([iv.chrom, iv.start, iv.end, type_, gene_id, strand])
df_unique = pd.DataFrame.from_records(df_unique_(gas), columns=['chrom', 'start', 'end', 'type', 'gene_id', 'strand'])\
.sort_values(['chrom', 'start', 'end', 'strand'])
df_unique['score'] = '.'
assert (df_unique['end'] - df_unique['start']).sum() == sum(yp.chroms_ce10.values())
In [15]:
# df_unique['segment_id'] <- adjacent segments with identical chrom, type, gene_id
df_unique['segment_id'] = -1
df_unique.loc[0, 'segment_id'] = 0
for i in range(1, len(df_unique)):
if (df_unique.loc[i, 'chrom'] == df_unique.loc[i - 1, 'chrom']) and\
(df_unique.loc[i, 'type'] == df_unique.loc[i - 1, 'type']) and\
(df_unique.loc[i, 'gene_id'] == df_unique.loc[i - 1, 'gene_id']):
df_unique.loc[i, 'segment_id'] = df_unique.loc[i - 1, 'segment_id']
else:
df_unique.loc[i, 'segment_id'] = df_unique.loc[i - 1, 'segment_id'] + 1
In [16]:
df_unique_out = df_unique.groupby(['segment_id']).agg({
'chrom': lambda s: list(set(s))[0],
'start': np.min,
'end': np.max,
'type': lambda s: list(set(s))[0],
'gene_id': lambda s: list(set(s))[0],
'strand': lambda s: list(set(s))[0],
'score': lambda s: list(set(s))[0],
}).reset_index()[['chrom', 'start', 'end', 'type', 'gene_id', 'strand', 'score']]\
.sort_values(['chrom', 'start', 'end']).reset_index(drop=True)
print('%d raw, %d collapsed' % (len(df_unique), len(df_unique_out)))
In [17]:
write_gffbed(vp('Fig2D2_genomic_regions.bed'),
chrom = df_unique_out['chrom'],
start = df_unique_out['start'],
end = df_unique_out['end'],
name = list(map(name_, df_unique_out['type'], df_unique_out['gene_id'])),
strand = df_unique_out['strand'],
itemRgb = map(lambda type_: d_type_colour[type_], df_unique_out['type']),
attr = df_unique_out[['type', 'gene_id']]
)
In [18]:
fp_ = vp('Fig2D2_genomic_regions.tsv')
df_unique_out[['chrom', 'start', 'end', 'type', 'strand', 'gene_id']].to_csv(fp_, header=True, index=False, sep='\t')