In [2]:
# initialise and load WB annotations
%run ~/relmapping/annot/notebooks/__init__.ipynb
fp_ = 'WS260_ce10/WS260_ce10.transcripts.annot.gtf.gz'
df_annot = yp.df_gfftags_unpack(yp.read_wbgtf(fp_, parse_attr=False), name='attribute')
def vp(fp): return os.path.join('annot/Fig2D2_genomic_regions/', fp) # "verbose path"

In [3]:
# annot/S3_genomic_regions/df_exonic.bed: "union-of-exons" having the same gene_id
df_ = df_annot.query('feature == "exon"')[['chrom', 'start', 'end', 'gene_id', 'score', 'strand']]
df_['start'] = df_['start'] - 1 # gtf-to-bed
df_ = BedTool.from_dataframe(df_).cluster(s=True).to_dataframe()
df_.columns = ['chrom', 'start', 'end', 'gene_id', 'score', 'strand', 'cluster_id']
df_exonic = df_.groupby(['gene_id', 'cluster_id']).agg({
        'chrom': lambda s: list(set(s))[0],
        'start': np.min,
        'end': np.max,
        'score': lambda s: list(set(s))[0],
        'strand': lambda s: list(set(s))[0],
}).reset_index()[['chrom', 'start', 'end', 'gene_id', 'score', 'strand']]\
  .sort_values(['chrom', 'start', 'end', 'strand']).reset_index(drop=True)

fp_ = vp('df_exonic.bed')
df_exonic.to_csv(fp_, header=False, index=False, sep='\t')
!wc -l {fp_}
df_exonic.head()


135091 annot/Fig2D2_genomic_regions/df_exonic.bed
Out[3]:
chrom start end gene_id score strand
0 chrI 3746 3909 WBGene00023193 . -
1 chrI 4115 4358 WBGene00022277 . -
2 chrI 5194 5296 WBGene00022277 . -
3 chrI 6036 6327 WBGene00022277 . -
4 chrI 9726 9846 WBGene00022277 . -

In [4]:
# annot/S3_genomic_regions/df_gene_ends.bed: sequence flanking flank_ at the 3' end of the last exon of a gene
# (So alternative 3' ends are ignored by construction)
# df_ <- all exons
df_ = df_annot.query('feature == "exon"')[['chrom', 'start', 'end', 'gene_id', 'score', 'strand']]
df_['start'] = df_['start'] - 1 # gtf-to-bed
# df_ <- collapse to gene boundaries by gene_id
df_ = df_.groupby(['gene_id']).agg({
        'chrom': lambda s: list(set(s))[0],
        'start': np.min,
        'end': np.max,
        'gene_id': lambda s: ','.join(sorted(set(s))),
        'score': lambda s: list(set(s))[0],
        'strand': lambda s: list(set(s))[0],
})

def gene_end_(start, end, strand):
    if strand == '+':
        return end - 1
    elif strand == '-':
        return start
    assert False

flank_ = 100
df_gene_ends = pd.DataFrame()
df_gene_ends['chrom'] = df_['chrom']
df_gene_ends['start'] = list(map(gene_end_, df_['start'], df_['end'], df_['strand']))
df_gene_ends['end'] = df_gene_ends['start'] + 1
df_gene_ends['start'] -= flank_
df_gene_ends['end'] += flank_
df_gene_ends['gene_id'] = df_['gene_id']
df_gene_ends['score'] = df_['score']
df_gene_ends['strand'] = df_['strand']
df_gene_ends = df_gene_ends.sort_values(['chrom', 'start', 'end', 'strand']).reset_index(drop=True)

fp_ = vp('df_gene_ends.bed')
df_gene_ends.to_csv(fp_, header=False, index=False, sep='\t')
!wc -l {fp_}
df_gene_ends.head()


23155 annot/Fig2D2_genomic_regions/df_gene_ends.bed
Out[4]:
chrom start end gene_id score strand
0 chrI 3646 3847 WBGene00023193 . -
1 chrI 4015 4216 WBGene00022277 . -
2 chrI 16736 16937 WBGene00022276 . +
3 chrI 17386 17587 WBGene00022278 . -
4 chrI 22781 22982 WBGene00235381 . -

In [5]:
# annot/S3_genomic_regions/df_intronic.bed: "intron regions", derived from "union-of-exons'
def to_intronic(df_gene_id):
    l_chrom = []
    l_start = []
    l_end = []
    l_gene_id = []
    l_score = []
    l_strand = []
    for ((i0, r0), (i1, r1)) in yp.pairwise(df_gene_id.iterrows()):
        if r0['chrom'] != r1['chrom']:
            continue
        l_chrom.append(r0['chrom'])
        l_start.append(r0['end'])
        l_end.append(r1['start'])
        l_gene_id.append(r0['gene_id'])
        l_score.append(r0['score'])
        l_strand.append(r0['strand'])
    df_intronic = pd.DataFrame({'chrom': l_chrom, 'start': l_start, 'end': l_end, 
                                'gene_id': l_gene_id, 'score': l_score, 'strand': l_strand})
    df_intronic['start'] = df_intronic['start'].astype(int)
    df_intronic['end'] = df_intronic['end'].astype(int)
    return df_intronic[['chrom', 'start', 'end', 'gene_id', 'score', 'strand']]

df_intronic = pd.concat([to_intronic(df_gene_id_) for gene_id_, df_gene_id_ in df_exonic.groupby('gene_id')], axis=0) 
df_intronic = df_intronic.sort_values(['chrom', 'start', 'end', 'strand']).reset_index(drop=True)

fp_ = vp('df_intronic.bed')
df_intronic.to_csv(fp_, header=False, index=False, sep='\t')
!wc -l {fp_}
df_intronic.head()


111936 annot/Fig2D2_genomic_regions/df_intronic.bed
Out[5]:
chrom start end gene_id score strand
0 chrI 4358 5194 WBGene00022277 . -
1 chrI 5296 6036 WBGene00022277 . -
2 chrI 6327 9726 WBGene00022277 . -
3 chrI 9846 10094 WBGene00022277 . -
4 chrI 11561 11617 WBGene00022276 . +

In [8]:
# annot/S3_genomic_regions/df_outronic.bed: TSS to closest downstream annotated transcript start
def S2_exon1_(df_regl): # Attaches exon1, exactly as in the regulatory annotation (should really be calculated once and stored)
    #NAMES_EXON = ('chrom', 'start', 'end', 'transcript_id', 'score', 'strand', 'gene_id', 'gene_biotype', 'display_id')
    df_exon1_fwd = read_gffbed('WS260_ce10/WS260_ce10.transcripts.annot_pass1_exon1_fwd.bed')
    df_exon1_rev = read_gffbed('WS260_ce10/WS260_ce10.transcripts.annot_pass1_exon1_rev.bed')

    # set boundaries to 5' end only
    df_exon1_fwd['end'] = df_exon1_fwd['start'] + 1
    df_exon1_rev['start'] = df_exon1_rev['end'] - 1

    print(len(df_exon1_fwd), len(df_exon1_rev))

    # Match hypersensitive sites to closest first/other exon
    def df_closest(df_a, df_b, b_prefix, **kwargs):
        df_a_sort = df_a
        df_b_sort = df_b.sort_values(list(df_b.columns[:3]))
        fn_ = BedTool.from_dataframe(df_a).closest(BedTool.from_dataframe(df_b_sort).fn, **kwargs).fn
        names_ = list(itertools.chain(df_a.columns.values,
            ['%s_%s' % (b_prefix, col) for col in df_b.columns.values],
            ['%s_distance' % (b_prefix)]
        ))
        df_ = pd.read_csv(fn_, names=names_, sep='\t')
        return df_[names_[len(df_a.columns):]]

    # Closest exon1 -- 200 upstream from atac_mode towards downstream
    closest_exon1_flank = 200
    df_ = pd.concat([df_regl['chrom'], summit_pos(df_regl) - closest_exon1_flank, summit_pos(df_regl) - closest_exon1_flank + 1], axis=1).copy()
    df_exon1_fwd_out_ = df_closest(df_, df_exon1_fwd, 'exon1', D='ref', t='first', iu=True)
    df_ = pd.concat([df_regl['chrom'], summit_pos(df_regl) + closest_exon1_flank, summit_pos(df_regl) + closest_exon1_flank + 1], axis=1).copy()
    df_exon1_rev_out_ = df_closest(df_, df_exon1_rev, 'exon1', D='ref', t='first', id=True)
    return[df_exon1_fwd_out_, df_exon1_rev_out_]

df_regl = regl_Apr27()
print('%d regions loaded' % (len(df_regl),))

(df_exon1_fwd, df_exon1_rev) = S2_exon1_(df_regl)
df_regl['exon1_start_fwd'] = df_exon1_fwd['exon1_start']
df_regl['exon1_end_rev'] = df_exon1_rev['exon1_end']

# coding promoters, regions (this will intentionally leave out really short outrons, 
# as these wouldn't "include" any accessible sites anyways)
q_fwd_ = '(annot_fwd == "coding_promoter") & (((exon1_start_fwd - 1) - end) > 10)'
q_rev_ = '(annot_rev == "coding_promoter") & ((start - (exon1_end_rev - 1)) > 10)'
df_outronic_fwd = df_regl.query(q_fwd_)[['chrom', 'end', 'exon1_start_fwd', 'promoter_gene_id_fwd']].reset_index(drop=True)
df_outronic_rev = df_regl.query(q_rev_)[['chrom', 'exon1_end_rev', 'start', 'promoter_gene_id_rev']].reset_index(drop=True)
df_outronic_fwd['exon1_start_fwd'] -= 1 # gtf-to-bed
df_outronic_fwd.columns = ['chrom', 'start', 'end', 'gene_id']
df_outronic_rev.columns = ['chrom', 'start', 'end', 'gene_id']
df_outronic_fwd['score'] = '.'
df_outronic_rev['score'] = '.'
df_outronic_fwd['strand'] = '+'
df_outronic_rev['strand'] = '-'

# Merge across strands
df_outronic = pd.concat([df_outronic_fwd, df_outronic_rev], axis=0)\
    .sort_values(['chrom', 'start', 'end', 'strand']).reset_index(drop=True)
fp_ = vp('df_outronic.bed')
df_outronic.to_csv(fp_, header=False, index=False, sep='\t')
!wc -l {fp_}
df_outronic.head()


/mnt/home3/jj374/anaconda36/lib/python3.6/site-packages/ipykernel_launcher.py:64: RuntimeWarning: Mean of empty slice
13054 of 42245 sites with CV values via promoter annotation
26764 of 42245 sites with CV values via "associated gene"
42245 regions loaded
/mnt/home1/ahringer/jj374/relmapping/scripts/yarp/yarp.py:400: FutureWarning: convert_objects is deprecated.  Use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
  df_name = df_name.convert_objects(convert_numeric=True)
17922 17074
12107 annot/Fig2D2_genomic_regions/df_outronic.bed
Out[8]:
chrom start end gene_id score strand
0 chrI 10230 11272 WBGene00022277 . -
1 chrI 11423 11493 WBGene00022276 . +
2 chrI 26781 26902 WBGene00022278 . -
3 chrI 32544 36018 WBGene00022279 . -
4 chrI 32544 42126 WBGene00022279 . -

In [9]:
# annot/S3_genomic_regions/df_intergenic.bed: "all except exonic, intronic, outronic"
df_ = pd.concat([df_outronic, df_exonic, df_intronic, df_gene_ends], axis=0)\
    .sort_values(['chrom', 'start', 'end', 'strand']).reset_index(drop=True)

df_intergenic = BedTool.from_dataframe(df_).complement(g='shared/ce10.chroms').to_dataframe()
df_intergenic['gene_id'] = '.'
df_intergenic['score'] = '.'
df_intergenic['strand'] = '.'

fp_ = vp('df_intergenic.bed')
df_intergenic.to_csv(fp_, header=False, index=False, sep='\t')
!wc -l {fp_}
df_intergenic.head()


21057 annot/Fig2D2_genomic_regions/df_intergenic.bed
Out[9]:
chrom start end gene_id score strand
0 chrI 0 3646 . . .
1 chrI 3909 4015 . . .
2 chrI 11272 11423 . . .
3 chrI 11493 11494 . . .
4 chrI 16937 17386 . . .

In [10]:
# df_pooled <- exonic, intronic, intergenic, outronic
df_outronic['type'] = 'outronic'
df_exonic['type'] = 'exonic'
df_intronic['type'] = 'intronic'
df_gene_ends['type'] = 'gene_end'
df_intergenic['type'] = 'intergenic'

df_pooled = pd.concat([df_outronic, df_exonic, df_intronic, df_gene_ends, df_intergenic], axis=0)\
    .sort_values(['chrom', 'start', 'end', 'strand']).reset_index(drop=True)

print('%d pooled segments' % (len(df_pooled),))


303346 pooled segments

In [12]:
# annot/S3_genomic_regions/S3a_genomic_regions_full.bed <- full regions, unmixed
d_type_colour = {
    'outronic': yp.RED,
    'exonic': yp.ORANGE,
    'intronic': yp.YELLOW,
    'gene_end': yp.PURPLE,
    'intergenic': yp.SKYBLUE,
    'mixed': yp.BLACK,
}

def name_(type_, gene_id_):
    if type_ == 'intergenic' or type_ == 'mixed':
        return type_
    else:
        return '%s: %s' % (type_, gene_id_)

write_gffbed(vp('Fig2D2_genomic_regions_pre-unique.bed'),
    chrom = df_pooled['chrom'],
    start = df_pooled['start'],
    end = df_pooled['end'],
    name = list(map(name_, df_pooled['type'], df_pooled['gene_id'])),
    strand = df_pooled['strand'],
    itemRgb = map(lambda type_: d_type_colour[type_], df_pooled['type']),
    attr = df_pooled[['type', 'gene_id']]
)

In [13]:
# Build gas 
gas = hts.GenomicArrayOfSets(chroms=yp.chroms_ce10, stranded=False)
for r in df_pooled.itertuples():
    iv = hts.GenomicInterval(r.chrom, r.start, r.end)
    try:
        gas[iv] += r
    except IndexError:
        print(iv, r.type)

In [14]:
# annot/S3_genomic_regions/S3a_genomic_regions_unique.bed <- regions, mixed
# Note: leaves some "identical" neighbouring states at the moment

# last exon, by construction, overlaps with gene end
# => remove exonic if there's a gene_end with the same gene_id
# => exons can be short, therefore also ignore intronic
def collapse_at_gene_end(s_r_):
    l_gene_end_gene_id = set([r.gene_id for r in s_r_ if r.type == 'gene_end'])
    for r in s_r_:
        if (r.type in ['exonic', 'intronic']) and (r.gene_id in l_gene_end_gene_id):
            continue
        yield r

def df_unique_(gas):
    n_exonic_gene_end = 0
    for chrom in gas.chrom_vectors.keys():
        iv_chrom = gas.chrom_vectors[chrom]['.'].iv
        for iv, s_r_ in itertools.islice(gas[iv_chrom].steps(), None):
            s_r = list(collapse_at_gene_end(s_r_))
            type_ = ','.join(sorted(set([r.type for r in s_r])))
            gene_id = ','.join(sorted(set([r.gene_id for r in s_r])))
            l_strand_ = sorted(set([r.strand for r in s_r]))
            strand = '.' if len(l_strand_) > 1 else l_strand_[0]

            # "Unique-ify based on type, e.g. sequence in exons of two different genes is still exonic"
            if (len(set([r.type for r in s_r])) > 1):
                type_ = 'mixed'
                gene_id = ','.join(sorted(set([r.gene_id for r in s_r])))
                l_strand_ = list(set([r.strand for r in s_r]))
                strand = '.' if len(l_strand_) > 1 else l_strand_[0]

            yield([iv.chrom, iv.start, iv.end, type_, gene_id, strand])

df_unique = pd.DataFrame.from_records(df_unique_(gas), columns=['chrom', 'start', 'end', 'type', 'gene_id', 'strand'])\
    .sort_values(['chrom', 'start', 'end', 'strand'])
df_unique['score'] = '.'

assert (df_unique['end'] - df_unique['start']).sum() == sum(yp.chroms_ce10.values())

In [15]:
# df_unique['segment_id'] <- adjacent segments with identical chrom, type, gene_id
df_unique['segment_id'] = -1
df_unique.loc[0, 'segment_id'] = 0
for i in range(1, len(df_unique)):
    if (df_unique.loc[i, 'chrom'] == df_unique.loc[i - 1, 'chrom']) and\
       (df_unique.loc[i, 'type'] == df_unique.loc[i - 1, 'type']) and\
       (df_unique.loc[i, 'gene_id'] == df_unique.loc[i - 1, 'gene_id']):
        df_unique.loc[i, 'segment_id'] = df_unique.loc[i - 1, 'segment_id']
    else:
        df_unique.loc[i, 'segment_id'] = df_unique.loc[i - 1, 'segment_id'] + 1

In [16]:
df_unique_out = df_unique.groupby(['segment_id']).agg({
        'chrom': lambda s: list(set(s))[0],
        'start': np.min,
        'end': np.max,
        'type': lambda s: list(set(s))[0],
        'gene_id': lambda s: list(set(s))[0],
        'strand': lambda s: list(set(s))[0],
        'score': lambda s: list(set(s))[0],
}).reset_index()[['chrom', 'start', 'end', 'type', 'gene_id', 'strand', 'score']]\
.sort_values(['chrom', 'start', 'end']).reset_index(drop=True)
print('%d raw, %d collapsed' % (len(df_unique), len(df_unique_out)))


332908 raw, 296393 collapsed

In [17]:
write_gffbed(vp('Fig2D2_genomic_regions.bed'),
    chrom = df_unique_out['chrom'],
    start = df_unique_out['start'],
    end = df_unique_out['end'],
    name = list(map(name_, df_unique_out['type'], df_unique_out['gene_id'])),
    strand = df_unique_out['strand'],
    itemRgb = map(lambda type_: d_type_colour[type_], df_unique_out['type']),
    attr = df_unique_out[['type', 'gene_id']]
)

In [18]:
fp_ = vp('Fig2D2_genomic_regions.tsv')
df_unique_out[['chrom', 'start', 'end', 'type', 'strand', 'gene_id']].to_csv(fp_, header=True, index=False, sep='\t')