In [ ]:
%run ~/relmapping/annot_cb/notebooks/annot__init__.ipynb

In [2]:
kwargs_to_gtf = {'index': False, 'header': False, 'quoting': csv.QUOTE_NONE, 'sep': '\t'}
fp_ = 'wget/ftp.wormbase.org/pub/wormbase/releases/WS260/species/c_briggsae/PRJNA10731/c_briggsae.PRJNA10731.WS260.canonical_geneset.gtf.gz'
df_ = pd.read_csv(fp_, sep='\t', names=NAMES_GTF, comment='#')
print('%d raw records' % (len(df_),))
l_chroms = ['I', 'II', 'III', 'IV', 'V', 'X',]
df_ = df_.query('chrom in @l_chroms')
print('%d records from non-scaffolds' % (len(df_),))


467622 raw records
457497 records from non-scaffolds

In [3]:
kwargs_ = {'index': False, 'header': False, 'quoting': csv.QUOTE_NONE, 'sep': '\t', 'compression': 'gzip'}
df_.query('feature=="gene"').to_csv('annot_cb/canonical_geneset/WS260_CB4.genes.gtf.gz', **kwargs_)
df_.query('feature!="gene"').to_csv('annot_cb/canonical_geneset/WS260_CB4.transcripts.gtf.gz', **kwargs_)

In [12]:
df_genes_raw = df_.query('feature=="gene"').reset_index(drop=True)
df_genes = df_gfftags_unpack(df_genes_raw, name='attribute')
df_genes['gene_biotype'].value_counts()


Out[12]:
protein_coding    21653
tRNA                759
pseudogene          338
miRNA               133
ncRNA               117
snRNA               107
snoRNA              103
lincRNA              16
rRNA                  5
Name: gene_biotype, dtype: int64

In [14]:
kwargs_to_gtf = {'index': False, 'header': False, 'quoting': csv.QUOTE_NONE, 'sep': '\t'}
for gene_biotype in df_genes['gene_biotype'].value_counts().index.tolist():#['tRNA', 'snoRNA', 'miRNA', 'lincRNA', 'snRNA', 'antisense', 'rRNA',]:
    fp_ = 'annot_cb/canonical_geneset/WS260_CB4.genes.%s.gtf.gz' % (gene_biotype,)
    print(fp_)
    df_ = df_genes_raw.loc[df_genes.query('gene_biotype == "%s"' % (gene_biotype)).index]
    df_.to_csv(fp_, compression='gzip', **kwargs_to_gtf)
    !gunzip -c {fp_} | wc -l


annot_cb/canonical_geneset/WS260_CB4.genes.protein_coding.gtf.gz
21653
annot_cb/canonical_geneset/WS260_CB4.genes.tRNA.gtf.gz
759
annot_cb/canonical_geneset/WS260_CB4.genes.pseudogene.gtf.gz
338
annot_cb/canonical_geneset/WS260_CB4.genes.miRNA.gtf.gz
133
annot_cb/canonical_geneset/WS260_CB4.genes.ncRNA.gtf.gz
117
annot_cb/canonical_geneset/WS260_CB4.genes.snRNA.gtf.gz
107
annot_cb/canonical_geneset/WS260_CB4.genes.snoRNA.gtf.gz
103
annot_cb/canonical_geneset/WS260_CB4.genes.lincRNA.gtf.gz
16
annot_cb/canonical_geneset/WS260_CB4.genes.rRNA.gtf.gz
5

In [5]:
df_transcripts_raw = read_wbgtf('annot_cb/canonical_geneset/WS260_CB4.transcripts.gtf.gz', parse_attr=False)
print('%d total records (raw)' % (len(df_transcripts_raw),))
df_transcripts_parsed = read_wbgtf('annot_cb/canonical_geneset/WS260_CB4.transcripts.gtf.gz', parse_attr=True)
#df_transcripts_parsed['locus_id'] = list(locus_id_from_gene_id(df_transcripts_parsed['gene_id']))
print('%d total records (parsed)' % (len(df_transcripts_parsed),))


434266 total records (raw)
434266 total records (parsed)

In [6]:
df_transcripts_parsed['transcript_biotype'].value_counts()


Out[6]:
protein_coding     430449
tRNA                 1556
pseudogene            884
tRNA_pseudogene       357
pre_miRNA             266
ncRNA                 260
snRNA                 214
snoRNA                206
lincRNA                64
rRNA                   10
Name: transcript_biotype, dtype: int64

In [7]:
# Load raw transcripts file; write file with suitable biotypes
l_annot_transcript_biotype = ['protein_coding', 'tRNA', 'snoRNA', 'miRNA', 'snRNA', 'rRNA', 'pseudogene']
l_annot_transcript_biotype_pass1 = ['protein_coding', 'pseudogene']
l_annot_transcript_biotype_pass2 = ['tRNA', 'snoRNA', 'miRNA', 'snRNA', 'rRNA']

m_ = df_transcripts_parsed['transcript_biotype'].isin(l_annot_transcript_biotype)
print('%d with transcript_biotype in %s' % (sum(m_), str(l_annot_transcript_biotype)))

m_pass1 = df_transcripts_parsed['transcript_biotype'].isin(l_annot_transcript_biotype_pass1)
print('%d with transcript_biotype in %s' % (sum(m_pass1), str(l_annot_transcript_biotype_pass1)))

m_pass2 = df_transcripts_parsed['transcript_biotype'].isin(l_annot_transcript_biotype_pass2)
print('%d with transcript_biotype in %s' % (sum(m_pass2), str(l_annot_transcript_biotype_pass2)))

kwargs_ = {'index': False, 'header': False, 'quoting': csv.QUOTE_NONE, 'sep': '\t', 'compression': 'gzip'}

df_transcripts_raw[m_].to_csv('annot_cb/canonical_geneset/WS260_CB4.transcripts.annot.gtf.gz', **kwargs_)
df_transcripts_raw[m_pass1].to_csv('annot_cb/canonical_geneset/WS260_CB4.transcripts.annot_pass1.gtf.gz', **kwargs_)
df_transcripts_raw[m_pass2].to_csv('annot_cb/canonical_geneset/WS260_CB4.transcripts.annot_pass2.gtf.gz', **kwargs_)


433319 with transcript_biotype in ['protein_coding', 'tRNA', 'snoRNA', 'miRNA', 'snRNA', 'rRNA', 'pseudogene']
431333 with transcript_biotype in ['protein_coding', 'pseudogene']
1986 with transcript_biotype in ['tRNA', 'snoRNA', 'miRNA', 'snRNA', 'rRNA']

In [8]:
# Exons -- annotate step -- pass1 or pass2
df_exon = df_transcripts_parsed.query('feature == "exon"').reset_index(drop=True)
df_exon['start'] = df_exon['start'] - 1 # coordinates .gtf-to-.bed
df_exon['is_pass1'] = df_exon['transcript_biotype'].map(lambda s: s in l_annot_transcript_biotype_pass1)
df_exon['is_pass2'] = df_exon['transcript_biotype'].map(lambda s: s in l_annot_transcript_biotype_pass2)
print(df_exon['is_pass1'].value_counts())
print(df_exon['is_pass2'].value_counts())

df_exon.rename(columns={'start': 'exon_start', 'end': 'exon_end'}, inplace=True)
df_exon.head()


True     162339
False      1513
Name: is_pass1, dtype: int64
False    162840
True       1012
Name: is_pass2, dtype: int64
Out[8]:
chrom source feature exon_start exon_end score strand frame exon_id exon_number gene_biotype gene_id gene_source protein_id transcript_biotype transcript_id transcript_source is_pass1 is_pass2
0 IV WormBase exon 1232 1487 . + . CBG00323.e1 1.0 protein_coding WBGene00023731 WormBase NaN protein_coding CBG00323 WormBase True False
1 IV WormBase exon 1615 1735 . + . CBG00323.e2 2.0 protein_coding WBGene00023731 WormBase NaN protein_coding CBG00323 WormBase True False
2 IV WormBase exon 2125 2212 . + . CBG00323.e3 3.0 protein_coding WBGene00023731 WormBase NaN protein_coding CBG00323 WormBase True False
3 IV WormBase exon 2247 2367 . + . CBG00323.e4 4.0 protein_coding WBGene00023731 WormBase NaN protein_coding CBG00323 WormBase True False
4 IV WormBase exon 2780 2912 . + . CBG00323.e5 5.0 protein_coding WBGene00023731 WormBase NaN protein_coding CBG00323 WormBase True False

In [9]:
# aoe = 1st exon assignment, fivep = excluding exon-proximal artefacts
df_exon_fwd = df_exon.query('strand == "+"').reset_index(drop=True)
df_exon_rev = df_exon.query('strand == "-"').reset_index(drop=True)

df_exon_fwd['fivep_start'] = df_exon_fwd['exon_start'] 
df_exon_fwd['fivep_end'] = df_exon_fwd['exon_start'] + 1
df_exon_rev['fivep_start'] = df_exon_rev['exon_end'] - 1 
df_exon_rev['fivep_end'] = df_exon_rev['exon_end']

# allow ATAC-seq mode up to 100bp downstream of exon start (similar to TIC-to-TSS assignments in Chen et al., 2013)
downstream_extension_len = 100
df_exon_fwd['aoe_start'] = df_exon_fwd['exon_start']
df_exon_fwd['aoe_end'] = df_exon_fwd['exon_start'] + downstream_extension_len
df_exon_rev['aoe_start'] = df_exon_rev['exon_end'] - downstream_extension_len
df_exon_rev['aoe_end'] = df_exon_rev['exon_end']

In [10]:
df_exon_fwd_pass1_exon1 = df_exon_fwd.query('is_pass1 & exon_number == 1').reset_index(drop=True)
df_exon_rev_pass1_exon1 = df_exon_rev.query('is_pass1 & exon_number == 1').reset_index(drop=True)
df_exon_fwd_pass1_exon2 = df_exon_fwd.query('is_pass1 & exon_number > 1').reset_index(drop=True)
df_exon_rev_pass1_exon2 = df_exon_rev.query('is_pass1 & exon_number > 1').reset_index(drop=True)
l_attr = ['exon_id', 'gene_biotype', 'gene_id', #'locus_id', 'transcript_biotype', 'transcript_id', 
         'fivep_start', 'fivep_end', 'aoe_start', 'aoe_end', 'exon_start', 'exon_end']
write_gffbed('annot_cb/canonical_geneset/WS260_CB4.transcripts.annot_pass1_exon1_fwd.bed',
    chrom = df_exon_fwd_pass1_exon1['chrom'],
    start = df_exon_fwd_pass1_exon1['aoe_start'],
    end = df_exon_fwd_pass1_exon1['aoe_end'],
    name = df_exon_fwd_pass1_exon1['gene_id'],
    attr = df_exon_fwd_pass1_exon1[l_attr],
    strand = df_exon_fwd_pass1_exon1['strand'],
)
write_gffbed('annot_cb/canonical_geneset/WS260_CB4.transcripts.annot_pass1_exon1_rev.bed',
    chrom = df_exon_rev_pass1_exon1['chrom'],
    start = df_exon_rev_pass1_exon1['aoe_start'],
    end = df_exon_rev_pass1_exon1['aoe_end'],
    name = df_exon_rev_pass1_exon1['gene_id'],
    attr = df_exon_rev_pass1_exon1[l_attr],
    strand = df_exon_rev_pass1_exon1['strand'],
)
write_gffbed('annot_cb/canonical_geneset/WS260_CB4.transcripts.annot_pass1_exon2_fwd.bed',
    chrom = df_exon_fwd_pass1_exon2['chrom'],
    start = df_exon_fwd_pass1_exon2['fivep_start'],
    end = df_exon_fwd_pass1_exon2['fivep_end'],
    name = df_exon_fwd_pass1_exon2['gene_id'],
    attr = df_exon_fwd_pass1_exon2[l_attr],
    strand = df_exon_fwd_pass1_exon2['strand'],
)
write_gffbed('annot_cb/canonical_geneset/WS260_CB4.transcripts.annot_pass1_exon2_rev.bed',
    chrom = df_exon_rev_pass1_exon2['chrom'],
    start = df_exon_rev_pass1_exon2['fivep_start'],
    end = df_exon_rev_pass1_exon2['fivep_end'],
    name = df_exon_rev_pass1_exon2['gene_id'],
    attr = df_exon_rev_pass1_exon2[l_attr],
    strand = df_exon_rev_pass1_exon2['strand'],
)

!wc -l annot_cb/canonical_geneset/WS260_CB4.transcripts.annot_pass1_exon1_fwd.bed
!wc -l annot_cb/canonical_geneset/WS260_CB4.transcripts.annot_pass1_exon1_rev.bed
!wc -l annot_cb/canonical_geneset/WS260_CB4.transcripts.annot_pass1_exon2_fwd.bed
!wc -l annot_cb/canonical_geneset/WS260_CB4.transcripts.annot_pass1_exon2_rev.bed

df_exon_fwd_pass2_exon1 = df_exon_fwd.query('is_pass2 & exon_number == 1').reset_index(drop=True)
df_exon_rev_pass2_exon1 = df_exon_rev.query('is_pass2 & exon_number == 1').reset_index(drop=True)
write_gffbed('annot_cb/canonical_geneset/WS260_CB4.transcripts.annot_pass2_exon1_fwd.bed',
    chrom = df_exon_fwd_pass2_exon1['chrom'],
    start = df_exon_fwd_pass2_exon1['aoe_start'],
    end = df_exon_fwd_pass2_exon1['aoe_end'],
    name = df_exon_fwd_pass2_exon1['gene_id'],
    attr = df_exon_fwd_pass2_exon1[l_attr],
    strand = df_exon_fwd_pass2_exon1['strand'],
)
write_gffbed('annot_cb/canonical_geneset/WS260_CB4.transcripts.annot_pass2_exon1_rev.bed',
    chrom = df_exon_rev_pass2_exon1['chrom'],
    start = df_exon_rev_pass2_exon1['aoe_start'],
    end = df_exon_rev_pass2_exon1['aoe_end'],
    name = df_exon_rev_pass2_exon1['gene_id'],
    attr = df_exon_rev_pass2_exon1[l_attr],
    strand = df_exon_rev_pass2_exon1['strand'],
)

!wc -l annot_cb/canonical_geneset/WS260_CB4.transcripts.annot_pass2_exon1_fwd.bed
!wc -l annot_cb/canonical_geneset/WS260_CB4.transcripts.annot_pass2_exon1_rev.bed


12861 annot_cb/canonical_geneset/WS260_CB4.transcripts.annot_pass1_exon1_fwd.bed
12772 annot_cb/canonical_geneset/WS260_CB4.transcripts.annot_pass1_exon1_rev.bed
69068 annot_cb/canonical_geneset/WS260_CB4.transcripts.annot_pass1_exon2_fwd.bed
67642 annot_cb/canonical_geneset/WS260_CB4.transcripts.annot_pass1_exon2_rev.bed
478 annot_cb/canonical_geneset/WS260_CB4.transcripts.annot_pass2_exon1_fwd.bed
498 annot_cb/canonical_geneset/WS260_CB4.transcripts.annot_pass2_exon1_rev.bed

In [11]:
def to_exon1_utradj(df_transcript):
    ps_exon1 = df_transcript.query('(feature == "exon") & (exon_number == 1)').iloc[0]
    ps = pd.Series()
    ps['chrom'] = ps_exon1.chrom
    ps['strand'] = ps_exon1.strand
    ps['exon_id'] = ps_exon1.exon_id
    ps['gene_biotype'] = ps_exon1.gene_biotype
    ps['gene_id'] = ps_exon1.gene_id
    #ps['locus_id'] = ps_exon1.locus_id

    # Exon boundaries (& transform coordinates GTF=>BED)
    ps['exon_start'] = ps_exon1.start - 1
    ps['exon_end'] = ps_exon1.end
    ps['utr_start'] = df_transcript.query('feature == "five_prime_utr"')['start'].min(axis=0) - 1
    ps['utr_end'] = df_transcript.query('feature == "five_prime_utr"')['end'].max(axis=0)

    # AOE for identifying putative target gene: 1st exon, expanded by UTR in cases where UTR spans beyond the 1st exon
    ps['aoe_start'] = ps[['exon_start', 'utr_start']].min()
    ps['aoe_end'] = ps[['exon_end', 'utr_end']].max()

    # Cutoff position for transcription initiation mode at the putative promoter
    if ps_exon1['strand'] == '+':
        ps['init_cutoff_pos'] = max(ps['exon_start'], ps['utr_end'] - 1)
    else:
        assert ps_exon1['strand'] == '-'
        ps['init_cutoff_pos'] = min(ps['exon_end'] - 1, ps['utr_start'])

    return ps

df_fwd_ = df_transcripts_parsed.query('((feature == "exon") | (feature == "five_prime_utr")) & (strand == "+") & (transcript_biotype in @l_annot_transcript_biotype_pass1)')
df_rev_ = df_transcripts_parsed.query('((feature == "exon") | (feature == "five_prime_utr")) & (strand == "-") & (transcript_biotype in @l_annot_transcript_biotype_pass1)')
df_exon_fwd_pass1_exon1_adj = pd.concat([to_exon1_utradj(df_tr_id_) for tr_id_, df_tr_id_ in df_fwd_.groupby('transcript_id')], axis=1).transpose()
df_exon_rev_pass1_exon1_adj = pd.concat([to_exon1_utradj(df_tr_id_) for tr_id_, df_tr_id_ in df_rev_.groupby('transcript_id')], axis=1).transpose()

In [12]:
l_attr = ['exon_id', 'gene_biotype', 'gene_id', #'locus_id', #'transcript_biotype', 'transcript_id', 'fivep_start', 'fivep_end',
          'exon_start', 'exon_end', 'utr_start', 'utr_end', 'aoe_start', 'aoe_end', 'init_cutoff_pos']

write_gffbed('annot_cb/canonical_geneset/WS260_CB4.transcripts.annot_pass1_exon1_fwd_utradj.bed',
    chrom = df_exon_fwd_pass1_exon1_adj['chrom'],
    start = df_exon_fwd_pass1_exon1_adj['aoe_start'],
    end = df_exon_fwd_pass1_exon1_adj['aoe_end'],
    name = df_exon_fwd_pass1_exon1_adj['gene_id'],
    attr = df_exon_fwd_pass1_exon1_adj[l_attr],
    strand = df_exon_fwd_pass1_exon1_adj['strand'],
)

write_gffbed('annot_cb/canonical_geneset/WS260_CB4.transcripts.annot_pass1_exon1_rev_utradj.bed',
    chrom = df_exon_rev_pass1_exon1_adj['chrom'],
    start = df_exon_rev_pass1_exon1_adj['aoe_start'],
    end = df_exon_rev_pass1_exon1_adj['aoe_end'],
    name = df_exon_rev_pass1_exon1_adj['gene_id'],
    attr = df_exon_rev_pass1_exon1_adj[l_attr],
    strand = df_exon_rev_pass1_exon1_adj['strand'],
)