In [ ]:
%run ~/relmapping/annot_cb/notebooks/annot__init__.ipynb
In [2]:
kwargs_to_gtf = {'index': False, 'header': False, 'quoting': csv.QUOTE_NONE, 'sep': '\t'}
fp_ = 'wget/ftp.wormbase.org/pub/wormbase/releases/WS260/species/c_briggsae/PRJNA10731/c_briggsae.PRJNA10731.WS260.canonical_geneset.gtf.gz'
df_ = pd.read_csv(fp_, sep='\t', names=NAMES_GTF, comment='#')
print('%d raw records' % (len(df_),))
l_chroms = ['I', 'II', 'III', 'IV', 'V', 'X',]
df_ = df_.query('chrom in @l_chroms')
print('%d records from non-scaffolds' % (len(df_),))
In [3]:
kwargs_ = {'index': False, 'header': False, 'quoting': csv.QUOTE_NONE, 'sep': '\t', 'compression': 'gzip'}
df_.query('feature=="gene"').to_csv('annot_cb/canonical_geneset/WS260_CB4.genes.gtf.gz', **kwargs_)
df_.query('feature!="gene"').to_csv('annot_cb/canonical_geneset/WS260_CB4.transcripts.gtf.gz', **kwargs_)
In [12]:
df_genes_raw = df_.query('feature=="gene"').reset_index(drop=True)
df_genes = df_gfftags_unpack(df_genes_raw, name='attribute')
df_genes['gene_biotype'].value_counts()
Out[12]:
In [14]:
kwargs_to_gtf = {'index': False, 'header': False, 'quoting': csv.QUOTE_NONE, 'sep': '\t'}
for gene_biotype in df_genes['gene_biotype'].value_counts().index.tolist():#['tRNA', 'snoRNA', 'miRNA', 'lincRNA', 'snRNA', 'antisense', 'rRNA',]:
fp_ = 'annot_cb/canonical_geneset/WS260_CB4.genes.%s.gtf.gz' % (gene_biotype,)
print(fp_)
df_ = df_genes_raw.loc[df_genes.query('gene_biotype == "%s"' % (gene_biotype)).index]
df_.to_csv(fp_, compression='gzip', **kwargs_to_gtf)
!gunzip -c {fp_} | wc -l
In [5]:
df_transcripts_raw = read_wbgtf('annot_cb/canonical_geneset/WS260_CB4.transcripts.gtf.gz', parse_attr=False)
print('%d total records (raw)' % (len(df_transcripts_raw),))
df_transcripts_parsed = read_wbgtf('annot_cb/canonical_geneset/WS260_CB4.transcripts.gtf.gz', parse_attr=True)
#df_transcripts_parsed['locus_id'] = list(locus_id_from_gene_id(df_transcripts_parsed['gene_id']))
print('%d total records (parsed)' % (len(df_transcripts_parsed),))
In [6]:
df_transcripts_parsed['transcript_biotype'].value_counts()
Out[6]:
In [7]:
# Load raw transcripts file; write file with suitable biotypes
l_annot_transcript_biotype = ['protein_coding', 'tRNA', 'snoRNA', 'miRNA', 'snRNA', 'rRNA', 'pseudogene']
l_annot_transcript_biotype_pass1 = ['protein_coding', 'pseudogene']
l_annot_transcript_biotype_pass2 = ['tRNA', 'snoRNA', 'miRNA', 'snRNA', 'rRNA']
m_ = df_transcripts_parsed['transcript_biotype'].isin(l_annot_transcript_biotype)
print('%d with transcript_biotype in %s' % (sum(m_), str(l_annot_transcript_biotype)))
m_pass1 = df_transcripts_parsed['transcript_biotype'].isin(l_annot_transcript_biotype_pass1)
print('%d with transcript_biotype in %s' % (sum(m_pass1), str(l_annot_transcript_biotype_pass1)))
m_pass2 = df_transcripts_parsed['transcript_biotype'].isin(l_annot_transcript_biotype_pass2)
print('%d with transcript_biotype in %s' % (sum(m_pass2), str(l_annot_transcript_biotype_pass2)))
kwargs_ = {'index': False, 'header': False, 'quoting': csv.QUOTE_NONE, 'sep': '\t', 'compression': 'gzip'}
df_transcripts_raw[m_].to_csv('annot_cb/canonical_geneset/WS260_CB4.transcripts.annot.gtf.gz', **kwargs_)
df_transcripts_raw[m_pass1].to_csv('annot_cb/canonical_geneset/WS260_CB4.transcripts.annot_pass1.gtf.gz', **kwargs_)
df_transcripts_raw[m_pass2].to_csv('annot_cb/canonical_geneset/WS260_CB4.transcripts.annot_pass2.gtf.gz', **kwargs_)
In [8]:
# Exons -- annotate step -- pass1 or pass2
df_exon = df_transcripts_parsed.query('feature == "exon"').reset_index(drop=True)
df_exon['start'] = df_exon['start'] - 1 # coordinates .gtf-to-.bed
df_exon['is_pass1'] = df_exon['transcript_biotype'].map(lambda s: s in l_annot_transcript_biotype_pass1)
df_exon['is_pass2'] = df_exon['transcript_biotype'].map(lambda s: s in l_annot_transcript_biotype_pass2)
print(df_exon['is_pass1'].value_counts())
print(df_exon['is_pass2'].value_counts())
df_exon.rename(columns={'start': 'exon_start', 'end': 'exon_end'}, inplace=True)
df_exon.head()
Out[8]:
In [9]:
# aoe = 1st exon assignment, fivep = excluding exon-proximal artefacts
df_exon_fwd = df_exon.query('strand == "+"').reset_index(drop=True)
df_exon_rev = df_exon.query('strand == "-"').reset_index(drop=True)
df_exon_fwd['fivep_start'] = df_exon_fwd['exon_start']
df_exon_fwd['fivep_end'] = df_exon_fwd['exon_start'] + 1
df_exon_rev['fivep_start'] = df_exon_rev['exon_end'] - 1
df_exon_rev['fivep_end'] = df_exon_rev['exon_end']
# allow ATAC-seq mode up to 100bp downstream of exon start (similar to TIC-to-TSS assignments in Chen et al., 2013)
downstream_extension_len = 100
df_exon_fwd['aoe_start'] = df_exon_fwd['exon_start']
df_exon_fwd['aoe_end'] = df_exon_fwd['exon_start'] + downstream_extension_len
df_exon_rev['aoe_start'] = df_exon_rev['exon_end'] - downstream_extension_len
df_exon_rev['aoe_end'] = df_exon_rev['exon_end']
In [10]:
df_exon_fwd_pass1_exon1 = df_exon_fwd.query('is_pass1 & exon_number == 1').reset_index(drop=True)
df_exon_rev_pass1_exon1 = df_exon_rev.query('is_pass1 & exon_number == 1').reset_index(drop=True)
df_exon_fwd_pass1_exon2 = df_exon_fwd.query('is_pass1 & exon_number > 1').reset_index(drop=True)
df_exon_rev_pass1_exon2 = df_exon_rev.query('is_pass1 & exon_number > 1').reset_index(drop=True)
l_attr = ['exon_id', 'gene_biotype', 'gene_id', #'locus_id', 'transcript_biotype', 'transcript_id',
'fivep_start', 'fivep_end', 'aoe_start', 'aoe_end', 'exon_start', 'exon_end']
write_gffbed('annot_cb/canonical_geneset/WS260_CB4.transcripts.annot_pass1_exon1_fwd.bed',
chrom = df_exon_fwd_pass1_exon1['chrom'],
start = df_exon_fwd_pass1_exon1['aoe_start'],
end = df_exon_fwd_pass1_exon1['aoe_end'],
name = df_exon_fwd_pass1_exon1['gene_id'],
attr = df_exon_fwd_pass1_exon1[l_attr],
strand = df_exon_fwd_pass1_exon1['strand'],
)
write_gffbed('annot_cb/canonical_geneset/WS260_CB4.transcripts.annot_pass1_exon1_rev.bed',
chrom = df_exon_rev_pass1_exon1['chrom'],
start = df_exon_rev_pass1_exon1['aoe_start'],
end = df_exon_rev_pass1_exon1['aoe_end'],
name = df_exon_rev_pass1_exon1['gene_id'],
attr = df_exon_rev_pass1_exon1[l_attr],
strand = df_exon_rev_pass1_exon1['strand'],
)
write_gffbed('annot_cb/canonical_geneset/WS260_CB4.transcripts.annot_pass1_exon2_fwd.bed',
chrom = df_exon_fwd_pass1_exon2['chrom'],
start = df_exon_fwd_pass1_exon2['fivep_start'],
end = df_exon_fwd_pass1_exon2['fivep_end'],
name = df_exon_fwd_pass1_exon2['gene_id'],
attr = df_exon_fwd_pass1_exon2[l_attr],
strand = df_exon_fwd_pass1_exon2['strand'],
)
write_gffbed('annot_cb/canonical_geneset/WS260_CB4.transcripts.annot_pass1_exon2_rev.bed',
chrom = df_exon_rev_pass1_exon2['chrom'],
start = df_exon_rev_pass1_exon2['fivep_start'],
end = df_exon_rev_pass1_exon2['fivep_end'],
name = df_exon_rev_pass1_exon2['gene_id'],
attr = df_exon_rev_pass1_exon2[l_attr],
strand = df_exon_rev_pass1_exon2['strand'],
)
!wc -l annot_cb/canonical_geneset/WS260_CB4.transcripts.annot_pass1_exon1_fwd.bed
!wc -l annot_cb/canonical_geneset/WS260_CB4.transcripts.annot_pass1_exon1_rev.bed
!wc -l annot_cb/canonical_geneset/WS260_CB4.transcripts.annot_pass1_exon2_fwd.bed
!wc -l annot_cb/canonical_geneset/WS260_CB4.transcripts.annot_pass1_exon2_rev.bed
df_exon_fwd_pass2_exon1 = df_exon_fwd.query('is_pass2 & exon_number == 1').reset_index(drop=True)
df_exon_rev_pass2_exon1 = df_exon_rev.query('is_pass2 & exon_number == 1').reset_index(drop=True)
write_gffbed('annot_cb/canonical_geneset/WS260_CB4.transcripts.annot_pass2_exon1_fwd.bed',
chrom = df_exon_fwd_pass2_exon1['chrom'],
start = df_exon_fwd_pass2_exon1['aoe_start'],
end = df_exon_fwd_pass2_exon1['aoe_end'],
name = df_exon_fwd_pass2_exon1['gene_id'],
attr = df_exon_fwd_pass2_exon1[l_attr],
strand = df_exon_fwd_pass2_exon1['strand'],
)
write_gffbed('annot_cb/canonical_geneset/WS260_CB4.transcripts.annot_pass2_exon1_rev.bed',
chrom = df_exon_rev_pass2_exon1['chrom'],
start = df_exon_rev_pass2_exon1['aoe_start'],
end = df_exon_rev_pass2_exon1['aoe_end'],
name = df_exon_rev_pass2_exon1['gene_id'],
attr = df_exon_rev_pass2_exon1[l_attr],
strand = df_exon_rev_pass2_exon1['strand'],
)
!wc -l annot_cb/canonical_geneset/WS260_CB4.transcripts.annot_pass2_exon1_fwd.bed
!wc -l annot_cb/canonical_geneset/WS260_CB4.transcripts.annot_pass2_exon1_rev.bed
In [11]:
def to_exon1_utradj(df_transcript):
ps_exon1 = df_transcript.query('(feature == "exon") & (exon_number == 1)').iloc[0]
ps = pd.Series()
ps['chrom'] = ps_exon1.chrom
ps['strand'] = ps_exon1.strand
ps['exon_id'] = ps_exon1.exon_id
ps['gene_biotype'] = ps_exon1.gene_biotype
ps['gene_id'] = ps_exon1.gene_id
#ps['locus_id'] = ps_exon1.locus_id
# Exon boundaries (& transform coordinates GTF=>BED)
ps['exon_start'] = ps_exon1.start - 1
ps['exon_end'] = ps_exon1.end
ps['utr_start'] = df_transcript.query('feature == "five_prime_utr"')['start'].min(axis=0) - 1
ps['utr_end'] = df_transcript.query('feature == "five_prime_utr"')['end'].max(axis=0)
# AOE for identifying putative target gene: 1st exon, expanded by UTR in cases where UTR spans beyond the 1st exon
ps['aoe_start'] = ps[['exon_start', 'utr_start']].min()
ps['aoe_end'] = ps[['exon_end', 'utr_end']].max()
# Cutoff position for transcription initiation mode at the putative promoter
if ps_exon1['strand'] == '+':
ps['init_cutoff_pos'] = max(ps['exon_start'], ps['utr_end'] - 1)
else:
assert ps_exon1['strand'] == '-'
ps['init_cutoff_pos'] = min(ps['exon_end'] - 1, ps['utr_start'])
return ps
df_fwd_ = df_transcripts_parsed.query('((feature == "exon") | (feature == "five_prime_utr")) & (strand == "+") & (transcript_biotype in @l_annot_transcript_biotype_pass1)')
df_rev_ = df_transcripts_parsed.query('((feature == "exon") | (feature == "five_prime_utr")) & (strand == "-") & (transcript_biotype in @l_annot_transcript_biotype_pass1)')
df_exon_fwd_pass1_exon1_adj = pd.concat([to_exon1_utradj(df_tr_id_) for tr_id_, df_tr_id_ in df_fwd_.groupby('transcript_id')], axis=1).transpose()
df_exon_rev_pass1_exon1_adj = pd.concat([to_exon1_utradj(df_tr_id_) for tr_id_, df_tr_id_ in df_rev_.groupby('transcript_id')], axis=1).transpose()
In [12]:
l_attr = ['exon_id', 'gene_biotype', 'gene_id', #'locus_id', #'transcript_biotype', 'transcript_id', 'fivep_start', 'fivep_end',
'exon_start', 'exon_end', 'utr_start', 'utr_end', 'aoe_start', 'aoe_end', 'init_cutoff_pos']
write_gffbed('annot_cb/canonical_geneset/WS260_CB4.transcripts.annot_pass1_exon1_fwd_utradj.bed',
chrom = df_exon_fwd_pass1_exon1_adj['chrom'],
start = df_exon_fwd_pass1_exon1_adj['aoe_start'],
end = df_exon_fwd_pass1_exon1_adj['aoe_end'],
name = df_exon_fwd_pass1_exon1_adj['gene_id'],
attr = df_exon_fwd_pass1_exon1_adj[l_attr],
strand = df_exon_fwd_pass1_exon1_adj['strand'],
)
write_gffbed('annot_cb/canonical_geneset/WS260_CB4.transcripts.annot_pass1_exon1_rev_utradj.bed',
chrom = df_exon_rev_pass1_exon1_adj['chrom'],
start = df_exon_rev_pass1_exon1_adj['aoe_start'],
end = df_exon_rev_pass1_exon1_adj['aoe_end'],
name = df_exon_rev_pass1_exon1_adj['gene_id'],
attr = df_exon_rev_pass1_exon1_adj[l_attr],
strand = df_exon_rev_pass1_exon1_adj['strand'],
)