In [1]:
%run ~/relmapping/annot/notebooks/annot__init__.ipynb
annot_ = 'annot_ce11'
def mp(fp, annot_=annot_): return os.path.join(annot_, 'canonical_geneset', fp)


/mnt/home3/jj374/anaconda36/lib/python3.6/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools
os.getcwd(): /mnt/beegfs/scratch_copy/ahringer/jj374/lab/relmapping

In [2]:
fp_ = 'annot_ce11/canonical_geneset/WS260_ce11.transcripts.gtf.gz'
df_ = yp.read_wbgtf(fp_, parse_attr=True, coords_adj=True)
df_.head()


Out[2]:
chrom source feature start end score strand frame exon_id exon_number gene_biotype gene_id gene_source locus_id protein_id transcript_biotype transcript_id transcript_source
0 chrI WormBase transcript 3746 3909 . - . NaN NaN snoRNA WBGene00023193 WormBase Y74C9A.6 NaN snoRNA Y74C9A.6 WormBase
1 chrI WormBase exon 3746 3909 . - . Y74C9A.6.e1 1.0 snoRNA WBGene00023193 WormBase Y74C9A.6 NaN snoRNA Y74C9A.6 WormBase
2 chrI WormBase three_prime_utr 4115 4220 . - . NaN NaN protein_coding WBGene00022277 WormBase homt-1 NaN protein_coding Y74C9A.3 WormBase
3 chrI WormBase exon 4115 4358 . - . Y74C9A.3.e5 5.0 protein_coding WBGene00022277 WormBase homt-1 NaN protein_coding Y74C9A.3 WormBase
4 chrI WormBase transcript 4115 10230 . - . NaN NaN protein_coding WBGene00022277 WormBase homt-1 NaN protein_coding Y74C9A.3 WormBase

In [3]:
# .bed-file for masking nuisance RNAs in ce11 tracks
l_ = ['tRNA', 'snoRNA', 'miRNA', 'snRNA', 'rRNA']
c_ = ['chrom', 'start', 'end', 'gene_id', 'score', 'strand', 'transcript_biotype']
q_ = '(feature == "transcript") & (transcript_biotype in @l_)'
fp_ = mp('WS260_ce11.transcripts.non_coding.bed')
df_.query(q_)[c_].reset_index(drop=True).to_csv(fp_, **yp.TO_GTF_KWARGS)
!wc -l {fp_}


1561 annot_ce11/canonical_geneset/WS260_ce11.transcripts.non_coding.bed