In [1]:
%run ~/relmapping/annot/notebooks/annot__init__.ipynb
annot_ = 'annot_ce10'
def mp(fp, annot_=annot_): return os.path.join(annot_, 'canonical_geneset', fp)


/mnt/home3/jj374/anaconda36/lib/python3.6/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools
os.getcwd(): /mnt/beegfs/scratch_copy/ahringer/jj374/lab/relmapping

In [2]:
# canonical_geneset.gtf.gz: WS260 canonical geneset in ce10 coordinates

# Download WS260 annotations:
# !cd .wget; wget -m --no-parent ftp://ftp.wormbase.org/pub/wormbase/releases/WS260/

def liftover_WS235_WS220_gtf(chroms, starts, ends, strands): # Liftover wrapper
    """
    # http://www.ncbi.nlm.nih.gov/pmc/articles/PMC4512556/
    # note that while annotations changed, the N2 genome sequence remained identical from WS215 through WS234
    Test: 
    list(liftover_WS230_WS220(
    chroms=['chrI', 'chrX', 'chrIII', 'chrV', 'chrIV', 'chrII'],
    starts=[1,1,4,1,1, 1],
    ends=[12507921,13713648,471811,15465174,11718826, 11718826],
    strands=['+', '-', '+', '-', '+']))
    """
    fp_inp = mp('liftover_fp_inp.tsv')
    fp_out = mp('liftover_fp_out.tsv')
    with open(fp_inp, 'w') as fh_inp:
        for (chrom, start, end, strand) in zip(chroms, starts, ends, strands):
            print('\t'.join(map(str, (chrom, start, end, strand))), file=fh_inp)
            #print('%s\t%s' % (chrom, start), file=fh_inp)

    liftover_bin = '/mnt/home1/ahringer/jj374/lab/raw_data/datastore/Kenneth_Evans/ConvertWormAssembly-1-0-0/ConvertWormAssembly.1.0.0.pl'
    liftover_dir = '/mnt/home1/ahringer/jj374/lab/raw_data/datastore/Kenneth_Evans/ConvertWormAssembly-1-0-0/CHROMOSOME-DIFFERENCES-to-WS244'
    !{liftover_bin} \
        --input={fp_inp} \
        --output={fp_out} \
        --backwards \
        --release1=WS235 \
        --release2=WS220 \
        --chrfield=1 \
        --posfield=2 \
        --posfield=3 \
        --postype=1 \
        --postype=1 \
        --strandfield=4 \
        --refdir={liftover_dir}
    
    df_out = pd.read_csv(fp_out, sep='\t', names=['chrom', 'start', 'end', 'strand'])
    for i, r in df_out.iterrows():
        yield((r['chrom'], r['start'], r['end'], r['strand']))
    
    !rm {fp_inp}
    !rm {fp_out}

def to_ce10(df_inp): # Use liftover wrapper on a gtf-DataFrame
    df_out = df_inp.query('(chrom != "CHROMOSOME_MtDNA") & (chrom != "MtDNA")').reset_index(drop=True).copy()
    df_WS220 = pd.DataFrame.from_records(liftover_WS235_WS220_gtf(df_out['chrom'], df_out['start'], df_out['end'], df_out['strand']), columns=['chrom', 'start', 'end', 'strand'])
    df_out['chrom'] = [*map(lambda chrom: 'chr' + chrom.lstrip('CHROMOSOME_'), df_WS220['chrom'])]
    df_out['start'] = df_WS220['start'].copy()
    df_out['end'] = df_WS220['end'].copy()
    df_out['strand'] = df_WS220['strand'].copy()
    return df_out.sort_values(['chrom', 'start', 'end', 'strand']).reset_index(drop=True)

# Genes & transcripts from the canonical geneset
kwargs_to_gtf = {'index': False, 'header': False, 'quoting': csv.QUOTE_NONE, 'sep': '\t'}
fp_ = 'wget/ftp.wormbase.org/pub/wormbase/releases/WS260/species/c_elegans/PRJNA13758/c_elegans.PRJNA13758.WS260.canonical_geneset.gtf.gz'
df_ = pd.read_csv(fp_, sep='\t', names=yp.NAMES_GTF, comment='#')
df_ = to_ce10(df_)
df_.head()


Input file  = annot_ce10/canonical_geneset/liftover_fp_inp.tsv 
Output file = annot_ce10/canonical_geneset/liftover_fp_out.tsv 
Lines processed from input file              = 726282
Comment / blank lines written to output file = 0
Data lines written to output file            = 726282
Number of records changed                    = 699702
Out[2]:
chrom source feature start end score strand frame attribute
0 chrI WormBase gene 3747 3909 . - . gene_id "WBGene00023193"; gene_source "WormBas...
1 chrI WormBase transcript 3747 3909 . - . gene_id "WBGene00023193"; transcript_id "Y74C9...
2 chrI WormBase exon 3747 3909 . - . gene_id "WBGene00023193"; transcript_id "Y74C9...
3 chrI WormBase three_prime_utr 4116 4220 . - . gene_id "WBGene00022277"; transcript_id "Y74C9...
4 chrI WormBase exon 4116 4358 . - . gene_id "WBGene00022277"; transcript_id "Y74C9...

In [3]:
# locus_id: intended to roughly match labels is shown on the default IGV RefSeq track
def locus_id_from_gene_id(l_gene_id):
    # Attach display_id
    def locus_id_(locus, sequence_id, gene_id):
        if locus == locus:
            return locus
        elif sequence_id == sequence_id:
            return sequence_id
        else:
            return gene_id

    fp_geneIDs = 'wget/ftp.wormbase.org/pub/wormbase/releases/WS260/species/c_elegans/PRJNA13758/annotation/c_elegans.PRJNA13758.WS260.geneIDs.txt.gz'
    l_ = ['gene_id', 'locus', 'sequence_id', 'status']
    df_geneIDs = pd.read_csv(fp_geneIDs, sep=',', names=('na', 'gene_id', 'locus', 'sequence_id', 'status'))[l_]
    df_geneIDs['display_id'] = list(map(locus_id_,df_geneIDs['locus'], df_geneIDs['sequence_id'], df_geneIDs['gene_id']))
    df_geneIDs = df_geneIDs.set_index('gene_id')
    for gene_id in l_gene_id:
        yield df_geneIDs.loc[gene_id]['display_id']

# Parse attribute string
df_attr_ = pd.DataFrame(df_['attribute'].apply(hts.parse_GFF_attribute_string).tolist())

# Define locus_id
se_locus_id = [*locus_id_from_gene_id(df_attr_['gene_id'])]

# Attach locus_id at the end of the unparsed attr field
def fmt_(attribute, locus_id): return '%s locus_id "%s";' % (attribute, locus_id)
df_['attribute'] = list(map(fmt_, df_['attribute'], se_locus_id))

df_.head()


Out[3]:
chrom source feature start end score strand frame attribute
0 chrI WormBase gene 3747 3909 . - . gene_id "WBGene00023193"; gene_source "WormBas...
1 chrI WormBase transcript 3747 3909 . - . gene_id "WBGene00023193"; transcript_id "Y74C9...
2 chrI WormBase exon 3747 3909 . - . gene_id "WBGene00023193"; transcript_id "Y74C9...
3 chrI WormBase three_prime_utr 4116 4220 . - . gene_id "WBGene00022277"; transcript_id "Y74C9...
4 chrI WormBase exon 4116 4358 . - . gene_id "WBGene00022277"; transcript_id "Y74C9...

In [4]:
df_['feature'].value_counts()


Out[4]:
exon               269868
CDS                222825
transcript          61073
gene                46742
stop_codon          33423
start_codon         33400
five_prime_utr      30958
three_prime_utr     27993
Name: feature, dtype: int64

In [5]:
# WS260_ce10.canonical_geneset.gtf.gz: entire canonical geneset
yp.to_csv_gz(mp('WS260_ce10.canonical_geneset.gtf.gz'), df_, **yp.TO_GTF_KWARGS)
!gunzip -c {mp('WS260_ce10.canonical_geneset.gtf.gz')} | wc -l


726283

In [6]:
# WS260_ce10.genes.gtf.gz: all gene records
yp.to_csv_gz(mp('WS260_ce10.genes.gtf.gz'), df_.query('feature=="gene"'), **yp.TO_GTF_KWARGS)
!gunzip -c {mp('WS260_ce10.genes.gtf.gz')} | wc -l


46743

In [7]:
# WS260_ce10.transcripts.gtf.gz: all transcript records
yp.to_csv_gz(mp('WS260_ce10.transcripts.gtf.gz'), df_.query('feature!="gene"'), **yp.TO_GTF_KWARGS)
!gunzip -c {mp('WS260_ce10.transcripts.gtf.gz')} | wc -l


679541

In [8]:
df_attr_['transcript_biotype'].value_counts()


Out[8]:
protein_coding     614785
piRNA               30728
ncRNA               20679
pseudogene           8443
tRNA                 1253
miRNA                 908
snoRNA                690
lincRNA               529
pre_miRNA             514
tRNA_pseudogene       420
antisense             289
snRNA                 260
rRNA                   40
rRNA_pseudogene         2
Name: transcript_biotype, dtype: int64

In [9]:
# WS260_ce10.annot.gtf.gz: all transcripts used in the annotation
l_annot_ = ['protein_coding', 'pseudogene', 'tRNA', 'snoRNA', 'miRNA', 'snRNA', 'rRNA']
m_annot_ = df_attr_['transcript_biotype'].isin(l_annot_)
fp_ = os.path.join(annot_, 'WS260_ce10.annot.gtf.gz')
yp.to_csv_gz(fp_, df_[m_annot_].query('feature!="gene"'), **yp.TO_GTF_KWARGS)
!gunzip -c {fp_} | wc -l


626380