In [1]:
%run ~/relmapping/annot/notebooks/__init__.ipynb


/mnt/home3/jj374/anaconda36/lib/python3.6/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools
os.getcwd(): /mnt/b2/scratch/ahringer/jj374/lab/relmapping

In [2]:
# Load full WS260 annotation set & extract operons
fp_ = 'wget/ftp.wormbase.org/pub/wormbase/releases/WS260/species/c_elegans/PRJNA13758/c_elegans.PRJNA13758.WS260.annotations.gff2.gz'
df_WS260 = pd.read_csv(fp_, sep='\t', names=yp.NAMES_GTF, comment='#')
print('%d annotation records' % (len(df_WS260),))

df_WS260_operon = df_WS260.query('source == "operon"')
print('%d operon records before liftover' % (len(df_WS260_operon),))


/mnt/home3/jj374/anaconda36/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2698: DtypeWarning: Columns (5) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
33021595 annotation records
1388 operon records before liftover

In [3]:
# Liftover to ce10 using an in-house utility
def liftover_WS235_WS220_gtf(chroms, starts, ends, strands): # Liftover wrapper
    """
    # http://www.ncbi.nlm.nih.gov/pmc/articles/PMC4512556/
    # note that while annotations changed, the N2 genome sequence remained identical from WS215 through WS234
    Test: 
    list(liftover_WS230_WS220(
    chroms=['chrI', 'chrX', 'chrIII', 'chrV', 'chrIV', 'chrII'],
    starts=[1,1,4,1,1, 1],
    ends=[12507921,13713648,471811,15465174,11718826, 11718826],
    strands=['+', '-', '+', '-', '+']))
    """
    fp_inp = '/mnt/home1/ahringer/jj374/relmapping/tmp/liftover_WS235_WS220_gtf_fp_inp.tsv'
    fp_out = '/mnt/home1/ahringer/jj374/relmapping/tmp/liftover_WS235_WS220_gtf_fp_out.tsv'
    with open(fp_inp, 'w') as fh_inp:
        for (chrom, start, end, strand) in zip(chroms, starts, ends, strands):
            fh_inp.write('\t'.join(map(str, (chrom, start, end, strand))) + '\n')

    liftover_bin = os.path.expanduser('~/lab/raw_data/datastore/Kenneth_Evans/ConvertWormAssembly-1-0-0/ConvertWormAssembly.1.0.0.pl')
    liftover_dir = os.path.expanduser('~/lab/raw_data/datastore/Kenneth_Evans/ConvertWormAssembly-1-0-0/CHROMOSOME-DIFFERENCES-to-WS244')
    !{liftover_bin} \
        --input={fp_inp} \
        --output={fp_out} \
        --backwards \
        --release1=WS235 \
        --release2=WS220 \
        --chrfield=1 \
        --posfield=2 \
        --posfield=3 \
        --postype=1 \
        --postype=1 \
        --strandfield=4 \
        --refdir={liftover_dir}
    
    df_out = pd.read_csv(fp_out, sep='\t', names=['chrom', 'start', 'end', 'strand'])
    for i, r in df_out.iterrows():
        yield((r['chrom'], r['start'], r['end'], r['strand']))

def to_ce10(df_inp): # Use liftover wrapper on a gtf-DataFrame
    df_out = df_inp.query('(chrom != "CHROMOSOME_MtDNA") & (chrom != "MtDNA")').reset_index(drop=True).copy()
    df_WS220 = pd.DataFrame.from_records(liftover_WS235_WS220_gtf(df_out['chrom'], df_out['start'], df_out['end'], df_out['strand']), columns=['chrom', 'start', 'end', 'strand'])
    df_out['chrom'] = list(map(lambda chrom: 'chr' + chrom.lstrip('CHROMOSOME_'), df_WS220['chrom']))
    df_out['start'] = df_WS220['start'].copy()
    df_out['end'] = df_WS220['end'].copy()
    df_out['strand'] = df_WS220['strand'].copy()
    return df_out.sort_values(['chrom', 'start', 'end', 'strand'])

df_WS260_operon_ce10 = to_ce10(df_WS260_operon)
print('%d operon records after liftover' % (len(df_WS260_operon_ce10),))


Input file  = /mnt/home1/ahringer/jj374/relmapping/tmp/liftover_WS235_WS220_gtf_fp_inp.tsv 
Output file = /mnt/home1/ahringer/jj374/relmapping/tmp/liftover_WS235_WS220_gtf_fp_out.tsv 
Lines processed from input file              = 1388
Comment / blank lines written to output file = 0
Data lines written to output file            = 1388
Number of records changed                    = 1349
1388 operon records after liftover

In [4]:
# Write output to WS260_ce10/WS260_ce10.operon.gtf
df_WS260_operon_ce10.to_csv('WS260_ce10/WS260_ce10.operon.gtf', **yp.TO_GTF_KWARGS)
!wc -l WS260_ce10/WS260_ce10.operon.gtf


1388 WS260_ce10/WS260_ce10.operon.gtf