Data table generator for ID Mapping Service

Introduction

This is a python script to generate data maping table for this GO service.

To use this with the Docker Compose command, please export this as a standard python script.


In [1]:
# Source data set builder for in-memory data table used in ID Mapper.
import pandas as pd
import json, urllib.request

# Location of original data sets
NCBI_FTP = "ftp://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/"
UNI_FTP = "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/"

# From NCBI Gene
NCBI_SOURCES = {
    "HUMAN": NCBI_FTP + "Mammalia/Homo_sapiens.gene_info.gz",
    "YEAST": NCBI_FTP + "Fungi/Saccharomyces_cerevisiae.gene_info.gz",
    "MOUSE": NCBI_FTP + "Mammalia/Mus_musculus.gene_info.gz",
    "FLY": NCBI_FTP + "Invertebrates/Drosophila_melanogaster.gene_info.gz",
    "ARATH": NCBI_FTP + "Plants/Arabidopsis_thaliana.gene_info.gz"
}

# Uniprot ID Mapping
UNIPROT_SOURCES = {
    "HUMAN": UNI_FTP + "HUMAN_9606_idmapping_selected.tab.gz",
    "MOUSE": UNI_FTP + "MOUSE_10090_idmapping_selected.tab.gz",
    "YEAST": UNI_FTP + "YEAST_559292_idmapping_selected.tab.gz",
    "FLY": UNI_FTP + "DROME_7227_idmapping_selected.tab.gz",
    "ARATH": UNI_FTP + "ARATH_3702_idmapping_selected.tab.gz"
}

# Get all data sets from the FTP server
NCBI_COLUMNS = ["tax_id", "GeneID", "Symbol", "LocusTag", "Synonyms", "dbXrefs", "chromosome", "map_location",
                "description", "type_of_gene", "Symbol_from_nomenclature_authority", "Full_name_from_nomenclature_authority",
                "Nomenclature_status", "Other_designations", "Modification_date"]

UNIPROT_COLUMNS = ['UniProtKB-AC','UniProtKB-ID','GeneID','RefSeq','GI','PDB','GO','UniRef100','UniRef90',
    'UniRef50','UniParc','PIR','NCBI-taxon','MIM','UniGene','PubMed','EMBL','EMBL-CDS',
    'Ensembl','Ensembl_TRS','Ensembl_PRO','Additional PubMed']

## Load NCBI data first.

ncbi_map = {}

for key in NCBI_SOURCES:
    print("Downloading from NCBI FTP server: " + key + "...")    
    local_filename, headers = urllib.request.urlretrieve(NCBI_SOURCES[key])
    ncbi_map[key] = pd.read_csv(local_filename, sep='\t', low_memory=False, 
                             names=NCBI_COLUMNS, comment='#', compression="gzip")


Downloading from NCBI FTP server: FLY...
Downloading from NCBI FTP server: HUMAN...
Downloading from NCBI FTP server: ARATH...
Downloading from NCBI FTP server: MOUSE...
Downloading from NCBI FTP server: YEAST...

In [2]:
# Load UNIPROT data next...
uniprot_map = {}

for key in UNIPROT_SOURCES:
    print("Downloading from Uniprot FTP server: " + key + "...")    
    local_filename, headers = urllib.request.urlretrieve(UNIPROT_SOURCES[key])
    uniprot_map[key] = pd.read_csv(local_filename, 
                     sep="\t", names=UNIPROT_COLUMNS, low_memory=False, compression="gzip")


Downloading from Uniprot FTP server: FLY...
Downloading from Uniprot FTP server: HUMAN...
Downloading from Uniprot FTP server: ARATH...
Downloading from Uniprot FTP server: MOUSE...
Downloading from Uniprot FTP server: YEAST...

In [3]:
for key in ncbi_map:
    ncbi_gene_info = ncbi_map[key]
    ncbi_subset = ncbi_gene_info[["GeneID", "Symbol", "LocusTag", "Synonyms", "chromosome", "map_location",
                "description", "Full_name_from_nomenclature_authority"]].astype(str)
    
    # Merge and create new table
    merged = pd.merge(uniprot_map[key], ncbi_subset , left_on="GeneID", right_on="GeneID", how="outer")
    # Drop unnecessary columns
    df_final = merged.drop("Additional PubMed", 1)
    df_final = df_final.drop("NCBI-taxon", 1)
    df_final = df_final.drop("PubMed", 1)
    
    # Create one mapping file / species
    df_final.to_csv("./idmapping_" + key.lower() +".tsv", sep='\t', index=False)