In [ ]:
# Source data set builder for in-memory data table used in ID Mapper.

import pandas as pd
import json

# Location of original data sets

# From NCBI Gene
NCBI_HUMAN = "ftp://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz"
NCBI_YEAST = "ftp://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/Fungi/Saccharomyces_cerevisiae.gene_info.gz"
NCBI_MOUSE = "ftp://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/Mammalia/Mus_musculus.gene_info.gz"
NCBI_FLY = "ftp://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/Invertebrates/Drosophila_melanogaster.gene_info.gz"

# Uniprot ID Mapping
UNIPROT_HUMAN = "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping_selected.tab.gz"
UNIPROT_MOUSE = "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/MOUSE_10090_idmapping_selected.tab.gz"
UNIPROT_YEAST = "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/YEAST_559292_idmapping_selected.tab.gz"
UNIPROT_FLY = "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/DROME_7227_idmapping_selected.tab.gz"

In [ ]:
# Base ID Mapping Table from Uniprot
human1 = pd.read_csv('./HUMAN_9606_idmapping.dat.txt', sep="\t", names=["UniProtKB-AC", "ID_type", "ID"])

In [ ]:
id_types = pd.unique(human1["ID_type"])

types = pd.DataFrame(id_types)
types.columns = ["ID_TYPE"]

types = types.sort_values("ID_TYPE")
types.head()

In [ ]:
# Column header of the base table
idmapping_selected_columns = [
    'UniProtKB-AC',
    'UniProtKB-ID',
    'GeneID',
    'RefSeq',
    'GI',
    'PDB',
    'GO',
    'UniRef100',
    'UniRef90',
    'UniRef50',
    'UniParc',
    'PIR',
    'NCBI-taxon',
    'MIM',
    'UniGene',
    'PubMed',
    'EMBL',
    'EMBL-CDS',
    'Ensembl',
    'Ensembl_TRS',
    'Ensembl_PRO',
    'Additional PubMed']

human2 = pd.read_csv('./HUMAN_9606_idmapping_selected.tab.txt', 
                     sep="\t", names=idmapping_selected_columns, low_memory=False)

In [ ]:
human2.head()

In [ ]:
column_names = ["tax_id", "GeneID", "Symbol", "LocusTag", "Synonyms", "dbXrefs", "chromosome", "map_location",
                "description", "type_of_gene", "Symbol_from_nomenclature_authority", "Full_name_from_nomenclature_authority",
                "Nomenclature_status", "Other_designations", "Modification_date"]
ncbi_gene_info = pd.read_csv('./Homo_sapiens.gene_info.txt', sep='\t', low_memory=False, 
                             names=column_names, comment='#')

In [ ]:
ncbi_gene_info.head()

In [ ]:
ncbi_subset = ncbi_gene_info[["GeneID", "Symbol", "Full_name_from_nomenclature_authority"]].astype(str)
ncbi_subset.head(20)

In [ ]:
merged = pd.merge(human2, ncbi_subset , left_on="GeneID", right_on="GeneID", how="outer")

In [ ]:
merged.head()

In [ ]:
print(merged.shape)

list(merged.columns.values)

In [ ]:
# Drop unnecessary columns
df_final = merged.drop('Additional PubMed', 1)
df_final = df_final.drop('NCBI-taxon', 1)
df_final = df_final.drop('PubMed', 1)

In [ ]:
df_final.head()

In [ ]:
df_final.to_csv('./idmapping.tsv', sep='\t', index=False)