In [ ]:
# Source data set builder for in-memory data table used in ID Mapper.
import pandas as pd
import json
# Location of original data sets
# From NCBI Gene
NCBI_HUMAN = "ftp://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz"
NCBI_YEAST = "ftp://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/Fungi/Saccharomyces_cerevisiae.gene_info.gz"
NCBI_MOUSE = "ftp://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/Mammalia/Mus_musculus.gene_info.gz"
NCBI_FLY = "ftp://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/Invertebrates/Drosophila_melanogaster.gene_info.gz"
# Uniprot ID Mapping
UNIPROT_HUMAN = "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping_selected.tab.gz"
UNIPROT_MOUSE = "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/MOUSE_10090_idmapping_selected.tab.gz"
UNIPROT_YEAST = "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/YEAST_559292_idmapping_selected.tab.gz"
UNIPROT_FLY = "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/DROME_7227_idmapping_selected.tab.gz"
In [ ]:
# Base ID Mapping Table from Uniprot
human1 = pd.read_csv('./HUMAN_9606_idmapping.dat.txt', sep="\t", names=["UniProtKB-AC", "ID_type", "ID"])
In [ ]:
id_types = pd.unique(human1["ID_type"])
types = pd.DataFrame(id_types)
types.columns = ["ID_TYPE"]
types = types.sort_values("ID_TYPE")
types.head()
In [ ]:
# Column header of the base table
idmapping_selected_columns = [
'UniProtKB-AC',
'UniProtKB-ID',
'GeneID',
'RefSeq',
'GI',
'PDB',
'GO',
'UniRef100',
'UniRef90',
'UniRef50',
'UniParc',
'PIR',
'NCBI-taxon',
'MIM',
'UniGene',
'PubMed',
'EMBL',
'EMBL-CDS',
'Ensembl',
'Ensembl_TRS',
'Ensembl_PRO',
'Additional PubMed']
human2 = pd.read_csv('./HUMAN_9606_idmapping_selected.tab.txt',
sep="\t", names=idmapping_selected_columns, low_memory=False)
In [ ]:
human2.head()
In [ ]:
column_names = ["tax_id", "GeneID", "Symbol", "LocusTag", "Synonyms", "dbXrefs", "chromosome", "map_location",
"description", "type_of_gene", "Symbol_from_nomenclature_authority", "Full_name_from_nomenclature_authority",
"Nomenclature_status", "Other_designations", "Modification_date"]
ncbi_gene_info = pd.read_csv('./Homo_sapiens.gene_info.txt', sep='\t', low_memory=False,
names=column_names, comment='#')
In [ ]:
ncbi_gene_info.head()
In [ ]:
ncbi_subset = ncbi_gene_info[["GeneID", "Symbol", "Full_name_from_nomenclature_authority"]].astype(str)
ncbi_subset.head(20)
In [ ]:
merged = pd.merge(human2, ncbi_subset , left_on="GeneID", right_on="GeneID", how="outer")
In [ ]:
merged.head()
In [ ]:
print(merged.shape)
list(merged.columns.values)
In [ ]:
# Drop unnecessary columns
df_final = merged.drop('Additional PubMed', 1)
df_final = df_final.drop('NCBI-taxon', 1)
df_final = df_final.drop('PubMed', 1)
In [ ]:
df_final.head()
In [ ]:
df_final.to_csv('./idmapping.tsv', sep='\t', index=False)