In [1]:
import os
import gzip
import re
import json

import pandas

In [2]:
# # Download human entrez gene information
# url = 'ftp://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz'
# ! wget --timestamping --directory-prefix download/ $url

In [3]:
# Read Entrez info dataset
path = os.path.join('download', 'Homo_sapiens.gene_info.gz')

with gzip.open(path, 'rt') as read_file:
    matches = re.match(r'#Format: (.+) \(', next(read_file))
    columns = matches.group(1).split(' ')
    gene_df = pandas.read_table(read_file, names = columns, na_values=['-'])

# Restrict to homo sapiens
gene_df = gene_df.query('tax_id == 9606')

len(gene_df)


Out[3]:
56352

In [4]:
# extract symbols and xrefs
xref_rows = list()
symbol_rows = list()

for i, series in gene_df.iterrows():
    gene_id = series.GeneID
    
    # symbols
    symbol = series.Symbol
    if pandas.notnull(symbol):
        symbol_rows.append((gene_id, 'symbol', symbol))
    
    # synonyms
    synonyms = series.Synonyms
    if pandas.notnull(synonyms):
        for synonym in synonyms.split('|'):
            symbol_rows.append((gene_id, 'synonym', synonym))

    # xrefs
    dbXrefs = series.dbXrefs
    if pandas.notnull(dbXrefs):
        for xref in dbXrefs.split('|'):
            db, ref = xref.split(':', 1)
            xref_rows.append((gene_id, db, ref))

xref_df = pandas.DataFrame(xref_rows, columns=['GeneID', 'resource', 'identifier'])
xref_df.to_csv('data/xrefs-human.tsv', sep='\t', index=False)

symbol_df = pandas.DataFrame(symbol_rows, columns=['GeneID', 'type', 'symbol'])
symbol_df.to_csv('data/symbols-human.tsv', sep='\t', index=False)

In [5]:
xref_df.head()


Out[5]:
GeneID resource identifier
0 1 MIM 138670
1 1 HGNC HGNC:5
2 1 Ensembl ENSG00000121410
3 1 HPRD 00726
4 1 Vega OTTHUMG00000183507

In [6]:
symbol_df.head()


Out[6]:
GeneID type symbol
0 1 symbol A1BG
1 1 synonym A1B
2 1 synonym ABG
3 1 synonym GAB
4 1 synonym HYST2477

In [7]:
# save a select columnset
columns = ['tax_id', 'GeneID', 'Symbol', 'chromosome', 'map_location', 'type_of_gene', 'description']
select_df = gene_df[columns]
select_df.to_csv('data/genes-human.tsv', sep='\t', index=False)
select_df.head()


Out[7]:
tax_id GeneID Symbol chromosome map_location type_of_gene description
0 9606 1 A1BG 19 19q13.4 protein-coding alpha-1-B glycoprotein
1 9606 2 A2M 12 12p13.31 protein-coding alpha-2-macroglobulin
2 9606 3 A2MP1 12 12p13.31 pseudo alpha-2-macroglobulin pseudogene 1
3 9606 9 NAT1 8 8p22 protein-coding N-acetyltransferase 1 (arylamine N-acetyltrans...
4 9606 10 NAT2 8 8p22 protein-coding N-acetyltransferase 2 (arylamine N-acetyltrans...

In [8]:
# Compute dictionaries of symbols to GeneIDs and save as jsons
symbol_df = pandas.read_table('data/symbols-human.tsv')

symbol_to_id = dict()
synonym_to_ids = dict()
for i, row in symbol_df.iterrows():
    symbol = row.symbol
    if pandas.isnull(symbol):
        continue
    gene_id = row.GeneID
    if row.type == 'symbol':
        symbol_to_id[symbol] = gene_id
    if row.type == 'synonym':
        synonym_to_ids.setdefault(symbol, list()).append(gene_id)

with open('data/symbols-human.json', 'w') as write_file:
    json.dump(symbol_to_id, write_file, indent=2, sort_keys=True)
with open('data/synonyms-human.json', 'w') as write_file:
    json.dump(synonym_to_ids, write_file, indent=2, sort_keys=True)

Create a symbol to ID mapping that includes both approved symbols and synonyms


In [9]:
# Synonyms that uniquely map to GeneIDs
symbol_map = {k: v[0] for k, v in synonym_to_ids.items() if len(v) == 1}

# Override synonyms with symbols
symbol_map.update(symbol_to_id)

with open('data/symbol-map.json', 'w') as write_file:
    json.dump(symbol_map, write_file, indent=2, sort_keys=True)