notebook.community

Edit and run



In [1]:

    
import os
import gzip
import re
import json

import pandas



In [2]:

    
# # Download human entrez gene information
# url = 'ftp://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz'
# ! wget --timestamping --directory-prefix download/ $url



In [3]:

    
# Read Entrez info dataset
path = os.path.join('download', 'Homo_sapiens.gene_info.gz')

with gzip.open(path, 'rt') as read_file:
    matches = re.match(r'#Format: (.+) \(', next(read_file))
    columns = matches.group(1).split(' ')
    gene_df = pandas.read_table(read_file, names = columns, na_values=['-'])

# Restrict to homo sapiens
gene_df = gene_df.query('tax_id == 9606')

len(gene_df)









    Out[3]:





56352



In [4]:

    
# extract symbols and xrefs
xref_rows = list()
symbol_rows = list()

for i, series in gene_df.iterrows():
    gene_id = series.GeneID
    
    # symbols
    symbol = series.Symbol
    if pandas.notnull(symbol):
        symbol_rows.append((gene_id, 'symbol', symbol))
    
    # synonyms
    synonyms = series.Synonyms
    if pandas.notnull(synonyms):
        for synonym in synonyms.split('|'):
            symbol_rows.append((gene_id, 'synonym', synonym))

    # xrefs
    dbXrefs = series.dbXrefs
    if pandas.notnull(dbXrefs):
        for xref in dbXrefs.split('|'):
            db, ref = xref.split(':', 1)
            xref_rows.append((gene_id, db, ref))

xref_df = pandas.DataFrame(xref_rows, columns=['GeneID', 'resource', 'identifier'])
xref_df.to_csv('data/xrefs-human.tsv', sep='\t', index=False)

symbol_df = pandas.DataFrame(symbol_rows, columns=['GeneID', 'type', 'symbol'])
symbol_df.to_csv('data/symbols-human.tsv', sep='\t', index=False)



In [5]:

    
xref_df.head()









    Out[5]:






  
    
      
      GeneID
      resource
      identifier
    
  
  
    
      0
      1
      MIM
      138670
    
    
      1
      1
      HGNC
      HGNC:5
    
    
      2
      1
      Ensembl
      ENSG00000121410
    
    
      3
      1
      HPRD
      00726
    
    
      4
      1
      Vega
      OTTHUMG00000183507



In [6]:

    
symbol_df.head()









    Out[6]:






  
    
      
      GeneID
      type
      symbol
    
  
  
    
      0
      1
      symbol
      A1BG
    
    
      1
      1
      synonym
      A1B
    
    
      2
      1
      synonym
      ABG
    
    
      3
      1
      synonym
      GAB
    
    
      4
      1
      synonym
      HYST2477



In [7]:

    
# save a select columnset
columns = ['tax_id', 'GeneID', 'Symbol', 'chromosome', 'map_location', 'type_of_gene', 'description']
select_df = gene_df[columns]
select_df.to_csv('data/genes-human.tsv', sep='\t', index=False)
select_df.head()









    Out[7]:






  
    
      
      tax_id
      GeneID
      Symbol
      chromosome
      map_location
      type_of_gene
      description
    
  
  
    
      0
      9606
      1
      A1BG
      19
      19q13.4
      protein-coding
      alpha-1-B glycoprotein
    
    
      1
      9606
      2
      A2M
      12
      12p13.31
      protein-coding
      alpha-2-macroglobulin
    
    
      2
      9606
      3
      A2MP1
      12
      12p13.31
      pseudo
      alpha-2-macroglobulin pseudogene 1
    
    
      3
      9606
      9
      NAT1
      8
      8p22
      protein-coding
      N-acetyltransferase 1 (arylamine N-acetyltrans...
    
    
      4
      9606
      10
      NAT2
      8
      8p22
      protein-coding
      N-acetyltransferase 2 (arylamine N-acetyltrans...



In [8]:

    
# Compute dictionaries of symbols to GeneIDs and save as jsons
symbol_df = pandas.read_table('data/symbols-human.tsv')

symbol_to_id = dict()
synonym_to_ids = dict()
for i, row in symbol_df.iterrows():
    symbol = row.symbol
    if pandas.isnull(symbol):
        continue
    gene_id = row.GeneID
    if row.type == 'symbol':
        symbol_to_id[symbol] = gene_id
    if row.type == 'synonym':
        synonym_to_ids.setdefault(symbol, list()).append(gene_id)

with open('data/symbols-human.json', 'w') as write_file:
    json.dump(symbol_to_id, write_file, indent=2, sort_keys=True)
with open('data/synonyms-human.json', 'w') as write_file:
    json.dump(synonym_to_ids, write_file, indent=2, sort_keys=True)

Create a symbol to ID mapping that includes both approved symbols and synonyms



In [9]:

    
# Synonyms that uniquely map to GeneIDs
symbol_map = {k: v[0] for k, v in synonym_to_ids.items() if len(v) == 1}

# Override synonyms with symbols
symbol_map.update(symbol_to_id)

with open('data/symbol-map.json', 'w') as write_file:
    json.dump(symbol_map, write_file, indent=2, sort_keys=True)

	GeneID	resource	identifier
0	1	MIM	138670
1	1	HGNC	HGNC:5
2	1	Ensembl	ENSG00000121410
3	1	HPRD	00726
4	1	Vega	OTTHUMG00000183507

	GeneID	type	symbol
0	1	symbol	A1BG
1	1	synonym	A1B
2	1	synonym	ABG
3	1	synonym	GAB
4	1	synonym	HYST2477

	tax_id	GeneID	Symbol	chromosome	map_location	type_of_gene	description
0	9606	1	A1BG	19	19q13.4	protein-coding	alpha-1-B glycoprotein
1	9606	2	A2M	12	12p13.31	protein-coding	alpha-2-macroglobulin
2	9606	3	A2MP1	12	12p13.31	pseudo	alpha-2-macroglobulin pseudogene 1
3	9606	9	NAT1	8	8p22	protein-coding	N-acetyltransferase 1 (arylamine N-acetyltrans...
4	9606	10	NAT2	8	8p22	protein-coding	N-acetyltransferase 2 (arylamine N-acetyltrans...