In [6]:
import json
import csv

In [22]:
hum_ensembl_entrez_map = {}
with open('mart_export_hum.txt', 'r') as mar_export_hum_file:
    mar_export_hum = csv.reader(mar_export_hum_file,delimiter='\t')
    for row in mar_export_hum:
        if len(row[1]) > 0:
            hum_ensembl_entrez_map[row[0]] = row[1]

In [34]:
mus_refseq_entrez_map = {}
mus_refseq_ensembl_map = {}
with open('mart_export_mus.txt', 'r') as mar_export_mus_file:
    mar_export_mus = csv.reader(mar_export_mus_file,delimiter='\t')
    for row in mar_export_mus:
#         if len(row[1]) > 0 and len(row[2]) > 0:
#             print "NM and NR in same row", row[1],row[2]
        if len(row[1]) > 0:
            if len(row[0]) > 0:
                mus_refseq_entrez_map[row[1]] = row[0]
            if len(row[3]) > 0:
                mus_refseq_ensembl_map[row[1]] = row[3]
        if len(row[2]) > 0:
            if len(row[0]) > 0:
                mus_refseq_entrez_map[row[2]] = row[0]
            if len(row[3]) > 0:
                mus_refseq_ensembl_map[row[2]] = row[3]

In [80]:
with open('genes_list.json') as hum_genes_file:
    hum_data = json.load(hum_genes_file)

with open('genes_list_GRCm38.txt') as mus_genes_file:
    mus_data = json.load(mus_genes_file)
print hum_data[0]['ensembl_id'].split('.')[0]
print mus_data[0]


ENSG00000261122
{u'ensembl_id': u'NM_001282945', u'name': u'Adora1'}

In [77]:
'hello'.upper()


Out[77]:
'HELLO'

In [78]:
'3333.333'.split('.')[0]


Out[78]:
'3333'

In [82]:
for idx,gene in enumerate(hum_data):
    hum_data[idx]['name'] = hum_data[idx]['name'].upper()
    if gene['ensembl_id'].split('.')[0] in hum_ensembl_entrez_map:
        hum_data[idx]['entrez_id'] = hum_ensembl_entrez_map[gene['ensembl_id'].split('.')[0]]
    else:
        hum_data[idx]['entrez_id'] = ""
        #hum_data[idx]['entrez_full'] = "Entrez" + hum_ensembl_entrez_map[gene['ensembl_id'][:-2]]

In [83]:
with open('genes_list.json2', 'w') as genes_list:
    genes_list.write(json.dumps(hum_data))

In [84]:
for idx,gene in enumerate(mus_data):
    if gene['ensembl_id'] in mus_refseq_entrez_map:
        mus_data[idx]['entrez_id'] = mus_refseq_entrez_map[gene['ensembl_id']]
    else:
        mus_data[idx]['entrez_id'] = ""
    if gene['ensembl_id'] in mus_refseq_ensembl_map:
        mus_data[idx]['ensembl_id_real'] = mus_refseq_ensembl_map[gene['ensembl_id']]
    else:
        mus_data[idx]['ensembl_id_real'] = ""

In [85]:
with open('genes_list_GRCm38.txt2', 'w') as genes_list:
    genes_list.write(json.dumps(mus_data))

In [86]:
print mus_data[0]
print hum_data[0]


{'ensembl_id_real': 'ENSMUSG00000042429', u'ensembl_id': u'NM_001282945', 'entrez_id': '11539', u'name': u'Adora1'}
{u'ensembl_id': u'ENSG00000261122.2', 'entrez_id': '400533', u'name': u'5S_RRNA', u'description': u''}

In [ ]: