In [13]:
from thermo import *
from thermo.identifiers import ChemicalMetadataDB
from numpy.testing import assert_allclose
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem.rdMolDescriptors import CalcMolFormula
# db = ChemicalMetadataDB(elements=False, main_db=('cation_db_20171030.tsv'), user_dbs=[])
db = ChemicalMetadataDB(elements=False, main_db=('Cation db.tsv'), user_dbs=[])
def formula_to_charge(formula):
splits = formula.split('+')
if len(splits) == 1 or splits[1] == '':
return 1
else:
return 1*int(splits[1])
# [(i.formula, formula_to_charge(i.formula)) for i in db.CAS_index.values()]
def formula_variations_ion(formula, charge):
formula = formula.split('+')[0]
formulas = [formula+'+'*abs(charge),
formula+'+'+str(charge),
formula+'('+'+'+ str(abs(charge)) + ')',
formula+'('+ str(abs(charge)) + '+)',
formula+'('+ '+'*abs(charge) + ')']
return formulas
In [8]:
data = {}
with open('Original metadata.csv') as f:
f.readline()
for line in f.readlines():
if len(line.split('\t')) == 6:
name, name2, CAS, formula, charge, MW = line.split('\t')
else:
name, name2, CAS, formula, charge = line.split('\t')
MW = 0
MW = MW.strip()
if not MW:
MW = 0
data[CAS] = {'Name': name, 'Name2': name2, 'formula': formula, 'charge':int(charge), 'MW': float(MW)}
In [4]:
#db.CAS_index.keys()
# db.CAS_index[12099995000000]
# db.CAS_index
In [12]:
good_syns = {CAS:{'synonyms': []} for CAS, d in data.items()}
for CAS, d in data.items():
if d['MW']:
good_syns[CAS]['synonyms'].append(d['Name2'])
for CAS, d in data.items():
good_syns[CAS]['synonyms'].extend(formula_variations_ion(d['formula'], d['charge']))
for CAS, d in db.CAS_index.items():
CAS = d.CASs
syns = formula_variations_ion(d.formula, formula_to_charge(d.formula))
if CAS in good_syns:
pass
# syns = [i for i in syns if i not in good_syns[CAS]['synonyms']]
# good_syns[CAS]['synonyms'].extend(syns)
else:
good_syns[CAS] = {}
good_syns[CAS]['synonyms'] = []
good_syns['14464-47-2']['synonyms'].append('deuterium(1+)')
good_syns['2099995000-00-0']['synonyms'].append('Yttrium dihydroxide ion')
# good_syns['14878-41-2']['synonyms'].extend(['Co(en)3+3', '[Co(en)3]+3'])
for f, CAS in zip(['[Co(en)3]+3', '[Co2(trien)3]+6', '[Ni2(trien)3]+4'], ['14878-41-2', '747348-22-7', '12312-87-7']):
charge = formula_to_charge(f)
variations = formula_variations_ion(f, charge) + formula_variations_ion(f.replace('[', '').replace(']', ''), charge)
good_syns[CAS]['synonyms'].extend(variations)
InOHp3 = {'formula': 'InOH+3', 'MW': molecular_weight(nested_formula_parser('OHIn+3')), 'smiles': '[OH-].[In+4]'}
MgOHp2 = {'formula': 'MgOH+2', 'MW': molecular_weight(nested_formula_parser('MgOH+2')), 'smiles': '[OH-].[Mg+3]'}
custom_ions = {'1313606-05-1': InOHp3, '289651-09-8': MgOHp2}
for CAS, d in custom_ions.items():
if CAS in good_syns:
good_syns[CAS].update(d)
else:
good_syns[CAS] = d
import json
f = open('Good synoynms by CAS2.json', 'w')
json.dump(good_syns, f, indent=2, separators=(',', ': '), sort_keys=True)
f.close()
print('hi4.1')
In [ ]:
In [ ]:
In [12]:
# Only 25 synonyms to go through, plus to remove the dups in there - not bad.
from collections import Counter
ns = []
for i in a.CAS_index.values():
ns.extend(list(set(i.all_names)))
# assert len(i.all_names) == len(set(i.all_names))
# len(ns), len(set(ns))
Counter(ns).most_common(20)
Out[12]:
In [13]:
# None of the charges are wrong?
for CAS, d in data.items():
chem = a.search_CAS(CAS)
if not chem:
continue
mol = Chem.MolFromSmiles(chem.smiles)
# print(Chem.MolToSmiles(mol))
charge = Chem.GetFormalCharge(mol)
try:
assert charge == d['charge']
# print('PASS', charge, d['charge'])
except:
print('F:', charge, d['charge'], CAS)
In [4]:
# Chem.GetFormalCharge(Chem.MolFromSmiles('[SbH6+3]'))
In [33]:
len(a.pubchem_index), len(a.CAS_index)
Out[33]:
In [6]:
# mol = Chem.MolFromMolFile('mol/14695-95-5.mol')
# mol = Chem.MolFromMolFile('/tmp/399316.mol')
# # # mol = Chem.MolFromSmiles('[Sb+3]')
# # # When read, 1 atom
# # Chem.MolToSmiles(mol, allHsExplicit=True)
# # mol.GetNumAtoms()
# mw = Descriptors.MolWt(mol)
# formula = CalcMolFormula(mol)
# mw, formula
In [14]:
# Most of the MW ones fail due to having added extra hydrogens???? OR MW?
for CAS, d in data.items():
chem = a.search_CAS(CAS)
if not chem or d['MW'] == 0:
continue
try:
assert_allclose(chem.MW, d['MW'], atol=0.25)
except:
print('F:', CAS, chem.MW, d['MW'], chem)
# 4 plutonium, 3 americium, 1 curioum, 1 promethium, 1 deuterium
In [ ]:
In [ ]:
In [ ]: