In [16]:
from thermo import *
from thermo.identifiers import ChemicalMetadataDB
from numpy.testing import assert_allclose
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem.rdMolDescriptors import CalcMolFormula
db = ChemicalMetadataDB(elements=False, main_db=('anion_db_20171029.tsv'),
user_dbs=[]) # user_dbs=['/home/caleb/Documents/University/CHE3123/thermo/thermo/Identifiers/Cation db.tsv']
def formula_to_charge(formula):
splits = formula.split('-')
if len(splits) == 1 or splits[1] == '':
return -1
else:
return -1*int(splits[1])
# [(i.formula, formula_to_charge(i.formula)) for i in db.CAS_index.values()]
def formula_variations_ion(formula, charge):
formula = formula.split('-')[0]
formulas = [formula+'-'*abs(charge),
formula+str(charge),
formula+'('+'-'+ str(abs(charge)) + ')',
formula+'('+ str(abs(charge))+'-' + ')',
formula+'('+ '-'*abs(charge) + ')']
return formulas
# formula_variations_ion('AgH2O2-', -1)
# 259 in that book vs 339 here.
len(db.CAS_index)
Out[16]:
In [17]:
data = {}
with open('Original metadata.csv') as f:
f.readline()
for line in f.readlines():
if len(line.split('\t')) == 6:
name, name2, CAS, formula, charge, MW = line.split('\t')
else:
name, name2, CAS, formula, charge = line.split('\t')
MW = 0
data[CAS] = {'Name': name, 'Name2': name2, 'formula': formula, 'charge':int(charge), 'MW': float(MW)}
# data
In [ ]:
In [18]:
good_syns = {CAS:{'synonyms': []} for CAS, d in data.items()}
for CAS, d in data.items():
if d['MW']:
good_syns[CAS]['synonyms'].append(d['Name2'])
for CAS, d in data.items():
good_syns[CAS]['synonyms'].extend(formula_variations_ion(d['formula'], d['charge']))
for CAS, d in db.CAS_index.items():
CAS = d.CASs
syns = formula_variations_ion(d.formula, formula_to_charge(d.formula))
if CAS in good_syns:
syns = [i for i in syns if i not in good_syns[CAS]['synonyms']]
good_syns[CAS]['synonyms'].extend(syns)
else:
good_syns[CAS] = {}
good_syns[CAS]['synonyms'] = syns
good_syns['338-70-5']['synonyms'].append('oxalate')
# good_syns['18500-32-8']['synonyms'].append('H2NNH3+')
# 'H2P2O7-2 is in there as P2O7H2-2 - look into unique searching by formula? Should be possible.
# 34175-11-6 'HS2O4-', as HO4S2-
# 14102-45-5 is 'H2AsO3-',
# HP2O7-3 as well
# 44030-61-7 is 'HPO3F-', here's a good case to the search
# OCN NCO- 661-20-1
# 920-52-5 'C2O4H-' already done
['CH3COO-', 'CHOO-']
ClO2m = {'formula': 'ClO2-', 'MW': 67.448, 'pubchem': 197148, 'smiles': '[O-]Cl=O',
'inchikey': 'QBWCMBCROVPCKQ-UHFFFAOYSA-M', 'inchi': 'InChI=1S/ClHO2/c2-1-3/h(H,2,3)/p-1'}
# Get the rest from the api
# Key problem - if there's no mol downloaded, there's a no go.
ClO3m = {'formula': 'ClO3-', 'pubchem': 104770}
Br3m = {'pubchem': 77881}
BrO3m = {'pubchem': 84979}
AlOm = {'smiles': '[Al-].[O]', 'MW': 42.981, 'formula': 'AlO-'} # No inchi
BrO4m = {'pubchem': 5460630}
IO4m = {'pubchem': 167232}
B4O7m = {'pubchem': 91932232}
HF2m = {'pubchem': 21864337}
SiF6m = {'pubchem': 28117}
B4O7m = {'pubchem': 91932232}
N2O2m = {'pubchem': 4686309}
SbF6m = {'pubchem': 3868826}
IO3m = {'pubchem': 84927}
AgO2H2m = {'smiles': '[Ag+1].[OH-].[OH-]', 'formula': 'H2AgO2-1', 'MW': 124.8752}
CrO4H4m = {'smiles': '[OH-].[OH-].[OH-].[OH-].[Cr+3]', 'formula': 'CrH4O4-', 'MW': 120.024} # read wrong no pubchem
TiO5H5 = {'smiles': '[OH-].[OH-].[OH-].[OH-].[Ti+4]', 'MW': 132.902, 'formula': 'TiH5O5-'} # read wrong no pubchem
FeO4H4m3 = {'smiles': '[OH-].[OH-].[OH-].[OH-].[Fe+1]', 'formula': 'FeH4O4-3', 'MW': 123.873} # read wrong no pubchem
FeO4H4m = {'smiles': '[OH-].[OH-].[OH-].[OH-].[Fe+3]', 'formula': 'FeH4O4-', 'MW': 123.873} # read wrong no pubchem
FeO4H4m2 = {'smiles': '[OH-].[OH-].[OH-].[OH-].[Fe+2]', 'formula': 'FeH4O4-2', 'MW': 123.873} # read wrong no pubchem
FeO3H3m = {'smiles': '[OH-].[OH-].[OH-].[Fe+2]', 'formula': 'FeH3O3-', 'MW': 106.866}
FeOH2m = {'formula': 'FeH2O-', 'MW': 73.86028, 'smiles': '[H-].[OH-].[Fe+1]'}
HB4O7m = {'formula': 'HB4O7-', 'MW': 156.24774} # no smiles available?
H2P2m = {'formula': 'H2P2-', 'MW': molecular_weight(nested_formula_parser('H2P2-'))}
custom_ions = {'14998-27-7': ClO2m, '14866-68-3': ClO3m, '14522-80-6': Br3m, '15541-45-4': BrO3m,
'12758-12-2': AlOm, '16474-32-1': BrO4m, '15056-35-6': IO4m, '12258-53-6': B4O7m,
'18130-74-0': HF2m, '17084-08-1': SiF6m, '12258-53-6': B4O7m, '15435-66-2': N2O2m,
'17111-95-4': SbF6m, '15454-31-6': IO3m, '12258-16-1': AgO2H2m, '97775-49-0':CrO4H4m,
'119046-04-7': TiO5H5, '29145-79-7': FeO4H4m, '150393-25-2': FeO4H4m3,
'73128-36-6': FeO4H4m2, '70756-39-7': FeO3H3m, '150381-43-4':FeOH2m,
'12447-33-5': HB4O7m, '107596-48-5': H2P2m }
# 15454-31-6
for CAS, d in custom_ions.items():
if CAS in good_syns:
good_syns[CAS].update(d)
else:
good_syns[CAS] = d
import json
f = open('Good synoynms by CAS2.json', 'w')
json.dump(good_syns, f, indent=2, separators=(',', ': '), sort_keys=True)
f.close()
In [ ]:
In [ ]:
In [12]:
from collections import Counter
ns = []
for i in db.CAS_index.values():
ns.extend(list(set(i.all_names)))
Counter(ns).most_common(20)
# No dup names :)
# len(ns)
# len(a.CAS_index), len(a.pubchem_index)
# TODO oxalate goes to the one without Hs, 71081 338-70-5 C2O4-2
Out[12]:
In [ ]:
In [4]:
# None of the charges are wrong?
for CAS, d in data.items():
chem = db.search_CAS(CAS)
if not chem:
continue
print('NOTINDB', CAS)
continue
mol = Chem.MolFromSmiles(chem.smiles)
if mol is None:
continue
print('CANTREADMOL', CAS)
continue
# print(Chem.MolToSmiles(mol))
charge = Chem.GetFormalCharge(mol)
try:
assert charge == d['charge']
# print('PASS', charge, d['charge'])
except:
print('F:', charge, d['charge'], CAS)
In [33]:
# Chem.GetFormalCharge(Chem.MolFromSmiles('[SbH6+3]'))
# a.search_CAS('16971-29-2').InChI, a.search_CAS('16971-29-2').formula, a.search_CAS('16971-29-2').smiles
In [ ]:
In [6]:
# mol = Chem.MolFromMolFile('mol/14695-95-5.mol')
# mol = Chem.MolFromMolFile('/tmp/399316.mol')
# # # mol = Chem.MolFromSmiles('[Sb+3]')
# # # When read, 1 atom
# # Chem.MolToSmiles(mol, allHsExplicit=True)
# # mol.GetNumAtoms()
# mw = Descriptors.MolWt(mol)
# formula = CalcMolFormula(mol)
# mw, formula
In [5]:
# Most of the MW ones fail due to having added extra hydrogens???? OR MW?
for CAS, d in data.items():
chem = db.search_CAS(CAS)
if not chem or d['MW'] == 0:
continue
try:
assert_allclose(chem.MW, d['MW'], atol=0.3)
except:
print('F:', CAS, chem.MW, d['MW'], chem)
# Plenty of outstanding work here and with the charges.
# 20561-39-1 astatine mos stable.
# 16518-47-1 Arsenate (AsO43-), dihydrogen (8CI,9CI), must be a typo - clearly weight is higher.
# 14493-01-7 is fine, confirmed.
# 19469-81-9 H4Al is simply wrong.
# 14897-04-2 is fine
# 34786-97-5 is fine, Vanadate (V(OH)2O21-), their error
# 14333-20-1 Pertechnetate is fine
# 16844-87-4 Arsenate (AsO43-), monohydrogen is fine
# 26450-38-4 their typo for sure
# 15390-83-7 is fine for sure, their bad
In [ ]:
In [ ]:
In [ ]: