In [16]:
from thermo import *
from thermo.identifiers import ChemicalMetadataDB
from numpy.testing import assert_allclose
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem.rdMolDescriptors import CalcMolFormula

db = ChemicalMetadataDB(elements=False, main_db=('anion_db_20171029.tsv'), 
                       user_dbs=[]) #  user_dbs=['/home/caleb/Documents/University/CHE3123/thermo/thermo/Identifiers/Cation db.tsv']
def formula_to_charge(formula):
    splits = formula.split('-')
    if len(splits) == 1 or splits[1] == '':
        return -1
    else:
        return -1*int(splits[1])
    
# [(i.formula, formula_to_charge(i.formula)) for i in db.CAS_index.values()]
def formula_variations_ion(formula, charge):
    formula = formula.split('-')[0]
    formulas = [formula+'-'*abs(charge), 
                formula+str(charge),
                formula+'('+'-'+ str(abs(charge)) + ')',
                formula+'('+ str(abs(charge))+'-' + ')',
                formula+'('+ '-'*abs(charge) + ')']
    return formulas
# formula_variations_ion('AgH2O2-', -1)
# 259 in that book vs 339 here.
len(db.CAS_index)


Out[16]:
200

In [17]:
data = {}
with open('Original metadata.csv') as f:
    f.readline()
    for line in f.readlines():
        if len(line.split('\t')) == 6:
            name, name2, CAS, formula, charge, MW = line.split('\t')
        else:
            name, name2, CAS, formula, charge = line.split('\t')
            MW = 0
        data[CAS] = {'Name': name, 'Name2': name2, 'formula': formula, 'charge':int(charge), 'MW': float(MW)}
# data

In [ ]:


In [18]:
good_syns = {CAS:{'synonyms': []} for CAS, d in data.items()}
                  
for CAS, d in data.items():
    if d['MW']:
        good_syns[CAS]['synonyms'].append(d['Name2'])

for CAS, d in data.items():
    good_syns[CAS]['synonyms'].extend(formula_variations_ion(d['formula'], d['charge']))
for CAS, d in db.CAS_index.items():
    CAS = d.CASs
    syns = formula_variations_ion(d.formula, formula_to_charge(d.formula))
    if CAS in good_syns:
        syns = [i for i in syns if i not in good_syns[CAS]['synonyms']]
        good_syns[CAS]['synonyms'].extend(syns)
    else:
        good_syns[CAS] = {}
        good_syns[CAS]['synonyms'] = syns
    
good_syns['338-70-5']['synonyms'].append('oxalate')
# good_syns['18500-32-8']['synonyms'].append('H2NNH3+')


# 'H2P2O7-2 is in there as P2O7H2-2 - look into unique searching by formula? Should be possible.
# 34175-11-6 'HS2O4-', as HO4S2-
# 14102-45-5 is 'H2AsO3-',
# HP2O7-3 as well
# 44030-61-7 is 'HPO3F-', here's a good case to the search
# OCN NCO- 661-20-1
# 920-52-5 'C2O4H-' already done
['CH3COO-', 'CHOO-']

ClO2m = {'formula': 'ClO2-', 'MW': 67.448, 'pubchem': 197148, 'smiles': '[O-]Cl=O',
         'inchikey': 'QBWCMBCROVPCKQ-UHFFFAOYSA-M', 'inchi': 'InChI=1S/ClHO2/c2-1-3/h(H,2,3)/p-1'}
# Get the rest from the api
# Key problem - if there's no mol downloaded, there's a no go.

ClO3m = {'formula': 'ClO3-', 'pubchem': 104770} 
Br3m = {'pubchem': 77881}
BrO3m = {'pubchem': 84979}
AlOm = {'smiles': '[Al-].[O]', 'MW': 42.981, 'formula': 'AlO-'} # No inchi
BrO4m = {'pubchem': 5460630}
IO4m = {'pubchem': 167232}
B4O7m = {'pubchem': 91932232}
HF2m = {'pubchem': 21864337}
SiF6m = {'pubchem': 28117}
B4O7m = {'pubchem': 91932232}
N2O2m = {'pubchem': 4686309}
SbF6m = {'pubchem': 3868826}
IO3m =  {'pubchem': 84927}
AgO2H2m = {'smiles': '[Ag+1].[OH-].[OH-]', 'formula': 'H2AgO2-1', 'MW': 124.8752}
CrO4H4m = {'smiles': '[OH-].[OH-].[OH-].[OH-].[Cr+3]', 'formula': 'CrH4O4-', 'MW': 120.024} # read wrong no pubchem
TiO5H5 = {'smiles': '[OH-].[OH-].[OH-].[OH-].[Ti+4]', 'MW': 132.902, 'formula': 'TiH5O5-'} # read wrong no pubchem
FeO4H4m3 = {'smiles': '[OH-].[OH-].[OH-].[OH-].[Fe+1]', 'formula': 'FeH4O4-3', 'MW': 123.873} # read wrong no pubchem
FeO4H4m = {'smiles': '[OH-].[OH-].[OH-].[OH-].[Fe+3]', 'formula': 'FeH4O4-', 'MW': 123.873} # read wrong no pubchem
FeO4H4m2 = {'smiles': '[OH-].[OH-].[OH-].[OH-].[Fe+2]', 'formula': 'FeH4O4-2', 'MW': 123.873} # read wrong no pubchem
FeO3H3m = {'smiles': '[OH-].[OH-].[OH-].[Fe+2]', 'formula': 'FeH3O3-', 'MW': 106.866}
FeOH2m = {'formula': 'FeH2O-', 'MW': 73.86028, 'smiles': '[H-].[OH-].[Fe+1]'}
HB4O7m = {'formula': 'HB4O7-', 'MW': 156.24774} # no smiles available?
H2P2m = {'formula': 'H2P2-', 'MW': molecular_weight(nested_formula_parser('H2P2-'))}


custom_ions = {'14998-27-7': ClO2m, '14866-68-3': ClO3m, '14522-80-6': Br3m, '15541-45-4': BrO3m,
              '12758-12-2': AlOm, '16474-32-1': BrO4m, '15056-35-6': IO4m, '12258-53-6': B4O7m,
              '18130-74-0': HF2m, '17084-08-1': SiF6m, '12258-53-6': B4O7m, '15435-66-2': N2O2m,
              '17111-95-4': SbF6m, '15454-31-6': IO3m, '12258-16-1': AgO2H2m, '97775-49-0':CrO4H4m,
              '119046-04-7': TiO5H5, '29145-79-7': FeO4H4m, '150393-25-2': FeO4H4m3,
              '73128-36-6': FeO4H4m2, '70756-39-7': FeO3H3m, '150381-43-4':FeOH2m,
              '12447-33-5': HB4O7m, '107596-48-5': H2P2m }

# 15454-31-6
for CAS, d in custom_ions.items():
    if CAS in good_syns:
        good_syns[CAS].update(d)
    else:
        good_syns[CAS] = d

import json
f = open('Good synoynms by CAS2.json', 'w')
json.dump(good_syns, f, indent=2, separators=(',', ': '), sort_keys=True)
f.close()

In [ ]:


In [ ]:


In [12]:
from collections import Counter
ns = []
for i in db.CAS_index.values():
    ns.extend(list(set(i.all_names)))
Counter(ns).most_common(20)
# No dup names :)
# len(ns)
# len(a.CAS_index), len(a.pubchem_index)
# TODO oxalate goes to the one without Hs, 71081	338-70-5	C2O4-2


Out[12]:
[('1395-EP2284155A2', 1),
 ('S-2', 1),
 ('PdCl6(-2)', 1),
 ('BF4(1-)', 1),
 ('1385048-46-3', 1),
 ('H2O4P(1-)', 1),
 ('CHEBI:33019', 1),
 ('9177-EP2298732A1', 1),
 ('[PF6](-)', 1),
 ('HMDB01061', 1),
 ('3230-EP2298764A1', 1),
 ('HMDB00257', 1),
 ('HMDB00250', 1),
 ('S--', 1),
 ('Bromate ion (BrO31-)', 1),
 ('hydridotrioxophosphate(2-)', 1),
 ('Superoxide O2-', 1),
 ('mercurate (2-), tetrakis(cyano-C)', 1),
 ('NO2', 1),
 ('NO3', 1)]

In [ ]:


In [4]:
# None of the charges are wrong?
for CAS, d in data.items():
    chem = db.search_CAS(CAS)
    if not chem:
        continue
        print('NOTINDB', CAS)
        continue
    mol = Chem.MolFromSmiles(chem.smiles)
    if mol is None:
        continue
        print('CANTREADMOL', CAS)
        continue
#     print(Chem.MolToSmiles(mol))
    charge = Chem.GetFormalCharge(mol)
    try:
        assert charge == d['charge']
#         print('PASS', charge, d['charge'])
    except:
        print('F:', charge, d['charge'], CAS)


('F:', 0, -1, '119046-04-7')

In [33]:
# Chem.GetFormalCharge(Chem.MolFromSmiles('[SbH6+3]'))
# a.search_CAS('16971-29-2').InChI, a.search_CAS('16971-29-2').formula, a.search_CAS('16971-29-2').smiles

In [ ]:


In [6]:
# mol = Chem.MolFromMolFile('mol/14695-95-5.mol')
# mol = Chem.MolFromMolFile('/tmp/399316.mol')

# # # mol = Chem.MolFromSmiles('[Sb+3]')
# # # When read, 1 atom
# # Chem.MolToSmiles(mol, allHsExplicit=True)
# # mol.GetNumAtoms()
# mw = Descriptors.MolWt(mol)
# formula = CalcMolFormula(mol)
# mw, formula

In [5]:
# Most of the MW ones fail due to having added extra hydrogens???? OR MW?
for CAS, d in data.items():
    chem = db.search_CAS(CAS)
    if not chem or d['MW'] == 0:
        continue
    try:
        assert_allclose(chem.MW, d['MW'], atol=0.3)
    except:
        print('F:', CAS, chem.MW, d['MW'],  chem)

# Plenty of outstanding work here and with the charges.
# 20561-39-1 astatine mos stable.
# 16518-47-1 Arsenate (AsO43-), dihydrogen (8CI,9CI), must be a typo - clearly weight is higher.
# 14493-01-7 is fine, confirmed.
# 19469-81-9 H4Al is simply wrong.
# 14897-04-2 is fine
# 34786-97-5 is fine, Vanadate (V(OH)2O21-), their error
# 14333-20-1 Pertechnetate is fine
# 16844-87-4 Arsenate (AsO43-), monohydrogen is fine
# 26450-38-4 their typo for sure
# 15390-83-7 is fine for sure, their bad


('F:', '20561-39-1', 210.0, 211.0, <ChemicalMetadata, name=Astatine ion (At1-), formula=At-, smiles=[At-], MW=210>)
('F:', '16518-47-1', 140.934, 116.95, <ChemicalMetadata, name=Arsenate (AsO43-), dihydrogen (8CI,9CI), formula=AsH2O4-, smiles=O=[As]([O-])(O)O, MW=140.934>)
('F:', '14493-01-7', 514.694, 515.71, <ChemicalMetadata, name=Platinate(2-), tetrabromo-, (SP-4-1)-, formula=Br4Pt-2, smiles=[Br-].[Br-].[Br-].[Br-].[Pt+2], MW=514.694>)
('F:', '19469-81-9', 31.014, 78.84, <ChemicalMetadata, name=Aluminate(1-), tetrahydro- (8CI), formula=AlH4-, smiles=[Al-].[H].[H].[H].[H], MW=31.014>)
('F:', '14897-04-2', 215.041, 216.11, <ChemicalMetadata, name=Cobaltate(3-), hexakis(cyano-κC)-, (OC-6-11)-, formula=C6CoN6-3, smiles=[C-]#N.[C-]#N.[C-]#N.[C-]#N.[C-]#N.[C-]#N.[Co+3], MW=215.041>)
('F:', '34786-97-5', 116.954, 139.94, <ChemicalMetadata, name=Vanadate (V(OH)2O21-), (T-4)- (9CI), formula=H2O4V-, smiles=[O-].[OH-].[OH-].[O].[V+2], MW=116.954>)
('F:', '14333-20-1', 161.996, 163.01, <ChemicalMetadata, name=Pertechnetate (TcO41-) (8CI), formula=O4Tc-, smiles=[O].[O].[O].[O].[Tc-], MW=161.996>)
('F:', '16844-87-4', 139.926, 115.95, <ChemicalMetadata, name=Arsenate (AsO43-), monohydrogen (8CI,9CI), formula=AsHO4-2, smiles=O=[As]([O-])([O-])O, MW=139.926>)
('F:', '26450-38-4', 115.946, 138.93, <ChemicalMetadata, name=Vanadate (V(OH)O32-), (T-4)- (9CI), formula=HO4V-2, smiles=[O-].[O-].[OH-].[O].[V+], MW=115.946>)
('F:', '15390-83-7', 78.84, 31.01, <ChemicalMetadata, name=Borate(1-), tetrahydroxy- (9CI), formula=BH4O4-, smiles=O[B-](O)(O)O, MW=78.84>)

In [ ]:


In [ ]:


In [ ]: