In [11]:
from thermo import *
from thermo.identifiers import ChemicalMetadataDB
from numpy.testing import assert_allclose
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem.rdMolDescriptors import CalcMolFormula

# db = ChemicalMetadataDB(elements=False, main_db=('cation_db_20171030.tsv'), user_dbs=[])
db = ChemicalMetadataDB(elements=False, main_db=('Cation db.tsv'), user_dbs=[])
def formula_to_charge(formula):
    splits = formula.split('+')
    if len(splits) == 1 or splits[1] == '':
        return 1
    else:
        return 1*int(splits[1])
    
# [(i.formula, formula_to_charge(i.formula)) for i in db.CAS_index.values()]
def formula_variations_ion(formula, charge):
    formula = formula.split('+')[0]
    formulas = [formula+'+'*abs(charge), 
                formula+'+'+str(charge),
                formula+'('+'+'+ str(abs(charge)) + ')',
                formula+'('+ str(abs(charge)) + '+)',
                formula+'('+ '+'*abs(charge) + ')']
    return formulas


Out[11]:
'2099995000-00-0'

In [8]:
data = {}
with open('Original metadata.csv') as f:
    f.readline()
    for line in f.readlines():
        if len(line.split('\t')) == 6:
            name, name2, CAS, formula, charge, MW = line.split('\t')
        else:
            name, name2, CAS, formula, charge = line.split('\t')
            MW = 0
        MW = MW.strip()
        if not MW:
            MW = 0
        data[CAS] = {'Name': name, 'Name2': name2, 'formula': formula, 'charge':int(charge), 'MW': float(MW)}

In [4]:
#db.CAS_index.keys()

# db.CAS_index[12099995000000]
# db.CAS_index

In [12]:
good_syns = {CAS:{'synonyms': []} for CAS, d in data.items()}
                  
for CAS, d in data.items():
    if d['MW']:
        good_syns[CAS]['synonyms'].append(d['Name2'])

for CAS, d in data.items():
    good_syns[CAS]['synonyms'].extend(formula_variations_ion(d['formula'], d['charge']))
for CAS, d in db.CAS_index.items():
    CAS = d.CASs
    syns = formula_variations_ion(d.formula, formula_to_charge(d.formula))
    if CAS in good_syns:
        pass
#         syns = [i for i in syns if i not in good_syns[CAS]['synonyms']]
#         good_syns[CAS]['synonyms'].extend(syns)
    else:
        good_syns[CAS] = {}
        good_syns[CAS]['synonyms'] = []

    
good_syns['14464-47-2']['synonyms'].append('deuterium(1+)')
good_syns['2099995000-00-0']['synonyms'].append('Yttrium dihydroxide ion')

# good_syns['14878-41-2']['synonyms'].extend(['Co(en)3+3', '[Co(en)3]+3'])

for f, CAS in zip(['[Co(en)3]+3', '[Co2(trien)3]+6', '[Ni2(trien)3]+4'], ['14878-41-2', '747348-22-7', '12312-87-7']):
    charge = formula_to_charge(f)
    variations = formula_variations_ion(f, charge) + formula_variations_ion(f.replace('[', '').replace(']', ''), charge)
    good_syns[CAS]['synonyms'].extend(variations)

InOHp3 = {'formula': 'InOH+3', 'MW': molecular_weight(nested_formula_parser('OHIn+3')), 'smiles': '[OH-].[In+4]'}
MgOHp2 = {'formula': 'MgOH+2', 'MW': molecular_weight(nested_formula_parser('MgOH+2')), 'smiles': '[OH-].[Mg+3]'}


custom_ions = {'1313606-05-1': InOHp3, '289651-09-8': MgOHp2}

for CAS, d in custom_ions.items():
    if CAS in good_syns:
        good_syns[CAS].update(d)
    else:
        good_syns[CAS] = d


import json
f = open('Good synoynms by CAS2.json', 'w')
json.dump(good_syns, f, indent=2, separators=(',', ': '), sort_keys=True)
f.close()
print('hi4.1')


hi4.1

In [ ]:


In [ ]:


In [12]:
# Only 25 synonyms to go through, plus to remove the dups in there - not bad.
from collections import Counter
ns = []
for i in a.CAS_index.values():
    ns.extend(list(set(i.all_names)))
#     assert len(i.all_names) == len(set(i.all_names))
# len(ns), len(set(ns))
Counter(ns).most_common(20)


Out[12]:
[('', 3),
 ('CID3028194', 1),
 ('Hydroxylammonium', 1),
 ('In(OH)2(+)', 1),
 ('Mn(II)', 1),
 ('STL483864', 1),
 ('propanaminium, N,N,N-tripropyl', 1),
 ('AC1MTWXQ', 1),
 ('strontium(II) cation', 1),
 ('CHEBI:45825', 1),
 ('PLATINUM(2+)  TETRAAMMINE-  DICHLORIDE  (SP-4-1)-  REACTION PRODUCTS WITH AMMONIA',
  1),
 ('37466_RIEDEL', 1),
 ('Trihydroxytellanium', 1),
 ('Dy+3', 1),
 ('DTXSID30169912', 1),
 ('ZrO++', 1),
 ('Carbolith Capsules 600mg', 1),
 ('Pe4N+', 1),
 ('I14-115545', 1),
 ('In(OH)2(1+)', 1)]

In [13]:
# None of the charges are wrong?
for CAS, d in data.items():
    chem = a.search_CAS(CAS)
    if not chem:
        continue
    mol = Chem.MolFromSmiles(chem.smiles)
#     print(Chem.MolToSmiles(mol))
    charge = Chem.GetFormalCharge(mol)
    try:
        assert charge == d['charge']
#         print('PASS', charge, d['charge'])
    except:
        print('F:', charge, d['charge'], CAS)


('F:', 0, 3, '1313606-05-1')
('F:', 0, 2, '289651-09-8')

In [4]:
# Chem.GetFormalCharge(Chem.MolFromSmiles('[SbH6+3]'))

In [33]:
len(a.pubchem_index), len(a.CAS_index)


Out[33]:
(117, 182)

In [6]:
# mol = Chem.MolFromMolFile('mol/14695-95-5.mol')
# mol = Chem.MolFromMolFile('/tmp/399316.mol')

# # # mol = Chem.MolFromSmiles('[Sb+3]')
# # # When read, 1 atom
# # Chem.MolToSmiles(mol, allHsExplicit=True)
# # mol.GetNumAtoms()
# mw = Descriptors.MolWt(mol)
# formula = CalcMolFormula(mol)
# mw, formula

In [14]:
# Most of the MW ones fail due to having added extra hydrogens???? OR MW?
for CAS, d in data.items():
    chem = a.search_CAS(CAS)
    if not chem or d['MW'] == 0:
        continue
    try:
        assert_allclose(chem.MW, d['MW'], atol=0.25)
    except:
        print('F:', CAS, chem.MW, d['MW'],  chem)

# 4 plutonium, 3 americium, 1 curioum, 1 promethium, 1 deuterium


('F:', '22541-46-4', 243.0, 241.0, <ChemicalMetadata, name=Americium (Am3+), formula=Am+3, smiles=[Am+3], MW=243>)
('F:', '22878-02-0', 274.998, 273.0, <ChemicalMetadata, name=Americium oxide (AmO2), ion(1+), formula=AmO2+, smiles=[Am+].[O].[O], MW=274.998>)
('F:', '22853-00-5', 275.998, 271.0, <ChemicalMetadata, name=Dioxoplutonium ion(2+), formula=O2Pu+2, smiles=[O].[O].[Pu+2], MW=275.998>)
('F:', '22967-56-2', 275.998, 271.0, <ChemicalMetadata, name=Dioxoplutonium ion(1+), formula=O2Pu+, smiles=[O].[O].[Pu+], MW=275.998>)
('F:', '12323-66-9', 274.998, 273.0, <ChemicalMetadata, name=Americium dioxide ion(2+), formula=AmO2+2, smiles=[Am+2].[O].[O], MW=274.998>)
('F:', '22541-16-8', 145.0, 147.0, <ChemicalMetadata, name=Pm3+, formula=Pm+3, smiles=[Pm+3], MW=145>)
('F:', '22541-44-2', 244.0, 239.0, <ChemicalMetadata, name=Plutonium ion(4+), formula=Pu+4, smiles=[Pu+4], MW=244>)
('F:', '22541-70-4', 244.0, 239.0, <ChemicalMetadata, name=Plutonium ion (3+), formula=Pu+3, smiles=[Pu+3], MW=244>)
('F:', '22541-42-0', 247.0, 244.0, <ChemicalMetadata, name=Cm3+, formula=Cm+3, smiles=[Cm+3], MW=247>)

In [ ]:


In [ ]:


In [ ]: