In [12]:
from thermo import *
file_name = '/home/caleb/Documents/University/CHE3123/thermo/thermo/Identifiers/chemical identifiers.tsv'
f = open(file_name)
f2 = open(file_name+'2', 'w')
for line in f:
# This is effectively the documentation for the file format of the file
f2.write(fix_line(line))
f2.close()
f.close()
In [11]:
from rdkit.Chem import *
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem.rdMolDescriptors import CalcMolFormula
def fix_line(line):
diff = False
values = line.rstrip('\n').split('\t')
(pubchemid, CAS, formula, MW, smiles, InChI, InChI_key, iupac_name, common_name) = values[0:9]
others = values[9:]
try:
m = Chem.MolFromSmiles(smiles)
for i in m.GetAtoms():
if i.GetIsotope():
formula = CalcMolFormula(m, True, True)
diff = True
break
except:
pass
ans = '\t'.join((pubchemid, CAS, formula, MW, smiles, InChI, InChI_key, iupac_name, common_name))
ans += '\t' + '\t'.join(others) +'\n'
if diff:
return ans
else:
return line
In [ ]: