In [1]:
import re
import pubchempy as pcp
Enable debug logging to make it easier to see what is going on:
In [2]:
import logging
logging.getLogger('pubchempy').setLevel(logging.DEBUG)
A function to get the CAS registry numbers for compounds with a particular SMILES substructure:
In [3]:
def get_substructure_cas(smiles):
cas_rns = []
results = pcp.get_synonyms(smiles, 'smiles', searchtype='substructure')
for result in results:
for syn in result.get('Synonym', []):
match = re.match('(\d{2,7}-\d\d-\d)', syn)
if match:
cas_rns.append(match.group(1))
return cas_rns
Test some inputs:
In [4]:
cas_rns = get_substructure_cas('[Pb]')
print(len(cas_rns))
print(cas_rns[:10])
In [5]:
cas_rns = get_substructure_cas('[Se]')
print(len(cas_rns))
print(cas_rns[:10])
In [6]:
cas_rns = get_substructure_cas('[Ti]')
print(len(cas_rns))
print(cas_rns[:10])
In [7]:
cas_rns = get_substructure_cas('[Pd]')
print(len(cas_rns))
print(cas_rns[:10])
We could potentially get a TimeoutError if there are too many results. In this case, it might be better to perform the substructure search and then get the synonyms separately:
In [8]:
cids = pcp.get_cids('[Pd]', 'smiles', searchtype='substructure')
Then you can do pcp.get_synonyms(cids)
with the list of CIDs.