In [1]:
import os
import pandas as pd
import pybel as pb
import re
First download the information from here https://www.dropbox.com/sh/gde0vn6rrmgvlrn/AACLO7XahY3qItNNWINyNQ_7a?dl=0
Change the working directory to the 'results' directory after unzipping the respective file:
In [2]:
os.chdir('../results')
In [3]:
molecule_string = []
casrn = []
test_type = []
dose = []
dose_amount = []
dose_units = []
route = []
organism = []
source = []
rootdir = '.'
fnames = []
The following expression traverses the current directory tree and accumulates the file names in fnames:
In [4]:
for dirpath, subdirlist, filelist in os.walk(rootdir):
# Remove the _cas directory, as the files there do not appear to contain valid information
# .DS_Store appears only on systems running OSX
if '_cas' in subdirlist:
subdirlist.remove('_cas')
fnames.append( [os.path.join(dirpath, name) for name in filelist if (name != '.DS_Store')] )
Read each file in the list as a pybel molecule object and extract the relevant information from it (this takes a while for the whole data bank, ~5min)
In [7]:
for first_level in fnames:
if len(first_level):
print "Processing directory " + first_level[0].split('/')[1]
for f in first_level:
try:
current_mol = pb.readfile('sdf', f).next()
except StopIteration: # Some files are empty, in that case just skip and continue
print f, " --- file empty ---"
continue
molecule_string.append(str(current_mol).strip())
try:
tox_data = current_mol.data['chemid_tox_info'].split('\n')[1].split()
except KeyError:
print f, " --- no toxicology information ---"
continue
casrn.append(current_mol.data['casrn'])
test_type.append(tox_data[1])
dose_components = re.match(r"([0-9.]+)([a-z/]+)", tox_data[2], re.I)
if dose_components:
dose_amount.append(dose_components.groups()[0])
dose_units.append(dose_components.groups()[1])
else:
dose_amount.append('NA')
dose_units.append('NA')
route.append(tox_data[3])
organism.append(tox_data[4])
Convert the lists with the information to pandas objects and construct a data frame with them
In [8]:
d = {'molecule_string':pd.Series(molecule_string),
'casrn':pd.Series(casrn),
'test_type':pd.Series(test_type),
'dose_amount':pd.Series(dose_amount),
'dose_units':pd.Series(dose_units),
'route':pd.Series(route),
'organism':pd.Series(organism)}
df = pd.DataFrame(d)
Finally write the data frame to disk (change the parameters if necessary)
In [9]:
df.to_csv('../molecule_data.txt', sep='|', index=False)
In [ ]: