In [1]:
import os
import pandas as pd
import pybel as pb
import re

First download the information from here https://www.dropbox.com/sh/gde0vn6rrmgvlrn/AACLO7XahY3qItNNWINyNQ_7a?dl=0

Change the working directory to the 'results' directory after unzipping the respective file:


In [2]:
os.chdir('../results')

In [3]:
molecule_string = []
casrn = []
test_type = []
dose = []
dose_amount = []
dose_units = []
route = []
organism = []
source = []

rootdir = '.'
fnames = []

The following expression traverses the current directory tree and accumulates the file names in fnames:


In [4]:
for dirpath, subdirlist, filelist in os.walk(rootdir):
    # Remove the _cas directory, as the files there do not appear to contain valid information
    # .DS_Store appears only on systems running OSX
    if '_cas' in subdirlist:
        subdirlist.remove('_cas')
    fnames.append( [os.path.join(dirpath, name) for name in filelist if (name != '.DS_Store')] )

Read each file in the list as a pybel molecule object and extract the relevant information from it (this takes a while for the whole data bank, ~5min)


In [7]:
for first_level in fnames:
    if len(first_level):
        print "Processing directory " + first_level[0].split('/')[1]
    for f in first_level:
        try:
            current_mol = pb.readfile('sdf', f).next()
        except StopIteration: # Some files are empty, in that case just skip and continue
            print f, " --- file empty ---"
            continue
        molecule_string.append(str(current_mol).strip())
        try:
            tox_data = current_mol.data['chemid_tox_info'].split('\n')[1].split()
        except KeyError:
            print f, " --- no toxicology information ---"
            continue
        casrn.append(current_mol.data['casrn'])
        test_type.append(tox_data[1])
        
        dose_components = re.match(r"([0-9.]+)([a-z/]+)", tox_data[2], re.I)
        if dose_components:
            dose_amount.append(dose_components.groups()[0])
            dose_units.append(dose_components.groups()[1])
        else:
            dose_amount.append('NA')
            dose_units.append('NA')
        
        route.append(tox_data[3])
        organism.append(tox_data[4])

Convert the lists with the information to pandas objects and construct a data frame with them


In [8]:
d = {'molecule_string':pd.Series(molecule_string),
     'casrn':pd.Series(casrn),
     'test_type':pd.Series(test_type),
     'dose_amount':pd.Series(dose_amount),
     'dose_units':pd.Series(dose_units),
     'route':pd.Series(route),
     'organism':pd.Series(organism)}
df = pd.DataFrame(d)

Finally write the data frame to disk (change the parameters if necessary)


In [9]:
df.to_csv('../molecule_data.txt', sep='|', index=False)

In [ ]: