Setting up imports


In [1]:
%load_ext autoreload
%autoreload 2
%pdb off
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

__author__ = "Joseph Gomes"
__copyright__ = "Copyright 2016, Stanford University"
__license__ = "LGPL"

import os
import unittest
import tempfile
import shutil

import numpy as np
import numpy.random

from deepchem import metrics
from deepchem.datasets import Dataset
from deepchem.featurizers.featurize import DataFeaturizer
from deepchem.featurizers.featurize import FeaturizedSamples
from deepchem.hyperparameters import HyperparamOpt
from deepchem.metrics import Metric
from deepchem.models import Model
from deepchem.models.sklearn_models import SklearnModel
from deepchem.transformers import NormalizationTransformer
from deepchem.utils.evaluate import Evaluator
from sklearn.ensemble import RandomForestRegressor
from sklearn.kernel_ridge import KernelRidge


Automatic pdb calling has been turned OFF

Creating temporary directories


In [2]:
feature_dir = tempfile.mkdtemp()
samples_dir = tempfile.mkdtemp()
train_dir = tempfile.mkdtemp()
valid_dir = tempfile.mkdtemp()
test_dir = tempfile.mkdtemp()
model_dir = tempfile.mkdtemp()

Setting up model variables


In [3]:
from deepchem.featurizers.coulomb_matrices import CoulombMatrixEig
compound_featurizers = [CoulombMatrixEig(23, remove_hydrogens=False)]
complex_featurizers = []
tasks = ["atomization_energy"]
task_type = "regression"
task_types = {task: task_type for task in tasks}
input_file = "../datasets/gdb1k.sdf"
smiles_field = "smiles"
mol_field = "mol"

Load featurized data


In [4]:
featurizers = compound_featurizers + complex_featurizers
featurizer = DataFeaturizer(tasks=tasks,
                            smiles_field=smiles_field,
                            mol_field=mol_field,
                            compound_featurizers=compound_featurizers,
                            complex_featurizers=complex_featurizers, verbosity="high")

In [5]:
featurized_samples = featurizer.featurize(input_file, feature_dir, samples_dir)


Loading raw samples now.
Reading structures from ../datasets/gdb1k.sdf.
Loaded raw data frame from file.
About to preprocess samples.
Sharding and standardizing into shard-1 / 1 shards
Currently featurizing feature_type: CoulombMatrixEig
Featurizing sample 0
Saving compounds to disk

Perform Train, Validation, and Testing Split


In [6]:
from deepchem.splits import RandomSplitter
random_splitter = RandomSplitter()
train_samples, valid_samples, test_samples = random_splitter.train_valid_test_split(featurized_samples,
    train_dir, valid_dir, test_dir)

Creating datasets


In [7]:
train_dataset = Dataset(data_dir=train_dir, samples=train_samples, 
                        featurizers=featurizers, tasks=tasks)
valid_dataset = Dataset(data_dir=valid_dir, samples=valid_samples, 
                        featurizers=featurizers, tasks=tasks)
test_dataset = Dataset(data_dir=test_dir, samples=test_samples, 
                       featurizers=featurizers, tasks=tasks)


/home/joegomes/deepchem/deepchem/datasets/__init__.py:402: UnicodeWarning: Unicode equal comparison failed to convert both arguments to Unicode - interpreting them as being unequal
  if features[feature_ind] == "":
/home/joegomes/deepchem/deepchem/datasets/__init__.py:411: UnicodeWarning: Unicode equal comparison failed to convert both arguments to Unicode - interpreting them as being unequal
  if y[ind, task] == "":

Transforming datasets


In [8]:
input_transformers = [NormalizationTransformer(transform_X=True, dataset=train_dataset)]
output_transformers = [NormalizationTransformer(transform_y=True, dataset=train_dataset)]
transformers = input_transformers + output_transformers
for transformer in transformers:
    transformer.transform(train_dataset)
for transformer in transformers:
    transformer.transform(valid_dataset)
for transformer in transformers:
    transformer.transform(test_dataset)

Fit Random Forest with hyperparameter search


In [13]:
def rf_model_builder(tasks, task_types, params_dict, model_dir, verbosity=None):
    """Builds random forests given hyperparameters.
    
    """
    n_estimators = params_dict["n_estimators"]
    max_features = params_dict["max_features"]
    return SklearnModel(
        tasks, task_types, params_dict, model_dir,
        mode="regression",
        model_instance=RandomForestRegressor(n_estimators=n_estimators,
                                             max_features=max_features))

params_dict = {
    "n_estimators": [10, 100],
    "data_shape": [train_dataset.get_data_shape()],
    "max_features": ["auto"],
    }

metric = Metric(metrics.mean_absolute_error)
optimizer = HyperparamOpt(rf_model_builder, tasks, task_types, verbosity="low")       
best_model, best_hyperparams, all_results = optimizer.hyperparam_search(              
    params_dict, train_dataset, valid_dataset, output_transformers,                     
    metric, use_max="False", logdir=None)


Model 1/2, Metric mean_absolute_error, Validation set 0: 27.577397
	best_validation_score so far: 27.577397
Model 2/2, Metric mean_absolute_error, Validation set 1: 24.751563
	best_validation_score so far: 27.577397
Best hyperparameters: (10, (23,), u'auto')
train_score: 10.962967
validation_score: 27.577397

In [14]:
def kr_model_builder(tasks, task_types, params_dict, model_dir, verbosity=None):
    """Builds random forests given hyperparameters.

    """
    kernel = params_dict["kernel"]
    alpha = params_dict["alpha"]
    gamma = params_dict["gamma"]
    return SklearnModel(
        tasks, task_types, params_dict, model_dir,
        mode="regression",
        model_instance=KernelRidge(alpha=alpha,kernel=kernel,gamma=gamma))

params_dict = {
    "kernel": ["laplacian"],
    "alpha": [0.0001],
    "gamma": [0.0001]
    }

metric = Metric(metrics.mean_absolute_error)
optimizer = HyperparamOpt(kr_model_builder, tasks, task_types, verbosity="low")       
best_model, best_hyperparams, all_results = optimizer.hyperparam_search(              
    params_dict, train_dataset, valid_dataset, output_transformers,                     
    metric, use_max="False", logdir=None)


Model 1/1, Metric mean_absolute_error, Validation set 0: 11.888263
	best_validation_score so far: 11.888263
Best hyperparameters: (u'laplacian', 0.0001, 0.0001)
train_score: 8.300859
validation_score: 11.888263

In [ ]: