Setting up imports
In [1]:
%load_ext autoreload
%autoreload 2
%pdb off
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals
__author__ = "Joseph Gomes"
__copyright__ = "Copyright 2016, Stanford University"
__license__ = "LGPL"
import os
import unittest
import tempfile
import shutil
import numpy as np
import numpy.random
from deepchem import metrics
from deepchem.datasets import Dataset
from deepchem.featurizers.featurize import DataFeaturizer
from deepchem.featurizers.featurize import FeaturizedSamples
from deepchem.hyperparameters import HyperparamOpt
from deepchem.metrics import Metric
from deepchem.models import Model
from deepchem.models.sklearn_models import SklearnModel
from deepchem.transformers import NormalizationTransformer
from deepchem.utils.evaluate import Evaluator
from sklearn.ensemble import RandomForestRegressor
from sklearn.kernel_ridge import KernelRidge
Creating temporary directories
In [2]:
feature_dir = tempfile.mkdtemp()
samples_dir = tempfile.mkdtemp()
train_dir = tempfile.mkdtemp()
valid_dir = tempfile.mkdtemp()
test_dir = tempfile.mkdtemp()
model_dir = tempfile.mkdtemp()
Setting up model variables
In [3]:
from deepchem.featurizers.coulomb_matrices import CoulombMatrixEig
compound_featurizers = [CoulombMatrixEig(23, remove_hydrogens=False)]
complex_featurizers = []
tasks = ["atomization_energy"]
task_type = "regression"
task_types = {task: task_type for task in tasks}
input_file = "../datasets/gdb1k.sdf"
smiles_field = "smiles"
mol_field = "mol"
Load featurized data
In [4]:
featurizers = compound_featurizers + complex_featurizers
featurizer = DataFeaturizer(tasks=tasks,
smiles_field=smiles_field,
mol_field=mol_field,
compound_featurizers=compound_featurizers,
complex_featurizers=complex_featurizers, verbosity="high")
In [5]:
featurized_samples = featurizer.featurize(input_file, feature_dir, samples_dir)
Perform Train, Validation, and Testing Split
In [6]:
from deepchem.splits import RandomSplitter
random_splitter = RandomSplitter()
train_samples, valid_samples, test_samples = random_splitter.train_valid_test_split(featurized_samples,
train_dir, valid_dir, test_dir)
Creating datasets
In [7]:
train_dataset = Dataset(data_dir=train_dir, samples=train_samples,
featurizers=featurizers, tasks=tasks)
valid_dataset = Dataset(data_dir=valid_dir, samples=valid_samples,
featurizers=featurizers, tasks=tasks)
test_dataset = Dataset(data_dir=test_dir, samples=test_samples,
featurizers=featurizers, tasks=tasks)
Transforming datasets
In [8]:
input_transformers = [NormalizationTransformer(transform_X=True, dataset=train_dataset)]
output_transformers = [NormalizationTransformer(transform_y=True, dataset=train_dataset)]
transformers = input_transformers + output_transformers
for transformer in transformers:
transformer.transform(train_dataset)
for transformer in transformers:
transformer.transform(valid_dataset)
for transformer in transformers:
transformer.transform(test_dataset)
Fit Random Forest with hyperparameter search
In [13]:
def rf_model_builder(tasks, task_types, params_dict, model_dir, verbosity=None):
"""Builds random forests given hyperparameters.
"""
n_estimators = params_dict["n_estimators"]
max_features = params_dict["max_features"]
return SklearnModel(
tasks, task_types, params_dict, model_dir,
mode="regression",
model_instance=RandomForestRegressor(n_estimators=n_estimators,
max_features=max_features))
params_dict = {
"n_estimators": [10, 100],
"data_shape": [train_dataset.get_data_shape()],
"max_features": ["auto"],
}
metric = Metric(metrics.mean_absolute_error)
optimizer = HyperparamOpt(rf_model_builder, tasks, task_types, verbosity="low")
best_model, best_hyperparams, all_results = optimizer.hyperparam_search(
params_dict, train_dataset, valid_dataset, output_transformers,
metric, use_max="False", logdir=None)
In [14]:
def kr_model_builder(tasks, task_types, params_dict, model_dir, verbosity=None):
"""Builds random forests given hyperparameters.
"""
kernel = params_dict["kernel"]
alpha = params_dict["alpha"]
gamma = params_dict["gamma"]
return SklearnModel(
tasks, task_types, params_dict, model_dir,
mode="regression",
model_instance=KernelRidge(alpha=alpha,kernel=kernel,gamma=gamma))
params_dict = {
"kernel": ["laplacian"],
"alpha": [0.0001],
"gamma": [0.0001]
}
metric = Metric(metrics.mean_absolute_error)
optimizer = HyperparamOpt(kr_model_builder, tasks, task_types, verbosity="low")
best_model, best_hyperparams, all_results = optimizer.hyperparam_search(
params_dict, train_dataset, valid_dataset, output_transformers,
metric, use_max="False", logdir=None)
In [ ]: