In [1]:
import lasagne
import numpy as np
import pandas as pd
import pickle
import sklearn

from sklearn import model_selection, preprocessing
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
from lasagne.layers import DenseLayer
from lasagne.layers import InputLayer
from nolearn.lasagne import NeuralNet
from scipy.stats import randint as sp_randint
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, BayesianRidge, Lasso
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import LinearSVR
from sklearn.tree import DecisionTreeRegressor
pd.set_option('precision', 3)
TARGET_COLUMN = 'Activity_Score'


/home/pphilip/miniconda2/envs/my-rdkit-env/lib/python2.7/site-packages/theano/tensor/signal/downsample.py:6: UserWarning: downsample module has been moved to the theano.tensor.signal.pool module.
  "downsample module has been moved to the theano.tensor.signal.pool module.")
/home/pphilip/miniconda2/envs/my-rdkit-env/lib/python2.7/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

In [2]:
def change_nan_infinite(dataframe):
    """
    Replacing NaN and infinite values from the dataframe with zeros.
    :param dataframe: Dataframe containing NaN and infinite values.
    :return data: Data with no NaN or infinite values.
    """

    dataframe.replace([np.inf, -np.inf], np.nan, inplace=True)
    data = dataframe.fillna(0)

    return data

def choose_features(x_train, y_train, x_test, column_names):
    """
    Selecting the features of high importance to reduce feature space.
    :param x_train: Training set of features.
    :param x_test: Test set of features.
    :param y_train: Training target values
    :param column_names: Names of columns in x
    """

    # Random forest feature importance
    clf = RandomForestRegressor(n_jobs=-1, random_state=1, n_estimators=20, max_depth=10)
    # Random state has int value for non-random sampling
    clf.fit(x_train, y_train)
    feature_importance = clf.feature_importances_
    scores_table = pd.DataFrame({'feature': column_names, 'scores':
                                 feature_importance}).sort_values(by=['scores'], ascending=False)
    scores = scores_table['scores'].tolist()
    n_features = [25, 50, 75, 100, 150, 200, 250, 300]
    for n in n_features:
        feature_scores = scores_table['feature'].tolist()
        selected_features = feature_scores[:n]
        x_train = pd.DataFrame(x_train, columns=column_names)
        desired_x_train = x_train[selected_features]
        x_test = pd.DataFrame(x_test, columns=column_names)
        desired_x_test = x_test[selected_features]

        desired_x_train.to_csv('./data/select_x_train_postprocessing_rfr_%d.csv' % n)
        desired_x_test.to_csv('./data/select_x_test_postprocessing_rfr_%d.csv' % n)
    pd.DataFrame(scores).to_csv('./data/select_feature_scores_rfr.csv')

    return

def run_models(x_train, y_train, x_test, y_test, n_features):
    """
    Driving all machine learning models as parallel processes.
    :param x_train: features dataframe for model training
    :param y_train: target dataframe for model training
    :param x_test: features dataframe for model testing
    :param y_test: target dataframe for model testing
    :return: None
    """
    model_choice = int(input("Type your choice of model to be run:" + "\n" +
                             "1 for Linear Regression" + "\n" +
                             "2 for Neural Network" + "\n" +
                             "3 for Support Vector Machine" + "\n" +
                             "4 for Decision Tree" + "\n" +
                             "5 for Ridge Regression" + "\n" +
                             "6 for Bayesian Ridge Regression" + "\n" +
                             "7 for Lasso:" + "\n" +
                             "8 for Random Forest Regressor:" + "\n"
                             ))
    if model_choice == 1:
        build_linear(x_train, y_train, x_test, y_test, n_features)
    elif model_choice == 2:
        build_nn(x_train, y_train, x_test, y_test, n_features)
    elif model_choice == 3:
        build_svm(x_train, y_train, x_test, y_test, n_features)
    elif model_choice == 4:
        build_tree(x_train, y_train, x_test, y_test, n_features)
    elif model_choice == 5:
        build_ridge(x_train, y_train, x_test, y_test, n_features)
    elif model_choice == 6:
        build_bayesian_rr(x_train, y_train, x_test, y_test, n_features)
    elif model_choice == 7:
        build_lasso(x_train, y_train, x_test, y_test, n_features)
    elif model_choice == 8:
        build_forest(x_train, y_train, x_test, y_test, n_features)
    else:
        print("Please choose from list of available models only")

    return


def build_linear(x_train, y_train, x_test, y_test, n_features):
    """
    Constructing a decision trees regression model from input dataframe
    :param x_train: features dataframe for model training
    :param y_train: target dataframe for model training
    :param x_test: features dataframe for model testing
    :param y_test: target dataframe for model testing
    :return: None
    """
    clf = LinearRegression(n_jobs=-1)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)

    # Mean absolute error regression loss
    mean_abs = sklearn.metrics.mean_absolute_error(y_test, y_pred)
    # Mean squared error regression loss
    mean_sq = sklearn.metrics.mean_squared_error(y_test, y_pred)
    # Median absolute error regression loss
    median_abs = sklearn.metrics.median_absolute_error(y_test, y_pred)
    # R^2 (coefficient of determination) regression score function
    r2 = sklearn.metrics.r2_score(y_test, y_pred)
    # Explained variance regression score function
    exp_var_score = sklearn.metrics.explained_variance_score(y_test, y_pred)

    with open('./trained_networks/select_lr_%d_data.pkl' % n_features, 'wb') as results:
        pickle.dump(clf, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(mean_abs, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(mean_sq, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(median_abs, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(r2, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(exp_var_score, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(y_pred, results, pickle.HIGHEST_PROTOCOL)

    return


def build_nn(x_train, y_train, x_test, y_test, n_features):
    """
    Constructing a regression neural network model from input dataframe
    :param x_train: features dataframe for model training
    :param y_train: target dataframe for model training
    :param x_test: features dataframe for model testing
    :param y_test: target dataframe for model testing
    :return: None
    """
    net = NeuralNet(layers=[('input', InputLayer),
                            ('hidden0', DenseLayer),
                            ('hidden1', DenseLayer),
                            ('output', DenseLayer)],
                    input_shape=(None, x_train.shape[1]),  # Number of i/p nodes = number of columns in x
                    hidden0_num_units=15,
                    hidden0_nonlinearity=lasagne.nonlinearities.softmax,
                    hidden1_num_units=17,
                    hidden1_nonlinearity=lasagne.nonlinearities.softmax,
                    output_num_units=1,  # Number of o/p nodes = number of columns in y
                    output_nonlinearity=lasagne.nonlinearities.softmax,
                    max_epochs=100,
                    update_learning_rate=0.01,
                    regression=True,
                    verbose=0)

    # Finding the optimal set of params for each variable in the training of the neural network
    param_dist = {'hidden0_num_units':sp_randint(3, 30), 'hidden1_num_units':sp_randint(3, 30)}
    clf = RandomizedSearchCV(estimator=net, param_distributions=param_dist,
                             n_iter=15, n_jobs=-1)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)

    # Mean absolute error regression loss
    mean_abs = sklearn.metrics.mean_absolute_error(y_test, y_pred)
    # Mean squared error regression loss
    mean_sq = sklearn.metrics.mean_squared_error(y_test, y_pred)
    # Median absolute error regression loss
    median_abs = sklearn.metrics.median_absolute_error(y_test, y_pred)
    # R^2 (coefficient of determination) regression score function
    r2 = sklearn.metrics.r2_score(y_test, y_pred)
    # Explained variance regression score function
    exp_var_score = sklearn.metrics.explained_variance_score(y_test, y_pred)

    with open('./trained_networks/select_nn_%d_data.pkl' % n_features, 'wb') as results:
        pickle.dump(clf, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(net, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(mean_abs, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(mean_sq, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(median_abs, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(r2, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(exp_var_score, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(y_pred, results, pickle.HIGHEST_PROTOCOL)

    return


def build_svm(x_train, y_train, x_test, y_test, n_features):
    """
    Constructing a support vector regression model from input dataframe
    :param x_train: features dataframe for model training
    :param y_train: target dataframe for model training
    :param x_test: features dataframe for model testing
    :param y_test: target dataframe for model testing
    :return: None
    """

    clf = LinearSVR(random_state=1, dual=False, epsilon=0,
                    loss='squared_epsilon_insensitive')
    # Random state has int value for non-random sampling
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)

    # Mean absolute error regression loss
    mean_abs = sklearn.metrics.mean_absolute_error(y_test, y_pred)
    # Mean squared error regression loss
    mean_sq = sklearn.metrics.mean_squared_error(y_test, y_pred)
    # Median absolute error regression loss
    median_abs = sklearn.metrics.median_absolute_error(y_test, y_pred)
    # R^2 (coefficient of determination) regression score function
    r2 = sklearn.metrics.r2_score(y_test, y_pred)
    # Explained variance regression score function
    exp_var_score = sklearn.metrics.explained_variance_score(y_test, y_pred)

    with open('./trained_networks/select_svm_%d_data.pkl' % n_features, 'wb') as results:
        pickle.dump(clf, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(mean_abs, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(mean_sq, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(median_abs, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(r2, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(exp_var_score, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(y_pred, results, pickle.HIGHEST_PROTOCOL)

    return


def build_tree(x_train, y_train, x_test, y_test, n_features):
    """
    Constructing a decision trees regression model from input dataframe
    :param x_train: features dataframe for model training
    :param y_train: target dataframe for model training
    :param x_test: features dataframe for model testing
    :param y_test: target dataframe for model testing
    :return: None
    """
    model = DecisionTreeRegressor()
    param_dist = {'max_depth': sp_randint(1, 15),
                  'min_samples_split': sp_randint(2, 15)}
    clf = RandomizedSearchCV(estimator=model, param_distributions=param_dist,
                             n_iter=15, n_jobs=-1)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)

    print(clf.best_params_, clf.best_score_)

    # Mean absolute error regression loss
    mean_abs = sklearn.metrics.mean_absolute_error(y_test, y_pred)
    # Mean squared error regression loss
    mean_sq = sklearn.metrics.mean_squared_error(y_test, y_pred)
    # Median absolute error regression loss
    median_abs = sklearn.metrics.median_absolute_error(y_test, y_pred)
    # R^2 (coefficient of determination) regression score function
    r2 = sklearn.metrics.r2_score(y_test, y_pred)
    # Explained variance regression score function
    exp_var_score = sklearn.metrics.explained_variance_score(y_test, y_pred)

    with open('./trained_networks/select_dt_%d_data.pkl' % n_features, 'wb') as results:
        pickle.dump(clf, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(mean_abs, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(mean_sq, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(median_abs, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(r2, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(exp_var_score, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(y_pred, results, pickle.HIGHEST_PROTOCOL)

    return


def build_ridge(x_train, y_train, x_test, y_test, n_features):
    """
    Constructing a ridge regression model from input dataframe
    :param x_train: features dataframe for model training
    :param y_train: target dataframe for model training
    :param x_test: features dataframe for model testing
    :param y_test: target dataframe for model testing
    :return: None
    """
    clf = Ridge()
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)

    # Mean absolute error regression loss
    mean_abs = sklearn.metrics.mean_absolute_error(y_test, y_pred)
    # Mean squared error regression loss
    mean_sq = sklearn.metrics.mean_squared_error(y_test, y_pred)
    # Median absolute error regression loss
    median_abs = sklearn.metrics.median_absolute_error(y_test, y_pred)
    # R^2 (coefficient of determination) regression score function
    r2 = sklearn.metrics.r2_score(y_test, y_pred)
    # Explained variance regression score function
    exp_var_score = sklearn.metrics.explained_variance_score(y_test, y_pred)

    with open('./trained_networks/select_rr_%d_data.pkl' % n_features, 'wb') as results:
        pickle.dump(clf, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(mean_abs, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(mean_sq, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(median_abs, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(r2, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(exp_var_score, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(y_pred, results, pickle.HIGHEST_PROTOCOL)

    return


def build_bayesian_rr(x_train, y_train, x_test, y_test, n_features):
    """
    Constructing a Bayesian ridge regression model from input dataframe
    :param x_train: features dataframe for model training
    :param y_train: target dataframe for model training
    :param x_test: features dataframe for model testing
    :param y_test: target dataframe for model testing
    :return: None
    """
    clf = BayesianRidge()
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)

    # Mean absolute error regression loss
    mean_abs = sklearn.metrics.mean_absolute_error(y_test, y_pred)
    # Mean squared error regression loss
    mean_sq = sklearn.metrics.mean_squared_error(y_test, y_pred)
    # Median absolute error regression loss
    median_abs = sklearn.metrics.median_absolute_error(y_test, y_pred)
    # R^2 (coefficient of determination) regression score function
    r2 = sklearn.metrics.r2_score(y_test, y_pred)
    # Explained variance regression score function
    exp_var_score = sklearn.metrics.explained_variance_score(y_test, y_pred)
    # Optimal ridge regression alpha value from CV
    ridge_alpha = clf.alpha_

    with open('./trained_networks/select_brr_%d_data.pkl' % n_features, 'wb') as results:
        pickle.dump(clf, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(mean_abs, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(mean_sq, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(median_abs, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(r2, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(exp_var_score, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(ridge_alpha, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(y_pred, results, pickle.HIGHEST_PROTOCOL)

    return


def build_lasso(x_train, y_train, x_test, y_test, n_features):
    """
    Constructing a Lasso linear model with cross validation from input dataframe
    :param x_train: features dataframe for model training
    :param y_train: target dataframe for model training
    :param x_test: features dataframe for model testing
    :param y_test: target dataframe for model testing
    :return: None
    """

    model = Lasso(random_state=1, tol=0.001)
    # Random state has int value for non-random sampling
    param_dist = {'alpha': np.arange( 0.0001, 1, 0.001 ).tolist()}
    clf = RandomizedSearchCV(estimator=model, param_distributions=param_dist,
                             n_iter=20, n_jobs=-1)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)

    print(clf.best_params_, clf.best_score_)

    # Mean absolute error regression loss
    mean_abs = sklearn.metrics.mean_absolute_error(y_test, y_pred)
    # Mean squared error regression loss
    mean_sq = sklearn.metrics.mean_squared_error(y_test, y_pred)
    # Median absolute error regression loss
    median_abs = sklearn.metrics.median_absolute_error(y_test, y_pred)
    # R^2 (coefficient of determination) regression score function
    r2 = sklearn.metrics.r2_score(y_test, y_pred)
    # Explained variance regression score function
    exp_var_score = sklearn.metrics.explained_variance_score(y_test, y_pred)

    with open('./trained_networks/select_lasso_%d_data.pkl' % n_features, 'wb') as results:
        pickle.dump(clf, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(mean_abs, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(mean_sq, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(median_abs, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(r2, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(exp_var_score, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(y_pred, results, pickle.HIGHEST_PROTOCOL)

    return


def build_forest(x_train, y_train, x_test, y_test, n_features):
    """
    Constructing a random forest regression model from input dataframe
    :param x_train: features dataframe for model training
    :param y_train: target dataframe for model training
    :param x_test: features dataframe for model testing
    :param y_test: target dataframe for model testing
    :return: None
    """
    model = RandomForestRegressor()
    param_dist = {'max_depth': sp_randint(1, 15),
                  'min_samples_split': sp_randint(2, 15)}
    clf = RandomizedSearchCV(estimator=model, param_distributions=param_dist,
                             n_iter=15, n_jobs=-1)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)

    # Mean absolute error regression loss
    mean_abs = sklearn.metrics.mean_absolute_error(y_test, y_pred)
    # Mean squared error regression loss
    mean_sq = sklearn.metrics.mean_squared_error(y_test, y_pred)
    # Median absolute error regression loss
    median_abs = sklearn.metrics.median_absolute_error(y_test, y_pred)
    # R^2 (coefficient of determination) regression score function
    r2 = sklearn.metrics.r2_score(y_test, y_pred)
    # Explained variance regression score function
    exp_var_score = sklearn.metrics.explained_variance_score(y_test, y_pred)

    with open('./trained_networks/select_rfr_%d_data.pkl' % n_features, 'wb') as results:
        pickle.dump(clf, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(mean_abs, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(mean_sq, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(median_abs, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(r2, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(exp_var_score, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(y_pred, results, pickle.HIGHEST_PROTOCOL)
    return

def select_results():
    df_mean_abs = pd.DataFrame()
    df_mean_sq = pd.DataFrame()
    df_median_abs = pd.DataFrame()
    df_r2 = pd.DataFrame()
    df_exp_var_score = pd.DataFrame()
    lists = [25, 50, 75, 100, 150, 200, 250, 300]
    
    for n_features in lists:
        with open('./trained_networks/select_lr_%d_data.pkl' % n_features, 'rb') as result:
            clf = pickle.load(result)
            mean_abs = pickle.load(result)
            mean_sq = pickle.load(result)
            median_abs = pickle.load(result)
            r2 = pickle.load(result)
            exp_var_score = pickle.load(result)
            y_pred = pickle.load(result)
            
            df_mean_abs.set_value('Linear Regression', '%d' % n_features,  mean_abs)
            df_mean_sq.set_value('Linear Regression', '%d' % n_features, mean_sq)
            df_median_abs.set_value('Linear Regression', '%d' % n_features, median_abs)
            df_r2.set_value('Linear Regression', '%d' % n_features, r2)
            df_exp_var_score.set_value('Linear Regression', '%d' % n_features, exp_var_score)

        with open('./trained_networks/select_nn_%d_data.pkl' % n_features, 'rb') as result:
            clf = pickle.load(result)
            net = pickle.load(result)
            mean_abs = pickle.load(result)
            mean_sq = pickle.load(result)
            median_abs = pickle.load(result)
            r2 = pickle.load(result)
            exp_var_score = pickle.load(result)
            y_pred_nn = pickle.load(result)
            
            df_mean_abs.set_value('Neural Network', '%d' % n_features, mean_abs)
            df_mean_sq.set_value('Neural Network', '%d' % n_features, mean_sq)
            df_median_abs.set_value('Neural Network', '%d' % n_features, median_abs)
            df_r2.set_value('Neural Network', '%d' % n_features, r2)
            df_exp_var_score.set_value('Neural Network', '%d' % n_features, exp_var_score)

        with open('./trained_networks/select_svm_%d_data.pkl' % n_features, 'rb') as result:
            clf = pickle.load(result)
            mean_abs = pickle.load(result)
            mean_sq = pickle.load(result)
            median_abs = pickle.load(result)
            r2 = pickle.load(result)
            exp_var_score = pickle.load(result)
            y_pred_svm = pickle.load(result)
            
            df_mean_abs.set_value('Linear SVR', '%d' % n_features, mean_abs)
            df_mean_sq.set_value('Linear SVR', '%d' % n_features, mean_sq)
            df_median_abs.set_value('Linear SVR', '%d' % n_features, median_abs)
            df_r2.set_value('Linear SVR', '%d' % n_features, r2)
            df_exp_var_score.set_value('Linear SVR', '%d' % n_features, exp_var_score)

        with open('./trained_networks/select_dt_%d_data.pkl' % n_features, 'rb') as result:
            clf = pickle.load(result)
            mean_abs = pickle.load(result)
            mean_sq = pickle.load(result)
            median_abs = pickle.load(result)
            r2 = pickle.load(result)
            exp_var_score = pickle.load(result)
            y_pred_dt = pickle.load(result)
            
            df_mean_abs.set_value('Decision Tree', '%d' % n_features, mean_abs)
            df_mean_sq.set_value('Decision Tree', '%d' % n_features, mean_sq)
            df_median_abs.set_value('Decision Tree', '%d' % n_features, median_abs)
            df_r2.set_value('Decision Tree', '%d' % n_features, r2)
            df_exp_var_score.set_value('Decision Tree', '%d' % n_features, exp_var_score)

        with open('./trained_networks/select_rr_%d_data.pkl' % n_features, 'rb') as result:
            clf = pickle.load(result)
            mean_abs = pickle.load(result)
            mean_sq = pickle.load(result)
            median_abs = pickle.load(result)
            r2 = pickle.load(result)
            exp_var_score = pickle.load(result)
            y_pred_rr = pickle.load(result)
            
            df_mean_abs.set_value('Ridge Regression', '%d' % n_features, mean_abs)
            df_mean_sq.set_value('Ridge Regression', '%d' % n_features, mean_sq)
            df_median_abs.set_value('Ridge Regression', '%d' % n_features, median_abs)
            df_r2.set_value('Ridge Regression', '%d' % n_features, r2)
            df_exp_var_score.set_value('Ridge Regression', '%d' % n_features, exp_var_score)

        with open('./trained_networks/select_brr_%d_data.pkl' % n_features, 'rb') as result:
            clf = pickle.load(result)
            mean_abs = pickle.load(result)
            mean_sq = pickle.load(result)
            median_abs = pickle.load(result)
            r2 = pickle.load(result)
            exp_var_score = pickle.load(result)
            y_pred_brr = pickle.load(result)
            
            df_mean_abs.set_value('Bayesian Ridge Regression', '%d' % n_features, mean_abs)
            df_mean_sq.set_value('Bayesian Ridge Regression', '%d' % n_features, mean_sq)
            df_median_abs.set_value('Bayesian Ridge Regression', '%d' % n_features, median_abs)
            df_r2.set_value('Bayesian Ridge Regression', '%d' % n_features, r2)
            df_exp_var_score.set_value('Bayesian Ridge Regression', '%d' % n_features, exp_var_score)
            
        with open('./trained_networks/select_lasso_%d_data.pkl' % n_features, 'rb') as result:
            clf = pickle.load(result)
            mean_abs = pickle.load(result)
            mean_sq = pickle.load(result)
            median_abs = pickle.load(result)
            r2 = pickle.load(result)
            exp_var_score = pickle.load(result)
            y_pred_lasso = pickle.load(result)
            
            df_mean_abs.set_value('Lasso', '%d' % n_features, mean_abs)
            df_mean_sq.set_value('Lasso', '%d' % n_features, mean_sq)
            df_median_abs.set_value('Lasso', '%d' % n_features, median_abs)
            df_r2.set_value('Lasso', '%d' % n_features, r2)
            df_exp_var_score.set_value('Lasso', '%d' % n_features, exp_var_score)

        with open('./trained_networks/select_rfr_%d_data.pkl' % n_features, 'rb') as result:
            clf = pickle.load(result)
            mean_abs = pickle.load(result)
            mean_sq = pickle.load(result)
            median_abs = pickle.load(result)
            r2 = pickle.load(result)
            exp_var_score = pickle.load(result)
            y_pred_lasso = pickle.load(result)
            
            df_mean_abs.set_value('Random Forest regression', '%d' % n_features, mean_abs)
            df_mean_sq.set_value('Random Forest regression', '%d' % n_features, mean_sq)
            df_median_abs.set_value('Random Forest regression', '%d' % n_features, median_abs)
            df_r2.set_value('Random Forest regression', '%d' % n_features, r2)
            df_exp_var_score.set_value('Random Forest regression', '%d' % n_features, exp_var_score)

    return df_mean_abs, df_mean_sq, df_median_abs, df_r2, df_exp_var_score

def all_results():
    df_mean_abs = pd.DataFrame()
    df_mean_sq = pd.DataFrame()
    df_median_abs = pd.DataFrame()
    df_r2 = pd.DataFrame()
    df_exp_var_score = pd.DataFrame()
    lists = [25, 50, 75, 100, 150, 200, 250, 300]
    
    for n_features in lists:
        with open('./trained_networks/all_lr_%d_data.pkl' % n_features, 'rb') as result:
            clf = pickle.load(result)
            mean_abs = pickle.load(result)
            mean_sq = pickle.load(result)
            median_abs = pickle.load(result)
            r2 = pickle.load(result)
            exp_var_score = pickle.load(result)
            y_pred = pickle.load(result)
            
            df_mean_abs.set_value('Linear Regression', '%d' % n_features,  mean_abs)
            df_mean_sq.set_value('Linear Regression', '%d' % n_features, mean_sq)
            df_median_abs.set_value('Linear Regression', '%d' % n_features, median_abs)
            df_r2.set_value('Linear Regression', '%d' % n_features, r2)
            df_exp_var_score.set_value('Linear Regression', '%d' % n_features, exp_var_score)

        with open('./trained_networks/all_nn_%d_data.pkl' % n_features, 'rb') as result:
            clf = pickle.load(result)
            net = pickle.load(result)
            mean_abs = pickle.load(result)
            mean_sq = pickle.load(result)
            median_abs = pickle.load(result)
            r2 = pickle.load(result)
            exp_var_score = pickle.load(result)
            y_pred_nn = pickle.load(result)
            
            df_mean_abs.set_value('Neural Network', '%d' % n_features, mean_abs)
            df_mean_sq.set_value('Neural Network', '%d' % n_features, mean_sq)
            df_median_abs.set_value('Neural Network', '%d' % n_features, median_abs)
            df_r2.set_value('Neural Network', '%d' % n_features, r2)
            df_exp_var_score.set_value('Neural Network', '%d' % n_features, exp_var_score)

        with open('./trained_networks/all_svm_%d_data.pkl' % n_features, 'rb') as result:
            clf = pickle.load(result)
            mean_abs = pickle.load(result)
            mean_sq = pickle.load(result)
            median_abs = pickle.load(result)
            r2 = pickle.load(result)
            exp_var_score = pickle.load(result)
            y_pred_svm = pickle.load(result)
            
            df_mean_abs.set_value('Linear SVR', '%d' % n_features, mean_abs)
            df_mean_sq.set_value('Linear SVR', '%d' % n_features, mean_sq)
            df_median_abs.set_value('Linear SVR', '%d' % n_features, median_abs)
            df_r2.set_value('Linear SVR', '%d' % n_features, r2)
            df_exp_var_score.set_value('Linear SVR', '%d' % n_features, exp_var_score)

        with open('./trained_networks/all_dt_%d_data.pkl' % n_features, 'rb') as result:
            clf = pickle.load(result)
            mean_abs = pickle.load(result)
            mean_sq = pickle.load(result)
            median_abs = pickle.load(result)
            r2 = pickle.load(result)
            exp_var_score = pickle.load(result)
            y_pred_dt = pickle.load(result)
            
            df_mean_abs.set_value('Decision Tree', '%d' % n_features, mean_abs)
            df_mean_sq.set_value('Decision Tree', '%d' % n_features, mean_sq)
            df_median_abs.set_value('Decision Tree', '%d' % n_features, median_abs)
            df_r2.set_value('Decision Tree', '%d' % n_features, r2)
            df_exp_var_score.set_value('Decision Tree', '%d' % n_features, exp_var_score)

        with open('./trained_networks/all_rr_%d_data.pkl' % n_features, 'rb') as result:
            clf = pickle.load(result)
            mean_abs = pickle.load(result)
            mean_sq = pickle.load(result)
            median_abs = pickle.load(result)
            r2 = pickle.load(result)
            exp_var_score = pickle.load(result)
            y_pred_rr = pickle.load(result)
            
            df_mean_abs.set_value('Ridge Regression', '%d' % n_features, mean_abs)
            df_mean_sq.set_value('Ridge Regression', '%d' % n_features, mean_sq)
            df_median_abs.set_value('Ridge Regression', '%d' % n_features, median_abs)
            df_r2.set_value('Ridge Regression', '%d' % n_features, r2)
            df_exp_var_score.set_value('Ridge Regression', '%d' % n_features, exp_var_score)

        with open('./trained_networks/all_brr_%d_data.pkl' % n_features, 'rb') as result:
            clf = pickle.load(result)
            mean_abs = pickle.load(result)
            mean_sq = pickle.load(result)
            median_abs = pickle.load(result)
            r2 = pickle.load(result)
            exp_var_score = pickle.load(result)
            y_pred_brr = pickle.load(result)
            
            df_mean_abs.set_value('Bayesian Ridge Regression', '%d' % n_features, mean_abs)
            df_mean_sq.set_value('Bayesian Ridge Regression', '%d' % n_features, mean_sq)
            df_median_abs.set_value('Bayesian Ridge Regression', '%d' % n_features, median_abs)
            df_r2.set_value('Bayesian Ridge Regression', '%d' % n_features, r2)
            df_exp_var_score.set_value('Bayesian Ridge Regression', '%d' % n_features, exp_var_score)
            
        with open('./trained_networks/all_lasso_%d_data.pkl' % n_features, 'rb') as result:
            clf = pickle.load(result)
            mean_abs = pickle.load(result)
            mean_sq = pickle.load(result)
            median_abs = pickle.load(result)
            r2 = pickle.load(result)
            exp_var_score = pickle.load(result)
            y_pred_lasso = pickle.load(result)
            
            df_mean_abs.set_value('Lasso', '%d' % n_features, mean_abs)
            df_mean_sq.set_value('Lasso', '%d' % n_features, mean_sq)
            df_median_abs.set_value('Lasso', '%d' % n_features, median_abs)
            df_r2.set_value('Lasso', '%d' % n_features, r2)
            df_exp_var_score.set_value('Lasso', '%d' % n_features, exp_var_score)

        with open('./trained_networks/all_rfr_%d_data.pkl' % n_features, 'rb') as result:
            clf = pickle.load(result)
            mean_abs = pickle.load(result)
            mean_sq = pickle.load(result)
            median_abs = pickle.load(result)
            r2 = pickle.load(result)
            exp_var_score = pickle.load(result)
            y_pred_lasso = pickle.load(result)
            
            df_mean_abs.set_value('Random Forest regression', '%d' % n_features, mean_abs)
            df_mean_sq.set_value('Random Forest regression', '%d' % n_features, mean_sq)
            df_median_abs.set_value('Random Forest regression', '%d' % n_features, median_abs)
            df_r2.set_value('Random Forest regression', '%d' % n_features, r2)
            df_exp_var_score.set_value('Random Forest regression', '%d' % n_features, exp_var_score)

    return df_mean_abs, df_mean_sq, df_median_abs, df_r2, df_exp_var_score

In [ ]:
df = pd.read_csv('https://s3-us-west-2.amazonaws.com/'
                 'pphilip-usp-inhibition/data/df_preprocessing.csv')
df.drop(df.columns[0], axis=1, inplace=True)

In [ ]:
select_df = df.loc[df[TARGET_COLUMN] > 0]
select_df.reset_index(drop=True, inplace=True)

# Copying column names to use after np array manipulation
all_headers = list(select_df.columns.values)
x_headers = list(select_df.columns.values)[:-1]

# Train, validation and test split
df_train, df_test = model_selection.train_test_split(select_df, test_size=0.25)
# Reassign column name and index after randomized split
df_train.reset_index(inplace=True, drop=True)
df_test.reset_index(inplace=True, drop=True)
df_train = pd.DataFrame(df_train, columns=all_headers)
df_test = pd.DataFrame(df_test, columns=all_headers)

# Remove the classification column from the dataframe
x_train = df_train.drop(TARGET_COLUMN, axis=1)
x_test = df_test.drop(TARGET_COLUMN, axis=1)
y_train = df_train[TARGET_COLUMN]
y_test = df_test[TARGET_COLUMN]

# Checking dataframe for NaN and infinite values
x_train = change_nan_infinite(x_train)
y_train = change_nan_infinite(y_train)
x_test = change_nan_infinite(x_test)
y_test = change_nan_infinite(y_test)

y_train = pd.DataFrame(y_train, columns=[TARGET_COLUMN])
y_test = pd.DataFrame(y_test, columns=[TARGET_COLUMN])

y_train.to_csv('./data/select_y_train_postprocessing.csv')
y_test.to_csv('./data/select_y_test_postprocessing.csv')

# Transform all column values to mean 0 and unit variance
clf = preprocessing.StandardScaler().fit(x_train)
x_train = clf.transform(x_train)
x_test = clf.transform(x_test)
y_train = np.array(y_train)

# Feature selection and feature importance plot
choose_features(x_train, y_train, x_test, x_headers)

In [ ]:
n_features = int(input("Choose the number of features to be used in the model" + "\n" +
                       "Pick from 25, 50, 75, 100, 150, 200, 250, 300" + "\n"))
x_train = pd.read_csv('./data/select_x_train_postprocessing_rfr_%d.csv' % n_features)
x_test = pd.read_csv('./data/select_x_test_postprocessing_rfr_%d.csv' % n_features)
y_train = pd.read_csv('./data/select_y_train_postprocessing.csv')
y_test = pd.read_csv('./data/select_y_test_postprocessing.csv')
x_train.drop(x_train.columns[0], axis=1, inplace=True)
x_test.drop(x_test.columns[0], axis=1, inplace=True)
y_train.drop(y_train.columns[0], axis=1, inplace=True)
y_test.drop(y_test.columns[0], axis=1, inplace=True)

print("Generating models")
run_models(np.array(x_train), np.array(y_train).ravel(), np.array(x_test), np.array(y_test).ravel(), n_features)

In [3]:
df_all_mean_abs, df_all_mean_sq, df_all_median_abs, df_all_r2, df_all_exp_var_score = all_results()
df_select_mean_abs, df_select_mean_sq, df_select_median_abs, df_select_r2, df_select_exp_var_score = select_results()

In [8]:
df_all_mean_abs


Out[8]:
25 50 75 100 150 200 250 300
Linear Regression 1.464 1.463 1.464 1.462 1.465 1.466 1.470 1.473
Neural Network 1.678 1.678 1.678 1.678 1.678 1.678 1.678 1.678
Linear SVR 1.464 1.463 1.464 1.462 1.465 1.466 1.470 1.473
Decision Tree 1.451 1.470 1.470 1.470 1.470 1.456 1.456 1.448
Ridge Regression 1.464 1.463 1.464 1.462 1.465 1.466 1.470 1.473
Bayesian Ridge Regression 1.464 1.463 1.463 1.462 1.463 1.464 1.465 1.466
Lasso 1.471 1.463 1.464 1.468 1.462 1.463 1.461 1.464
Random Forest regression 1.413 1.398 1.420 1.416 1.431 1.420 1.416 1.420

In [9]:
df_all_mean_sq


Out[9]:
25 50 75 100 150 200 250 300
Linear Regression 15.164 15.130 15.115 15.081 15.042 15.018 14.980 14.964
Neural Network 15.349 15.349 15.349 15.349 15.349 15.349 15.349 15.349
Linear SVR 15.164 15.130 15.115 15.081 15.042 15.018 14.980 14.961
Decision Tree 15.075 15.241 15.241 15.241 15.241 15.176 15.176 15.202
Ridge Regression 15.164 15.130 15.115 15.081 15.042 15.018 14.980 14.962
Bayesian Ridge Regression 15.164 15.130 15.115 15.084 15.050 15.022 14.994 14.972
Lasso 15.211 15.142 15.153 15.174 15.125 15.130 15.118 15.144
Random Forest regression 14.608 14.504 14.783 14.692 14.752 14.631 14.689 14.728

In [10]:
df_all_median_abs


Out[10]:
25 50 75 100 150 200 250 300
Linear Regression 0.726 0.720 0.720 0.713 0.709 0.706 0.711 0.711
Neural Network 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000
Linear SVR 0.726 0.720 0.720 0.713 0.709 0.706 0.711 0.711
Decision Tree 0.490 0.547 0.547 0.547 0.547 0.490 0.490 0.632
Ridge Regression 0.726 0.720 0.720 0.713 0.709 0.706 0.711 0.711
Bayesian Ridge Regression 0.726 0.721 0.722 0.713 0.709 0.708 0.707 0.709
Lasso 0.737 0.726 0.727 0.734 0.719 0.719 0.717 0.723
Random Forest regression 0.553 0.527 0.546 0.552 0.581 0.542 0.542 0.556

In [11]:
df_all_r2


Out[11]:
25 50 75 100 150 200 250 300
Linear Regression 0.014 0.016 0.017 0.019 0.022 0.024 0.026 0.027
Neural Network -0.003 -0.003 -0.003 -0.003 -0.003 -0.003 -0.003 -0.003
Linear SVR 0.014 0.016 0.017 0.019 0.022 0.024 0.026 0.027
Decision Tree 0.020 0.009 0.009 0.009 0.009 0.013 0.013 0.012
Ridge Regression 0.014 0.016 0.017 0.019 0.022 0.024 0.026 0.027
Bayesian Ridge Regression 0.014 0.016 0.017 0.019 0.021 0.023 0.025 0.027
Lasso 0.011 0.016 0.015 0.013 0.017 0.016 0.017 0.015
Random Forest regression 0.050 0.057 0.039 0.045 0.041 0.049 0.045 0.042

In [12]:
df_all_exp_var_score


Out[12]:
25 50 75 100 150 200 250 300
Linear Regression 0.014 0.016 0.017 0.019 0.022 0.024 0.026 0.027
Neural Network 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000
Linear SVR 0.014 0.016 0.017 0.019 0.022 0.024 0.026 0.027
Decision Tree 0.020 0.009 0.009 0.009 0.009 0.013 0.013 0.012
Ridge Regression 0.014 0.016 0.017 0.019 0.022 0.024 0.026 0.027
Bayesian Ridge Regression 0.014 0.016 0.017 0.019 0.021 0.023 0.025 0.027
Lasso 0.011 0.016 0.015 0.013 0.017 0.016 0.017 0.015
Random Forest regression 0.050 0.057 0.039 0.045 0.041 0.049 0.045 0.042

In [13]:
df_select_mean_abs


Out[13]:
25 50 75 100 150 200 250 300
Linear Regression 6.405 6.384 6.357 6.337 6.276 6.268 6.253 6.262
Neural Network 14.275 14.275 14.275 14.275 14.275 14.275 14.275 14.275
Linear SVR 6.405 6.384 6.356 6.337 6.276 6.267 6.247 6.256
Decision Tree 6.463 6.489 6.463 6.465 6.465 6.463 6.463 6.465
Ridge Regression 6.405 6.384 6.356 6.337 6.276 6.266 6.247 6.256
Bayesian Ridge Regression 6.406 6.384 6.355 6.345 6.304 6.288 6.273 6.269
Lasso 6.413 6.413 6.378 6.345 6.278 6.268 6.281 6.346
Random Forest regression 6.334 6.294 6.312 6.344 6.367 6.345 6.374 6.374

In [14]:
df_select_mean_sq


Out[14]:
25 50 75 100 150 200 250 300
Linear Regression 81.188 80.801 80.604 80.264 79.336 79.221 79.143 79.341
Neural Network 287.633 287.633 287.633 287.633 287.633 287.633 287.633 287.633
Linear SVR 81.188 80.801 80.603 80.262 79.338 79.200 79.077 79.188
Decision Tree 82.855 82.251 82.855 82.384 82.384 82.855 82.855 82.384
Ridge Regression 81.188 80.801 80.603 80.262 79.340 79.198 79.072 79.190
Bayesian Ridge Regression 81.126 80.629 80.306 80.174 79.538 79.328 79.102 79.059
Lasso 81.201 80.956 80.385 80.212 79.348 79.195 79.193 79.776
Random Forest regression 76.645 75.487 76.521 78.174 78.872 76.120 77.208 79.215

In [15]:
df_select_median_abs


Out[15]:
25 50 75 100 150 200 250 300
Linear Regression 5.184 5.257 5.243 5.275 5.186 5.150 5.155 5.153
Neural Network 9.000 9.000 9.000 9.000 9.000 9.000 9.000 9.000
Linear SVR 5.184 5.256 5.243 5.274 5.177 5.150 5.131 5.156
Decision Tree 4.650 5.423 4.650 4.409 4.409 4.650 4.650 4.409
Ridge Regression 5.184 5.256 5.246 5.276 5.180 5.155 5.115 5.147
Bayesian Ridge Regression 5.189 5.261 5.258 5.299 5.220 5.203 5.148 5.150
Lasso 5.170 5.184 5.242 5.300 5.191 5.186 5.179 5.244
Random Forest regression 4.655 4.651 4.667 4.770 4.787 4.716 4.695 4.748

In [16]:
df_select_r2


Out[16]:
25 50 75 100 150 200 250 300
Linear Regression 0.032 0.036 0.039 0.043 0.054 0.055 0.056 0.054
Neural Network -2.430 -2.430 -2.430 -2.430 -2.430 -2.430 -2.430 -2.430
Linear SVR 0.032 0.036 0.039 0.043 0.054 0.056 0.057 0.056
Decision Tree 0.012 0.019 0.012 0.018 0.018 0.012 0.012 0.018
Ridge Regression 0.032 0.036 0.039 0.043 0.054 0.056 0.057 0.056
Bayesian Ridge Regression 0.033 0.038 0.042 0.044 0.051 0.054 0.057 0.057
Lasso 0.032 0.035 0.041 0.043 0.054 0.056 0.056 0.049
Random Forest regression 0.086 0.100 0.087 0.068 0.059 0.092 0.079 0.055

In [17]:
df_select_exp_var_score


Out[17]:
25 50 75 100 150 200 250 300
Linear Regression 0.032 0.037 0.039 0.043 0.054 0.055 0.056 0.054
Neural Network 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000
Linear SVR 0.032 0.037 0.039 0.043 0.054 0.056 0.057 0.056
Decision Tree 0.012 0.019 0.012 0.018 0.018 0.012 0.012 0.018
Ridge Regression 0.032 0.037 0.039 0.043 0.054 0.056 0.057 0.056
Bayesian Ridge Regression 0.033 0.039 0.042 0.044 0.052 0.054 0.057 0.057
Lasso 0.032 0.035 0.041 0.044 0.054 0.056 0.056 0.049
Random Forest regression 0.086 0.100 0.088 0.068 0.060 0.093 0.080 0.056

In [31]:
N = 8
select_means = df_select_mean_abs.mean(axis=1).tolist()
select_std = df_select_mean_abs.std(axis=1).tolist()
ind = np.arange(N)  # the x locations for the groups
width = 0.35       # the width of the bars
fig, ax = plt.subplots()
rects1 = ax.bar(ind, select_means, width, color='r', yerr=select_std)
all_means = df_all_mean_abs.mean(axis=1).tolist()
all_std = df_all_mean_abs.std(axis=1).tolist()
rects2 = ax.bar(ind + width, all_means, width, color='b', yerr=all_std)

# add some text for labels, title and axes ticks
ax.set_ylabel('Mean absolute error')
ax.set_title('MAE by dataset and learning algorithm')
ax.set_xticks(ind + width / 2)
ax.set_xticklabels(('LR', 'NN', 'SVR', 'DT', 'RR', 'BRR', 'Lasso', 'RFR'))
ax.legend((rects1[0], rects2[0]), ('Active molecules', 'All molecules'))
plt.savefig('./plots/mae.png', bbox_inches='tight')

In [32]:
N = 8
select_means = df_select_mean_sq.mean(axis=1).tolist()
select_std = df_select_mean_sq.std(axis=1).tolist()
ind = np.arange(N)  # the x locations for the groups
width = 0.35       # the width of the bars
fig, ax = plt.subplots()
rects1 = ax.bar(ind, select_means, width, color='r', yerr=select_std)
all_means = df_all_mean_sq.mean(axis=1).tolist()
all_std = df_all_mean_sq.std(axis=1).tolist()
rects2 = ax.bar(ind + width, all_means, width, color='b', yerr=all_std)

# add some text for labels, title and axes ticks
ax.set_ylabel('Mean square error')
ax.set_title('MSE by dataset and learning algorithm')
ax.set_xticks(ind + width / 2)
ax.set_xticklabels(('LR', 'NN', 'SVR', 'DT', 'RR', 'BRR', 'Lasso', 'RFR'))
ax.legend((rects1[0], rects2[0]), ('Active molecules', 'All molecules'))
plt.savefig('./plots/mse.png', bbox_inches='tight')

In [33]:
N = 8
select_means = df_select_median_abs.mean(axis=1).tolist()
select_std = df_select_median_abs.std(axis=1).tolist()
ind = np.arange(N)  # the x locations for the groups
width = 0.35       # the width of the bars
fig, ax = plt.subplots()
rects1 = ax.bar(ind, select_means, width, color='r', yerr=select_std)
all_means = df_all_median_abs.mean(axis=1).tolist()
all_std = df_all_median_abs.std(axis=1).tolist()
rects2 = ax.bar(ind + width, all_means, width, color='b', yerr=all_std)

# add some text for labels, title and axes ticks
ax.set_ylabel('Median absolute error')
ax.set_title('MedAE by dataset and learning algorithm')
ax.set_xticks(ind + width / 2)
ax.set_xticklabels(('LR', 'NN', 'SVR', 'DT', 'RR', 'BRR', 'Lasso', 'RFR'))
ax.legend((rects1[0], rects2[0]), ('Active molecules', 'All molecules'))
plt.savefig('./plots/medae.png', bbox_inches='tight')

In [34]:
N = 8
select_means = df_select_r2.mean(axis=1).tolist()
select_std = df_select_r2.std(axis=1).tolist()
ind = np.arange(N)  # the x locations for the groups
width = 0.35       # the width of the bars
fig, ax = plt.subplots()
rects1 = ax.bar(ind, select_means, width, color='r', yerr=select_std)
all_means = df_all_r2.mean(axis=1).tolist()
all_std = df_all_r2.std(axis=1).tolist()
rects2 = ax.bar(ind + width, all_means, width, color='b', yerr=all_std)

# add some text for labels, title and axes ticks
ax.set_ylabel('R2 score')
ax.set_title('R2 score by dataset and learning algorithm')
ax.set_xticks(ind + width / 2)
ax.set_xticklabels(('LR', 'NN', 'SVR', 'DT', 'RR', 'BRR', 'Lasso', 'RFR'))
ax.legend((rects1[0], rects2[0]), ('Active molecules', 'All molecules'))
plt.savefig('./plots/r2.png', bbox_inches='tight')

In [35]:
N = 8
select_means = df_select_exp_var_score.mean(axis=1).tolist()
select_std = df_select_exp_var_score.std(axis=1).tolist()
ind = np.arange(N)  # the x locations for the groups
width = 0.35       # the width of the bars
fig, ax = plt.subplots()
rects1 = ax.bar(ind, select_means, width, color='r', yerr=select_std)
all_means = df_all_exp_var_score.mean(axis=1).tolist()
all_std = df_all_exp_var_score.std(axis=1).tolist()
rects2 = ax.bar(ind + width, all_means, width, color='b', yerr=all_std)

# add some text for labels, title and axes ticks
ax.set_ylabel('Explained variance score')
ax.set_title('EVS by dataset and learning algorithm')
ax.set_xticks(ind + width / 2)
ax.set_xticklabels(('LR', 'NN', 'SVR', 'DT', 'RR', 'BRR', 'Lasso', 'RFR'))
ax.legend((rects1[0], rects2[0]), ('Active molecules', 'All molecules'))
plt.savefig('./plots/evs.png', bbox_inches='tight')

In [ ]:
n_features = 300
x_train = pd.read_csv('./data/select_x_train_postprocessing_rfr_%d.csv' % n_features)
x_test = pd.read_csv('./data/select_x_test_postprocessing_rfr_%d.csv' % n_features)
y_train = pd.read_csv('./data/select_y_train_postprocessing.csv')
y_test = pd.read_csv('./data/select_y_test_postprocessing.csv')
x_train.drop(x_train.columns[0], axis=1, inplace=True)
x_test.drop(x_test.columns[0], axis=1, inplace=True)
y_train.drop(y_train.columns[0], axis=1, inplace=True)
y_test.drop(y_test.columns[0], axis=1, inplace=True)
plots.plot_features(x_train, y_train, x_test, y_test)

In [ ]:
y_train = pd.read_csv('./data/select_y_train_postprocessing.csv')
y_test = pd.read_csv('./data/select_y_test_postprocessing.csv')
y_train.drop(y_train.columns[0], axis=1, inplace=True)
y_test.drop(y_test.columns[0], axis=1, inplace=True)
plots.plot_y_dist(y_train, y_test)

In [ ]:
genalgo.main()