In [1]:
import pandas as pd
import numpy as np
import pickle
import lasagne
import sklearn
from sklearn import model_selection
from lasagne.layers import DenseLayer
from lasagne.layers import InputLayer
from nolearn.lasagne import NeuralNet
from scipy.stats import randint as sp_randint
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, BayesianRidge, Lasso
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import LinearSVR
from sklearn.tree import DecisionTreeRegressor
try:
    import urllib.request as urllib2
except ImportError:
    import urllib2
import random
pd.set_option('precision', 3)
TARGET_COLUMN = 'Activity_Score'


/home/pphilip/miniconda2/envs/my-rdkit-env/lib/python2.7/site-packages/theano/tensor/signal/downsample.py:6: UserWarning: downsample module has been moved to the theano.tensor.signal.pool module.
  "downsample module has been moved to the theano.tensor.signal.pool module.")
/home/pphilip/miniconda2/envs/my-rdkit-env/lib/python2.7/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

In [2]:
def choose_features(x_train, y_train, x_test, column_names):
    """
    Selecting the features of high importance to reduce feature space.
    :param x_train: Training set of features.
    :param x_test: Test set of features.
    :param y_train: Training target values
    :param column_names: Names of columns in x
    """

    # Random forest feature importance
    clf = RandomForestRegressor(n_jobs=-1, random_state=1, n_estimators=20, max_depth=10)
    # Random state has int value for non-random sampling
    clf.fit(x_train, y_train)
    feature_importance = clf.feature_importances_
    scores_table = pd.DataFrame({'feature': column_names, 'scores':
                                 feature_importance}).sort_values(by=['scores'], ascending=False)
    scores = scores_table['scores'].tolist()
    n_features = [25, 50, 75, 100, 150, 200, 250, 300]
    for n in n_features:
        feature_scores = scores_table['feature'].tolist()
        selected_features = feature_scores[:n]
        x_train = pd.DataFrame(x_train, columns=column_names)
        desired_x_train = x_train[selected_features]
        x_test = pd.DataFrame(x_test, columns=column_names)
        desired_x_test = x_test[selected_features]

        desired_x_train.to_csv('./data/all_x_train_postprocessing_rfr_%d.csv' % n)
        desired_x_test.to_csv('./data/all_x_test_postprocessing_rfr_%d.csv' % n)
    pd.DataFrame(scores).to_csv('./data/all_feature_scores_rfr.csv')

    return


def change_nan_infinite(dataframe):
    """
    Replacing NaN and infinite values from the dataframe with zeros.
    :param dataframe: Dataframe containing NaN and infinite values.
    :return data: Data with no NaN or infinite values.
    """

    dataframe.replace([np.inf, -np.inf], np.nan, inplace=True)
    data = dataframe.fillna(0)

    return data


def run_models(x_train, y_train, x_test, y_test, n_features):
    """
    Driving all machine learning models as parallel processes.
    :param x_train: features dataframe for model training
    :param y_train: target dataframe for model training
    :param x_test: features dataframe for model testing
    :param y_test: target dataframe for model testing
    :return: None
    """
    model_choice = int(input("Type your choice of model to be run:" + "\n" +
                             "1 for Linear Regression" + "\n" +
                             "2 for Neural Network" + "\n" +
                             "3 for Support Vector Machine" + "\n" +
                             "4 for Decision Tree" + "\n" +
                             "5 for Ridge Regression" + "\n" +
                             "6 for Bayesian Ridge Regression" + "\n" +
                             "7 for Lasso:" + "\n" +
                             "8 for Random Forest Regressor:" + "\n"
                             ))
    if model_choice == 1:
        build_linear(x_train, y_train, x_test, y_test, n_features)
    elif model_choice == 2:
        build_nn(x_train, y_train, x_test, y_test, n_features)
    elif model_choice == 3:
        build_svm(x_train, y_train, x_test, y_test, n_features)
    elif model_choice == 4:
        build_tree(x_train, y_train, x_test, y_test, n_features)
    elif model_choice == 5:
        build_ridge(x_train, y_train, x_test, y_test, n_features)
    elif model_choice == 6:
        build_bayesian_rr(x_train, y_train, x_test, y_test, n_features)
    elif model_choice == 7:
        build_lasso(x_train, y_train, x_test, y_test, n_features)
    elif model_choice == 8:
        build_forest(x_train, y_train, x_test, y_test, n_features)
    else:
        print("Please choose from list of available models only")

    return


def build_linear(x_train, y_train, x_test, y_test, n_features):
    """
    Constructing a decision trees regression model from input dataframe
    :param x_train: features dataframe for model training
    :param y_train: target dataframe for model training
    :param x_test: features dataframe for model testing
    :param y_test: target dataframe for model testing
    :return: None
    """
    clf = LinearRegression(n_jobs=-1)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)

    # Mean absolute error regression loss
    mean_abs = sklearn.metrics.mean_absolute_error(y_test, y_pred)
    # Mean squared error regression loss
    mean_sq = sklearn.metrics.mean_squared_error(y_test, y_pred)
    # Median absolute error regression loss
    median_abs = sklearn.metrics.median_absolute_error(y_test, y_pred)
    # R^2 (coefficient of determination) regression score function
    r2 = sklearn.metrics.r2_score(y_test, y_pred)
    # Explained variance regression score function
    exp_var_score = sklearn.metrics.explained_variance_score(y_test, y_pred)

    with open('./trained_networks/all_lr_%d_data.pkl' % n_features, 'wb') as results:
        pickle.dump(clf, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(mean_abs, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(mean_sq, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(median_abs, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(r2, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(exp_var_score, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(y_pred, results, pickle.HIGHEST_PROTOCOL)

    return


def build_nn(x_train, y_train, x_test, y_test, n_features):
    """
    Constructing a regression neural network model from input dataframe
    :param x_train: features dataframe for model training
    :param y_train: target dataframe for model training
    :param x_test: features dataframe for model testing
    :param y_test: target dataframe for model testing
    :return: None
    """
    net = NeuralNet(layers=[('input', InputLayer),
                            ('hidden0', DenseLayer),
                            ('hidden1', DenseLayer),
                            ('output', DenseLayer)],
                    input_shape=(None, x_train.shape[1]),  # Number of i/p nodes = number of columns in x
                    hidden0_num_units=15,
                    hidden0_nonlinearity=lasagne.nonlinearities.softmax,
                    hidden1_num_units=17,
                    hidden1_nonlinearity=lasagne.nonlinearities.softmax,
                    output_num_units=1,  # Number of o/p nodes = number of columns in y
                    output_nonlinearity=lasagne.nonlinearities.softmax,
                    max_epochs=100,
                    update_learning_rate=0.01,
                    regression=True,
                    verbose=1)

    # Finding the optimal set of params for each variable in the training of the neural network
    param_dist = {'hidden0_num_units':sp_randint(3, 30), 'hidden1_num_units':sp_randint(3, 30)}
    clf = RandomizedSearchCV(estimator=net, param_distributions=param_dist,
                             n_iter=15, n_jobs=-1)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)

    # Mean absolute error regression loss
    mean_abs = sklearn.metrics.mean_absolute_error(y_test, y_pred)
    # Mean squared error regression loss
    mean_sq = sklearn.metrics.mean_squared_error(y_test, y_pred)
    # Median absolute error regression loss
    median_abs = sklearn.metrics.median_absolute_error(y_test, y_pred)
    # R^2 (coefficient of determination) regression score function
    r2 = sklearn.metrics.r2_score(y_test, y_pred)
    # Explained variance regression score function
    exp_var_score = sklearn.metrics.explained_variance_score(y_test, y_pred)

    with open('./trained_networks/all_nn_%d_data.pkl' % n_features, 'wb') as results:
        pickle.dump(clf, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(net, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(mean_abs, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(mean_sq, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(median_abs, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(r2, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(exp_var_score, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(y_pred, results, pickle.HIGHEST_PROTOCOL)

    return


def build_svm(x_train, y_train, x_test, y_test, n_features):
    """
    Constructing a support vector regression model from input dataframe
    :param x_train: features dataframe for model training
    :param y_train: target dataframe for model training
    :param x_test: features dataframe for model testing
    :param y_test: target dataframe for model testing
    :return: None
    """

    clf = LinearSVR(random_state=1, dual=False, epsilon=0,
                    loss='squared_epsilon_insensitive')
    # Random state has int value for non-random sampling
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)

    # Mean absolute error regression loss
    mean_abs = sklearn.metrics.mean_absolute_error(y_test, y_pred)
    # Mean squared error regression loss
    mean_sq = sklearn.metrics.mean_squared_error(y_test, y_pred)
    # Median absolute error regression loss
    median_abs = sklearn.metrics.median_absolute_error(y_test, y_pred)
    # R^2 (coefficient of determination) regression score function
    r2 = sklearn.metrics.r2_score(y_test, y_pred)
    # Explained variance regression score function
    exp_var_score = sklearn.metrics.explained_variance_score(y_test, y_pred)

    with open('./trained_networks/all_svm_%d_data.pkl' % n_features, 'wb') as results:
        pickle.dump(clf, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(mean_abs, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(mean_sq, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(median_abs, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(r2, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(exp_var_score, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(y_pred, results, pickle.HIGHEST_PROTOCOL)

    return


def build_tree(x_train, y_train, x_test, y_test, n_features):
    """
    Constructing a decision trees regression model from input dataframe
    :param x_train: features dataframe for model training
    :param y_train: target dataframe for model training
    :param x_test: features dataframe for model testing
    :param y_test: target dataframe for model testing
    :return: None
    """
    model = DecisionTreeRegressor()
    param_dist = {'max_depth': sp_randint(1, 15),
                  'min_samples_split': sp_randint(2, 15)}
    clf = RandomizedSearchCV(estimator=model, param_distributions=param_dist,
                             n_iter=15, n_jobs=-1)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)

    print(clf.best_params_, clf.best_score_)

    # Mean absolute error regression loss
    mean_abs = sklearn.metrics.mean_absolute_error(y_test, y_pred)
    # Mean squared error regression loss
    mean_sq = sklearn.metrics.mean_squared_error(y_test, y_pred)
    # Median absolute error regression loss
    median_abs = sklearn.metrics.median_absolute_error(y_test, y_pred)
    # R^2 (coefficient of determination) regression score function
    r2 = sklearn.metrics.r2_score(y_test, y_pred)
    # Explained variance regression score function
    exp_var_score = sklearn.metrics.explained_variance_score(y_test, y_pred)

    with open('./trained_networks/all_dt_%d_data.pkl' % n_features, 'wb') as results:
        pickle.dump(clf, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(mean_abs, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(mean_sq, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(median_abs, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(r2, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(exp_var_score, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(y_pred, results, pickle.HIGHEST_PROTOCOL)

    return


def build_ridge(x_train, y_train, x_test, y_test, n_features):
    """
    Constructing a ridge regression model from input dataframe
    :param x_train: features dataframe for model training
    :param y_train: target dataframe for model training
    :param x_test: features dataframe for model testing
    :param y_test: target dataframe for model testing
    :return: None
    """
    clf = Ridge()
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)

    # Mean absolute error regression loss
    mean_abs = sklearn.metrics.mean_absolute_error(y_test, y_pred)
    # Mean squared error regression loss
    mean_sq = sklearn.metrics.mean_squared_error(y_test, y_pred)
    # Median absolute error regression loss
    median_abs = sklearn.metrics.median_absolute_error(y_test, y_pred)
    # R^2 (coefficient of determination) regression score function
    r2 = sklearn.metrics.r2_score(y_test, y_pred)
    # Explained variance regression score function
    exp_var_score = sklearn.metrics.explained_variance_score(y_test, y_pred)

    with open('./trained_networks/all_rr_%d_data.pkl' % n_features, 'wb') as results:
        pickle.dump(clf, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(mean_abs, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(mean_sq, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(median_abs, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(r2, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(exp_var_score, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(y_pred, results, pickle.HIGHEST_PROTOCOL)

    return


def build_bayesian_rr(x_train, y_train, x_test, y_test, n_features):
    """
    Constructing a Bayesian ridge regression model from input dataframe
    :param x_train: features dataframe for model training
    :param y_train: target dataframe for model training
    :param x_test: features dataframe for model testing
    :param y_test: target dataframe for model testing
    :return: None
    """
    clf = BayesianRidge()
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)

    # Mean absolute error regression loss
    mean_abs = sklearn.metrics.mean_absolute_error(y_test, y_pred)
    # Mean squared error regression loss
    mean_sq = sklearn.metrics.mean_squared_error(y_test, y_pred)
    # Median absolute error regression loss
    median_abs = sklearn.metrics.median_absolute_error(y_test, y_pred)
    # R^2 (coefficient of determination) regression score function
    r2 = sklearn.metrics.r2_score(y_test, y_pred)
    # Explained variance regression score function
    exp_var_score = sklearn.metrics.explained_variance_score(y_test, y_pred)

    with open('./trained_networks/all_brr_%d_data.pkl' % n_features, 'wb') as results:
        pickle.dump(clf, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(mean_abs, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(mean_sq, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(median_abs, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(r2, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(exp_var_score, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(y_pred, results, pickle.HIGHEST_PROTOCOL)

    return


def build_lasso(x_train, y_train, x_test, y_test, n_features):
    """
    Constructing a Lasso linear model with cross validation from input dataframe
    :param x_train: features dataframe for model training
    :param y_train: target dataframe for model training
    :param x_test: features dataframe for model testing
    :param y_test: target dataframe for model testing
    :return: None
    """

    model = Lasso(random_state=1)
    # Random state has int value for non-random sampling
    param_dist = {'alpha': np.arange( 0.0001, 1, 0.001 ).tolist()}
    clf = RandomizedSearchCV(estimator=model, param_distributions=param_dist,
                             n_iter=20, n_jobs=-1)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)

    # Mean absolute error regression loss
    mean_abs = sklearn.metrics.mean_absolute_error(y_test, y_pred)
    # Mean squared error regression loss
    mean_sq = sklearn.metrics.mean_squared_error(y_test, y_pred)
    # Median absolute error regression loss
    median_abs = sklearn.metrics.median_absolute_error(y_test, y_pred)
    # R^2 (coefficient of determination) regression score function
    r2 = sklearn.metrics.r2_score(y_test, y_pred)
    # Explained variance regression score function
    exp_var_score = sklearn.metrics.explained_variance_score(y_test, y_pred)

    with open('./trained_networks/all_lasso_%d_data.pkl' % n_features, 'wb') as results:
        pickle.dump(clf, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(mean_abs, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(mean_sq, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(median_abs, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(r2, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(exp_var_score, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(y_pred, results, pickle.HIGHEST_PROTOCOL)

    return


def build_forest(x_train, y_train, x_test, y_test, n_features):
    """
    Constructing a random forest regression model from input dataframe
    :param x_train: features dataframe for model training
    :param y_train: target dataframe for model training
    :param x_test: features dataframe for model testing
    :param y_test: target dataframe for model testing
    :return: None
    """
    model = RandomForestRegressor()
    param_dist = {'max_depth': sp_randint(1, 15),
                  'min_samples_split': sp_randint(2, 15)}
    clf = RandomizedSearchCV(estimator=model, param_distributions=param_dist,
                             n_iter=15, n_jobs=-1)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)

    # Mean absolute error regression loss
    mean_abs = sklearn.metrics.mean_absolute_error(y_test, y_pred)
    # Mean squared error regression loss
    mean_sq = sklearn.metrics.mean_squared_error(y_test, y_pred)
    # Median absolute error regression loss
    median_abs = sklearn.metrics.median_absolute_error(y_test, y_pred)
    # R^2 (coefficient of determination) regression score function
    r2 = sklearn.metrics.r2_score(y_test, y_pred)
    # Explained variance regression score function
    exp_var_score = sklearn.metrics.explained_variance_score(y_test, y_pred)

    with open('./trained_networks/all_rfr_%d_data.pkl' % n_features, 'wb') as results:
        pickle.dump(clf, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(mean_abs, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(mean_sq, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(median_abs, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(r2, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(exp_var_score, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(y_pred, results, pickle.HIGHEST_PROTOCOL)
    print(r2)
    return

def results():
    df_mean_abs = pd.DataFrame()
    df_mean_sq = pd.DataFrame()
    df_median_abs = pd.DataFrame()
    df_r2 = pd.DataFrame()
    df_exp_var_score = pd.DataFrame()
    lists = [25, 50, 75, 100, 150, 200, 250, 300]
    
    for n_features in lists:
        with open('./trained_networks/all_lr_%d_data.pkl' % n_features, 'rb') as result:
            clf = pickle.load(result)
            mean_abs = pickle.load(result)
            mean_sq = pickle.load(result)
            median_abs = pickle.load(result)
            r2 = pickle.load(result)
            exp_var_score = pickle.load(result)
            y_pred = pickle.load(result)
            
            df_mean_abs.set_value('Linear Regression', '%d' % n_features,  mean_abs)
            df_mean_sq.set_value('Linear Regression', '%d' % n_features, mean_sq)
            df_median_abs.set_value('Linear Regression', '%d' % n_features, median_abs)
            df_r2.set_value('Linear Regression', '%d' % n_features, r2)
            df_exp_var_score.set_value('Linear Regression', '%d' % n_features, exp_var_score)

        with open('./trained_networks/all_nn_%d_data.pkl' % n_features, 'rb') as result:
            clf = pickle.load(result)
            net = pickle.load(result)
            mean_abs = pickle.load(result)
            mean_sq = pickle.load(result)
            median_abs = pickle.load(result)
            r2 = pickle.load(result)
            exp_var_score = pickle.load(result)
            y_pred_nn = pickle.load(result)
            
            df_mean_abs.set_value('Neural Network', '%d' % n_features, mean_abs)
            df_mean_sq.set_value('Neural Network', '%d' % n_features, mean_sq)
            df_median_abs.set_value('Neural Network', '%d' % n_features, median_abs)
            df_r2.set_value('Neural Network', '%d' % n_features, r2)
            df_exp_var_score.set_value('Neural Network', '%d' % n_features, exp_var_score)

        with open('./trained_networks/all_svm_%d_data.pkl' % n_features, 'rb') as result:
            clf = pickle.load(result)
            mean_abs = pickle.load(result)
            mean_sq = pickle.load(result)
            median_abs = pickle.load(result)
            r2 = pickle.load(result)
            exp_var_score = pickle.load(result)
            y_pred_svm = pickle.load(result)
            
            df_mean_abs.set_value('Linear SVR', '%d' % n_features, mean_abs)
            df_mean_sq.set_value('Linear SVR', '%d' % n_features, mean_sq)
            df_median_abs.set_value('Linear SVR', '%d' % n_features, median_abs)
            df_r2.set_value('Linear SVR', '%d' % n_features, r2)
            df_exp_var_score.set_value('Linear SVR', '%d' % n_features, exp_var_score)

        with open('./trained_networks/all_dt_%d_data.pkl' % n_features, 'rb') as result:
            clf = pickle.load(result)
            mean_abs = pickle.load(result)
            mean_sq = pickle.load(result)
            median_abs = pickle.load(result)
            r2 = pickle.load(result)
            exp_var_score = pickle.load(result)
            y_pred_dt = pickle.load(result)
            
            df_mean_abs.set_value('Decision Tree', '%d' % n_features, mean_abs)
            df_mean_sq.set_value('Decision Tree', '%d' % n_features, mean_sq)
            df_median_abs.set_value('Decision Tree', '%d' % n_features, median_abs)
            df_r2.set_value('Decision Tree', '%d' % n_features, r2)
            df_exp_var_score.set_value('Decision Tree', '%d' % n_features, exp_var_score)

        with open('./trained_networks/all_rr_%d_data.pkl' % n_features, 'rb') as result:
            clf = pickle.load(result)
            mean_abs = pickle.load(result)
            mean_sq = pickle.load(result)
            median_abs = pickle.load(result)
            r2 = pickle.load(result)
            exp_var_score = pickle.load(result)
            y_pred_rr = pickle.load(result)
            
            df_mean_abs.set_value('Ridge Regression', '%d' % n_features, mean_abs)
            df_mean_sq.set_value('Ridge Regression', '%d' % n_features, mean_sq)
            df_median_abs.set_value('Ridge Regression', '%d' % n_features, median_abs)
            df_r2.set_value('Ridge Regression', '%d' % n_features, r2)
            df_exp_var_score.set_value('Ridge Regression', '%d' % n_features, exp_var_score)

        with open('./trained_networks/all_brr_%d_data.pkl' % n_features, 'rb') as result:
            clf = pickle.load(result)
            mean_abs = pickle.load(result)
            mean_sq = pickle.load(result)
            median_abs = pickle.load(result)
            r2 = pickle.load(result)
            exp_var_score = pickle.load(result)
            y_pred_brr = pickle.load(result)
            
            df_mean_abs.set_value('Bayesian Ridge Regression', '%d' % n_features, mean_abs)
            df_mean_sq.set_value('Bayesian Ridge Regression', '%d' % n_features, mean_sq)
            df_median_abs.set_value('Bayesian Ridge Regression', '%d' % n_features, median_abs)
            df_r2.set_value('Bayesian Ridge Regression', '%d' % n_features, r2)
            df_exp_var_score.set_value('Bayesian Ridge Regression', '%d' % n_features, exp_var_score)
            
        with open('./trained_networks/all_lasso_%d_data.pkl' % n_features, 'rb') as result:
            clf = pickle.load(result)
            mean_abs = pickle.load(result)
            mean_sq = pickle.load(result)
            median_abs = pickle.load(result)
            r2 = pickle.load(result)
            exp_var_score = pickle.load(result)
            y_pred_lasso = pickle.load(result)
            
            df_mean_abs.set_value('Lasso', '%d' % n_features, mean_abs)
            df_mean_sq.set_value('Lasso', '%d' % n_features, mean_sq)
            df_median_abs.set_value('Lasso', '%d' % n_features, median_abs)
            df_r2.set_value('Lasso', '%d' % n_features, r2)
            df_exp_var_score.set_value('Lasso', '%d' % n_features, exp_var_score)

        with open('./trained_networks/all_rfr_%d_data.pkl' % n_features, 'rb') as result:
            clf = pickle.load(result)
            mean_abs = pickle.load(result)
            mean_sq = pickle.load(result)
            median_abs = pickle.load(result)
            r2 = pickle.load(result)
            exp_var_score = pickle.load(result)
            y_pred_lasso = pickle.load(result)
            
            df_mean_abs.set_value('Random Forest regression', '%d' % n_features, mean_abs)
            df_mean_sq.set_value('Random Forest regression', '%d' % n_features, mean_sq)
            df_median_abs.set_value('Random Forest regression', '%d' % n_features, median_abs)
            df_r2.set_value('Random Forest regression', '%d' % n_features, r2)
            df_exp_var_score.set_value('Random Forest regression', '%d' % n_features, exp_var_score)

    return df_mean_abs, df_mean_sq, df_median_abs, df_r2, df_exp_var_score

In [ ]:
df = pd.read_csv('https://s3-us-west-2.amazonaws.com/'
                 'pphilip-usp-inhibition/data/df_preprocessing.csv')
df.drop(df.columns[0], axis=1, inplace=True)

# Copying column names to use after np array manipulation
all_headers = list(df.columns.values)
x_headers = list(df.columns.values)[:-1]

# Train, validation and test split
df_train, df_test = model_selection.train_test_split(df, test_size=0.25)
# Reassign column name and index after randomized split
df_train.reset_index(inplace=True, drop=True)
df_test.reset_index(inplace=True, drop=True)
df_train = pd.DataFrame(df_train, columns=all_headers)
df_test = pd.DataFrame(df_test, columns=all_headers)

# Remove the classification column from the dataframe
x_train = df_train.drop(TARGET_COLUMN, axis=1)
x_test = df_test.drop(TARGET_COLUMN, axis=1)
y_train = df_train[TARGET_COLUMN]
y_test = df_test[TARGET_COLUMN]

# Checking dataframe for NaN and infinite values
x_train = change_nan_infinite(x_train)
y_train = change_nan_infinite(y_train)
x_test = change_nan_infinite(x_test)
y_test = change_nan_infinite(y_test)

y_train = pd.DataFrame(y_train, columns=[TARGET_COLUMN])
y_test = pd.DataFrame(y_test, columns=[TARGET_COLUMN])

y_train.to_csv('./data/all_y_train_postprocessing.csv')
y_test.to_csv('./data/all_y_test_postprocessing.csv')

# Transform all column values to mean 0 and unit variance
clf = sklearn.preprocessing.StandardScaler().fit(x_train)
x_train = clf.transform(x_train)
x_test = clf.transform(x_test)
y_train = np.array(y_train)

# Feature selection and feature importance plot
choose_features(x_train, y_train, x_test, x_headers)

In [ ]:
n_features = int(input("Choose the number of features to be used in the model" + "\n" +
                       "Pick from 25, 50, 75, 100, 150, 200, 250, 300" + "\n"))
x_train = pd.read_csv('./data/all_x_train_postprocessing_rfr_%d.csv' % n_features)
x_test = pd.read_csv('./data/all_x_test_postprocessing_rfr_%d.csv' % n_features)
y_train = pd.read_csv('./data/all_y_train_postprocessing.csv')
y_test = pd.read_csv('./data/all_y_test_postprocessing.csv')
x_train.drop(x_train.columns[0], axis=1, inplace=True)
x_test.drop(x_test.columns[0], axis=1, inplace=True)
y_train.drop(y_train.columns[0], axis=1, inplace=True)
y_test.drop(y_test.columns[0], axis=1, inplace=True)

print("Generating models")
run_models(np.array(x_train), np.array(y_train).ravel(), np.array(x_test), np.array(y_test).ravel(), n_features)

In [3]:
df_mean_abs, df_mean_sq, df_median_abs, df_r2, df_exp_var_score = results()

In [4]:
df_mean_abs


Out[4]:
25 50 75 100 150 200 250 300
Linear Regression 1.464 1.463 1.464 1.462 1.465 1.466 1.470 1.473
Neural Network 1.678 1.678 1.678 1.678 1.678 1.678 1.678 1.678
Linear SVR 1.464 1.463 1.464 1.462 1.465 1.466 1.470 1.473
Decision Tree 1.451 1.470 1.470 1.470 1.470 1.456 1.456 1.448
Ridge Regression 1.464 1.463 1.464 1.462 1.465 1.466 1.470 1.473
Bayesian Ridge Regression 1.464 1.463 1.463 1.462 1.463 1.464 1.465 1.466
Lasso 1.471 1.463 1.464 1.468 1.462 1.463 1.461 1.464
Random Forest regression 1.413 1.398 1.420 1.416 1.431 1.420 1.416 1.420

In [5]:
df_mean_sq


Out[5]:
25 50 75 100 150 200 250 300
Linear Regression 15.164 15.130 15.115 15.081 15.042 15.018 14.980 14.964
Neural Network 15.349 15.349 15.349 15.349 15.349 15.349 15.349 15.349
Linear SVR 15.164 15.130 15.115 15.081 15.042 15.018 14.980 14.961
Decision Tree 15.075 15.241 15.241 15.241 15.241 15.176 15.176 15.202
Ridge Regression 15.164 15.130 15.115 15.081 15.042 15.018 14.980 14.962
Bayesian Ridge Regression 15.164 15.130 15.115 15.084 15.050 15.022 14.994 14.972
Lasso 15.211 15.142 15.153 15.174 15.125 15.130 15.118 15.144
Random Forest regression 14.608 14.504 14.783 14.692 14.752 14.631 14.689 14.728

In [6]:
df_median_abs


Out[6]:
25 50 75 100 150 200 250 300
Linear Regression 0.726 0.720 0.720 0.713 0.709 0.706 0.711 0.711
Neural Network 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000
Linear SVR 0.726 0.720 0.720 0.713 0.709 0.706 0.711 0.711
Decision Tree 0.490 0.547 0.547 0.547 0.547 0.490 0.490 0.632
Ridge Regression 0.726 0.720 0.720 0.713 0.709 0.706 0.711 0.711
Bayesian Ridge Regression 0.726 0.721 0.722 0.713 0.709 0.708 0.707 0.709
Lasso 0.737 0.726 0.727 0.734 0.719 0.719 0.717 0.723
Random Forest regression 0.553 0.527 0.546 0.552 0.581 0.542 0.542 0.556

In [7]:
df_r2


Out[7]:
25 50 75 100 150 200 250 300
Linear Regression 0.014 0.016 0.017 0.019 0.022 0.024 0.026 0.027
Neural Network -0.003 -0.003 -0.003 -0.003 -0.003 -0.003 -0.003 -0.003
Linear SVR 0.014 0.016 0.017 0.019 0.022 0.024 0.026 0.027
Decision Tree 0.020 0.009 0.009 0.009 0.009 0.013 0.013 0.012
Ridge Regression 0.014 0.016 0.017 0.019 0.022 0.024 0.026 0.027
Bayesian Ridge Regression 0.014 0.016 0.017 0.019 0.021 0.023 0.025 0.027
Lasso 0.011 0.016 0.015 0.013 0.017 0.016 0.017 0.015
Random Forest regression 0.050 0.057 0.039 0.045 0.041 0.049 0.045 0.042

In [8]:
df_exp_var_score


Out[8]:
25 50 75 100 150 200 250 300
Linear Regression 0.014 0.016 0.017 0.019 0.022 0.024 0.026 0.027
Neural Network 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000
Linear SVR 0.014 0.016 0.017 0.019 0.022 0.024 0.026 0.027
Decision Tree 0.020 0.009 0.009 0.009 0.009 0.013 0.013 0.012
Ridge Regression 0.014 0.016 0.017 0.019 0.022 0.024 0.026 0.027
Bayesian Ridge Regression 0.014 0.016 0.017 0.019 0.021 0.023 0.025 0.027
Lasso 0.011 0.016 0.015 0.013 0.017 0.016 0.017 0.015
Random Forest regression 0.050 0.057 0.039 0.045 0.041 0.049 0.045 0.042

In [ ]:
n_features = 300
x_train = pd.read_csv('./data/all_x_train_postprocessing_rfr_%d.csv' % n_features)
x_test = pd.read_csv('./data/all_x_test_postprocessing_rfr_%d.csv' % n_features)
y_train = pd.read_csv('./data/all_y_train_postprocessing.csv')
y_test = pd.read_csv('./data/all_y_test_postprocessing.csv')
x_train.drop(x_train.columns[0], axis=1, inplace=True)
x_test.drop(x_test.columns[0], axis=1, inplace=True)
y_train.drop(y_train.columns[0], axis=1, inplace=True)
y_test.drop(y_test.columns[0], axis=1, inplace=True)
plots.plot_features(x_train, y_train, x_test, y_test)

In [ ]:
y_train = pd.read_csv('./data/all_y_train_postprocessing.csv')
y_test = pd.read_csv('./data/all_y_test_postprocessing.csv')
y_train.drop(y_train.columns[0], axis=1, inplace=True)
y_test.drop(y_test.columns[0], axis=1, inplace=True)
plots.plot_y_dist(y_train, y_test)

In [ ]:
genalgo.main()