In [1]:
import lasagne
import numpy as np
import pandas as pd
import pickle
import sklearn
from sklearn import model_selection, preprocessing
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
from lasagne.layers import DenseLayer
from lasagne.layers import InputLayer
from nolearn.lasagne import NeuralNet
from scipy.stats import randint as sp_randint
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, BayesianRidge, Lasso
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import LinearSVR
from sklearn.tree import DecisionTreeRegressor
pd.set_option('precision', 3)
TARGET_COLUMN = 'Activity_Score'
In [2]:
def change_nan_infinite(dataframe):
"""
Replacing NaN and infinite values from the dataframe with zeros.
:param dataframe: Dataframe containing NaN and infinite values.
:return data: Data with no NaN or infinite values.
"""
dataframe.replace([np.inf, -np.inf], np.nan, inplace=True)
data = dataframe.fillna(0)
return data
def choose_features(x_train, y_train, x_test, column_names):
"""
Selecting the features of high importance to reduce feature space.
:param x_train: Training set of features.
:param x_test: Test set of features.
:param y_train: Training target values
:param column_names: Names of columns in x
"""
# Random forest feature importance
clf = RandomForestRegressor(n_jobs=-1, random_state=1, n_estimators=20, max_depth=10)
# Random state has int value for non-random sampling
clf.fit(x_train, y_train)
feature_importance = clf.feature_importances_
scores_table = pd.DataFrame({'feature': column_names, 'scores':
feature_importance}).sort_values(by=['scores'], ascending=False)
scores = scores_table['scores'].tolist()
n_features = [25, 50, 75, 100, 150, 200, 250, 300]
for n in n_features:
feature_scores = scores_table['feature'].tolist()
selected_features = feature_scores[:n]
x_train = pd.DataFrame(x_train, columns=column_names)
desired_x_train = x_train[selected_features]
x_test = pd.DataFrame(x_test, columns=column_names)
desired_x_test = x_test[selected_features]
desired_x_train.to_csv('./data/select_x_train_postprocessing_rfr_%d.csv' % n)
desired_x_test.to_csv('./data/select_x_test_postprocessing_rfr_%d.csv' % n)
pd.DataFrame(scores).to_csv('./data/select_feature_scores_rfr.csv')
return
def run_models(x_train, y_train, x_test, y_test, n_features):
"""
Driving all machine learning models as parallel processes.
:param x_train: features dataframe for model training
:param y_train: target dataframe for model training
:param x_test: features dataframe for model testing
:param y_test: target dataframe for model testing
:return: None
"""
model_choice = int(input("Type your choice of model to be run:" + "\n" +
"1 for Linear Regression" + "\n" +
"2 for Neural Network" + "\n" +
"3 for Support Vector Machine" + "\n" +
"4 for Decision Tree" + "\n" +
"5 for Ridge Regression" + "\n" +
"6 for Bayesian Ridge Regression" + "\n" +
"7 for Lasso:" + "\n" +
"8 for Random Forest Regressor:" + "\n"
))
if model_choice == 1:
build_linear(x_train, y_train, x_test, y_test, n_features)
elif model_choice == 2:
build_nn(x_train, y_train, x_test, y_test, n_features)
elif model_choice == 3:
build_svm(x_train, y_train, x_test, y_test, n_features)
elif model_choice == 4:
build_tree(x_train, y_train, x_test, y_test, n_features)
elif model_choice == 5:
build_ridge(x_train, y_train, x_test, y_test, n_features)
elif model_choice == 6:
build_bayesian_rr(x_train, y_train, x_test, y_test, n_features)
elif model_choice == 7:
build_lasso(x_train, y_train, x_test, y_test, n_features)
elif model_choice == 8:
build_forest(x_train, y_train, x_test, y_test, n_features)
else:
print("Please choose from list of available models only")
return
def build_linear(x_train, y_train, x_test, y_test, n_features):
"""
Constructing a decision trees regression model from input dataframe
:param x_train: features dataframe for model training
:param y_train: target dataframe for model training
:param x_test: features dataframe for model testing
:param y_test: target dataframe for model testing
:return: None
"""
clf = LinearRegression(n_jobs=-1)
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
# Mean absolute error regression loss
mean_abs = sklearn.metrics.mean_absolute_error(y_test, y_pred)
# Mean squared error regression loss
mean_sq = sklearn.metrics.mean_squared_error(y_test, y_pred)
# Median absolute error regression loss
median_abs = sklearn.metrics.median_absolute_error(y_test, y_pred)
# R^2 (coefficient of determination) regression score function
r2 = sklearn.metrics.r2_score(y_test, y_pred)
# Explained variance regression score function
exp_var_score = sklearn.metrics.explained_variance_score(y_test, y_pred)
with open('./trained_networks/select_lr_%d_data.pkl' % n_features, 'wb') as results:
pickle.dump(clf, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(mean_abs, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(mean_sq, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(median_abs, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(r2, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(exp_var_score, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(y_pred, results, pickle.HIGHEST_PROTOCOL)
return
def build_nn(x_train, y_train, x_test, y_test, n_features):
"""
Constructing a regression neural network model from input dataframe
:param x_train: features dataframe for model training
:param y_train: target dataframe for model training
:param x_test: features dataframe for model testing
:param y_test: target dataframe for model testing
:return: None
"""
net = NeuralNet(layers=[('input', InputLayer),
('hidden0', DenseLayer),
('hidden1', DenseLayer),
('output', DenseLayer)],
input_shape=(None, x_train.shape[1]), # Number of i/p nodes = number of columns in x
hidden0_num_units=15,
hidden0_nonlinearity=lasagne.nonlinearities.softmax,
hidden1_num_units=17,
hidden1_nonlinearity=lasagne.nonlinearities.softmax,
output_num_units=1, # Number of o/p nodes = number of columns in y
output_nonlinearity=lasagne.nonlinearities.softmax,
max_epochs=100,
update_learning_rate=0.01,
regression=True,
verbose=0)
# Finding the optimal set of params for each variable in the training of the neural network
param_dist = {'hidden0_num_units':sp_randint(3, 30), 'hidden1_num_units':sp_randint(3, 30)}
clf = RandomizedSearchCV(estimator=net, param_distributions=param_dist,
n_iter=15, n_jobs=-1)
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
# Mean absolute error regression loss
mean_abs = sklearn.metrics.mean_absolute_error(y_test, y_pred)
# Mean squared error regression loss
mean_sq = sklearn.metrics.mean_squared_error(y_test, y_pred)
# Median absolute error regression loss
median_abs = sklearn.metrics.median_absolute_error(y_test, y_pred)
# R^2 (coefficient of determination) regression score function
r2 = sklearn.metrics.r2_score(y_test, y_pred)
# Explained variance regression score function
exp_var_score = sklearn.metrics.explained_variance_score(y_test, y_pred)
with open('./trained_networks/select_nn_%d_data.pkl' % n_features, 'wb') as results:
pickle.dump(clf, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(net, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(mean_abs, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(mean_sq, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(median_abs, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(r2, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(exp_var_score, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(y_pred, results, pickle.HIGHEST_PROTOCOL)
return
def build_svm(x_train, y_train, x_test, y_test, n_features):
"""
Constructing a support vector regression model from input dataframe
:param x_train: features dataframe for model training
:param y_train: target dataframe for model training
:param x_test: features dataframe for model testing
:param y_test: target dataframe for model testing
:return: None
"""
clf = LinearSVR(random_state=1, dual=False, epsilon=0,
loss='squared_epsilon_insensitive')
# Random state has int value for non-random sampling
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
# Mean absolute error regression loss
mean_abs = sklearn.metrics.mean_absolute_error(y_test, y_pred)
# Mean squared error regression loss
mean_sq = sklearn.metrics.mean_squared_error(y_test, y_pred)
# Median absolute error regression loss
median_abs = sklearn.metrics.median_absolute_error(y_test, y_pred)
# R^2 (coefficient of determination) regression score function
r2 = sklearn.metrics.r2_score(y_test, y_pred)
# Explained variance regression score function
exp_var_score = sklearn.metrics.explained_variance_score(y_test, y_pred)
with open('./trained_networks/select_svm_%d_data.pkl' % n_features, 'wb') as results:
pickle.dump(clf, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(mean_abs, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(mean_sq, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(median_abs, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(r2, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(exp_var_score, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(y_pred, results, pickle.HIGHEST_PROTOCOL)
return
def build_tree(x_train, y_train, x_test, y_test, n_features):
"""
Constructing a decision trees regression model from input dataframe
:param x_train: features dataframe for model training
:param y_train: target dataframe for model training
:param x_test: features dataframe for model testing
:param y_test: target dataframe for model testing
:return: None
"""
model = DecisionTreeRegressor()
param_dist = {'max_depth': sp_randint(1, 15),
'min_samples_split': sp_randint(2, 15)}
clf = RandomizedSearchCV(estimator=model, param_distributions=param_dist,
n_iter=15, n_jobs=-1)
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print(clf.best_params_, clf.best_score_)
# Mean absolute error regression loss
mean_abs = sklearn.metrics.mean_absolute_error(y_test, y_pred)
# Mean squared error regression loss
mean_sq = sklearn.metrics.mean_squared_error(y_test, y_pred)
# Median absolute error regression loss
median_abs = sklearn.metrics.median_absolute_error(y_test, y_pred)
# R^2 (coefficient of determination) regression score function
r2 = sklearn.metrics.r2_score(y_test, y_pred)
# Explained variance regression score function
exp_var_score = sklearn.metrics.explained_variance_score(y_test, y_pred)
with open('./trained_networks/select_dt_%d_data.pkl' % n_features, 'wb') as results:
pickle.dump(clf, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(mean_abs, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(mean_sq, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(median_abs, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(r2, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(exp_var_score, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(y_pred, results, pickle.HIGHEST_PROTOCOL)
return
def build_ridge(x_train, y_train, x_test, y_test, n_features):
"""
Constructing a ridge regression model from input dataframe
:param x_train: features dataframe for model training
:param y_train: target dataframe for model training
:param x_test: features dataframe for model testing
:param y_test: target dataframe for model testing
:return: None
"""
clf = Ridge()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
# Mean absolute error regression loss
mean_abs = sklearn.metrics.mean_absolute_error(y_test, y_pred)
# Mean squared error regression loss
mean_sq = sklearn.metrics.mean_squared_error(y_test, y_pred)
# Median absolute error regression loss
median_abs = sklearn.metrics.median_absolute_error(y_test, y_pred)
# R^2 (coefficient of determination) regression score function
r2 = sklearn.metrics.r2_score(y_test, y_pred)
# Explained variance regression score function
exp_var_score = sklearn.metrics.explained_variance_score(y_test, y_pred)
with open('./trained_networks/select_rr_%d_data.pkl' % n_features, 'wb') as results:
pickle.dump(clf, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(mean_abs, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(mean_sq, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(median_abs, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(r2, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(exp_var_score, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(y_pred, results, pickle.HIGHEST_PROTOCOL)
return
def build_bayesian_rr(x_train, y_train, x_test, y_test, n_features):
"""
Constructing a Bayesian ridge regression model from input dataframe
:param x_train: features dataframe for model training
:param y_train: target dataframe for model training
:param x_test: features dataframe for model testing
:param y_test: target dataframe for model testing
:return: None
"""
clf = BayesianRidge()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
# Mean absolute error regression loss
mean_abs = sklearn.metrics.mean_absolute_error(y_test, y_pred)
# Mean squared error regression loss
mean_sq = sklearn.metrics.mean_squared_error(y_test, y_pred)
# Median absolute error regression loss
median_abs = sklearn.metrics.median_absolute_error(y_test, y_pred)
# R^2 (coefficient of determination) regression score function
r2 = sklearn.metrics.r2_score(y_test, y_pred)
# Explained variance regression score function
exp_var_score = sklearn.metrics.explained_variance_score(y_test, y_pred)
# Optimal ridge regression alpha value from CV
ridge_alpha = clf.alpha_
with open('./trained_networks/select_brr_%d_data.pkl' % n_features, 'wb') as results:
pickle.dump(clf, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(mean_abs, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(mean_sq, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(median_abs, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(r2, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(exp_var_score, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(ridge_alpha, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(y_pred, results, pickle.HIGHEST_PROTOCOL)
return
def build_lasso(x_train, y_train, x_test, y_test, n_features):
"""
Constructing a Lasso linear model with cross validation from input dataframe
:param x_train: features dataframe for model training
:param y_train: target dataframe for model training
:param x_test: features dataframe for model testing
:param y_test: target dataframe for model testing
:return: None
"""
model = Lasso(random_state=1, tol=0.001)
# Random state has int value for non-random sampling
param_dist = {'alpha': np.arange( 0.0001, 1, 0.001 ).tolist()}
clf = RandomizedSearchCV(estimator=model, param_distributions=param_dist,
n_iter=20, n_jobs=-1)
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print(clf.best_params_, clf.best_score_)
# Mean absolute error regression loss
mean_abs = sklearn.metrics.mean_absolute_error(y_test, y_pred)
# Mean squared error regression loss
mean_sq = sklearn.metrics.mean_squared_error(y_test, y_pred)
# Median absolute error regression loss
median_abs = sklearn.metrics.median_absolute_error(y_test, y_pred)
# R^2 (coefficient of determination) regression score function
r2 = sklearn.metrics.r2_score(y_test, y_pred)
# Explained variance regression score function
exp_var_score = sklearn.metrics.explained_variance_score(y_test, y_pred)
with open('./trained_networks/select_lasso_%d_data.pkl' % n_features, 'wb') as results:
pickle.dump(clf, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(mean_abs, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(mean_sq, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(median_abs, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(r2, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(exp_var_score, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(y_pred, results, pickle.HIGHEST_PROTOCOL)
return
def build_forest(x_train, y_train, x_test, y_test, n_features):
"""
Constructing a random forest regression model from input dataframe
:param x_train: features dataframe for model training
:param y_train: target dataframe for model training
:param x_test: features dataframe for model testing
:param y_test: target dataframe for model testing
:return: None
"""
model = RandomForestRegressor()
param_dist = {'max_depth': sp_randint(1, 15),
'min_samples_split': sp_randint(2, 15)}
clf = RandomizedSearchCV(estimator=model, param_distributions=param_dist,
n_iter=15, n_jobs=-1)
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
# Mean absolute error regression loss
mean_abs = sklearn.metrics.mean_absolute_error(y_test, y_pred)
# Mean squared error regression loss
mean_sq = sklearn.metrics.mean_squared_error(y_test, y_pred)
# Median absolute error regression loss
median_abs = sklearn.metrics.median_absolute_error(y_test, y_pred)
# R^2 (coefficient of determination) regression score function
r2 = sklearn.metrics.r2_score(y_test, y_pred)
# Explained variance regression score function
exp_var_score = sklearn.metrics.explained_variance_score(y_test, y_pred)
with open('./trained_networks/select_rfr_%d_data.pkl' % n_features, 'wb') as results:
pickle.dump(clf, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(mean_abs, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(mean_sq, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(median_abs, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(r2, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(exp_var_score, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(y_pred, results, pickle.HIGHEST_PROTOCOL)
return
def select_results():
df_mean_abs = pd.DataFrame()
df_mean_sq = pd.DataFrame()
df_median_abs = pd.DataFrame()
df_r2 = pd.DataFrame()
df_exp_var_score = pd.DataFrame()
lists = [25, 50, 75, 100, 150, 200, 250, 300]
for n_features in lists:
with open('./trained_networks/select_lr_%d_data.pkl' % n_features, 'rb') as result:
clf = pickle.load(result)
mean_abs = pickle.load(result)
mean_sq = pickle.load(result)
median_abs = pickle.load(result)
r2 = pickle.load(result)
exp_var_score = pickle.load(result)
y_pred = pickle.load(result)
df_mean_abs.set_value('Linear Regression', '%d' % n_features, mean_abs)
df_mean_sq.set_value('Linear Regression', '%d' % n_features, mean_sq)
df_median_abs.set_value('Linear Regression', '%d' % n_features, median_abs)
df_r2.set_value('Linear Regression', '%d' % n_features, r2)
df_exp_var_score.set_value('Linear Regression', '%d' % n_features, exp_var_score)
with open('./trained_networks/select_nn_%d_data.pkl' % n_features, 'rb') as result:
clf = pickle.load(result)
net = pickle.load(result)
mean_abs = pickle.load(result)
mean_sq = pickle.load(result)
median_abs = pickle.load(result)
r2 = pickle.load(result)
exp_var_score = pickle.load(result)
y_pred_nn = pickle.load(result)
df_mean_abs.set_value('Neural Network', '%d' % n_features, mean_abs)
df_mean_sq.set_value('Neural Network', '%d' % n_features, mean_sq)
df_median_abs.set_value('Neural Network', '%d' % n_features, median_abs)
df_r2.set_value('Neural Network', '%d' % n_features, r2)
df_exp_var_score.set_value('Neural Network', '%d' % n_features, exp_var_score)
with open('./trained_networks/select_svm_%d_data.pkl' % n_features, 'rb') as result:
clf = pickle.load(result)
mean_abs = pickle.load(result)
mean_sq = pickle.load(result)
median_abs = pickle.load(result)
r2 = pickle.load(result)
exp_var_score = pickle.load(result)
y_pred_svm = pickle.load(result)
df_mean_abs.set_value('Linear SVR', '%d' % n_features, mean_abs)
df_mean_sq.set_value('Linear SVR', '%d' % n_features, mean_sq)
df_median_abs.set_value('Linear SVR', '%d' % n_features, median_abs)
df_r2.set_value('Linear SVR', '%d' % n_features, r2)
df_exp_var_score.set_value('Linear SVR', '%d' % n_features, exp_var_score)
with open('./trained_networks/select_dt_%d_data.pkl' % n_features, 'rb') as result:
clf = pickle.load(result)
mean_abs = pickle.load(result)
mean_sq = pickle.load(result)
median_abs = pickle.load(result)
r2 = pickle.load(result)
exp_var_score = pickle.load(result)
y_pred_dt = pickle.load(result)
df_mean_abs.set_value('Decision Tree', '%d' % n_features, mean_abs)
df_mean_sq.set_value('Decision Tree', '%d' % n_features, mean_sq)
df_median_abs.set_value('Decision Tree', '%d' % n_features, median_abs)
df_r2.set_value('Decision Tree', '%d' % n_features, r2)
df_exp_var_score.set_value('Decision Tree', '%d' % n_features, exp_var_score)
with open('./trained_networks/select_rr_%d_data.pkl' % n_features, 'rb') as result:
clf = pickle.load(result)
mean_abs = pickle.load(result)
mean_sq = pickle.load(result)
median_abs = pickle.load(result)
r2 = pickle.load(result)
exp_var_score = pickle.load(result)
y_pred_rr = pickle.load(result)
df_mean_abs.set_value('Ridge Regression', '%d' % n_features, mean_abs)
df_mean_sq.set_value('Ridge Regression', '%d' % n_features, mean_sq)
df_median_abs.set_value('Ridge Regression', '%d' % n_features, median_abs)
df_r2.set_value('Ridge Regression', '%d' % n_features, r2)
df_exp_var_score.set_value('Ridge Regression', '%d' % n_features, exp_var_score)
with open('./trained_networks/select_brr_%d_data.pkl' % n_features, 'rb') as result:
clf = pickle.load(result)
mean_abs = pickle.load(result)
mean_sq = pickle.load(result)
median_abs = pickle.load(result)
r2 = pickle.load(result)
exp_var_score = pickle.load(result)
y_pred_brr = pickle.load(result)
df_mean_abs.set_value('Bayesian Ridge Regression', '%d' % n_features, mean_abs)
df_mean_sq.set_value('Bayesian Ridge Regression', '%d' % n_features, mean_sq)
df_median_abs.set_value('Bayesian Ridge Regression', '%d' % n_features, median_abs)
df_r2.set_value('Bayesian Ridge Regression', '%d' % n_features, r2)
df_exp_var_score.set_value('Bayesian Ridge Regression', '%d' % n_features, exp_var_score)
with open('./trained_networks/select_lasso_%d_data.pkl' % n_features, 'rb') as result:
clf = pickle.load(result)
mean_abs = pickle.load(result)
mean_sq = pickle.load(result)
median_abs = pickle.load(result)
r2 = pickle.load(result)
exp_var_score = pickle.load(result)
y_pred_lasso = pickle.load(result)
df_mean_abs.set_value('Lasso', '%d' % n_features, mean_abs)
df_mean_sq.set_value('Lasso', '%d' % n_features, mean_sq)
df_median_abs.set_value('Lasso', '%d' % n_features, median_abs)
df_r2.set_value('Lasso', '%d' % n_features, r2)
df_exp_var_score.set_value('Lasso', '%d' % n_features, exp_var_score)
with open('./trained_networks/select_rfr_%d_data.pkl' % n_features, 'rb') as result:
clf = pickle.load(result)
mean_abs = pickle.load(result)
mean_sq = pickle.load(result)
median_abs = pickle.load(result)
r2 = pickle.load(result)
exp_var_score = pickle.load(result)
y_pred_lasso = pickle.load(result)
df_mean_abs.set_value('Random Forest regression', '%d' % n_features, mean_abs)
df_mean_sq.set_value('Random Forest regression', '%d' % n_features, mean_sq)
df_median_abs.set_value('Random Forest regression', '%d' % n_features, median_abs)
df_r2.set_value('Random Forest regression', '%d' % n_features, r2)
df_exp_var_score.set_value('Random Forest regression', '%d' % n_features, exp_var_score)
return df_mean_abs, df_mean_sq, df_median_abs, df_r2, df_exp_var_score
def all_results():
df_mean_abs = pd.DataFrame()
df_mean_sq = pd.DataFrame()
df_median_abs = pd.DataFrame()
df_r2 = pd.DataFrame()
df_exp_var_score = pd.DataFrame()
lists = [25, 50, 75, 100, 150, 200, 250, 300]
for n_features in lists:
with open('./trained_networks/all_lr_%d_data.pkl' % n_features, 'rb') as result:
clf = pickle.load(result)
mean_abs = pickle.load(result)
mean_sq = pickle.load(result)
median_abs = pickle.load(result)
r2 = pickle.load(result)
exp_var_score = pickle.load(result)
y_pred = pickle.load(result)
df_mean_abs.set_value('Linear Regression', '%d' % n_features, mean_abs)
df_mean_sq.set_value('Linear Regression', '%d' % n_features, mean_sq)
df_median_abs.set_value('Linear Regression', '%d' % n_features, median_abs)
df_r2.set_value('Linear Regression', '%d' % n_features, r2)
df_exp_var_score.set_value('Linear Regression', '%d' % n_features, exp_var_score)
with open('./trained_networks/all_nn_%d_data.pkl' % n_features, 'rb') as result:
clf = pickle.load(result)
net = pickle.load(result)
mean_abs = pickle.load(result)
mean_sq = pickle.load(result)
median_abs = pickle.load(result)
r2 = pickle.load(result)
exp_var_score = pickle.load(result)
y_pred_nn = pickle.load(result)
df_mean_abs.set_value('Neural Network', '%d' % n_features, mean_abs)
df_mean_sq.set_value('Neural Network', '%d' % n_features, mean_sq)
df_median_abs.set_value('Neural Network', '%d' % n_features, median_abs)
df_r2.set_value('Neural Network', '%d' % n_features, r2)
df_exp_var_score.set_value('Neural Network', '%d' % n_features, exp_var_score)
with open('./trained_networks/all_svm_%d_data.pkl' % n_features, 'rb') as result:
clf = pickle.load(result)
mean_abs = pickle.load(result)
mean_sq = pickle.load(result)
median_abs = pickle.load(result)
r2 = pickle.load(result)
exp_var_score = pickle.load(result)
y_pred_svm = pickle.load(result)
df_mean_abs.set_value('Linear SVR', '%d' % n_features, mean_abs)
df_mean_sq.set_value('Linear SVR', '%d' % n_features, mean_sq)
df_median_abs.set_value('Linear SVR', '%d' % n_features, median_abs)
df_r2.set_value('Linear SVR', '%d' % n_features, r2)
df_exp_var_score.set_value('Linear SVR', '%d' % n_features, exp_var_score)
with open('./trained_networks/all_dt_%d_data.pkl' % n_features, 'rb') as result:
clf = pickle.load(result)
mean_abs = pickle.load(result)
mean_sq = pickle.load(result)
median_abs = pickle.load(result)
r2 = pickle.load(result)
exp_var_score = pickle.load(result)
y_pred_dt = pickle.load(result)
df_mean_abs.set_value('Decision Tree', '%d' % n_features, mean_abs)
df_mean_sq.set_value('Decision Tree', '%d' % n_features, mean_sq)
df_median_abs.set_value('Decision Tree', '%d' % n_features, median_abs)
df_r2.set_value('Decision Tree', '%d' % n_features, r2)
df_exp_var_score.set_value('Decision Tree', '%d' % n_features, exp_var_score)
with open('./trained_networks/all_rr_%d_data.pkl' % n_features, 'rb') as result:
clf = pickle.load(result)
mean_abs = pickle.load(result)
mean_sq = pickle.load(result)
median_abs = pickle.load(result)
r2 = pickle.load(result)
exp_var_score = pickle.load(result)
y_pred_rr = pickle.load(result)
df_mean_abs.set_value('Ridge Regression', '%d' % n_features, mean_abs)
df_mean_sq.set_value('Ridge Regression', '%d' % n_features, mean_sq)
df_median_abs.set_value('Ridge Regression', '%d' % n_features, median_abs)
df_r2.set_value('Ridge Regression', '%d' % n_features, r2)
df_exp_var_score.set_value('Ridge Regression', '%d' % n_features, exp_var_score)
with open('./trained_networks/all_brr_%d_data.pkl' % n_features, 'rb') as result:
clf = pickle.load(result)
mean_abs = pickle.load(result)
mean_sq = pickle.load(result)
median_abs = pickle.load(result)
r2 = pickle.load(result)
exp_var_score = pickle.load(result)
y_pred_brr = pickle.load(result)
df_mean_abs.set_value('Bayesian Ridge Regression', '%d' % n_features, mean_abs)
df_mean_sq.set_value('Bayesian Ridge Regression', '%d' % n_features, mean_sq)
df_median_abs.set_value('Bayesian Ridge Regression', '%d' % n_features, median_abs)
df_r2.set_value('Bayesian Ridge Regression', '%d' % n_features, r2)
df_exp_var_score.set_value('Bayesian Ridge Regression', '%d' % n_features, exp_var_score)
with open('./trained_networks/all_lasso_%d_data.pkl' % n_features, 'rb') as result:
clf = pickle.load(result)
mean_abs = pickle.load(result)
mean_sq = pickle.load(result)
median_abs = pickle.load(result)
r2 = pickle.load(result)
exp_var_score = pickle.load(result)
y_pred_lasso = pickle.load(result)
df_mean_abs.set_value('Lasso', '%d' % n_features, mean_abs)
df_mean_sq.set_value('Lasso', '%d' % n_features, mean_sq)
df_median_abs.set_value('Lasso', '%d' % n_features, median_abs)
df_r2.set_value('Lasso', '%d' % n_features, r2)
df_exp_var_score.set_value('Lasso', '%d' % n_features, exp_var_score)
with open('./trained_networks/all_rfr_%d_data.pkl' % n_features, 'rb') as result:
clf = pickle.load(result)
mean_abs = pickle.load(result)
mean_sq = pickle.load(result)
median_abs = pickle.load(result)
r2 = pickle.load(result)
exp_var_score = pickle.load(result)
y_pred_lasso = pickle.load(result)
df_mean_abs.set_value('Random Forest regression', '%d' % n_features, mean_abs)
df_mean_sq.set_value('Random Forest regression', '%d' % n_features, mean_sq)
df_median_abs.set_value('Random Forest regression', '%d' % n_features, median_abs)
df_r2.set_value('Random Forest regression', '%d' % n_features, r2)
df_exp_var_score.set_value('Random Forest regression', '%d' % n_features, exp_var_score)
return df_mean_abs, df_mean_sq, df_median_abs, df_r2, df_exp_var_score
In [ ]:
df = pd.read_csv('https://s3-us-west-2.amazonaws.com/'
'pphilip-usp-inhibition/data/df_preprocessing.csv')
df.drop(df.columns[0], axis=1, inplace=True)
In [ ]:
select_df = df.loc[df[TARGET_COLUMN] > 0]
select_df.reset_index(drop=True, inplace=True)
# Copying column names to use after np array manipulation
all_headers = list(select_df.columns.values)
x_headers = list(select_df.columns.values)[:-1]
# Train, validation and test split
df_train, df_test = model_selection.train_test_split(select_df, test_size=0.25)
# Reassign column name and index after randomized split
df_train.reset_index(inplace=True, drop=True)
df_test.reset_index(inplace=True, drop=True)
df_train = pd.DataFrame(df_train, columns=all_headers)
df_test = pd.DataFrame(df_test, columns=all_headers)
# Remove the classification column from the dataframe
x_train = df_train.drop(TARGET_COLUMN, axis=1)
x_test = df_test.drop(TARGET_COLUMN, axis=1)
y_train = df_train[TARGET_COLUMN]
y_test = df_test[TARGET_COLUMN]
# Checking dataframe for NaN and infinite values
x_train = change_nan_infinite(x_train)
y_train = change_nan_infinite(y_train)
x_test = change_nan_infinite(x_test)
y_test = change_nan_infinite(y_test)
y_train = pd.DataFrame(y_train, columns=[TARGET_COLUMN])
y_test = pd.DataFrame(y_test, columns=[TARGET_COLUMN])
y_train.to_csv('./data/select_y_train_postprocessing.csv')
y_test.to_csv('./data/select_y_test_postprocessing.csv')
# Transform all column values to mean 0 and unit variance
clf = preprocessing.StandardScaler().fit(x_train)
x_train = clf.transform(x_train)
x_test = clf.transform(x_test)
y_train = np.array(y_train)
# Feature selection and feature importance plot
choose_features(x_train, y_train, x_test, x_headers)
In [ ]:
n_features = int(input("Choose the number of features to be used in the model" + "\n" +
"Pick from 25, 50, 75, 100, 150, 200, 250, 300" + "\n"))
x_train = pd.read_csv('./data/select_x_train_postprocessing_rfr_%d.csv' % n_features)
x_test = pd.read_csv('./data/select_x_test_postprocessing_rfr_%d.csv' % n_features)
y_train = pd.read_csv('./data/select_y_train_postprocessing.csv')
y_test = pd.read_csv('./data/select_y_test_postprocessing.csv')
x_train.drop(x_train.columns[0], axis=1, inplace=True)
x_test.drop(x_test.columns[0], axis=1, inplace=True)
y_train.drop(y_train.columns[0], axis=1, inplace=True)
y_test.drop(y_test.columns[0], axis=1, inplace=True)
print("Generating models")
run_models(np.array(x_train), np.array(y_train).ravel(), np.array(x_test), np.array(y_test).ravel(), n_features)
In [3]:
df_all_mean_abs, df_all_mean_sq, df_all_median_abs, df_all_r2, df_all_exp_var_score = all_results()
df_select_mean_abs, df_select_mean_sq, df_select_median_abs, df_select_r2, df_select_exp_var_score = select_results()
In [8]:
df_all_mean_abs
Out[8]:
In [9]:
df_all_mean_sq
Out[9]:
In [10]:
df_all_median_abs
Out[10]:
In [11]:
df_all_r2
Out[11]:
In [12]:
df_all_exp_var_score
Out[12]:
In [13]:
df_select_mean_abs
Out[13]:
In [14]:
df_select_mean_sq
Out[14]:
In [15]:
df_select_median_abs
Out[15]:
In [16]:
df_select_r2
Out[16]:
In [17]:
df_select_exp_var_score
Out[17]:
In [31]:
N = 8
select_means = df_select_mean_abs.mean(axis=1).tolist()
select_std = df_select_mean_abs.std(axis=1).tolist()
ind = np.arange(N) # the x locations for the groups
width = 0.35 # the width of the bars
fig, ax = plt.subplots()
rects1 = ax.bar(ind, select_means, width, color='r', yerr=select_std)
all_means = df_all_mean_abs.mean(axis=1).tolist()
all_std = df_all_mean_abs.std(axis=1).tolist()
rects2 = ax.bar(ind + width, all_means, width, color='b', yerr=all_std)
# add some text for labels, title and axes ticks
ax.set_ylabel('Mean absolute error')
ax.set_title('MAE by dataset and learning algorithm')
ax.set_xticks(ind + width / 2)
ax.set_xticklabels(('LR', 'NN', 'SVR', 'DT', 'RR', 'BRR', 'Lasso', 'RFR'))
ax.legend((rects1[0], rects2[0]), ('Active molecules', 'All molecules'))
plt.savefig('./plots/mae.png', bbox_inches='tight')
In [32]:
N = 8
select_means = df_select_mean_sq.mean(axis=1).tolist()
select_std = df_select_mean_sq.std(axis=1).tolist()
ind = np.arange(N) # the x locations for the groups
width = 0.35 # the width of the bars
fig, ax = plt.subplots()
rects1 = ax.bar(ind, select_means, width, color='r', yerr=select_std)
all_means = df_all_mean_sq.mean(axis=1).tolist()
all_std = df_all_mean_sq.std(axis=1).tolist()
rects2 = ax.bar(ind + width, all_means, width, color='b', yerr=all_std)
# add some text for labels, title and axes ticks
ax.set_ylabel('Mean square error')
ax.set_title('MSE by dataset and learning algorithm')
ax.set_xticks(ind + width / 2)
ax.set_xticklabels(('LR', 'NN', 'SVR', 'DT', 'RR', 'BRR', 'Lasso', 'RFR'))
ax.legend((rects1[0], rects2[0]), ('Active molecules', 'All molecules'))
plt.savefig('./plots/mse.png', bbox_inches='tight')
In [33]:
N = 8
select_means = df_select_median_abs.mean(axis=1).tolist()
select_std = df_select_median_abs.std(axis=1).tolist()
ind = np.arange(N) # the x locations for the groups
width = 0.35 # the width of the bars
fig, ax = plt.subplots()
rects1 = ax.bar(ind, select_means, width, color='r', yerr=select_std)
all_means = df_all_median_abs.mean(axis=1).tolist()
all_std = df_all_median_abs.std(axis=1).tolist()
rects2 = ax.bar(ind + width, all_means, width, color='b', yerr=all_std)
# add some text for labels, title and axes ticks
ax.set_ylabel('Median absolute error')
ax.set_title('MedAE by dataset and learning algorithm')
ax.set_xticks(ind + width / 2)
ax.set_xticklabels(('LR', 'NN', 'SVR', 'DT', 'RR', 'BRR', 'Lasso', 'RFR'))
ax.legend((rects1[0], rects2[0]), ('Active molecules', 'All molecules'))
plt.savefig('./plots/medae.png', bbox_inches='tight')
In [34]:
N = 8
select_means = df_select_r2.mean(axis=1).tolist()
select_std = df_select_r2.std(axis=1).tolist()
ind = np.arange(N) # the x locations for the groups
width = 0.35 # the width of the bars
fig, ax = plt.subplots()
rects1 = ax.bar(ind, select_means, width, color='r', yerr=select_std)
all_means = df_all_r2.mean(axis=1).tolist()
all_std = df_all_r2.std(axis=1).tolist()
rects2 = ax.bar(ind + width, all_means, width, color='b', yerr=all_std)
# add some text for labels, title and axes ticks
ax.set_ylabel('R2 score')
ax.set_title('R2 score by dataset and learning algorithm')
ax.set_xticks(ind + width / 2)
ax.set_xticklabels(('LR', 'NN', 'SVR', 'DT', 'RR', 'BRR', 'Lasso', 'RFR'))
ax.legend((rects1[0], rects2[0]), ('Active molecules', 'All molecules'))
plt.savefig('./plots/r2.png', bbox_inches='tight')
In [35]:
N = 8
select_means = df_select_exp_var_score.mean(axis=1).tolist()
select_std = df_select_exp_var_score.std(axis=1).tolist()
ind = np.arange(N) # the x locations for the groups
width = 0.35 # the width of the bars
fig, ax = plt.subplots()
rects1 = ax.bar(ind, select_means, width, color='r', yerr=select_std)
all_means = df_all_exp_var_score.mean(axis=1).tolist()
all_std = df_all_exp_var_score.std(axis=1).tolist()
rects2 = ax.bar(ind + width, all_means, width, color='b', yerr=all_std)
# add some text for labels, title and axes ticks
ax.set_ylabel('Explained variance score')
ax.set_title('EVS by dataset and learning algorithm')
ax.set_xticks(ind + width / 2)
ax.set_xticklabels(('LR', 'NN', 'SVR', 'DT', 'RR', 'BRR', 'Lasso', 'RFR'))
ax.legend((rects1[0], rects2[0]), ('Active molecules', 'All molecules'))
plt.savefig('./plots/evs.png', bbox_inches='tight')
In [ ]:
n_features = 300
x_train = pd.read_csv('./data/select_x_train_postprocessing_rfr_%d.csv' % n_features)
x_test = pd.read_csv('./data/select_x_test_postprocessing_rfr_%d.csv' % n_features)
y_train = pd.read_csv('./data/select_y_train_postprocessing.csv')
y_test = pd.read_csv('./data/select_y_test_postprocessing.csv')
x_train.drop(x_train.columns[0], axis=1, inplace=True)
x_test.drop(x_test.columns[0], axis=1, inplace=True)
y_train.drop(y_train.columns[0], axis=1, inplace=True)
y_test.drop(y_test.columns[0], axis=1, inplace=True)
plots.plot_features(x_train, y_train, x_test, y_test)
In [ ]:
y_train = pd.read_csv('./data/select_y_train_postprocessing.csv')
y_test = pd.read_csv('./data/select_y_test_postprocessing.csv')
y_train.drop(y_train.columns[0], axis=1, inplace=True)
y_test.drop(y_test.columns[0], axis=1, inplace=True)
plots.plot_y_dist(y_train, y_test)
In [ ]:
genalgo.main()