In [1]:
import pandas as pd
import numpy as np
import pickle
import lasagne
import sklearn
from sklearn import model_selection
from lasagne.layers import DenseLayer
from lasagne.layers import InputLayer
from nolearn.lasagne import NeuralNet
from scipy.stats import randint as sp_randint
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, BayesianRidge, Lasso
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import LinearSVR
from sklearn.tree import DecisionTreeRegressor
try:
import urllib.request as urllib2
except ImportError:
import urllib2
import random
pd.set_option('precision', 3)
TARGET_COLUMN = 'Activity_Score'
In [2]:
def choose_features(x_train, y_train, x_test, column_names):
"""
Selecting the features of high importance to reduce feature space.
:param x_train: Training set of features.
:param x_test: Test set of features.
:param y_train: Training target values
:param column_names: Names of columns in x
"""
# Random forest feature importance
clf = RandomForestRegressor(n_jobs=-1, random_state=1, n_estimators=20, max_depth=10)
# Random state has int value for non-random sampling
clf.fit(x_train, y_train)
feature_importance = clf.feature_importances_
scores_table = pd.DataFrame({'feature': column_names, 'scores':
feature_importance}).sort_values(by=['scores'], ascending=False)
scores = scores_table['scores'].tolist()
n_features = [25, 50, 75, 100, 150, 200, 250, 300]
for n in n_features:
feature_scores = scores_table['feature'].tolist()
selected_features = feature_scores[:n]
x_train = pd.DataFrame(x_train, columns=column_names)
desired_x_train = x_train[selected_features]
x_test = pd.DataFrame(x_test, columns=column_names)
desired_x_test = x_test[selected_features]
desired_x_train.to_csv('./data/all_x_train_postprocessing_rfr_%d.csv' % n)
desired_x_test.to_csv('./data/all_x_test_postprocessing_rfr_%d.csv' % n)
pd.DataFrame(scores).to_csv('./data/all_feature_scores_rfr.csv')
return
def change_nan_infinite(dataframe):
"""
Replacing NaN and infinite values from the dataframe with zeros.
:param dataframe: Dataframe containing NaN and infinite values.
:return data: Data with no NaN or infinite values.
"""
dataframe.replace([np.inf, -np.inf], np.nan, inplace=True)
data = dataframe.fillna(0)
return data
def run_models(x_train, y_train, x_test, y_test, n_features):
"""
Driving all machine learning models as parallel processes.
:param x_train: features dataframe for model training
:param y_train: target dataframe for model training
:param x_test: features dataframe for model testing
:param y_test: target dataframe for model testing
:return: None
"""
model_choice = int(input("Type your choice of model to be run:" + "\n" +
"1 for Linear Regression" + "\n" +
"2 for Neural Network" + "\n" +
"3 for Support Vector Machine" + "\n" +
"4 for Decision Tree" + "\n" +
"5 for Ridge Regression" + "\n" +
"6 for Bayesian Ridge Regression" + "\n" +
"7 for Lasso:" + "\n" +
"8 for Random Forest Regressor:" + "\n"
))
if model_choice == 1:
build_linear(x_train, y_train, x_test, y_test, n_features)
elif model_choice == 2:
build_nn(x_train, y_train, x_test, y_test, n_features)
elif model_choice == 3:
build_svm(x_train, y_train, x_test, y_test, n_features)
elif model_choice == 4:
build_tree(x_train, y_train, x_test, y_test, n_features)
elif model_choice == 5:
build_ridge(x_train, y_train, x_test, y_test, n_features)
elif model_choice == 6:
build_bayesian_rr(x_train, y_train, x_test, y_test, n_features)
elif model_choice == 7:
build_lasso(x_train, y_train, x_test, y_test, n_features)
elif model_choice == 8:
build_forest(x_train, y_train, x_test, y_test, n_features)
else:
print("Please choose from list of available models only")
return
def build_linear(x_train, y_train, x_test, y_test, n_features):
"""
Constructing a decision trees regression model from input dataframe
:param x_train: features dataframe for model training
:param y_train: target dataframe for model training
:param x_test: features dataframe for model testing
:param y_test: target dataframe for model testing
:return: None
"""
clf = LinearRegression(n_jobs=-1)
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
# Mean absolute error regression loss
mean_abs = sklearn.metrics.mean_absolute_error(y_test, y_pred)
# Mean squared error regression loss
mean_sq = sklearn.metrics.mean_squared_error(y_test, y_pred)
# Median absolute error regression loss
median_abs = sklearn.metrics.median_absolute_error(y_test, y_pred)
# R^2 (coefficient of determination) regression score function
r2 = sklearn.metrics.r2_score(y_test, y_pred)
# Explained variance regression score function
exp_var_score = sklearn.metrics.explained_variance_score(y_test, y_pred)
with open('./trained_networks/all_lr_%d_data.pkl' % n_features, 'wb') as results:
pickle.dump(clf, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(mean_abs, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(mean_sq, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(median_abs, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(r2, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(exp_var_score, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(y_pred, results, pickle.HIGHEST_PROTOCOL)
return
def build_nn(x_train, y_train, x_test, y_test, n_features):
"""
Constructing a regression neural network model from input dataframe
:param x_train: features dataframe for model training
:param y_train: target dataframe for model training
:param x_test: features dataframe for model testing
:param y_test: target dataframe for model testing
:return: None
"""
net = NeuralNet(layers=[('input', InputLayer),
('hidden0', DenseLayer),
('hidden1', DenseLayer),
('output', DenseLayer)],
input_shape=(None, x_train.shape[1]), # Number of i/p nodes = number of columns in x
hidden0_num_units=15,
hidden0_nonlinearity=lasagne.nonlinearities.softmax,
hidden1_num_units=17,
hidden1_nonlinearity=lasagne.nonlinearities.softmax,
output_num_units=1, # Number of o/p nodes = number of columns in y
output_nonlinearity=lasagne.nonlinearities.softmax,
max_epochs=100,
update_learning_rate=0.01,
regression=True,
verbose=1)
# Finding the optimal set of params for each variable in the training of the neural network
param_dist = {'hidden0_num_units':sp_randint(3, 30), 'hidden1_num_units':sp_randint(3, 30)}
clf = RandomizedSearchCV(estimator=net, param_distributions=param_dist,
n_iter=15, n_jobs=-1)
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
# Mean absolute error regression loss
mean_abs = sklearn.metrics.mean_absolute_error(y_test, y_pred)
# Mean squared error regression loss
mean_sq = sklearn.metrics.mean_squared_error(y_test, y_pred)
# Median absolute error regression loss
median_abs = sklearn.metrics.median_absolute_error(y_test, y_pred)
# R^2 (coefficient of determination) regression score function
r2 = sklearn.metrics.r2_score(y_test, y_pred)
# Explained variance regression score function
exp_var_score = sklearn.metrics.explained_variance_score(y_test, y_pred)
with open('./trained_networks/all_nn_%d_data.pkl' % n_features, 'wb') as results:
pickle.dump(clf, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(net, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(mean_abs, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(mean_sq, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(median_abs, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(r2, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(exp_var_score, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(y_pred, results, pickle.HIGHEST_PROTOCOL)
return
def build_svm(x_train, y_train, x_test, y_test, n_features):
"""
Constructing a support vector regression model from input dataframe
:param x_train: features dataframe for model training
:param y_train: target dataframe for model training
:param x_test: features dataframe for model testing
:param y_test: target dataframe for model testing
:return: None
"""
clf = LinearSVR(random_state=1, dual=False, epsilon=0,
loss='squared_epsilon_insensitive')
# Random state has int value for non-random sampling
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
# Mean absolute error regression loss
mean_abs = sklearn.metrics.mean_absolute_error(y_test, y_pred)
# Mean squared error regression loss
mean_sq = sklearn.metrics.mean_squared_error(y_test, y_pred)
# Median absolute error regression loss
median_abs = sklearn.metrics.median_absolute_error(y_test, y_pred)
# R^2 (coefficient of determination) regression score function
r2 = sklearn.metrics.r2_score(y_test, y_pred)
# Explained variance regression score function
exp_var_score = sklearn.metrics.explained_variance_score(y_test, y_pred)
with open('./trained_networks/all_svm_%d_data.pkl' % n_features, 'wb') as results:
pickle.dump(clf, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(mean_abs, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(mean_sq, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(median_abs, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(r2, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(exp_var_score, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(y_pred, results, pickle.HIGHEST_PROTOCOL)
return
def build_tree(x_train, y_train, x_test, y_test, n_features):
"""
Constructing a decision trees regression model from input dataframe
:param x_train: features dataframe for model training
:param y_train: target dataframe for model training
:param x_test: features dataframe for model testing
:param y_test: target dataframe for model testing
:return: None
"""
model = DecisionTreeRegressor()
param_dist = {'max_depth': sp_randint(1, 15),
'min_samples_split': sp_randint(2, 15)}
clf = RandomizedSearchCV(estimator=model, param_distributions=param_dist,
n_iter=15, n_jobs=-1)
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print(clf.best_params_, clf.best_score_)
# Mean absolute error regression loss
mean_abs = sklearn.metrics.mean_absolute_error(y_test, y_pred)
# Mean squared error regression loss
mean_sq = sklearn.metrics.mean_squared_error(y_test, y_pred)
# Median absolute error regression loss
median_abs = sklearn.metrics.median_absolute_error(y_test, y_pred)
# R^2 (coefficient of determination) regression score function
r2 = sklearn.metrics.r2_score(y_test, y_pred)
# Explained variance regression score function
exp_var_score = sklearn.metrics.explained_variance_score(y_test, y_pred)
with open('./trained_networks/all_dt_%d_data.pkl' % n_features, 'wb') as results:
pickle.dump(clf, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(mean_abs, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(mean_sq, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(median_abs, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(r2, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(exp_var_score, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(y_pred, results, pickle.HIGHEST_PROTOCOL)
return
def build_ridge(x_train, y_train, x_test, y_test, n_features):
"""
Constructing a ridge regression model from input dataframe
:param x_train: features dataframe for model training
:param y_train: target dataframe for model training
:param x_test: features dataframe for model testing
:param y_test: target dataframe for model testing
:return: None
"""
clf = Ridge()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
# Mean absolute error regression loss
mean_abs = sklearn.metrics.mean_absolute_error(y_test, y_pred)
# Mean squared error regression loss
mean_sq = sklearn.metrics.mean_squared_error(y_test, y_pred)
# Median absolute error regression loss
median_abs = sklearn.metrics.median_absolute_error(y_test, y_pred)
# R^2 (coefficient of determination) regression score function
r2 = sklearn.metrics.r2_score(y_test, y_pred)
# Explained variance regression score function
exp_var_score = sklearn.metrics.explained_variance_score(y_test, y_pred)
with open('./trained_networks/all_rr_%d_data.pkl' % n_features, 'wb') as results:
pickle.dump(clf, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(mean_abs, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(mean_sq, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(median_abs, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(r2, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(exp_var_score, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(y_pred, results, pickle.HIGHEST_PROTOCOL)
return
def build_bayesian_rr(x_train, y_train, x_test, y_test, n_features):
"""
Constructing a Bayesian ridge regression model from input dataframe
:param x_train: features dataframe for model training
:param y_train: target dataframe for model training
:param x_test: features dataframe for model testing
:param y_test: target dataframe for model testing
:return: None
"""
clf = BayesianRidge()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
# Mean absolute error regression loss
mean_abs = sklearn.metrics.mean_absolute_error(y_test, y_pred)
# Mean squared error regression loss
mean_sq = sklearn.metrics.mean_squared_error(y_test, y_pred)
# Median absolute error regression loss
median_abs = sklearn.metrics.median_absolute_error(y_test, y_pred)
# R^2 (coefficient of determination) regression score function
r2 = sklearn.metrics.r2_score(y_test, y_pred)
# Explained variance regression score function
exp_var_score = sklearn.metrics.explained_variance_score(y_test, y_pred)
with open('./trained_networks/all_brr_%d_data.pkl' % n_features, 'wb') as results:
pickle.dump(clf, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(mean_abs, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(mean_sq, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(median_abs, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(r2, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(exp_var_score, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(y_pred, results, pickle.HIGHEST_PROTOCOL)
return
def build_lasso(x_train, y_train, x_test, y_test, n_features):
"""
Constructing a Lasso linear model with cross validation from input dataframe
:param x_train: features dataframe for model training
:param y_train: target dataframe for model training
:param x_test: features dataframe for model testing
:param y_test: target dataframe for model testing
:return: None
"""
model = Lasso(random_state=1)
# Random state has int value for non-random sampling
param_dist = {'alpha': np.arange( 0.0001, 1, 0.001 ).tolist()}
clf = RandomizedSearchCV(estimator=model, param_distributions=param_dist,
n_iter=20, n_jobs=-1)
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
# Mean absolute error regression loss
mean_abs = sklearn.metrics.mean_absolute_error(y_test, y_pred)
# Mean squared error regression loss
mean_sq = sklearn.metrics.mean_squared_error(y_test, y_pred)
# Median absolute error regression loss
median_abs = sklearn.metrics.median_absolute_error(y_test, y_pred)
# R^2 (coefficient of determination) regression score function
r2 = sklearn.metrics.r2_score(y_test, y_pred)
# Explained variance regression score function
exp_var_score = sklearn.metrics.explained_variance_score(y_test, y_pred)
with open('./trained_networks/all_lasso_%d_data.pkl' % n_features, 'wb') as results:
pickle.dump(clf, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(mean_abs, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(mean_sq, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(median_abs, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(r2, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(exp_var_score, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(y_pred, results, pickle.HIGHEST_PROTOCOL)
return
def build_forest(x_train, y_train, x_test, y_test, n_features):
"""
Constructing a random forest regression model from input dataframe
:param x_train: features dataframe for model training
:param y_train: target dataframe for model training
:param x_test: features dataframe for model testing
:param y_test: target dataframe for model testing
:return: None
"""
model = RandomForestRegressor()
param_dist = {'max_depth': sp_randint(1, 15),
'min_samples_split': sp_randint(2, 15)}
clf = RandomizedSearchCV(estimator=model, param_distributions=param_dist,
n_iter=15, n_jobs=-1)
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
# Mean absolute error regression loss
mean_abs = sklearn.metrics.mean_absolute_error(y_test, y_pred)
# Mean squared error regression loss
mean_sq = sklearn.metrics.mean_squared_error(y_test, y_pred)
# Median absolute error regression loss
median_abs = sklearn.metrics.median_absolute_error(y_test, y_pred)
# R^2 (coefficient of determination) regression score function
r2 = sklearn.metrics.r2_score(y_test, y_pred)
# Explained variance regression score function
exp_var_score = sklearn.metrics.explained_variance_score(y_test, y_pred)
with open('./trained_networks/all_rfr_%d_data.pkl' % n_features, 'wb') as results:
pickle.dump(clf, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(mean_abs, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(mean_sq, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(median_abs, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(r2, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(exp_var_score, results, pickle.HIGHEST_PROTOCOL)
pickle.dump(y_pred, results, pickle.HIGHEST_PROTOCOL)
print(r2)
return
def results():
df_mean_abs = pd.DataFrame()
df_mean_sq = pd.DataFrame()
df_median_abs = pd.DataFrame()
df_r2 = pd.DataFrame()
df_exp_var_score = pd.DataFrame()
lists = [25, 50, 75, 100, 150, 200, 250, 300]
for n_features in lists:
with open('./trained_networks/all_lr_%d_data.pkl' % n_features, 'rb') as result:
clf = pickle.load(result)
mean_abs = pickle.load(result)
mean_sq = pickle.load(result)
median_abs = pickle.load(result)
r2 = pickle.load(result)
exp_var_score = pickle.load(result)
y_pred = pickle.load(result)
df_mean_abs.set_value('Linear Regression', '%d' % n_features, mean_abs)
df_mean_sq.set_value('Linear Regression', '%d' % n_features, mean_sq)
df_median_abs.set_value('Linear Regression', '%d' % n_features, median_abs)
df_r2.set_value('Linear Regression', '%d' % n_features, r2)
df_exp_var_score.set_value('Linear Regression', '%d' % n_features, exp_var_score)
with open('./trained_networks/all_nn_%d_data.pkl' % n_features, 'rb') as result:
clf = pickle.load(result)
net = pickle.load(result)
mean_abs = pickle.load(result)
mean_sq = pickle.load(result)
median_abs = pickle.load(result)
r2 = pickle.load(result)
exp_var_score = pickle.load(result)
y_pred_nn = pickle.load(result)
df_mean_abs.set_value('Neural Network', '%d' % n_features, mean_abs)
df_mean_sq.set_value('Neural Network', '%d' % n_features, mean_sq)
df_median_abs.set_value('Neural Network', '%d' % n_features, median_abs)
df_r2.set_value('Neural Network', '%d' % n_features, r2)
df_exp_var_score.set_value('Neural Network', '%d' % n_features, exp_var_score)
with open('./trained_networks/all_svm_%d_data.pkl' % n_features, 'rb') as result:
clf = pickle.load(result)
mean_abs = pickle.load(result)
mean_sq = pickle.load(result)
median_abs = pickle.load(result)
r2 = pickle.load(result)
exp_var_score = pickle.load(result)
y_pred_svm = pickle.load(result)
df_mean_abs.set_value('Linear SVR', '%d' % n_features, mean_abs)
df_mean_sq.set_value('Linear SVR', '%d' % n_features, mean_sq)
df_median_abs.set_value('Linear SVR', '%d' % n_features, median_abs)
df_r2.set_value('Linear SVR', '%d' % n_features, r2)
df_exp_var_score.set_value('Linear SVR', '%d' % n_features, exp_var_score)
with open('./trained_networks/all_dt_%d_data.pkl' % n_features, 'rb') as result:
clf = pickle.load(result)
mean_abs = pickle.load(result)
mean_sq = pickle.load(result)
median_abs = pickle.load(result)
r2 = pickle.load(result)
exp_var_score = pickle.load(result)
y_pred_dt = pickle.load(result)
df_mean_abs.set_value('Decision Tree', '%d' % n_features, mean_abs)
df_mean_sq.set_value('Decision Tree', '%d' % n_features, mean_sq)
df_median_abs.set_value('Decision Tree', '%d' % n_features, median_abs)
df_r2.set_value('Decision Tree', '%d' % n_features, r2)
df_exp_var_score.set_value('Decision Tree', '%d' % n_features, exp_var_score)
with open('./trained_networks/all_rr_%d_data.pkl' % n_features, 'rb') as result:
clf = pickle.load(result)
mean_abs = pickle.load(result)
mean_sq = pickle.load(result)
median_abs = pickle.load(result)
r2 = pickle.load(result)
exp_var_score = pickle.load(result)
y_pred_rr = pickle.load(result)
df_mean_abs.set_value('Ridge Regression', '%d' % n_features, mean_abs)
df_mean_sq.set_value('Ridge Regression', '%d' % n_features, mean_sq)
df_median_abs.set_value('Ridge Regression', '%d' % n_features, median_abs)
df_r2.set_value('Ridge Regression', '%d' % n_features, r2)
df_exp_var_score.set_value('Ridge Regression', '%d' % n_features, exp_var_score)
with open('./trained_networks/all_brr_%d_data.pkl' % n_features, 'rb') as result:
clf = pickle.load(result)
mean_abs = pickle.load(result)
mean_sq = pickle.load(result)
median_abs = pickle.load(result)
r2 = pickle.load(result)
exp_var_score = pickle.load(result)
y_pred_brr = pickle.load(result)
df_mean_abs.set_value('Bayesian Ridge Regression', '%d' % n_features, mean_abs)
df_mean_sq.set_value('Bayesian Ridge Regression', '%d' % n_features, mean_sq)
df_median_abs.set_value('Bayesian Ridge Regression', '%d' % n_features, median_abs)
df_r2.set_value('Bayesian Ridge Regression', '%d' % n_features, r2)
df_exp_var_score.set_value('Bayesian Ridge Regression', '%d' % n_features, exp_var_score)
with open('./trained_networks/all_lasso_%d_data.pkl' % n_features, 'rb') as result:
clf = pickle.load(result)
mean_abs = pickle.load(result)
mean_sq = pickle.load(result)
median_abs = pickle.load(result)
r2 = pickle.load(result)
exp_var_score = pickle.load(result)
y_pred_lasso = pickle.load(result)
df_mean_abs.set_value('Lasso', '%d' % n_features, mean_abs)
df_mean_sq.set_value('Lasso', '%d' % n_features, mean_sq)
df_median_abs.set_value('Lasso', '%d' % n_features, median_abs)
df_r2.set_value('Lasso', '%d' % n_features, r2)
df_exp_var_score.set_value('Lasso', '%d' % n_features, exp_var_score)
with open('./trained_networks/all_rfr_%d_data.pkl' % n_features, 'rb') as result:
clf = pickle.load(result)
mean_abs = pickle.load(result)
mean_sq = pickle.load(result)
median_abs = pickle.load(result)
r2 = pickle.load(result)
exp_var_score = pickle.load(result)
y_pred_lasso = pickle.load(result)
df_mean_abs.set_value('Random Forest regression', '%d' % n_features, mean_abs)
df_mean_sq.set_value('Random Forest regression', '%d' % n_features, mean_sq)
df_median_abs.set_value('Random Forest regression', '%d' % n_features, median_abs)
df_r2.set_value('Random Forest regression', '%d' % n_features, r2)
df_exp_var_score.set_value('Random Forest regression', '%d' % n_features, exp_var_score)
return df_mean_abs, df_mean_sq, df_median_abs, df_r2, df_exp_var_score
In [ ]:
df = pd.read_csv('https://s3-us-west-2.amazonaws.com/'
'pphilip-usp-inhibition/data/df_preprocessing.csv')
df.drop(df.columns[0], axis=1, inplace=True)
# Copying column names to use after np array manipulation
all_headers = list(df.columns.values)
x_headers = list(df.columns.values)[:-1]
# Train, validation and test split
df_train, df_test = model_selection.train_test_split(df, test_size=0.25)
# Reassign column name and index after randomized split
df_train.reset_index(inplace=True, drop=True)
df_test.reset_index(inplace=True, drop=True)
df_train = pd.DataFrame(df_train, columns=all_headers)
df_test = pd.DataFrame(df_test, columns=all_headers)
# Remove the classification column from the dataframe
x_train = df_train.drop(TARGET_COLUMN, axis=1)
x_test = df_test.drop(TARGET_COLUMN, axis=1)
y_train = df_train[TARGET_COLUMN]
y_test = df_test[TARGET_COLUMN]
# Checking dataframe for NaN and infinite values
x_train = change_nan_infinite(x_train)
y_train = change_nan_infinite(y_train)
x_test = change_nan_infinite(x_test)
y_test = change_nan_infinite(y_test)
y_train = pd.DataFrame(y_train, columns=[TARGET_COLUMN])
y_test = pd.DataFrame(y_test, columns=[TARGET_COLUMN])
y_train.to_csv('./data/all_y_train_postprocessing.csv')
y_test.to_csv('./data/all_y_test_postprocessing.csv')
# Transform all column values to mean 0 and unit variance
clf = sklearn.preprocessing.StandardScaler().fit(x_train)
x_train = clf.transform(x_train)
x_test = clf.transform(x_test)
y_train = np.array(y_train)
# Feature selection and feature importance plot
choose_features(x_train, y_train, x_test, x_headers)
In [ ]:
n_features = int(input("Choose the number of features to be used in the model" + "\n" +
"Pick from 25, 50, 75, 100, 150, 200, 250, 300" + "\n"))
x_train = pd.read_csv('./data/all_x_train_postprocessing_rfr_%d.csv' % n_features)
x_test = pd.read_csv('./data/all_x_test_postprocessing_rfr_%d.csv' % n_features)
y_train = pd.read_csv('./data/all_y_train_postprocessing.csv')
y_test = pd.read_csv('./data/all_y_test_postprocessing.csv')
x_train.drop(x_train.columns[0], axis=1, inplace=True)
x_test.drop(x_test.columns[0], axis=1, inplace=True)
y_train.drop(y_train.columns[0], axis=1, inplace=True)
y_test.drop(y_test.columns[0], axis=1, inplace=True)
print("Generating models")
run_models(np.array(x_train), np.array(y_train).ravel(), np.array(x_test), np.array(y_test).ravel(), n_features)
In [3]:
df_mean_abs, df_mean_sq, df_median_abs, df_r2, df_exp_var_score = results()
In [4]:
df_mean_abs
Out[4]:
In [5]:
df_mean_sq
Out[5]:
In [6]:
df_median_abs
Out[6]:
In [7]:
df_r2
Out[7]:
In [8]:
df_exp_var_score
Out[8]:
In [ ]:
n_features = 300
x_train = pd.read_csv('./data/all_x_train_postprocessing_rfr_%d.csv' % n_features)
x_test = pd.read_csv('./data/all_x_test_postprocessing_rfr_%d.csv' % n_features)
y_train = pd.read_csv('./data/all_y_train_postprocessing.csv')
y_test = pd.read_csv('./data/all_y_test_postprocessing.csv')
x_train.drop(x_train.columns[0], axis=1, inplace=True)
x_test.drop(x_test.columns[0], axis=1, inplace=True)
y_train.drop(y_train.columns[0], axis=1, inplace=True)
y_test.drop(y_test.columns[0], axis=1, inplace=True)
plots.plot_features(x_train, y_train, x_test, y_test)
In [ ]:
y_train = pd.read_csv('./data/all_y_train_postprocessing.csv')
y_test = pd.read_csv('./data/all_y_test_postprocessing.csv')
y_train.drop(y_train.columns[0], axis=1, inplace=True)
y_test.drop(y_test.columns[0], axis=1, inplace=True)
plots.plot_y_dist(y_train, y_test)
In [ ]:
genalgo.main()