In [1]:
# CoEPrA Example
import os
import numpy as np
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
In [3]:
source_dataset_path = os.path.join("book_code", "Section 4", "CoEPrA.csv")
with open(source_dataset_path) as raw_data:
data = np.loadtxt(raw_data, delimiter=",")
print("---> Data Shape: {}".format(data.shape))
In [5]:
# Separate independent and dependent variables
X = data[:,0:5787]
y = data[:,5787]
In [7]:
# Split data into train and test
print("---> Data Split for training and testing")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("\tX_training shape\t'{}'".format(X_train.shape))
print("\tX_test shape\t\t'{}'".format(X_test.shape))
print("\ty_training shape\t'{}'".format(y_train.shape))
print("\ty_test shape\t\t'{}'".format(y_test.shape))
In [8]:
# Trying the linear regression approach without regularization
regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)
Out[8]:
In [9]:
# Make predictions using the training set and calculate the mean squared error
y_train_pred = regr.predict(X_train)
print("---> Mean squared error on the training data: {:0.2f}".format(mean_squared_error(y_train, y_train_pred)))
In [10]:
# This probably means 'overfitting', right?
# Let's run a K-Fold Cross Validation
scores = cross_val_score(regr, X_train, y_train, scoring='neg_mean_squared_error', cv=5)
print("---> Running a k-fold cross validation")
print("\tScores sample '{}'".format(scores[:10]))
print("\tMean score: '{}'".format(np.mean(scores)))
In [11]:
# Now let's try to make predictions using the testing set
y_testing_pred = regr.predict(X_test)
print("---> Mean squared error on the test data: {:0.2f}".format(mean_squared_error(y_test, y_testing_pred)))
In [12]:
# The mean squared error on the testing data is very high
# Let's try again but using L1 / Lasso Regularization
regr_lasso = linear_model.Lasso(alpha=0.3, max_iter=1000000)
regr_lasso.fit(X_train, y_train)
Out[12]:
In [13]:
print("---> Lasso L1 model")
print("\tSample weights: '{}'".format(regr_lasso.coef_[:20]))
In [16]:
lasso_nonzero_coef_indexes = np.nonzero(regr_lasso.coef_)
print("\tIndex of all non-zero coefficients:\n'{}'".format(lasso_nonzero_coef_indexes))
In [17]:
# Create a new feature matrix with only those features that have a nonzero lasso coef. in the model
X_train_filter = X_train[:, lasso_nonzero_coef_indexes[0]]
print("\tLasso nonzero filtered feature matrix shape: '{}'".format(X_train_filter.shape))
In [18]:
# Make predictions using the training set
y_lasso_train_pred = regr_lasso.predict(X_train)
print("\tMean squared error Lasso on the training dataset: {:.2f}".format(mean_squared_error(y_train, y_lasso_train_pred)))
In [20]:
# Make predictions on the testing dataset
y_lasso_test_pred = regr_lasso.predict(X_test)
print("\tMean squared error Lasson on the test dataset: {:.2f}".format(mean_squared_error(y_test, y_lasso_test_pred)))
In [21]:
# As we have a very small value for the training dataset but quite high for the testing dataset, we can probably say that it is an overfitting problem?
# K-fold cross validation
scores_lasso = cross_val_score(regr_lasso, X_train, y_train, scoring="neg_mean_squared_error", cv=5)
print("\tLasso - Mean cross validation score: {:.2f}".format(np.mean(scores_lasso)))
In [22]:
# Let's try L2 Ridge Regularization now
regr_ridge = linear_model.Ridge(alpha=0.8, max_iter=1000000)
regr_ridge.fit(X_train, y_train)
y_ridge_pred = regr_ridge.predict(X_train)
print("---> L2 Ridge model")
print("\tMean squared error on train data: {:.2f}".format(mean_squared_error(y_train, y_ridge_pred)))
In [23]:
# Cross validation score
scores_ridge = cross_val_score(regr_ridge, X_train, y_train, scoring="neg_mean_squared_error", cv=5)
print("\tRidge - Cross validation mean score {:.2f}".format(np.mean(scores_ridge)))
In [24]:
# Mean squared error on test data
y_ridge_pred_test = regr_ridge.predict(X_test)
print("\tRidge - Mean squared error on test data: {:.2f}".format(mean_squared_error(y_test, y_ridge_pred_test)))
In [ ]:
# This is not improving that much
# The book does this part for the filtered training and test data