In [1]:
# CoEPrA Example
import os
import numpy as np
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [3]:
source_dataset_path = os.path.join("book_code", "Section 4", "CoEPrA.csv")
with open(source_dataset_path) as raw_data:
    data = np.loadtxt(raw_data, delimiter=",")
print("---> Data Shape: {}".format(data.shape))


---> Data Shape: (89, 5788)

In [5]:
# Separate independent and dependent variables
X = data[:,0:5787]
y = data[:,5787]

In [7]:
# Split data into train and test
print("---> Data Split for training and testing")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("\tX_training shape\t'{}'".format(X_train.shape))
print("\tX_test shape\t\t'{}'".format(X_test.shape))
print("\ty_training shape\t'{}'".format(y_train.shape))
print("\ty_test shape\t\t'{}'".format(y_test.shape))


---> Data Split for training and testing
	X_training shape	'(71, 5787)'
	X_test shape		'(18, 5787)'
	y_training shape	'(71,)'
	y_test shape		'(18,)'

In [8]:
# Trying the linear regression approach without regularization
regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)


/Users/mbernal/clasificador/private_data/src/code_repos/machine_learning/ml-playground/python/lib/python3.7/site-packages/sklearn/linear_model/base.py:509: RuntimeWarning: internal gelsd driver lwork query error, required iwork dimension not returned. This is likely the result of LAPACK bug 0038, fixed in LAPACK 3.2.2 (released July 21, 2010). Falling back to 'gelss' driver.
  linalg.lstsq(X, y)
Out[8]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [9]:
# Make predictions using the training set and calculate the mean squared error
y_train_pred = regr.predict(X_train)
print("---> Mean squared error on the training data: {:0.2f}".format(mean_squared_error(y_train, y_train_pred)))


---> Mean squared error on the training data: 0.08

In [10]:
# This probably means 'overfitting', right?
# Let's run a K-Fold Cross Validation
scores = cross_val_score(regr, X_train, y_train, scoring='neg_mean_squared_error', cv=5)
print("---> Running a k-fold cross validation")
print("\tScores sample '{}'".format(scores[:10]))
print("\tMean score: '{}'".format(np.mean(scores)))


---> Running a k-fold cross validation
	Scores sample '[-1.58031398e+24 -6.28451759e+23 -5.89220228e+23 -1.03515026e+23
 -6.16618077e+23]'
	Mean score: '-7.036238136104907e+23'

In [11]:
# Now let's try to make predictions using the testing set
y_testing_pred = regr.predict(X_test)
print("---> Mean squared error on the test data: {:0.2f}".format(mean_squared_error(y_test, y_testing_pred)))


---> Mean squared error on the test data: 3583363366497778925568000.00

In [12]:
# The mean squared error on the testing data is very high
# Let's try again but using L1 / Lasso Regularization
regr_lasso = linear_model.Lasso(alpha=0.3, max_iter=1000000)
regr_lasso.fit(X_train, y_train)


Out[12]:
Lasso(alpha=0.3, copy_X=True, fit_intercept=True, max_iter=1000000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [13]:
print("---> Lasso L1 model")
print("\tSample weights: '{}'".format(regr_lasso.coef_[:20]))


---> Lasso L1 model
	Sample weights: '[-0.  0. -0.  0.  0.  0. -0.  0. -0.  0.  0. -0.  0. -0. -0.  0. -0. -0.
  0. -0.]'

In [16]:
lasso_nonzero_coef_indexes = np.nonzero(regr_lasso.coef_)
print("\tIndex of all non-zero coefficients:\n'{}'".format(lasso_nonzero_coef_indexes))


	Index of all non-zero coefficients:
'(array([  64,  136,  445,  451,  653,  715,  760,  787,  858, 1236, 1358,
       1422, 1430, 1732, 1737, 1874, 1879, 2065, 2247, 2374, 2380, 2581,
       2644, 2689, 2708, 2890, 3224, 3351, 3666, 3931, 3994, 4002, 4221,
       4303, 4510, 4573, 4574, 4637, 4645, 4819, 4952, 5153, 5154, 5280,
       5589, 5595, 5648, 5732]),)'

In [17]:
# Create a new feature matrix with only those features that have a nonzero lasso coef. in the model
X_train_filter = X_train[:, lasso_nonzero_coef_indexes[0]]
print("\tLasso nonzero filtered feature matrix shape: '{}'".format(X_train_filter.shape))


	Lasso nonzero filtered feature matrix shape: '(71, 48)'

In [18]:
# Make predictions using the training set
y_lasso_train_pred = regr_lasso.predict(X_train)
print("\tMean squared error Lasso on the training dataset: {:.2f}".format(mean_squared_error(y_train, y_lasso_train_pred)))


	Mean squared error Lasso on the training dataset: 0.05

In [20]:
# Make predictions on the testing dataset
y_lasso_test_pred = regr_lasso.predict(X_test)
print("\tMean squared error Lasson on the test dataset: {:.2f}".format(mean_squared_error(y_test, y_lasso_test_pred)))


	Mean squared error Lasson on the test dataset: 0.69

In [21]:
# As we have a very small value for the training dataset but quite high for the testing dataset, we can probably say that it is an overfitting problem?
# K-fold cross validation
scores_lasso = cross_val_score(regr_lasso, X_train, y_train, scoring="neg_mean_squared_error", cv=5)
print("\tLasso - Mean cross validation score: {:.2f}".format(np.mean(scores_lasso)))


	Lasso - Mean cross validation score: -1.16

In [22]:
# Let's try L2 Ridge Regularization now
regr_ridge = linear_model.Ridge(alpha=0.8, max_iter=1000000)
regr_ridge.fit(X_train, y_train)
y_ridge_pred = regr_ridge.predict(X_train)
print("---> L2 Ridge model")
print("\tMean squared error on train data: {:.2f}".format(mean_squared_error(y_train, y_ridge_pred)))


---> L2 Ridge model
	Mean squared error on train data: 0.00

In [23]:
# Cross validation score
scores_ridge = cross_val_score(regr_ridge, X_train, y_train, scoring="neg_mean_squared_error", cv=5)
print("\tRidge - Cross validation mean score {:.2f}".format(np.mean(scores_ridge)))


	Ridge - Cross validation mean score -2.52

In [24]:
# Mean squared error on test data
y_ridge_pred_test = regr_ridge.predict(X_test)
print("\tRidge - Mean squared error on test data: {:.2f}".format(mean_squared_error(y_test, y_ridge_pred_test)))


	Ridge - Mean squared error on test data: 2.46

In [ ]:
# This is not improving that much
# The book does this part for the filtered training and test data