This page briefly goes over the regression metrics found in scikit-learn. The metrics are first calculated with NumPy and then calculated using the higher level functions available in sklearn.metrics.
In [1]:
from sklearn.linear_model import LinearRegression
from sklearn.datasets import make_regression
import matplotlib.pyplot as plt
%matplotlib inline
#Generate data
regression_data, regression_values = make_regression(n_samples=100,n_features=1,n_informative=1,noise=10)
#Set X, y_true (and shift to quadrant 1)
X = regression_data[:,0].reshape(100,1)+200
y_true = regression_values.reshape(100,1)+200
##Fit data
lr_model = LinearRegression()
lr_model.fit(X,y_true)
#Make predictions
y_pred = lr_model.predict(X)
#Plot Data
plt.style.use('seaborn')
plt.scatter(X,y_true)
plt.plot(X,y_pred,'g-');
In [2]:
from sklearn.metrics import mean_squared_error, mean_squared_log_error, mean_absolute_error, median_absolute_error,explained_variance_score,r2_score
import numpy as np
In [3]:
MSE = np.mean((y_true-y_pred)**2)
#or use sklearn
MSE_sklearn = mean_squared_error(y_true,y_pred)
if MSE==MSE_sklearn:
print("Mean squared error: {}".format(MSE))
In [4]:
RMSE = np.sqrt(MSE)
#no sklearn function available as of v0.19.0
print("Root mean squared error: {}".format(RMSE))
In [5]:
residuals_sum_of_squares = np.sum((y_true-y_pred)**2)
total_sum_of_squares = np.sum((y_true-np.mean(y_true))**2)
r2 = 1-residuals_sum_of_squares/total_sum_of_squares
#Sklearn convenience method
r2_sklearn = r2_score(y_true,y_pred)
if r2 == r2_sklearn:
print("R^2 Score: {}".format(r2))
In [6]:
y_error = y_true-y_pred
numerator = np.sum((y_error-np.mean(y_error))**2)
explained_var = 1-numerator/total_sum_of_squares
#sklearn convenience method
explained_var_sklearn=explained_variance_score(y_true,y_pred)
if explained_var == explained_var_sklearn:
print("Explained variance score: {}".format(explained_var))
A metric which is sensitive to outliers due to the fact that it is a mean. It is calculated by taking the mean value of the absolute differences between the predicted and true values of y. One advantage is that it is easily interpretable. Conveniently, its units are the same as y's.
In [7]:
MAE = np.mean(np.abs(y_true-y_pred))
#or use sklearn
MAE_sklearn = mean_absolute_error(y_true,y_pred)
if MAE==MAE_sklearn:
print("MAE: {}".format(MAE))
In [8]:
MedAE = np.median(np.abs(y_true-y_pred))
#or use sklearn
MedAE_sklearn = median_absolute_error(y_true,y_pred)
if MedAE==MedAE_sklearn:
print("MedAE: {}".format(MedAE))
In [9]:
MSLE = np.mean((np.log(y_true+1)-np.log(y_pred+1))**2)
#or use sklearn
MSLE_sklearn = mean_squared_log_error(y_true,y_pred)
if MSLE==MSLE_sklearn:
print("Mean squared log error: {}".format(MSLE))