In [16]:
# TO DO
# - Nothing for the moment
In [ ]:
import os
# OS-independent way to navigate the file system
# Data directory is one directory up in relation to directory of this notebook
curr_dir = os.path.normpath(os.getcwd())
if '/Untracked/Notebooks' in curr_dir:
data_dir = curr_dir.replace('/Untracked/Notebooks', '/Data')
else:
# Notebook is tracked in the repo
data_dir = curr_dir.replace('/Notebooks', '/Data')
#print(data_dir)
In [17]:
# PENALTY FUNCTIONS - SOME EXAMPLES
# multiplier is a positive number > 0 that determines the slope
# Linear Penalty Function
def linearPenalty(x, multiplier=1):
return x * multiplier
# Flipped/Inverse Linear Penalty Function
def invLinearPenalty(x, multiplier=1):
return -x * multiplier
# Linear for negative x and zero for positive x
def leftLinearPenalty(x, multiplier=1):
if(x < 0): return -x * multiplier
else: return 0
# Linear for positive x and zero for negative x
def rightLinearPenalty(x, multiplier=1):
if(x < 0): return 0
else: return x * multiplier
# V shape penalty
def VPenalty(x, multiplier=1):
if (x < 0): return -x * multiplier
else: return x
# Inverted V shape penalty
def invertedVPenalty(x, multiplier=1):
if (x < 0): return x * multiplier
else: return -x * multiplier
# Positive parabola penalty
def squaredPenalty(x, multiplier=1):
return (x**2) * multiplier
# Inverted parabola penalty
def invertedSquaredPenalty(x, multiplier=1):
return -(x**2) * multiplier
# Non-linear penalty
def nonLinearPenalty(x, multiplier=1):
return x + x**2 + x**3
In [18]:
penaltyFunctions = {linearPenalty: "Linear Penalty",
invLinearPenalty: "Inverse Linear Penalty",
leftLinearPenalty: "Left-Linear Penalty",
rightLinearPenalty: "Right-Linear Penalty",
VPenalty: "V Penalty",
invertedVPenalty: "Inverted-V Penalty",
squaredPenalty: "Squared Penalty",
invertedSquaredPenalty: "Inverted Squared Penalty",
nonLinearPenalty: "Non-Linear Penalty"
}
In [19]:
# Given a list of error values, plot the penalty function
# Error = Actual - Predicted value - This is always along a single dimension because the output is always a single
# column in a dataset.
import matplotlib.pyplot as plt
import matplotlib.cm as cm
%matplotlib inline
import seaborn as sns
# Plot the penalty function for a given list of error values and a given penalty function
def penaltyPlot(errorList, penaltyFunction):
# Set up the x-axis
num_points = 200
x = np.linspace(min(errorList), max(errorList), num_points)
fig, ax = plt.subplots(figsize=(6,4))
ax.set(xlabel='Predicted Value - Actual Value')
ax.set(ylabel='Penalty')
ax.axvline(x=0, color='black')
ax.axhline(y=0, color='black')
ax.set(title=penaltyFunctions[penaltyFunction])
ax.plot(x, list(map(penaltyFunction,x)))
In [20]:
# Load up the packages to investigate the data
import numpy as np
import pandas as pd
In [21]:
# Add a column of ones to the first column of a dataframe
# and turn it into a matrix
def df_addOnes(dataFrame):
vals = dataFrame.values
#add_ones_column = zip(np.ones(len(dataFrame)), vals)
#feature_matrix = np.matrix([val for val in add_ones_column])
feature_matrix = np.c_[np.ones(len(dataFrame)), vals]
return feature_matrix
In [22]:
# Making it easy to calculate the total penalty over the entire dataset
def penalty(df_features, df_output, paramater_value_list, penalty_function):
# df_features is a dataframe of the features (no column of ones added)
# df_output is a dataframe of the output column (target variable)
# parameter_value_list is a list of w0, w1, ..., wn+1 where n is the number of features
# i.e., the number of columns in df_features.
# Cost of being wrong calculated over the entire data set
# Will take X and add a first column of 1s to it to enable the matrix multiplication
# Therefore: X is an m x n matrix and theta is a n x 1 matrix
#### Turn the function inputs into matrices ####
# Get X and y into the right shapes for use in the penalty function
# Add a first column of ones to the feature matrix
# Add a column of 1s to X
feature_matrix = df_addOnes(df_features)
output_matrix = np.matrix(df_output.values)
parameter_matrix = np.matrix(paramater_value_list).T
#print(feature_matrix.shape, parameter_matrix.shape, output_matrix.shape)
# Difference between the predicted and the actual value
error = (feature_matrix * parameter_matrix) - output_matrix
#print(error.shape)
# penaltyPerOutput is an m x 1 matrix where each element is the penalty for
# the input and its associated output for a particular value of W
# Apply a penalty function to the errors from each row of the dataset
penaltyPerOutput = list(map(penalty_function,error))
# totalPenalty is the sum of the penalties of each row of the dataset
totalPenalty = np.sum(penaltyPerOutput)
# The penalty of getting it wrong is 1/2m of the totalPenalty (normalized penalty)
# m is the number of rows in df_features
totalPenaltyNorm = totalPenalty / (2 * len(df_features))
return totalPenaltyNorm
In [24]:
# Implement Gradient Descent
# **NOTE: ONLY for a squared penalty function**
def gradientDescent(df_features,
df_output,
init_params_list,
num_iterations=100,
learning_rate=0.0001,
penalty_function=squaredPenalty):
# df_features is a dataframe with the features
# df_ouptut is a dataframe of the output column
# init_params_list is the list of initial W values, e.g., [-1.0, 3.53]
# num_iterations is the number of steps taken by the algorithm as it descends the penalty surface
# learning_rate is the multiplier that determines step size (smaller = smaller step size)
# penalty_function is the penalty function applied to the machine learning problem
# NOTE: The formula for gradient descent we're implementing works only for the squaredPenalty function
# Get the inputs into matrix form so we can use matrix multiplication (more efficient)
feature_matrix = df_addOnes(df_features)
m = len(feature_matrix) # number of rows of data
output_matrix = np.matrix(df_output.values)
parameter_matrix = np.matrix(init_params_list).T
# This is the initial value of the parameters in matrix form
w = parameter_matrix
# Set up arrays to capture the running results
# Specify dtype=object because we're putting arrays into an array
#running_w = np.empty(num_iterations, dtype = object)
#running_w = np.array([[ 0.50182941],[-0.07935517]])
running_w = np.array(parameter_matrix)
# don't have to specify dtype for the other arrays because we're putting single values into the array
running_error = np.zeros(num_iterations)
running_normError = np.zeros(num_iterations)
running_penalty = np.zeros(num_iterations)
# Iterate over the dataset num_iterations times and adjust the values of the parameters each time
for i in range(num_iterations):
#print(w)
for j in range(len(parameter_matrix)):
error = ((feature_matrix * w) - output_matrix).T * np.matrix(feature_matrix[:,j]).T
normError = (learning_rate/m) * error
w[j] = w[j] - normError
#print(w[j])
# w, error, normError and penalty after each iteration
#running_w[i] = w
running_w = np.append(running_w, w, axis=0)
#print(i)
#print(w)
#print(running_w)
running_error[i] = np.sum((feature_matrix * w) - output_matrix.T)
running_normError[i] = (learning_rate/m) * running_error[i]
running_penalty[i] = penalty_function(running_error[i])
# w is the value of parameters afer num_iterations
#print(w)
# Get the running_w into the right form
# From https://jasonstitt.com/python-group-iterator-list-function
running_w = list(zip(*[iter(running_w)] * len(parameter_matrix)))
# error after num_iterations
final_error = np.sum((feature_matrix * w) - output_matrix.T)
# Penalty after num_iterations
final_penalty = penalty_function(final_error)
return w, final_penalty, running_w, running_penalty
In [ ]: