In [2]:
from __future__ import division
import numpy as np
import itertools
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from tabulate import tabulate


/Users/jsubapple/anaconda/lib/python2.7/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')

In [3]:
# Symbolic partial differentiation -- examples
from sympy import symbols, diff
x, y, z = symbols('x y z', real=True)
f = 4*x*y + x**3 + z**8*y
diff(f, x)


Out[3]:
3*x**2 + 4*y

In [4]:
theta0, theta1, theta2, theta3, theta4, x0, x1 = symbols('theta0, theta1, theta2, theta3, theta4, x0, x1', real=True)
g = theta0*x0 + theta1*x1 + theta2*x1**2 + theta3*x1**3 + theta4*x1**(2.5)
diff(g, theta4)


Out[4]:
x1**2.5

In [5]:
# Take a data set and split it into training, cross-validation, and test data sets
def splitDataSet(data, trainRatio, valRatio):
    # Randomly shuffle the data -- 
    # Split up into train (trainRatio of all data), cross validate(valRatio), and test data sets
    # NOTE - there's also a sklearn function train_test_split to do a similar thing but not quite
    dataArr = np.array(data)
    dataDim = dataArr.shape
    #print dataArr[0:4,:]
    np.random.shuffle(dataArr)
    #print dataArr[0:4,:]

    # Find out the number of data rows so they can be split up in the right proportion
    num_train_rows = int(dataDim[0] * trainRatio)
    num_val_rows = int(dataDim[0] * valRatio)
    num_test_rows = dataDim[0] - (num_train_rows + num_val_rows)

    # Training Data
    X_train = dataArr[0:num_train_rows, 0].reshape((num_train_rows, 1))
    y_train = dataArr[0:num_train_rows, 1].reshape((num_train_rows, 1))

    # Cross Validation Data
    X_val = dataArr[num_train_rows:num_train_rows + num_val_rows, 0].reshape((num_val_rows, 1))
    y_val = dataArr[num_train_rows:num_train_rows + num_val_rows, 1].reshape((num_val_rows, 1))

    # Test Data
    X_test = dataArr[:num_test_rows, 0].reshape((num_test_rows, 1))
    y_test = dataArr[:num_test_rows, 1].reshape((num_test_rows, 1))
    
    return [X_train, y_train, X_val, y_val, X_test, y_test]

In [6]:
def computeCost(X, y, theta):
    # Cost of being wrong calculated over the entire data set
    # Assumes that X has a first column of 1s added to it to simplify the notation
    # Therefore: X is an m x n matrix and theta is a n x 1 matrix
    
    # costPerOutput is an m x 1 matrix where each element is the cost for
    # the input and its associated output for a particular value of theta
    costPerOutput = np.power(((X * theta) - y),2)
    
    # totalCost is the cost over the entire dataset
    totalCost = np.sum(costPerOutput)
    
    # The cost of getting it wrong is 1/2m of the totalCost (normalized cost)
    cost = totalCost / (2 * len(X))
    
    return cost

In [7]:
# Implement Gradient Descent
def gradientDescent(X, y, theta, alpha, iters):
    # NOTE: X must already have a column of 1s added to it
    # X is a m x n matrix
    # y is a m x 1 matrix
    # Theta is a n x 1 matrix
    
    # Keep track of everything
    sumError = np.zeros(shape=(len(theta),1))
    sumErrorNorm = np.zeros(shape=(len(theta),1))
    temp = np.matrix(np.zeros(theta.shape))
    cost = np.zeros(iters)
    
    for i in range(iters):
        # Calculate the non-normalized values for each theta parameter
        error = (X * theta) - y
        
        for j in range(len(theta)):
            # Multiply the error vector by the appropriate column of the X matrix and sum it
            sumError[j] = np.sum(np.multiply(error, X[:,j]))
            
            # Normalize the sumError using alpha and m
            sumErrorNorm[j] = np.divide(np.multiply(sumError[j], alpha), len(X))
            
            temp[j,0] = theta[j,0] - sumErrorNorm[j]
        
        theta = temp
        cost[i] = computeCost(X,y,theta)
            
    
    return theta, cost

In [8]:
from matplotlib.colors import ListedColormap

In [9]:
# Visualize the decision boundary
# From Sebastian Raschka
def plot_decision_regions(X, y, classifier, test_idx=None, resolution=0.02):
    
    # set up marker generator and color map
    markers = ('s', 'x', 'o', '^', 'v')
    colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
    cmap = ListedColormap(colors[:len(np.unique(y))])
    
    # plot the decision surface
    x1_min, x1_max = X[:,0].min() - 1, X[:,0].max() + 1
    x2_min, x2_max = X[:,1].min() - 1, X[:,1].max() + 1
    
    xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution), np.arange(x2_min, x2_max, resolution))
    
    Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
    Z = Z.reshape(xx1.shape)
    
    fig, ax = plt.subplots(figsize=(8,5))
    plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap)
    plt.xlim(xx1.min(), xx1.max())
    plt.ylim(xx2.min(), xx2.max())
    plt.xlabel('Exam 1 Score')
    plt.ylabel('Exam 2 Score')
    plt.title('Classifying the Data')
    
    # Plot all samples
    for idx, cl in enumerate(np.unique(y)):
        plt.scatter(x=X[y == cl, 0], y=X[y == cl, 1], alpha=0.8, c=cmap(idx), marker=markers[idx], label=cl)
    
    return plt

In [10]:
# Define the sigmoid function or transformation
def sigmoid(z):
    return 1 / (1 + np.exp(-z))