Logistic Regression


In [1]:
%pylab inline


Populating the interactive namespace from numpy and matplotlib

In [27]:
# Import libraries
from __future__ import absolute_import, division, print_function

# Ignore warnings
import warnings
#warnings.filterwarnings('ignore')

import sys
sys.path.append('/Users/omojumiller/mycode/tools')

import numpy as np
import pandas as pd
import scipy.stats as st
from tools import plot_features_by_target


# Use CPickle if available
try:
   import cPickle as pickle
except:
   import pickle

# Graphing Libraries
import matplotlib.pyplot as pyplt
import seaborn as sns
sns.set_style("whitegrid")  

# Configure for presentation
np.set_printoptions(threshold=50, linewidth=50)
import matplotlib as mpl
mpl.rc('font', size=16)

from IPython.display import display

In [186]:
import re

def get_col_types(row):
    """Get the number of numeric types and the rest"""
    num_of_numerics = 0

    i = 0
    num_index = []
    str_index = []
    for dp in row:
        if re.findall("[-+]?\d+[\.]?\d*[eE]?[-+]?\d*", dp):
            num_of_numerics += 1
            num_index.append(i)
            i += 1
        else:
            str_index.append(i)
            i += 1
                

    num_of_str = len(row) - num_of_numerics
    
    return dict(num_of_numerics=num_of_numerics, num_index=num_index,
         num_of_str=num_of_str, str_index=str_index)

In [302]:
import csv

# Load a CSV file
def load_csv(fname):
    """
    Read in a csv file
    
    Attributes:
    fname: string (filename)
    """
    num_lines = sum(1 for line in open(fname, 'rU'))

    with open(fname, 'rU') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        header = csv_reader.next()
        row = csv_reader.next()
        my_dtypes = get_col_types(row)

        X = numpy.zeros(shape=(num_lines, my_dtypes['num_of_numerics']))
        X_str = []

        # fill first row
        index = 0
        tmp = []
        for i in  my_dtypes['num_index']:
            tmp.append(float(row[i]))   
        X[index] = tmp

        tmp = []
        for i in  my_dtypes['str_index']:
            tmp.append(row[i])  
        X_str.append(tmp)

        # update the index
        index += 1

        # fill the rest of the table
        for row in csv_reader:
            tmp = []
            for i in  my_dtypes['num_index']:
                tmp.append(float(row[i]))   
            X[index] = tmp

            tmp = []
            for i in  my_dtypes['str_index']:
                tmp.append(row[i])  
            X_str.append(tmp)

            index += 1
        
    return (X[:-1], X_str[:-1])

In [303]:
fname = 'data/actors.csv'

In [304]:
X, X_str = load_csv(fname)

In [305]:
X


Out[305]:
array([[ 4871.7,    41. ,   118.8,   936.7],
       [ 4772.8,    69. ,    69.2,   623.4],
       [ 4468.3,    61. ,    73.3,   534.9],
       ..., 
       [ 2462.6,    35. ,    70.4,   336. ],
       [ 2457.8,    23. ,   106.9,   623.4],
       [ 2416.5,    25. ,    96.7,   448.1]])

In [224]:
df = pd.read_csv('data/actors.csv')

In [225]:
df.columns = ['Actor', 'TotalGross', 'NumberOfMovies', 'AveragePerMovie', 'Num1Movie', 'Gross']
df.head()


Out[225]:
Actor TotalGross NumberOfMovies AveragePerMovie Num1Movie Gross
0 Harrison Ford 4871.7 41 118.8 Star Wars: The Force Awakens 936.7
1 Samuel L. Jackson 4772.8 69 69.2 The Avengers 623.4
2 Morgan Freeman 4468.3 61 73.3 The Dark Knight 534.9
3 Tom Hanks 4340.8 44 98.7 Toy Story 3 415.0
4 Robert Downey, Jr. 3947.3 53 74.5 The Avengers 623.4

In [226]:
x_vars = ['NumberOfMovies', 'AveragePerMovie', 'Gross']
y_vars=['TotalGross']

In [227]:
plot_features_by_target(df, x_vars, y_vars)



In [230]:
# Produce a scatter matrix for each pair of features in the data
pd.scatter_matrix(df[x_vars], alpha = 0.3, figsize = (8,4), diagonal = 'kde');



In [59]:
sns.jointplot(x='NumberOfMovies', y='TotalGross', data=df, kind='reg');



In [278]:
class LinearRegression():
    """Linear model for doing regression.
    Parameters:
    -----------
    n_iterations: float
        The number of training iterations the algorithm will tune the weights for.
    learning_rate: float
        The step length that will be used when updating the weights.
    gradient_descent: boolean
        True or false depending if gradient descent should be used when training. If 
        false then we use batch optimization by least squares.
    """
    def __init__(self, n_iterations=100, learning_rate=0.001, gradient_descent=False):
        self.w = None
        self.n_iterations = n_iterations
        self.learning_rate = learning_rate
        self.gradient_descent = gradient_descent    # Opt. method. If False => Least squares

    def fit(self, X, y):
        # Insert constant ones as first column (for bias weights)
        X = np.insert(X, 0, 1, axis=1)
        # Get weights by gradient descent opt.
        if self.gradient_descent:
            n_features = np.shape(X)[1]
            # Initial weights randomly [0, 1]
            self.w = np.random.random((n_features, ))
            # Do gradient descent for n_iterations
            for _ in range(self.n_iterations):
                # Gradient of squared loss w.r.t the weights
                w_gradient = X.T.dot(X.dot(self.w) - y)
                print(w_gradient)
                # Move against the gradient to minimize loss
                self.w -= self.learning_rate * w_gradient
        # Get weights by least squares (by pseudoinverse)
        else:
            U, S, V = np.linalg.svd(X.T.dot(X))
            S = np.diag(S)
            X_sq_inv = V.dot(np.linalg.pinv(S)).dot(U.T)
            self.w = X_sq_inv.dot(X.T).dot(y)
            
        return(self.w)

    def predict(self, X):
        # Insert constant ones for bias weights
        X = np.insert(X, 0, 1, axis=1)
        y_pred = X.dot(self.w)
        return y_pred

In [306]:
num_train = int(len(X) * .75)

In [335]:
X_train, y_train = X[:,[1]][:num_train], X[:,[0]][:num_train]
X_test, y_test = X[:,[1]][num_train:], X[:,[0]][num_train:]
XX = X[:,[0, 1]]

In [336]:
clf = LinearRegression()
reg_weights = clf.fit(X_train, y_train)

In [337]:
y_preds = clf.predict(X_test)
error = y_test -  y_preds

In [338]:
pyplt.plot(error);



In [344]:
reg_line = []
for i in range(len(XX)): 
    reg_line.append(XX[i].T.dot(reg_weights))
    
pyplt.plot(reg_line);



In [ ]:


In [ ]: