In [1]:
from __future__ import division
import numpy as np
import pandas as pd
import csv
import matplotlib.pylab as plt

class linReg:

    df = None
    input_vars = None
    output_vars = None
    thetas = None
    alpha = 0.0

    # formats the self.df properly
    def __init__(self, fileName, xvar, yvar, alpha):
        self.df = pd.read_csv(fileName)
        length_col = len(self.df[yvar])
        # normalize the values
        y = self.df[yvar].as_matrix().reshape(length_col, 1)
        self.output_vars = y / y.max(0)
        x = self.df[xvar].as_matrix().reshape(length_col, 1)
        x = x / x.max(0)
        # add a fake x_0 to make matrix multiplications possible
        thet0 = np.ones((length_col, 1))
        self.input_vars = np.hstack((thet0, x))
        self.thetas = np.ones((2, 1))
        self.alpha = alpha

    def plot(self):
        plt.scatter(x = self.input_vars[:,1], y = self.output_vars)
        plt.show()

    def graph(self, formula):
        x = self.input_vars[:,1]
        y = eval(formula)
        plt.scatter(x = self.input_vars[:,1], y = self.output_vars)
        plt.plot(x, y)
        plt.show()

    @property
    def grad_iter(self):
        grads = np.zeros((len(self.input_vars), 1))
        for i in range(len(self.input_vars)):
            grads[i] = self.thetas[0] * self.input_vars[i][0] +\
                       self.thetas[1] * self.input_vars[i][1]
        return grads

    @property
    def grad_vec(self):
        return np.dot(self.input_vars, self.thetas)

    @property
    def update(self):
        x = self.output_vars - self.grad_vec
        y = np.dot(self.input_vars.T, x)
        self.thetas = self.thetas + self.alpha * y
        return self.thetas

    @property
    def cost(self):
        summation = (self.grad_vec - self.output_vars)
        return 0.5 * np.dot(summation.T, summation)

    def train(self, iterations):
        for i in range(iterations):
            self.update
        formula = str(self.thetas[0]) + '+ x*' + str(self.thetas[1])
        self.graph(formula)

1. Reading the data


In [2]:
trainer = linReg('housesRegr.csv', 'Size', 'Price', 0.0001)

2. Plotting the data


In [3]:
trainer.plot()



In [ ]: