In [1]:
from __future__ import division
import numpy as np
import pandas as pd
import csv
import matplotlib.pylab as plt
class linReg:
df = None
input_vars = None
output_vars = None
thetas = None
alpha = 0.0
# formats the self.df properly
def __init__(self, fileName, xvar, yvar, alpha):
self.df = pd.read_csv(fileName)
length_col = len(self.df[yvar])
# normalize the values
y = self.df[yvar].as_matrix().reshape(length_col, 1)
self.output_vars = y / y.max(0)
x = self.df[xvar].as_matrix().reshape(length_col, 1)
x = x / x.max(0)
# add a fake x_0 to make matrix multiplications possible
thet0 = np.ones((length_col, 1))
self.input_vars = np.hstack((thet0, x))
self.thetas = np.ones((2, 1))
self.alpha = alpha
def plot(self):
plt.scatter(x = self.input_vars[:,1], y = self.output_vars)
plt.show()
def graph(self, formula):
x = self.input_vars[:,1]
y = eval(formula)
plt.scatter(x = self.input_vars[:,1], y = self.output_vars)
plt.plot(x, y)
plt.show()
@property
def grad_iter(self):
grads = np.zeros((len(self.input_vars), 1))
for i in range(len(self.input_vars)):
grads[i] = self.thetas[0] * self.input_vars[i][0] +\
self.thetas[1] * self.input_vars[i][1]
return grads
@property
def grad_vec(self):
return np.dot(self.input_vars, self.thetas)
@property
def update(self):
x = self.output_vars - self.grad_vec
y = np.dot(self.input_vars.T, x)
self.thetas = self.thetas + self.alpha * y
return self.thetas
@property
def cost(self):
summation = (self.grad_vec - self.output_vars)
return 0.5 * np.dot(summation.T, summation)
def train(self, iterations):
for i in range(iterations):
self.update
formula = str(self.thetas[0]) + '+ x*' + str(self.thetas[1])
self.graph(formula)
1. Reading the data
In [2]:
trainer = linReg('housesRegr.csv', 'Size', 'Price', 0.0001)
2. Plotting the data
In [3]:
trainer.plot()
In [ ]: