This assignment can be done in teams of 2
Student 1: Roan de Jong (10791930)
Student 2: Ghislaine van den Boogerd (student_id)
This notebook provides a template for your programming assignment 2. You may want to use parts of your code from the previous assignment(s) as a starting point for this assignment.
The code you hand-in should follow the structure from this document. Write down your functions in the cells they belong to. Note that the structure corresponds with the structure from the actual programming assignment. Make sure you read this for the full explanation of what is expected of you.
Submission:
One way be sure you code can be run without errors is by quiting iPython completely and then restart iPython and run all cells again (you can do this by going to the menu bar above: Cell > Run all). This way you make sure that no old definitions of functions or values of variables are left (that your program might still be using).
If you have any questions ask your teaching assistent. We are here for you.
In [1]:
from __future__ import division
import numpy as np
import pandas as pd
import csv
import matplotlib.pylab as plt
class linReg:
df = None
input_vars = None
output_vars = None
thetas = None
alpha = 0.0
# formats the self.df properly
def __init__(self, fileName, alpha):
self.df = pd.read_csv(fileName, header=None)
length_col = len(self.df[self.df.columns[-1]])
# normalize the values
x = self.df[self.df.columns[0:-1]].as_matrix()
y = self.df[self.df.columns[-1]].as_matrix().reshape(length_col, 1)
self.output_vars = y / y.max(0)
theta_0 = np.ones((length_col, 1))
self.input_vars = np.hstack((theta_0, x))
# add a fake x_0 to make matrix multiplications possible
thet0 = np.ones((length_col, 1))
self.input_vars = np.hstack((thet0, x))
self.thetas = np.ones((5, 1))
self.alpha = alpha
@property
def grad_vec(self):
return np.dot(self.input_vars, self.thetas)
@property
def update(self):
x = self.output_vars - self.grad_vec
y = np.dot(self.input_vars.T, x)
self.thetas = self.thetas + self.alpha * y
return self.thetas
@property
def cost(self):
summation = (self.grad_vec - self.output_vars)
return 0.5 * np.dot(summation.T, summation)
def train(self, iterations):
for i in range(iterations):
self.update
print(self.cost)
1) Reading in data
In [4]:
if __name__ == '__main__':
trainer = linReg('housesRegr.csv', 0.0000000000001)
2) Gradient function
In [5]:
if __name__ == '__main__':
trainer = linReg('housesRegr.csv', 0.0000000000001)
print(trainer.grad_vec)
3) Parameter updating
In [6]:
if __name__ == '__main__':
trainer = linReg('housesRegr.csv', 0.0000000000001)
print(trainer.update)
4) Cost function
In [7]:
if __name__ == '__main__':
trainer = linReg('housesRegr.csv', 0.0000000000001)
print(trainer.cost)
5) Optimization learning rate and iterations
In [ ]:
if __name__ == '__main__':
# the optimized learning rate
trainer = linReg('housesRegr.csv', 0.0000000000001)
trainer.train(1000000)
In [ ]:
2) Cost function
In [ ]:
3) Optimization learning rate and iterations
In [ ]:
Discussion:
[You discussion comes here]
In [2]:
from __future__ import division
import numpy as np
import pandas as pd
import csv
import math
class logReg:
df = None
input_vars = None
classifying_vars = None
thetas = None
alpha = 0.0
def __init__(self, fileName, alpha):
self.df = pd.read_csv(fileName, header=None)
length_col = len(self.df[self.df.columns[-1]])
self.classifying_vars = self.df[self.df.columns[-1]].as_matrix()\
.reshape(length_col, 1)
x = self.df[self.df.columns[0:-1]].as_matrix()
# this is the column for x_0
temp_arr = np.ones((1, len(x.T[0])))
for column in x.T:
if column.max(0) > 0:
column = column / column.max(0)
temp_arr = np.vstack((temp_arr, column))
self.input_vars = temp_arr.T
self.thetas = np.full((len(self.input_vars[0]), 1), 0.5)
self.alpha = alpha
@property
def gradient(self):
theta_x = np.dot(self.input_vars, self.thetas)
# An ugly way to make a np.array
h_x = np.array([0.0])
for example in theta_x:
h_x = np.vstack((h_x, 1 / (1 + math.e**(-example))))
# We added this range to get rid of the useless 1st index: 0.0
return h_x[1:]
# Update the theta's as described in the lecture notes
def update(self, classifier):
output_vars = self.classifying_vars
np.place(output_vars, output_vars != classifier, [0])
np.place(output_vars, output_vars == classifier, [1])
x = self.gradient - output_vars
y = np.dot(self.input_vars.T, x)
self.thetas = self.thetas - self.alpha * y
return self.thetas
# calculate the cost
def cost(self, classifier):
h_x = self.gradient
cost = 0.0
for training_example in zip(h_x, self.classifying_vars):
if training_example[1] == classifier:
cost = cost + math.log(training_example[0])
else:
cost = cost + math.log(1 - training_example[0])
cost = -(1/len(self.classifying_vars)) * cost
return cost
# train the model on a certain number
def train(self, classifier, iterations):
for i in range(0, iterations):
self.update(classifier)
print(self.cost(classifier))
1) Reading the data
In [3]:
if __name__ == '__main__':
trainer = logReg('digits123.csv', 0.0001)
2) Gradient calculating and parameter updating
In [4]:
if __name__ == '__main__':
trainer = logReg('digits123.csv', 0.0001)
print(trainer.update(1))
3) Cost function
In [7]:
if __name__ == '__main__':
trainer = logReg('digits123.csv', 0.0001)
trainer.train(3, 100)
4) Pairwise comparison of classess
In [8]:
if __name__ == '__main__':
trainer = logReg('digits123.csv', 0.0001)
trainer.train(1, 100)
trainer.train(2, 100)
trainer.train(3, 100)
# the costs are quite similar right now, but it does seem to be the best for '3'
5) Optimization learning rate and iterations
In [ ]:
if __name__ == '__main__':
trainer = logReg('digits123.csv', 0.000000000001)
trainer.train(1, 1000)
Discussion:
[You discussion comes here]