In [1]:
import matplotlib.pyplot as plt
import numpy as np
import gzip as gz
from math import log, sqrt, pi, exp
import Queue
import sys
In [2]:
def pca(data, k_features):
'''name: pca
description: function takes an original data set an makes the following transformations:
the data is centered about the origin; the covariance is then calculated;
the eigenvalues and eigenvectors of the covariance are found;
the original data is the projected onto the k eigenvectors in descending order
of their eigenvalues, creating a new N x K matrix of k principal components
dependencies: none
inputs: data - is an N x K matrix with the rows representing observations and columns representing features
k_features - is an integer representing the number of principal components or features to keep
outputs: reduced_data - an N x k_features matrix
'''
# if number of features is equal to data features return the data
if k_features == data.shape[1]: return data
# check 0 < k_features <= number of features
if k_features > 0 and k_features < data.shape[1]:
# center the data and calculate the covariance matrix (sigma)
sigma = np.cov(data.T)
# get the eigenvectors of sigma
eigen_vecs, _, _ = np.linalg.svd(sigma)
# create an empty matrix to hold dimensionally reduced data
reduced_data = np.empty((data.shape[0], k_features))
# for each observation x, project x onto eigenvectors
for observation_idx in range(data.shape[0]):
reduced_data[observation_idx] = np.dot(eigen_vecs[:,:k_features].T, data[observation_idx,:][:,np.newaxis])[:,np.newaxis].T
# return dimensionally reduced data
return reduced_data
# print error message
print ('ERROR: 0 < k_features < %i') % data.shape[1]
In [3]:
def get_cases(data, cases):
'''name: get_cases
description: takes and N x K matrix and returns a N' x K martix,
where the data is only the date with labels matching the specifiued cases
dependencies: None
inputs: data - N x K matrix of data with labels in col 0
cases - tuple of labels to be kept
outputs: None
'''
# get logical array by examining the colmun where the labels match the cases
logical_array = np.logical_or.reduce([data[:,0] == case for case in cases])
# return the new data matrix with binary labels
return data[logical_array]
In [4]:
def clean_zip_code_data(filename, cases, num_features):
'''name: clean_zip_code_data
description: opens a gziped file and extracts the specified cases,
converts the two cases to binary and reduces the number of features using PCA
dependencies: get_cases
inputs: filename - name of file
cases - a tuple of two digits
outputs: returns N x K matrix of binary labeled (column zero) zipcode data
'''
# read the zip code data
with gz.open(filename) as f:
train_data = np.loadtxt(f)
# filter out specify cases
train_data = get_cases(train_data, cases)
# split labels and features
X_train = train_data[:,1:]
y_train = train_data[:,0][:,np.newaxis]
# convert labels to binary
y_train = np.where(y_train == cases[0], 0, 1)
#exctract features using pca
X_train = pca(X_train, num_features)
# return data with labels in column zero
return np.hstack((y_train,X_train))
In [5]:
class node(object):
def __init__(self):
self.randomVector = None
self.mean_pos = None
self.mean_neg = None
self.std_pos = None
self.std_neg = None
self.score = None
self.left = None
self.right = None
self.parent_score = None
def set_randomVec(self, size):
'''name: set_randomVec
description: Creates a randmom unit vector
dependencies: None
inputs: size - is the number of feature in the data
outputs: None
'''
# create a random vector
x = np.random.rand(size)
# normalize x and set data member
self.randomVector = (x / np.linalg.norm(x))[:,np.newaxis]
def set_gaussian_variables(self, data):
# set positive gaussian variables
self.mean_pos = np.mean(data[data[:,0] == 0][:,1])
self.std_pos = np.std(data[data[:,0] == 0][:,1]) if np.std(data[data[:,0] == 0][:,1]) != 0 else 0.0001
# set negative gaussian variables
self.mean_neg = np.mean(data[data[:,0] == 1][:,1])
self.std_neg = np.std(data[data[:,0] == 1][:,1]) if np.std(data[data[:,0] == 1][:,1]) != 0 else 0.0001
def set_score(self, predicted_labels, actual_labels):
# set score to be the percent correctly classified
self.score = 1 - (np.sum(np.absolute(np.subtract(predicted_labels, actual_labels))) / actual_labels.shape[0])
In [6]:
class random_forest(object):
def __init__(self, num_trees=11, stoping_condition=5):
self.num_trees = num_trees
self.trees = []
self.stoping_condition = stoping_condition
self.predicitions = None
def __create_tree(self, data):
'''name:
description:
dependencies:
inputs:
outputs:
'''
# get nmuber of features
num_features = data.shape[1] - 1
# create queue
queue = Queue.Queue()
# create root node and add to queue
root = node()
queue.put((root, data))
# while queue in not empty
while not queue.empty():
# get current node
current_tupple = queue.get()
current_node = current_tupple[0]
current_data = current_tupple[1]
# get number of observations
num_observation = current_data.shape[0]
# create empty vector for predicted labels
predicted_labels = np.empty(num_observation)
# create/set nodes random vector
current_node.set_randomVec(num_features)
# create an empty matrix for the projections and project observations onto random vec
projected_data = np.empty(num_observation)
for projection in range(num_observation):
projected_data[projection] = np.dot(current_data[projection,1:][np.newaxis,:], current_node.randomVector)
# set the mean and variance for both positive and negative projected data
current_node.set_gaussian_variables(np.hstack((current_data[:,0][:,np.newaxis], projected_data[:,np.newaxis])))
# for each observation
for observation_idx in range(num_observation):
# caculate negative and positive probability
pos_prob = (1 / (current_node.std_pos * sqrt(2 * pi))) * exp(-((projected_data[observation_idx] - current_node.mean_pos)**2 / (2 * current_node.std_pos**2)))
neg_prob = (1 / (current_node.std_neg * sqrt(2 * pi))) * exp(-((projected_data[observation_idx] - current_node.mean_neg)**2 / (2 * current_node.std_neg**2)))
# classify observation
predicted_labels[observation_idx] = 0 if pos_prob > neg_prob else 1
# caculate/set nodes score
current_node.set_score(predicted_labels, current_data[:,0])
# combine predicted labels with data
new_data = np.hstack((predicted_labels[:, np.newaxis], current_data))
# if the number of observations is greater than five and the current score isn't prefect
if current_node.score != current_node.parent_score: #and current_node.score > 0.1:
if new_data[new_data[:,0] == 0][:,1:].shape[0] > self.stoping_condition:
# create left node and add node and data predicted positive to queue
current_node.left = node()
current_node.left.parent_score = current_node.score
queue.put((current_node.left, new_data[new_data[:,0] == 0][:,1:]))
if new_data[new_data[:,0] == 1][:,1:].shape[0] > self.stoping_condition:
# create right node and add node and data predicted negative to queue
current_node.right = node()
current_node.right.parent_score = current_node.score
queue.put((current_node.right, new_data[new_data[:,0] == 1][:,1:]))
# return root of tree
return root
def train(self, data):
'''name:
description:
dependencies:
inputs:
outputs:
'''
# create specified number of trees
for tree in range(self.num_trees):
#if (tree + 1 ) % 10 == 0:
#print 'creating tree ', tree + 1
#sys.stdout.flush()
# add each tree to list of trees
self.trees.append(self.__create_tree(data))
def predict(self, data, score=0.9):
'''name:
description:
dependencies:
inputs:
outputs:
'''
# create empty vector for the predictions
self.predicitions = np.empty(data.shape[0])
# loop and grab each observation
for observationIndex in range(data.shape[0]):
# create a temp list to store the predictions for that observation in each tree
temp_preds = []
# grab each tree to predict in the current observation
for tree in self.trees:
# predict on the current observation with the current tree
temp_preds.append(self.__initial_prediction(data[observationIndex,1:][np.newaxis,:], tree, score))
# count the total number of positive and negative
self.predicitions[observationIndex] = 0 if (sum(temp_preds) < self.num_trees / 2) else 1
def __initial_prediction(self, observation, tree, score):
'''name:
description:
dependencies:
inputs:
outputs:
'''
# iterate through the tree while the tree score is less than or equal to score
visitor = tree
classifying = True
# loop till wanted score is greater than training score
# visitors score can become None which will always evalueat to less than
while classifying :
# project the data into the random vector
projected_data = np.dot(observation, visitor.randomVector)
# caculate negative and positive probability
pos_prob = (1 / (visitor.std_pos * sqrt(2 * pi))) * exp(-((projected_data - visitor.mean_pos)**2 / (2 * visitor.std_pos**2)))
neg_prob = (1 / (visitor.std_neg * sqrt(2 * pi))) * exp(-((projected_data - visitor.mean_neg)**2 / (2 * visitor.std_neg**2)))
classification = 0 if pos_prob > neg_prob else 1
visitor = visitor.left if classification == 0 else visitor.right
classifying = True if visitor and visitor.score <= score else False
return classification
def score(self,y):
'''name:
description:
dependencies:
inputs:
outputs:
'''
return 1 - (np.sum(np.absolute(np.subtract(self.predicitions, y))) / y.shape[0])
In [11]:
cases = [(3,5),(0,8),(7,9)]
for case in cases:
# specify file name, cases and number of features
data_train = clean_zip_code_data('../Data/zip.train.gz', case, 256)
data_test = clean_zip_code_data('../Data/zip.test.gz', case, 256)
# create a random forest
rf = random_forest(num_trees=501, stoping_condition=5)
# train the random forest
rf.train(data_train)
# get predictions
rf.predict(data_test, score=0.9)
# get score
print "Cases = %i, %i Score = %f" % (case[0],case[1], rf.score(data_test[:,0]) )
sys.stdout.flush()
In [ ]: