In [1]:
import matplotlib.pyplot as plt
import numpy as np
import gzip as gz
from math import log, sqrt, pi, exp
import Queue
import sys

In [2]:
def pca(data, k_features):
    '''name:         pca
       description:  function takes an original data set an makes the following transformations: 
                     the data is centered about the origin; the covariance is then calculated; 
                     the eigenvalues and eigenvectors of the covariance are found; 
                     the original data is the projected onto the k eigenvectors in descending order 
                     of their eigenvalues, creating a new N x K matrix of k principal components
       dependencies: none
       inputs:       data - is an N x K matrix with the rows representing observations and columns representing features
                     k_features - is an integer representing the number of principal components or features to keep
       outputs:      reduced_data - an N x k_features matrix 
    '''
    # if number of features is equal to data features return the data
    if k_features == data.shape[1]: return data
    
    # check 0 < k_features <= number of features
    if k_features > 0 and k_features < data.shape[1]:
      
        # center the data and calculate the covariance matrix (sigma)
        sigma = np.cov(data.T)
        
        # get the eigenvectors of sigma
        eigen_vecs, _, _ = np.linalg.svd(sigma)
        
        # create an empty matrix to hold dimensionally reduced data
        reduced_data = np.empty((data.shape[0], k_features))

        # for each observation x, project x onto eigenvectors
        for observation_idx in range(data.shape[0]):
            reduced_data[observation_idx] = np.dot(eigen_vecs[:,:k_features].T, data[observation_idx,:][:,np.newaxis])[:,np.newaxis].T
            
        # return dimensionally reduced data
        return reduced_data
    
    # print error message
    print ('ERROR: 0 < k_features < %i') % data.shape[1]

In [3]:
def get_cases(data, cases):
    '''name:         get_cases
       description:  takes and N x K matrix and returns a N' x K martix,
                     where the data is only the date with labels matching the specifiued cases
       dependencies: None
       inputs:       data - N x K matrix of data with labels in col 0
                     cases - tuple of labels to be kept
       outputs:      None
    '''
    # get logical array by examining the colmun where the labels match the cases
    logical_array = np.logical_or.reduce([data[:,0] == case for case in cases])
    
    # return the new data matrix with binary labels
    return data[logical_array]

In [4]:
def clean_zip_code_data(filename, cases, num_features):
    '''name:         clean_zip_code_data
       description:  opens a gziped file and extracts the specified cases, 
                     converts the two cases to binary and reduces the number of features using PCA
       dependencies: get_cases
       inputs:       filename - name of file
                     cases - a tuple of two digits
       outputs:      returns N x K matrix of binary labeled (column zero) zipcode data
    '''
    # read the zip code data
    with gz.open(filename) as f:
        train_data = np.loadtxt(f)

    # filter out specify cases
    train_data = get_cases(train_data, cases)

    # split labels and features
    X_train = train_data[:,1:]
    y_train = train_data[:,0][:,np.newaxis]
    
    # convert labels to binary
    y_train = np.where(y_train == cases[0], 0, 1)

    #exctract features using pca
    X_train = pca(X_train, num_features)
    
    # return data with labels in column zero
    return np.hstack((y_train,X_train))

In [5]:
class node(object):
    
    def __init__(self):
        self.randomVector = None
        self.mean_pos = None
        self.mean_neg = None
        self.std_pos = None
        self.std_neg = None
        self.score = None
        self.left = None
        self.right = None
        self.parent_score = None
        
    def set_randomVec(self, size):
        '''name:         set_randomVec
           description:  Creates a randmom unit vector
           dependencies: None
           inputs:       size - is the number of feature in the data
           outputs:      None
        '''
        # create a random vector
        x = np.random.rand(size)
        
        # normalize x and set data member
        self.randomVector = (x / np.linalg.norm(x))[:,np.newaxis]
        
    def set_gaussian_variables(self, data):
        
        # set positive gaussian variables
        self.mean_pos = np.mean(data[data[:,0] == 0][:,1])
        self.std_pos = np.std(data[data[:,0] == 0][:,1]) if np.std(data[data[:,0] == 0][:,1]) != 0 else 0.0001
        
        # set negative gaussian variables
        self.mean_neg = np.mean(data[data[:,0] == 1][:,1])
        self.std_neg = np.std(data[data[:,0] == 1][:,1]) if np.std(data[data[:,0] == 1][:,1]) != 0 else 0.0001
            
    def set_score(self, predicted_labels, actual_labels):
        # set score to be the percent correctly classified
        self.score = 1 - (np.sum(np.absolute(np.subtract(predicted_labels, actual_labels))) / actual_labels.shape[0])

In [6]:
class random_forest(object):
    
    def __init__(self, num_trees=11, stoping_condition=5):
        self.num_trees = num_trees
        self.trees = []
        self.stoping_condition = stoping_condition
        self.predicitions = None

    def __create_tree(self, data):
        '''name: 
           description:
           dependencies:
           inputs:
           outputs:
        '''
        # get nmuber of features
        num_features = data.shape[1] - 1
        
        # create queue
        queue = Queue.Queue()
        
        # create root node and add to queue
        root = node()
        queue.put((root, data))
        
        # while queue in not empty
        while not queue.empty():
        
            # get current node
            current_tupple = queue.get()
            current_node = current_tupple[0]
            current_data = current_tupple[1]
            
            # get number of observations
            num_observation = current_data.shape[0]
            
            # create empty vector for predicted labels
            predicted_labels = np.empty(num_observation)
        
            # create/set nodes random vector
            current_node.set_randomVec(num_features)
            
            # create an empty matrix for the projections and project observations onto random vec
            projected_data = np.empty(num_observation)
            for projection in range(num_observation):
                projected_data[projection] = np.dot(current_data[projection,1:][np.newaxis,:], current_node.randomVector)
            
            # set the mean and variance for both positive and negative projected data
            current_node.set_gaussian_variables(np.hstack((current_data[:,0][:,np.newaxis], projected_data[:,np.newaxis])))
            
            # for each observation
            for observation_idx in range(num_observation):
            
                # caculate negative and positive probability
                pos_prob = (1 / (current_node.std_pos * sqrt(2 * pi))) * exp(-((projected_data[observation_idx] - current_node.mean_pos)**2 / (2 * current_node.std_pos**2)))
                neg_prob = (1 / (current_node.std_neg * sqrt(2 * pi))) * exp(-((projected_data[observation_idx] - current_node.mean_neg)**2 / (2 * current_node.std_neg**2)))
                
                # classify observation
                predicted_labels[observation_idx] = 0 if pos_prob > neg_prob else 1
                
            # caculate/set nodes score
            current_node.set_score(predicted_labels, current_data[:,0])
            
            # combine predicted labels with data
            new_data = np.hstack((predicted_labels[:, np.newaxis], current_data))
            
            # if the number of observations is greater than five and the current score isn't prefect
            if current_node.score != current_node.parent_score: #and current_node.score > 0.1:
                
                if new_data[new_data[:,0] == 0][:,1:].shape[0] > self.stoping_condition:

                    # create left node and add node and data predicted positive to queue
                    current_node.left = node()
                    current_node.left.parent_score = current_node.score
                    queue.put((current_node.left, new_data[new_data[:,0] == 0][:,1:]))
                    
                if new_data[new_data[:,0] == 1][:,1:].shape[0] > self.stoping_condition:

                    # create right node and add node and data predicted negative to queue
                    current_node.right = node()
                    current_node.right.parent_score = current_node.score
                    queue.put((current_node.right, new_data[new_data[:,0] == 1][:,1:]))

        # return root of tree
        return root
    
    def train(self, data):
        '''name: 
           description:
           dependencies:
           inputs:
           outputs:
        '''
        # create specified number of trees
        for tree in range(self.num_trees):
            
            #if (tree + 1 ) % 10 == 0:
                #print 'creating tree ', tree + 1
                #sys.stdout.flush()
            
            # add each tree to list of trees
            self.trees.append(self.__create_tree(data))
    
    def predict(self, data, score=0.9):
        '''name: 
           description:
           dependencies:
           inputs:
           outputs:
        '''
        # create empty vector for the predictions
        self.predicitions = np.empty(data.shape[0])
        
        # loop and grab each observation
        for observationIndex in range(data.shape[0]):
            
            # create a temp list to store the predictions for that observation in each tree
            temp_preds = []
            
            # grab each tree to predict in the current observation
            for tree in self.trees:
            
                # predict on the current observation with the current tree
                temp_preds.append(self.__initial_prediction(data[observationIndex,1:][np.newaxis,:], tree, score))
                
            # count the total number of positive and negative
            self.predicitions[observationIndex] = 0 if (sum(temp_preds) < self.num_trees / 2) else 1
    
    def __initial_prediction(self, observation, tree, score):
        '''name: 
           description:
           dependencies:
           inputs:
           outputs:
        '''
        # iterate through the tree while the tree score is less than or equal to score
        visitor = tree
        
        classifying = True
        
        # loop till wanted score is greater than training score
        # visitors score can become None which will always evalueat to less than 
        while classifying :
            
            
            # project the data into the random vector
            projected_data = np.dot(observation, visitor.randomVector)
            
            # caculate negative and positive probability
            pos_prob = (1 / (visitor.std_pos * sqrt(2 * pi))) * exp(-((projected_data - visitor.mean_pos)**2 / (2 * visitor.std_pos**2)))
            neg_prob = (1 / (visitor.std_neg * sqrt(2 * pi))) * exp(-((projected_data - visitor.mean_neg)**2 / (2 * visitor.std_neg**2)))
            
            classification = 0 if pos_prob > neg_prob else 1
            
            visitor = visitor.left if classification == 0 else visitor.right
            
            classifying = True if visitor and visitor.score <= score else False
                
        return classification
        
    def score(self,y):
        '''name: 
           description:
           dependencies:
           inputs:
           outputs:
        '''
        return 1 - (np.sum(np.absolute(np.subtract(self.predicitions, y))) / y.shape[0])

In [11]:
cases = [(3,5),(0,8),(7,9)]

for case in cases:
    # specify file name, cases and number of features 
    data_train = clean_zip_code_data('../Data/zip.train.gz', case, 256)
    data_test = clean_zip_code_data('../Data/zip.test.gz', case, 256)

    # create a random forest
    rf = random_forest(num_trees=501, stoping_condition=5)

    # train the random forest
    rf.train(data_train)

    # get predictions
    rf.predict(data_test, score=0.9)

    # get score
    print "Cases = %i, %i Score = %f" % (case[0],case[1], rf.score(data_test[:,0]) )
    sys.stdout.flush()


(3, 5)
creating tree  10
Cases = 3, 5 Score = 0.613497
(0, 8)
creating tree  10
Cases = 0, 8 Score = 0.592025
(7, 9)
creating tree  10
Cases = 7, 9 Score = 0.634969

In [ ]: