In [ ]:
from __future__ import print_function
# from tabulate import tabulate
import numpy as np
import scipy
from sklearn.preprocessing import StandardScaler
%matplotlib inline
import matplotlib.pyplot as plt
import pylab as pl

import pickle
import os
import sys
import sys, time

In [ ]:
def get_class_mat_list (mat, mat_labels):
    assert (len(mat) == len(mat_labels))
    class_label_dict = {}
    class_mats = []
    class_label_idx = 0
    class_labels = []
    for samp_idx in range (len (mat)):
        class_label = mat_labels[samp_idx]
        if not class_label in class_label_dict:
            class_label_dict[class_label] = class_label_idx
            class_labels.append (class_label)
            class_mats.append (mat[samp_idx])
            class_label_idx += 1
        else:
            class_idx = class_label_dict[class_label]
            class_mats[class_idx] = np.vstack ([class_mats[class_idx],mat[samp_idx]])
    return (np.array(class_mats),class_labels)

def list_to_contig_mat (data_matrix_list, class_vals):
    data_mat_contig = np.vstack (data_matrix_list)
    class_vals_vec_list = []
    for class_idx in range (len(data_matrix_list)):
        class_vals_vec_list += [class_vals[class_idx]] * len (data_matrix_list[class_idx])
    class_vals_contig = np.asarray(class_vals_vec_list)
    return (data_mat_contig,class_vals_contig)

In [ ]:
def normalize (train, test):
    norm_train_set = train.copy() 
    mins, maxs = normalize_by_columns (norm_train_set)
    norm_test_set = test.copy() 
    normalize_by_columns (norm_test_set, mins, maxs)
    return (norm_train_set, norm_test_set)

def normalize_by_columns ( full_stack, mins = None, maxs = None ):
    """This is a global function to normalize a matrix by columns.
    If numpy 1D arrays of mins and maxs are provided, the matrix will be normalized against these ranges
    Otherwise, the mins and maxs will be determined from the matrix, and the matrix will be normalized
    against itself. The mins and maxs will be returned as a tuple.
    Out of range matrix values will be clipped to min and max (including +/- INF)
    zero-range columns will be set to 0.
    NANs in the columns will be set to 0.
    The normalized output range is hard-coded to 0-100
    """
    # Edge cases to deal with:
    # Range determination:
    # 1. features that are nan, inf, -inf
    # max and min determination must ignore invalid numbers
    # nan -> 0, inf -> max, -inf -> min
    # Normalization:
    # 2. feature values outside of range
    # values clipped to range (-inf to min -> min, max to inf -> max) - leaves nan as nan
    # 3. feature ranges that are 0 result in nan feature values
    # 4. all nan feature values set to 0

    # Turn off numpy warnings, since we're taking care of invalid values explicitly
    oldsettings = np.seterr(all='ignore')
    if (mins is None or maxs is None):
        # mask out NANs and +/-INFs to compute min/max
        full_stack_m = np.ma.masked_invalid (full_stack, copy=False)
        maxs = full_stack_m.max (axis=0)
        mins = full_stack_m.min (axis=0)

    # clip the values to the min-max range (NANs are left, but +/- INFs are taken care of)
    full_stack.clip (mins, maxs, full_stack)
    # remake a mask to account for NANs and divide-by-zero from max == min
    full_stack_m = np.ma.masked_invalid (full_stack, copy=False)

    # Normalize
    full_stack_m -= mins
    full_stack_m /= (maxs - mins)
    # Left over NANs and divide-by-zero from max == min become 0
    # Note the deep copy to change the numpy parameter in-place.
    full_stack[:] = full_stack_m.filled (0) * 100.0

    # return settings to original
    np.seterr(**oldsettings)

    return (mins,maxs)

def standardize (train, test):
    scaler = StandardScaler()
    new_train_set = scaler.fit_transform(train)
    new_test_set = scaler.transform(test)
    return (new_train_set,new_test_set)

In [ ]:
def round_robin_iteration (index, data_matrix_list):
    '''Does a leave N out, where N is the number of classes.
    The class with the smallest number of samples -1 (nsamples - 1) determines training set size.
    Picks nsamples-1 for training and testing from a circular list starting at index.
    Index ranges from 0 to the product of number of samples in each class.
    data_matrix_list is a list of data matrixes, with one matrix per class'''
    lengths = [m.shape[0] for m in data_matrix_list]
    steps = [0]*len(lengths)
    steps[0] = 1
    for idx in range (1,len(lengths)):
        steps[idx] = lengths[idx-1]*steps[idx-1]
    nclasses = len(lengths)
    max_samples = min (lengths) - 1
    indexes = [0] * nclasses
    last_index = index

    for index_idx in range (nclasses-1,0,-1):
        cur_index = last_index / steps[index_idx]
        indexes[index_idx] = cur_index
        last_index -= (cur_index * steps[index_idx])
        if last_index < 0: break
    if last_index > 0: indexes[0] = last_index

    train_mats = []
    test_mats = []
    for class_num in range(nclasses):
        class_indexes = [ (count+indexes[class_num]+1) % lengths[class_num] for count in range (max_samples) ]
        train_mats.append (np.take (data_matrix_list[class_num], class_indexes, axis=0) )
        test_mats.append (np.take (data_matrix_list[class_num], [indexes[class_num]], axis=0) )
    return (train_mats, test_mats)

In [ ]:
def Fisher(train_mat, train_vals):
    """Takes a FeatureSet_Discrete as input and calculates a Fisher score for
    each feature. Returns a newly instantiated instance of FisherFeatureWeights.

    For:
    N = number of classes
    F = number of features
    It = total number of images in training set
    Ic = number of images in a given class
    """

    # we deal with NANs/INFs separately, so turn off numpy warnings about invalid floats.
    oldsettings = np.seterr(all='ignore')
    (class_mats, class_vals) = get_class_mat_list (train_mat, train_vals)

    # 1D matrix 1 * F
    population_means = np.mean( train_mat, axis = 0 )
    n_classes = class_mats.shape[0]
    n_features = train_mat.shape[1]

    # 2D matrix shape N * F
    intra_class_means = np.empty( [n_classes, n_features] )
    # 2D matrix shape N * F
    intra_class_variances = np.empty( [n_classes, n_features] )

    class_index = 0
    for class_feature_matrix in class_mats:
        intra_class_means[ class_index ] = np.mean( class_feature_matrix, axis=0 )
    # Note that by default, numpy divides by N instead of the more common N-1, hence ddof=1.
        intra_class_variances[ class_index ] = np.var( class_feature_matrix, axis=0, ddof=1 )
        class_index += 1

    # 1D matrix 1 * F
    # we deal with NANs/INFs separately, so turn off numpy warnings about invalid floats.
    # for the record, in numpy:
    # 1./0. = inf, 0./inf = 0., 1./inf = 0. inf/0. = inf, inf/inf = nan
    # 0./0. = nan, nan/0. = nan, 0/nan = nan, nan/nan = nan, nan/inf = nan, inf/nan = nan
    # We can't deal with NANs only, must also deal with pos/neg infs
    # The masked array allows for dealing with "invalid" floats, which includes nan and +/-inf
    denom = np.mean( intra_class_variances, axis = 0 )
    denom[denom == 0] = np.nan
    feature_weights_m = np.ma.masked_invalid (
            ( np.square( population_means - intra_class_means ).sum( axis = 0 ) /
        (n_classes - 1) ) / denom
        )
    # return numpy error settings to original
    np.seterr(**oldsettings)

    # the filled(0) method of the masked array sets all nan and infs to 0
    fisher_values = feature_weights_m.filled(0).tolist()

    return (fisher_values)

In [ ]:
def Pearson(train_mat, train_vals):
    """Calculate regression parameters and correlation statistics that fully define
    a continuous classifier.

    At present the feature weights are proportional the Pearson correlation coefficient
    for each given feature."""

    from scipy import stats

    # Known issue: running stats.linregress() with np.seterr (all='raise') has caused
    # arithmetic underflow (FloatingPointError: 'underflow encountered in stdtr' )
    # I think this is something we can safely ignore in this function, and return settings
    # back to normal at the end. -CEC
    np.seterr (under='ignore')    

    pearson_coeffs = np.zeros(train_mat.shape[1])

    for feature_index in range( train_mat.shape[1] ):
        slope, intercept, pearson_coeff, p_value, std_err = stats.linregress(
            train_vals, train_mat[:,feature_index]
        )

        pearson_coeffs[feature_index] = pearson_coeff
# We're just returning the pearsons^2 now...
#    pearson_values = [val*val / r_val_squared_sum for val in pearson_coeffs ]
#    pearson_coeffs = (pearson_coeffs * pearson_coeffs) / r_val_squared_sum
    pearson_coeffs *= pearson_coeffs
    

    # Reset numpy
    np.seterr (all='raise')

    return pearson_coeffs

In [ ]:
def sort_by_weight (the_mat, feature_weights):
    i = np.argsort(feature_weights)
    sort_mat = the_mat[:,i]
    sort_mat = np.fliplr(sort_mat)
    return (sort_mat)

def weigh_sort(train, test, feature_weights):
    weigh_train = np.multiply (train, feature_weights)
    weigh_test = np.multiply (test, feature_weights)

    sorted_train = sort_by_weight (weigh_train, feature_weights)
    sorted_test = sort_by_weight (weigh_test, feature_weights)
    return (sorted_train, sorted_test)

In [ ]:
def marg_prob_to_pred_value (marg_probs, class_vals):
    weighted = np.array(marg_probs)*np.array(class_vals)
    return (np.sum(weighted))

def WND5 (contig_train_mat, contig_test_mat, contig_train_vals, **kwargs):
    n_test_samples = contig_test_mat.shape[0]
    n_train_samples = contig_train_mat.shape[0]
    predicted_classes = np.zeros(n_test_samples)
    predicted_values = np.zeros(n_test_samples)
    
    epsilon = np.finfo( np.float ).eps
    testimg_idx = 0
    trainimg_idx = 0
    
    for testimg_idx in range( n_test_samples ):
        # initialize
        class_dists = {}
        class_counts = {}
        classnames_list = []

        for trainimg_idx in range( n_train_samples ):
            train_class_label = contig_train_vals[trainimg_idx]
            if not train_class_label in class_dists:
                class_dists [train_class_label] = 0.0
                class_counts[train_class_label] = 0.0
                classnames_list.append (train_class_label)

            dists = np.absolute (contig_train_mat [trainimg_idx] - contig_test_mat [testimg_idx])
            w_dist = np.sum( dists )
            if w_dist > epsilon:
                class_counts[train_class_label] += 1.0
            else:
                continue

            w_dist = np.sum( np.square( dists ) )
            # The exponent -5 is the "5" in "WND5"
            class_dists[ train_class_label ] += w_dist ** -5

        
        class_idx = 0
        class_similarities = [0]*len(class_dists)
        for class_label in classnames_list:
            class_similarities[class_idx] = class_dists[class_label] / class_counts[class_label]
            class_idx += 1

        norm_factor = sum( class_similarities )
        marg_probs = np.array( [ x / norm_factor for x in class_similarities ] )

        predicted_class_idx = marg_probs.argmax()

        predicted_classes[testimg_idx] = classnames_list[ predicted_class_idx ]
        predicted_values[testimg_idx] = marg_prob_to_pred_value (marg_probs, classnames_list)

    return (predicted_classes, predicted_values)

def WND5_Cls (contig_train_mat, contig_test_mat, contig_train_vals, **kwargs):
    predicted_classes, predicted_values = WND5 (contig_train_mat, contig_test_mat, contig_train_vals, **kwargs)
    return (predicted_classes)

def WND5_Reg (contig_train_mat, contig_test_mat, contig_train_vals, **kwargs):
    predicted_classes, predicted_values = WND5 (contig_train_mat, contig_test_mat, contig_train_vals, **kwargs)
    return (predicted_values)

In [ ]:
def rand_forest_clf (contig_train_mat, contig_test_mat, contig_train_vals, **kwargs):
    if kwargs is not None and 'rnd_state' in kwargs:
        rnd_state = kwargs['rnd_state']
    else:
        rnd_state = None

    from sklearn.ensemble import RandomForestClassifier
    clf = RandomForestClassifier(n_estimators = 30, random_state = rnd_state)
    clf.fit(contig_train_mat, contig_train_vals)
    predicted_classes = clf.predict(contig_test_mat)
    return (predicted_classes)

def rand_forest_reg (contig_train_mat, contig_test_mat, contig_train_vals, **kwargs):
    if kwargs is not None and 'rnd_state' in kwargs:
        rnd_state = kwargs['rnd_state']
    else:
        rnd_state = None

    from sklearn.ensemble import RandomForestRegressor
    forest = RandomForestRegressor(n_estimators=30, random_state = rnd_state)
    forest.fit(contig_train_mat, contig_train_vals)
    predicted = forest.predict(contig_test_mat)
    return (predicted)

In [ ]:
def lin_reg(contig_train_mat, contig_test_mat, contig_train_vals, **kwargs):
    from sklearn.linear_model import LinearRegression
    lin_reg = LinearRegression()
    lin_reg.fit(contig_train_mat, contig_train_vals)
    predicted = lin_reg.predict(contig_test_mat)
    return (predicted)

In [ ]:
from __future__ import print_function
def R2_vs_N_features (regressor, contig_mat,mat_vals,feat_range):
    niter = contig_mat.shape[0]
    pred_vals = np.zeros ([niter,len(feat_range)])
    print ('Train class sizes : {}'.format(contig_mat.shape[0]-1))
    print ('N Features        : {}-{}'.format(feat_range[0],feat_range[-1]))
    print ('Iterations        : {}'.format(niter))
    actual = []
    for iter_idx in range ( niter ):
    # Split
        contig_train_mat = np.delete(contig_mat,[iter_idx],axis=0)
        contig_test_mat = np.asarray([contig_mat[iter_idx]])

        contig_train_vals = np.delete (mat_vals, [iter_idx])
        contig_test_vals = np.asarray ([mat_vals[iter_idx]])

        # Normalize
        norm_train, norm_test = normalize (contig_train_mat, contig_test_mat)

        # Reduce
        feature_weights = Pearson(norm_train, contig_train_vals)
        sorted_train, sorted_test = weigh_sort (norm_train, norm_test, feature_weights)
    #    print (sorted_train, sorted_test)

        # Classify with different numbers of features
        nfeatures_idx = 0
        for nfeatures in feat_range:
            # Classify
    #        preds,pred_val = WND5(sorted_train[:,:nfeatures], sorted_test[:,:nfeatures], contig_train_vals)
    #        pred_val = rand_forest_reg (sorted_train[:,:nfeatures], sorted_test[:,:nfeatures], contig_train_vals, iter_idx*nfeatures)
     #       pred_val = lin_reg (sorted_train[:,:nfeatures], sorted_test[:,:nfeatures], contig_train_vals)
            pred_val = regressor (sorted_train[:,:nfeatures], sorted_test[:,:nfeatures], contig_train_vals, rnd_state = iter_idx*nfeatures)
    #        print ('nfeatures: {}, iter_idx: {}'.format(nfeatures,iter_idx))
    #        print ('\rIteration {}; Predictions: {}; actual: {}\n'.format(iter_idx, pred_val[0], mat_vals[iter_idx]), end="")
            pred_vals[iter_idx][nfeatures_idx] = pred_val[0]
            nfeatures_idx += 1
        actual.append (mat_vals[iter_idx])
        print ("\rIteration {}".format(iter_idx), end="")

    print ()
    print ("Calculating R^2")
    r_squareds = [0]*len(feat_range)
    nfeatures_idx = 0
    for nfeatures in feat_range:
        score, p_value = pearsonr(pred_vals[:,nfeatures_idx], actual)
        r_squareds[nfeatures_idx] = score * score
        print ("\rN features {}".format(nfeatures), end="")
        nfeatures_idx += 1
    print ()
    return (r_squareds)

In [ ]:
def Graph_NFeat_vs_R2 (title, feat_range, avg_score):
    pl.figure()
    plot_score, = pl.plot(feat_range, avg_score, 'b', label="Without LDA")

    best_score_arg = np.nanargmax (avg_score)
    best_num_feat = feat_range[best_score_arg]
    best_score = avg_score[best_score_arg]

    text_x = feat_range[0]+(0.22*(feat_range[-1]-feat_range[0]))
    max_y = 0.5
    pl.text(text_x,0.9*max_y, 'Max = {:.4g} @ {} features'.format (best_score, best_num_feat))
    pl.title(title)
    pl.xlabel('Features')
    pl.ylabel('Coefficient of Determination '+'( $R^2$)')
    pl.ylim([0.0, max_y])
    pl.xlim([feat_range[0],feat_range[-1]])
#    pl.savefig(os.path.join (Config.graph_dir(),title+'.png'), format='png', dpi=150)
    pl.show()

In [ ]:
from __future__ import print_function
def ClsAcc_vs_N_features (classifier, feat_rank_alg, class_mat_list, class_vals, feat_range):
    niter = np.product ( [m.shape[0] for m in class_mat_list] )
    n_classes = len (class_mat_list)
    n_correct = np.zeros( [len(feat_range), n_classes] )
    (train,test) = round_robin_iteration (0,class_mat_list)
    max_features = train[0].shape[1]
    print ('Train class sizes : {}'.format([x.shape[0] for x in train]))
    print ('N Features        : {}-{}'.format(feat_range[0],feat_range[-1]))
    print ('Iterations        : {}'.format(niter))
    print ('class_vals        : {}'.format(class_vals))
    for iter_idx in range ( niter ):
        # Split
        (train,test) = round_robin_iteration (iter_idx,class_mat_list)
        (contig_train_mat, contig_train_vals) = list_to_contig_mat (train, class_vals)
        (contig_test_mat, contig_test_vals) = list_to_contig_mat (test, class_vals)

        # Normalize
        (norm_train, norm_test) = normalize (contig_train_mat, contig_test_mat)

        # Reduce
    #    feature_weights = Pearson (norm_train, contig_train_vals)
    #    feature_weights = Fisher (norm_train, contig_train_vals)
        feature_weights = feat_rank_alg (norm_train, contig_train_vals)
        (sorted_train, sorted_test) = weigh_sort (norm_train, norm_test, feature_weights)

        # Classify with different numbers of features
    #    preds = rand_forest_clf (sorted_train[:,:nfeatures], sorted_test[:,:nfeatures], contig_train_vals, rnd_state = iter_idx)
    #    preds = WND5_Cls (sorted_train[:,:nfeatures], sorted_test[:,:nfeatures], contig_train_vals, rnd_state = iter_idx)
        nfeatures_idx = 0
        for nfeatures in feat_range:
            preds = classifier (sorted_train[:,:nfeatures], sorted_test[:,:nfeatures], contig_train_vals, rnd_state = iter_idx)
            for pred_idx in range (len(preds)):
                if (preds[pred_idx] == contig_test_vals[pred_idx]):
                    n_correct[nfeatures_idx][pred_idx] += 1
            cumul_acc = [(float(x) / (float(iter_idx)+1.0)) for x in n_correct[nfeatures_idx]]
            print ('\rIteration {}; cumulative accuracies: {}'.format(iter_idx, cumul_acc), end="")
#            print ('Iteration {}; preds: {}, cumul_acc: {}, n_correct[{}]: {}'.format(iter_idx, preds, cumul_acc, nfeatures_idx, n_correct[nfeatures_idx]))
            nfeatures_idx += 1
    print ()

    nfeatures_idx = 0
    mean_feat_acc = np.zeros([len(feat_range),n_classes])
    for nfeatures in feat_range:
        for cl in range (n_classes):
            mean_feat_acc[nfeatures_idx][cl] = float(n_correct[nfeatures_idx][cl]) / float(niter)
        print ("\rN features {}".format(nfeatures), end="")
        nfeatures_idx += 1
    print ()

    return (mean_feat_acc)

In [ ]:
def Graph_NFeat_vs_ClsAcc (title, feat_range, mean_feat_acc, class_vals ):
    pl.figure()
    color_list = pl.cm.Dark2(np.linspace(0, 1, 12))
    pl.title(title)
    pl.xlabel('Features')
    pl.ylabel('Classification Accuracy')
    text_y = .8
    text_y_drop = 0.1
    pl.ylim([0.0, 1.1])
    pl.xlim([feat_range[0],feat_range[-1]])

    for cls in range(mean_feat_acc.shape[1]):
#        plot_score, = pl.plot(feat_range, avg_score, 'b', label="Without LDA")
        plot_score, = pl.plot(feat_range, mean_feat_acc[:,cls], color=color_list[cls], label="{}".format(class_vals[cls]))

        best_score_arg = np.nanargmax (mean_feat_acc[:,cls])
        best_num_feat = feat_range[best_score_arg]
        best_score = mean_feat_acc[best_score_arg][cls]

        text_x = feat_range[0]+(0.22*(feat_range[-1]-feat_range[0]))
        pl.text(text_x,text_y, 'Class {} Max = {:.4g} @ {} features'.format (class_vals[cls],best_score, best_num_feat))
        text_y -= text_y_drop

#    pl.savefig(os.path.join (Config.graph_dir(),title+'.png'), format='png', dpi=150)
    pl.show()

In [ ]:
from __future__ import print_function
def Mean_Feat_weights (rank_alg, class_mat_list, class_vals, feat_names):
    niter = np.product ( [m.shape[0] for m in class_mat_list] )
    n_classes = len (class_mat_list)
    (train,test) = round_robin_iteration (0,class_mat_list)
    max_features = train[0].shape[1]
    print ('Train class sizes : {}'.format([x.shape[0] for x in train]))
    print ('Iterations        : {}'.format(niter))
    print ('class_vals        : {}'.format(class_vals))
    mean_feature_weights = np.zeros(max_features)
    for iter_idx in range ( niter ):
        # Split
        (train,test) = round_robin_iteration (iter_idx,class_mat_list)
        (contig_train_mat, contig_train_vals) = list_to_contig_mat (train, class_vals)
        (contig_test_mat, contig_test_vals) = list_to_contig_mat (test, class_vals)

        # Normalize
        (norm_train, norm_test) = normalize (contig_train_mat, contig_test_mat)

        # Reduce
    #    feature_weights = Pearson (norm_train, contig_train_vals)
    #    feature_weights = Fisher (norm_train, contig_train_vals)
        feature_weights = rank_alg (norm_train, contig_train_vals)
        mean_feature_weights += feature_weights
        print ('\rIteration {}'.format(iter_idx), end="")
    print ()

    mean_feature_weights /= iter_idx
    i = np.argsort(mean_feature_weights)
    i = np.array(list(reversed(i)))
    sorted_feature_names = feat_names[i]
    sorted_feature_means = mean_feature_weights[i]
    sorted_feature_indexes = np.arange(max_features)[i]

    return (sorted_feature_names, sorted_feature_indexes, sorted_feature_means)

ProgressBar

p = Progressbar(120)
for i in range(1, 120+1):
    p.animate(i)

In [ ]:
from __future__ import print_function
try:
    from IPython.display import clear_output
    have_ipython = True
except ImportError:
    have_ipython = False

class ProgressBar(object):
    def __init__(self, iterations):
        self.iterations = iterations
        self.prog_bar = '[]'
        self.fill_char = '*'
        self.width = 20
        self.__update_amount(0)
        if have_ipython:
            self.animate = self.animate_ipython
        else:
            self.animate = self.animate_noipython

    def animate_ipython(self, iter):
        print ('\r', self, end="")
        sys.stdout.flush()
        self.update_iteration(iter + 1)
        if (iter + 1 > self.iterations):
            print ()

    def update_iteration(self, elapsed_iter):
        self.__update_amount((elapsed_iter / float(self.iterations)) * 100.0)
        self.prog_bar += '  %d of %s complete ' % (elapsed_iter, self.iterations)

    def __update_amount(self, new_amount):
        percent_done = int(round((new_amount / 100.0) * 100.0))
        all_full = self.width - 2
        num_hashes = int(round((percent_done / 100.0) * all_full))
        self.prog_bar = '[' + self.fill_char * num_hashes + ' ' * (all_full - num_hashes) + ']'
        pct_place = (len(self.prog_bar) // 2) - len(str(percent_done))
        pct_string = '%d%%' % percent_done
        self.prog_bar = self.prog_bar[0:pct_place] + \
            (pct_string + self.prog_bar[pct_place + len(pct_string):])

    def __str__(self):
        return str(self.prog_bar)