In [ ]:
from __future__ import print_function
# from tabulate import tabulate
import numpy as np
import scipy
from sklearn.preprocessing import StandardScaler
%matplotlib inline
import matplotlib.pyplot as plt
import pylab as pl
import pickle
import os
import sys
import sys, time
In [ ]:
def get_class_mat_list (mat, mat_labels):
assert (len(mat) == len(mat_labels))
class_label_dict = {}
class_mats = []
class_label_idx = 0
class_labels = []
for samp_idx in range (len (mat)):
class_label = mat_labels[samp_idx]
if not class_label in class_label_dict:
class_label_dict[class_label] = class_label_idx
class_labels.append (class_label)
class_mats.append (mat[samp_idx])
class_label_idx += 1
else:
class_idx = class_label_dict[class_label]
class_mats[class_idx] = np.vstack ([class_mats[class_idx],mat[samp_idx]])
return (np.array(class_mats),class_labels)
def list_to_contig_mat (data_matrix_list, class_vals):
data_mat_contig = np.vstack (data_matrix_list)
class_vals_vec_list = []
for class_idx in range (len(data_matrix_list)):
class_vals_vec_list += [class_vals[class_idx]] * len (data_matrix_list[class_idx])
class_vals_contig = np.asarray(class_vals_vec_list)
return (data_mat_contig,class_vals_contig)
In [ ]:
def normalize (train, test):
norm_train_set = train.copy()
mins, maxs = normalize_by_columns (norm_train_set)
norm_test_set = test.copy()
normalize_by_columns (norm_test_set, mins, maxs)
return (norm_train_set, norm_test_set)
def normalize_by_columns ( full_stack, mins = None, maxs = None ):
"""This is a global function to normalize a matrix by columns.
If numpy 1D arrays of mins and maxs are provided, the matrix will be normalized against these ranges
Otherwise, the mins and maxs will be determined from the matrix, and the matrix will be normalized
against itself. The mins and maxs will be returned as a tuple.
Out of range matrix values will be clipped to min and max (including +/- INF)
zero-range columns will be set to 0.
NANs in the columns will be set to 0.
The normalized output range is hard-coded to 0-100
"""
# Edge cases to deal with:
# Range determination:
# 1. features that are nan, inf, -inf
# max and min determination must ignore invalid numbers
# nan -> 0, inf -> max, -inf -> min
# Normalization:
# 2. feature values outside of range
# values clipped to range (-inf to min -> min, max to inf -> max) - leaves nan as nan
# 3. feature ranges that are 0 result in nan feature values
# 4. all nan feature values set to 0
# Turn off numpy warnings, since we're taking care of invalid values explicitly
oldsettings = np.seterr(all='ignore')
if (mins is None or maxs is None):
# mask out NANs and +/-INFs to compute min/max
full_stack_m = np.ma.masked_invalid (full_stack, copy=False)
maxs = full_stack_m.max (axis=0)
mins = full_stack_m.min (axis=0)
# clip the values to the min-max range (NANs are left, but +/- INFs are taken care of)
full_stack.clip (mins, maxs, full_stack)
# remake a mask to account for NANs and divide-by-zero from max == min
full_stack_m = np.ma.masked_invalid (full_stack, copy=False)
# Normalize
full_stack_m -= mins
full_stack_m /= (maxs - mins)
# Left over NANs and divide-by-zero from max == min become 0
# Note the deep copy to change the numpy parameter in-place.
full_stack[:] = full_stack_m.filled (0) * 100.0
# return settings to original
np.seterr(**oldsettings)
return (mins,maxs)
def standardize (train, test):
scaler = StandardScaler()
new_train_set = scaler.fit_transform(train)
new_test_set = scaler.transform(test)
return (new_train_set,new_test_set)
In [ ]:
def round_robin_iteration (index, data_matrix_list):
'''Does a leave N out, where N is the number of classes.
The class with the smallest number of samples -1 (nsamples - 1) determines training set size.
Picks nsamples-1 for training and testing from a circular list starting at index.
Index ranges from 0 to the product of number of samples in each class.
data_matrix_list is a list of data matrixes, with one matrix per class'''
lengths = [m.shape[0] for m in data_matrix_list]
steps = [0]*len(lengths)
steps[0] = 1
for idx in range (1,len(lengths)):
steps[idx] = lengths[idx-1]*steps[idx-1]
nclasses = len(lengths)
max_samples = min (lengths) - 1
indexes = [0] * nclasses
last_index = index
for index_idx in range (nclasses-1,0,-1):
cur_index = last_index / steps[index_idx]
indexes[index_idx] = cur_index
last_index -= (cur_index * steps[index_idx])
if last_index < 0: break
if last_index > 0: indexes[0] = last_index
train_mats = []
test_mats = []
for class_num in range(nclasses):
class_indexes = [ (count+indexes[class_num]+1) % lengths[class_num] for count in range (max_samples) ]
train_mats.append (np.take (data_matrix_list[class_num], class_indexes, axis=0) )
test_mats.append (np.take (data_matrix_list[class_num], [indexes[class_num]], axis=0) )
return (train_mats, test_mats)
In [ ]:
def Fisher(train_mat, train_vals):
"""Takes a FeatureSet_Discrete as input and calculates a Fisher score for
each feature. Returns a newly instantiated instance of FisherFeatureWeights.
For:
N = number of classes
F = number of features
It = total number of images in training set
Ic = number of images in a given class
"""
# we deal with NANs/INFs separately, so turn off numpy warnings about invalid floats.
oldsettings = np.seterr(all='ignore')
(class_mats, class_vals) = get_class_mat_list (train_mat, train_vals)
# 1D matrix 1 * F
population_means = np.mean( train_mat, axis = 0 )
n_classes = class_mats.shape[0]
n_features = train_mat.shape[1]
# 2D matrix shape N * F
intra_class_means = np.empty( [n_classes, n_features] )
# 2D matrix shape N * F
intra_class_variances = np.empty( [n_classes, n_features] )
class_index = 0
for class_feature_matrix in class_mats:
intra_class_means[ class_index ] = np.mean( class_feature_matrix, axis=0 )
# Note that by default, numpy divides by N instead of the more common N-1, hence ddof=1.
intra_class_variances[ class_index ] = np.var( class_feature_matrix, axis=0, ddof=1 )
class_index += 1
# 1D matrix 1 * F
# we deal with NANs/INFs separately, so turn off numpy warnings about invalid floats.
# for the record, in numpy:
# 1./0. = inf, 0./inf = 0., 1./inf = 0. inf/0. = inf, inf/inf = nan
# 0./0. = nan, nan/0. = nan, 0/nan = nan, nan/nan = nan, nan/inf = nan, inf/nan = nan
# We can't deal with NANs only, must also deal with pos/neg infs
# The masked array allows for dealing with "invalid" floats, which includes nan and +/-inf
denom = np.mean( intra_class_variances, axis = 0 )
denom[denom == 0] = np.nan
feature_weights_m = np.ma.masked_invalid (
( np.square( population_means - intra_class_means ).sum( axis = 0 ) /
(n_classes - 1) ) / denom
)
# return numpy error settings to original
np.seterr(**oldsettings)
# the filled(0) method of the masked array sets all nan and infs to 0
fisher_values = feature_weights_m.filled(0).tolist()
return (fisher_values)
In [ ]:
def Pearson(train_mat, train_vals):
"""Calculate regression parameters and correlation statistics that fully define
a continuous classifier.
At present the feature weights are proportional the Pearson correlation coefficient
for each given feature."""
from scipy import stats
# Known issue: running stats.linregress() with np.seterr (all='raise') has caused
# arithmetic underflow (FloatingPointError: 'underflow encountered in stdtr' )
# I think this is something we can safely ignore in this function, and return settings
# back to normal at the end. -CEC
np.seterr (under='ignore')
pearson_coeffs = np.zeros(train_mat.shape[1])
for feature_index in range( train_mat.shape[1] ):
slope, intercept, pearson_coeff, p_value, std_err = stats.linregress(
train_vals, train_mat[:,feature_index]
)
pearson_coeffs[feature_index] = pearson_coeff
# We're just returning the pearsons^2 now...
# pearson_values = [val*val / r_val_squared_sum for val in pearson_coeffs ]
# pearson_coeffs = (pearson_coeffs * pearson_coeffs) / r_val_squared_sum
pearson_coeffs *= pearson_coeffs
# Reset numpy
np.seterr (all='raise')
return pearson_coeffs
In [ ]:
def sort_by_weight (the_mat, feature_weights):
i = np.argsort(feature_weights)
sort_mat = the_mat[:,i]
sort_mat = np.fliplr(sort_mat)
return (sort_mat)
def weigh_sort(train, test, feature_weights):
weigh_train = np.multiply (train, feature_weights)
weigh_test = np.multiply (test, feature_weights)
sorted_train = sort_by_weight (weigh_train, feature_weights)
sorted_test = sort_by_weight (weigh_test, feature_weights)
return (sorted_train, sorted_test)
In [ ]:
def marg_prob_to_pred_value (marg_probs, class_vals):
weighted = np.array(marg_probs)*np.array(class_vals)
return (np.sum(weighted))
def WND5 (contig_train_mat, contig_test_mat, contig_train_vals, **kwargs):
n_test_samples = contig_test_mat.shape[0]
n_train_samples = contig_train_mat.shape[0]
predicted_classes = np.zeros(n_test_samples)
predicted_values = np.zeros(n_test_samples)
epsilon = np.finfo( np.float ).eps
testimg_idx = 0
trainimg_idx = 0
for testimg_idx in range( n_test_samples ):
# initialize
class_dists = {}
class_counts = {}
classnames_list = []
for trainimg_idx in range( n_train_samples ):
train_class_label = contig_train_vals[trainimg_idx]
if not train_class_label in class_dists:
class_dists [train_class_label] = 0.0
class_counts[train_class_label] = 0.0
classnames_list.append (train_class_label)
dists = np.absolute (contig_train_mat [trainimg_idx] - contig_test_mat [testimg_idx])
w_dist = np.sum( dists )
if w_dist > epsilon:
class_counts[train_class_label] += 1.0
else:
continue
w_dist = np.sum( np.square( dists ) )
# The exponent -5 is the "5" in "WND5"
class_dists[ train_class_label ] += w_dist ** -5
class_idx = 0
class_similarities = [0]*len(class_dists)
for class_label in classnames_list:
class_similarities[class_idx] = class_dists[class_label] / class_counts[class_label]
class_idx += 1
norm_factor = sum( class_similarities )
marg_probs = np.array( [ x / norm_factor for x in class_similarities ] )
predicted_class_idx = marg_probs.argmax()
predicted_classes[testimg_idx] = classnames_list[ predicted_class_idx ]
predicted_values[testimg_idx] = marg_prob_to_pred_value (marg_probs, classnames_list)
return (predicted_classes, predicted_values)
def WND5_Cls (contig_train_mat, contig_test_mat, contig_train_vals, **kwargs):
predicted_classes, predicted_values = WND5 (contig_train_mat, contig_test_mat, contig_train_vals, **kwargs)
return (predicted_classes)
def WND5_Reg (contig_train_mat, contig_test_mat, contig_train_vals, **kwargs):
predicted_classes, predicted_values = WND5 (contig_train_mat, contig_test_mat, contig_train_vals, **kwargs)
return (predicted_values)
In [ ]:
def rand_forest_clf (contig_train_mat, contig_test_mat, contig_train_vals, **kwargs):
if kwargs is not None and 'rnd_state' in kwargs:
rnd_state = kwargs['rnd_state']
else:
rnd_state = None
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators = 30, random_state = rnd_state)
clf.fit(contig_train_mat, contig_train_vals)
predicted_classes = clf.predict(contig_test_mat)
return (predicted_classes)
def rand_forest_reg (contig_train_mat, contig_test_mat, contig_train_vals, **kwargs):
if kwargs is not None and 'rnd_state' in kwargs:
rnd_state = kwargs['rnd_state']
else:
rnd_state = None
from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor(n_estimators=30, random_state = rnd_state)
forest.fit(contig_train_mat, contig_train_vals)
predicted = forest.predict(contig_test_mat)
return (predicted)
In [ ]:
def lin_reg(contig_train_mat, contig_test_mat, contig_train_vals, **kwargs):
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(contig_train_mat, contig_train_vals)
predicted = lin_reg.predict(contig_test_mat)
return (predicted)
In [ ]:
from __future__ import print_function
def R2_vs_N_features (regressor, contig_mat,mat_vals,feat_range):
niter = contig_mat.shape[0]
pred_vals = np.zeros ([niter,len(feat_range)])
print ('Train class sizes : {}'.format(contig_mat.shape[0]-1))
print ('N Features : {}-{}'.format(feat_range[0],feat_range[-1]))
print ('Iterations : {}'.format(niter))
actual = []
for iter_idx in range ( niter ):
# Split
contig_train_mat = np.delete(contig_mat,[iter_idx],axis=0)
contig_test_mat = np.asarray([contig_mat[iter_idx]])
contig_train_vals = np.delete (mat_vals, [iter_idx])
contig_test_vals = np.asarray ([mat_vals[iter_idx]])
# Normalize
norm_train, norm_test = normalize (contig_train_mat, contig_test_mat)
# Reduce
feature_weights = Pearson(norm_train, contig_train_vals)
sorted_train, sorted_test = weigh_sort (norm_train, norm_test, feature_weights)
# print (sorted_train, sorted_test)
# Classify with different numbers of features
nfeatures_idx = 0
for nfeatures in feat_range:
# Classify
# preds,pred_val = WND5(sorted_train[:,:nfeatures], sorted_test[:,:nfeatures], contig_train_vals)
# pred_val = rand_forest_reg (sorted_train[:,:nfeatures], sorted_test[:,:nfeatures], contig_train_vals, iter_idx*nfeatures)
# pred_val = lin_reg (sorted_train[:,:nfeatures], sorted_test[:,:nfeatures], contig_train_vals)
pred_val = regressor (sorted_train[:,:nfeatures], sorted_test[:,:nfeatures], contig_train_vals, rnd_state = iter_idx*nfeatures)
# print ('nfeatures: {}, iter_idx: {}'.format(nfeatures,iter_idx))
# print ('\rIteration {}; Predictions: {}; actual: {}\n'.format(iter_idx, pred_val[0], mat_vals[iter_idx]), end="")
pred_vals[iter_idx][nfeatures_idx] = pred_val[0]
nfeatures_idx += 1
actual.append (mat_vals[iter_idx])
print ("\rIteration {}".format(iter_idx), end="")
print ()
print ("Calculating R^2")
r_squareds = [0]*len(feat_range)
nfeatures_idx = 0
for nfeatures in feat_range:
score, p_value = pearsonr(pred_vals[:,nfeatures_idx], actual)
r_squareds[nfeatures_idx] = score * score
print ("\rN features {}".format(nfeatures), end="")
nfeatures_idx += 1
print ()
return (r_squareds)
In [ ]:
def Graph_NFeat_vs_R2 (title, feat_range, avg_score):
pl.figure()
plot_score, = pl.plot(feat_range, avg_score, 'b', label="Without LDA")
best_score_arg = np.nanargmax (avg_score)
best_num_feat = feat_range[best_score_arg]
best_score = avg_score[best_score_arg]
text_x = feat_range[0]+(0.22*(feat_range[-1]-feat_range[0]))
max_y = 0.5
pl.text(text_x,0.9*max_y, 'Max = {:.4g} @ {} features'.format (best_score, best_num_feat))
pl.title(title)
pl.xlabel('Features')
pl.ylabel('Coefficient of Determination '+'( $R^2$)')
pl.ylim([0.0, max_y])
pl.xlim([feat_range[0],feat_range[-1]])
# pl.savefig(os.path.join (Config.graph_dir(),title+'.png'), format='png', dpi=150)
pl.show()
In [ ]:
from __future__ import print_function
def ClsAcc_vs_N_features (classifier, feat_rank_alg, class_mat_list, class_vals, feat_range):
niter = np.product ( [m.shape[0] for m in class_mat_list] )
n_classes = len (class_mat_list)
n_correct = np.zeros( [len(feat_range), n_classes] )
(train,test) = round_robin_iteration (0,class_mat_list)
max_features = train[0].shape[1]
print ('Train class sizes : {}'.format([x.shape[0] for x in train]))
print ('N Features : {}-{}'.format(feat_range[0],feat_range[-1]))
print ('Iterations : {}'.format(niter))
print ('class_vals : {}'.format(class_vals))
for iter_idx in range ( niter ):
# Split
(train,test) = round_robin_iteration (iter_idx,class_mat_list)
(contig_train_mat, contig_train_vals) = list_to_contig_mat (train, class_vals)
(contig_test_mat, contig_test_vals) = list_to_contig_mat (test, class_vals)
# Normalize
(norm_train, norm_test) = normalize (contig_train_mat, contig_test_mat)
# Reduce
# feature_weights = Pearson (norm_train, contig_train_vals)
# feature_weights = Fisher (norm_train, contig_train_vals)
feature_weights = feat_rank_alg (norm_train, contig_train_vals)
(sorted_train, sorted_test) = weigh_sort (norm_train, norm_test, feature_weights)
# Classify with different numbers of features
# preds = rand_forest_clf (sorted_train[:,:nfeatures], sorted_test[:,:nfeatures], contig_train_vals, rnd_state = iter_idx)
# preds = WND5_Cls (sorted_train[:,:nfeatures], sorted_test[:,:nfeatures], contig_train_vals, rnd_state = iter_idx)
nfeatures_idx = 0
for nfeatures in feat_range:
preds = classifier (sorted_train[:,:nfeatures], sorted_test[:,:nfeatures], contig_train_vals, rnd_state = iter_idx)
for pred_idx in range (len(preds)):
if (preds[pred_idx] == contig_test_vals[pred_idx]):
n_correct[nfeatures_idx][pred_idx] += 1
cumul_acc = [(float(x) / (float(iter_idx)+1.0)) for x in n_correct[nfeatures_idx]]
print ('\rIteration {}; cumulative accuracies: {}'.format(iter_idx, cumul_acc), end="")
# print ('Iteration {}; preds: {}, cumul_acc: {}, n_correct[{}]: {}'.format(iter_idx, preds, cumul_acc, nfeatures_idx, n_correct[nfeatures_idx]))
nfeatures_idx += 1
print ()
nfeatures_idx = 0
mean_feat_acc = np.zeros([len(feat_range),n_classes])
for nfeatures in feat_range:
for cl in range (n_classes):
mean_feat_acc[nfeatures_idx][cl] = float(n_correct[nfeatures_idx][cl]) / float(niter)
print ("\rN features {}".format(nfeatures), end="")
nfeatures_idx += 1
print ()
return (mean_feat_acc)
In [ ]:
def Graph_NFeat_vs_ClsAcc (title, feat_range, mean_feat_acc, class_vals ):
pl.figure()
color_list = pl.cm.Dark2(np.linspace(0, 1, 12))
pl.title(title)
pl.xlabel('Features')
pl.ylabel('Classification Accuracy')
text_y = .8
text_y_drop = 0.1
pl.ylim([0.0, 1.1])
pl.xlim([feat_range[0],feat_range[-1]])
for cls in range(mean_feat_acc.shape[1]):
# plot_score, = pl.plot(feat_range, avg_score, 'b', label="Without LDA")
plot_score, = pl.plot(feat_range, mean_feat_acc[:,cls], color=color_list[cls], label="{}".format(class_vals[cls]))
best_score_arg = np.nanargmax (mean_feat_acc[:,cls])
best_num_feat = feat_range[best_score_arg]
best_score = mean_feat_acc[best_score_arg][cls]
text_x = feat_range[0]+(0.22*(feat_range[-1]-feat_range[0]))
pl.text(text_x,text_y, 'Class {} Max = {:.4g} @ {} features'.format (class_vals[cls],best_score, best_num_feat))
text_y -= text_y_drop
# pl.savefig(os.path.join (Config.graph_dir(),title+'.png'), format='png', dpi=150)
pl.show()
In [ ]:
from __future__ import print_function
def Mean_Feat_weights (rank_alg, class_mat_list, class_vals, feat_names):
niter = np.product ( [m.shape[0] for m in class_mat_list] )
n_classes = len (class_mat_list)
(train,test) = round_robin_iteration (0,class_mat_list)
max_features = train[0].shape[1]
print ('Train class sizes : {}'.format([x.shape[0] for x in train]))
print ('Iterations : {}'.format(niter))
print ('class_vals : {}'.format(class_vals))
mean_feature_weights = np.zeros(max_features)
for iter_idx in range ( niter ):
# Split
(train,test) = round_robin_iteration (iter_idx,class_mat_list)
(contig_train_mat, contig_train_vals) = list_to_contig_mat (train, class_vals)
(contig_test_mat, contig_test_vals) = list_to_contig_mat (test, class_vals)
# Normalize
(norm_train, norm_test) = normalize (contig_train_mat, contig_test_mat)
# Reduce
# feature_weights = Pearson (norm_train, contig_train_vals)
# feature_weights = Fisher (norm_train, contig_train_vals)
feature_weights = rank_alg (norm_train, contig_train_vals)
mean_feature_weights += feature_weights
print ('\rIteration {}'.format(iter_idx), end="")
print ()
mean_feature_weights /= iter_idx
i = np.argsort(mean_feature_weights)
i = np.array(list(reversed(i)))
sorted_feature_names = feat_names[i]
sorted_feature_means = mean_feature_weights[i]
sorted_feature_indexes = np.arange(max_features)[i]
return (sorted_feature_names, sorted_feature_indexes, sorted_feature_means)
In [ ]:
from __future__ import print_function
try:
from IPython.display import clear_output
have_ipython = True
except ImportError:
have_ipython = False
class ProgressBar(object):
def __init__(self, iterations):
self.iterations = iterations
self.prog_bar = '[]'
self.fill_char = '*'
self.width = 20
self.__update_amount(0)
if have_ipython:
self.animate = self.animate_ipython
else:
self.animate = self.animate_noipython
def animate_ipython(self, iter):
print ('\r', self, end="")
sys.stdout.flush()
self.update_iteration(iter + 1)
if (iter + 1 > self.iterations):
print ()
def update_iteration(self, elapsed_iter):
self.__update_amount((elapsed_iter / float(self.iterations)) * 100.0)
self.prog_bar += ' %d of %s complete ' % (elapsed_iter, self.iterations)
def __update_amount(self, new_amount):
percent_done = int(round((new_amount / 100.0) * 100.0))
all_full = self.width - 2
num_hashes = int(round((percent_done / 100.0) * all_full))
self.prog_bar = '[' + self.fill_char * num_hashes + ' ' * (all_full - num_hashes) + ']'
pct_place = (len(self.prog_bar) // 2) - len(str(percent_done))
pct_string = '%d%%' % percent_done
self.prog_bar = self.prog_bar[0:pct_place] + \
(pct_string + self.prog_bar[pct_place + len(pct_string):])
def __str__(self):
return str(self.prog_bar)