In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn import model_selection as ms
from sklearn import datasets, metrics, tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from smrt import smrt_balance
import time
from matplotlib.legend_handler import HandlerLine2D
seed = 42
In [2]:
def eval_unbalanced(train_data,label_data):
start_bal = time.time()
end_bal = time.time()
start_fit = time.time()
rf.fit(train_data,label_data)
end_fit = time.time()
preds = rf.predict(X_test)
cohen_score = metrics.cohen_kappa_score(y_test, preds)
acc_score = metrics.accuracy_score(y_test, preds)
auc_score = metrics.roc_auc_score(y_test, preds)
avg_prec_score = metrics.average_precision_score(y_test, preds)
balance_time = (end_bal-start_bal)
fit_time= (end_fit-start_fit)
combined_time = ((end_bal-start_bal)+(end_fit-start_fit))
base_scores_dict = {'Cohen Kappa Score':cohen_score,'Accuracy Score':acc_score
,'AUC Score': auc_score, "Average Precision Score":avg_prec_score}
base_time_dict = {'Balance Time': balance_time,'Fit Time':fit_time,'Combined Time': combined_time}
return base_scores_dict, base_time_dict
#function for returning evaluation metric and time dictionaries over a range of balance ratios for plotting
def eval_sampler(train_data,label_data,balance_ratio_array):
balance_array = balance_ratio_array
cohen_scores = []
acc_scores = []
auc_scores = []
avg_prec_scores = []
balance_time_list = []
fit_time_list = []
combined_time_list = []
for i in range(len(balance_arr)):
start_bal = time.time()
X_bal, y_bal = smrt_balance(train_data,label_data, seed=seed, shuffle=False, balance_ratio = balance_array[i])
end_bal = time.time()
start_fit = time.time()
rf.fit(X_bal,y_bal)
end_fit = time.time()
preds = rf.predict(X_test)
cohen_scores.append(metrics.cohen_kappa_score(y_test,preds))
acc_scores.append(metrics.accuracy_score(y_test, preds))
auc_scores.append(metrics.roc_auc_score(y_test, preds))
avg_prec_scores.append(metrics.average_precision_score(y_test, preds))
balance_time_list.append(end_bal-start_bal)
fit_time_list.append(end_fit-start_fit)
combined_time_list.append((end_bal-start_bal)+(end_fit-start_fit))
scores_dictionary = {'Cohen Kappa Score':cohen_scores,'Accuracy Score':acc_scores
,'AUC Score': auc_scores, "Average Precision Score":avg_prec_scores}
time_dictionary = {'Balance Time': balance_time_list,'Fit Time':fit_time_list,'Combined Time': combined_time_list}
return scores_dictionary, time_dictionary
#function for plotting performance mesure information over a range of balance ratios
def plot_performance(eval_score_dictionary,balance_ratio_array):
plt.figure(1)
count = 0
for i in eval_score_dictionary.keys():
count += 1
plt.subplot(220+count)
plt.title("{} Curve with SMRT".format(i))
plt.xlabel("Balance Ratio")
plt.ylabel("{}".format(i))
plt.tight_layout()
plt.plot(balance_ratio_array, eval_score_dictionary[i])
plt.show()
#function for plotting performance mesure information over a range of balance ratios
def plot_performance_comp(eval_score_dictionary,base_score_dictionary,balance_ratio_array):
plt.figure(1)
count = 0
for i in eval_score_dictionary.keys():
count += 1
plt.subplot(220+count)
plt.title("{} Curve with SMRT".format(i))
plt.xlabel("Balance Ratio")
plt.ylabel("{}".format(i))
plt.axhline(y=base_score_dictionary[i], color='r', linestyle='-',label='Base Estimator')
plt.tight_layout()
plt.plot(balance_ratio_array, eval_score_dictionary[i],label='SMRT Estimator')
plt.legend(loc="lower right",ncol=1, shadow=True, fancybox=True)
plt.show()
#function for plotting timing information over a range of balance ratios
def plot_time(time_dictionary, balance_ratio_array):
#function for plotting time
plt.figure(1)
count = 0
for i in time_dictionary.keys():
count += 1
plt.subplot(220+count)
plt.title("{}".format(i))
plt.xlabel("Balance Ratio")
plt.ylabel("{}".format(i))
plt.tight_layout()
plt.plot(balance_ratio_array, time_dictionary[i])
plt.subplot(224)
bal_line, = plt.plot(balance_ratio_array, time_dictionary['Balance Time'], label='Balance Time')
fit_line, = plt.plot(balance_ratio_array, time_dictionary['Fit Time'], label='Fit Time')
combined_line, =plt.plot(balance_ratio_array, time_dictionary['Combined Time'],label='Combined Time')
plt.legend(handler_map={bal_line: HandlerLine2D(numpoints=4)})
plt.title('Time Overview')
plt.xlabel('Balance Ratio')
plt.ylabel('Time')
plt.legend(handler_map = {bal_line: HandlerLine2D(numpoints=4)})
plt.tight_layout()
plt.show()
#function for plotting timing information over a range of balance ratios
def plot_time_comp(eval_time_dictionary, base_time_dictionary,balance_ratio_array):
#function for plotting time
plt.figure(1)
count = 0
for i in eval_time_dictionary.keys():
count += 1
plt.subplot(220+count)
plt.title("{}".format(i))
plt.xlabel("Balance Ratio")
plt.ylabel("{}".format(i))
plt.axhline(y=base_time_dictionary[i], color='r', linestyle='-',label='Base Estimator')
plt.tight_layout()
plt.plot(balance_ratio_array, eval_time_dictionary[i],label='SMRT Estimator')
plt.legend(loc="lower right",ncol=1, shadow=True, fancybox=True)
plt.subplot(224)
bal_line, = plt.plot(balance_ratio_array, eval_time_dictionary['Balance Time'], label='Balance Time')
fit_line, = plt.plot(balance_ratio_array, eval_time_dictionary['Fit Time'], label='Fit Time')
combined_line, = plt.plot(balance_ratio_array, eval_time_dictionary['Combined Time'],label='Combined Time')
base_combined_time = plt.axhline(y=base_time_dictionary['Combined Time'], color='r', linestyle='-')
plt.title('Time Overview')
plt.xlabel('Balance Ratio')
plt.ylabel('Time')
plt.legend(handler_map = {bal_line: HandlerLine2D(numpoints=4)})
plt.tight_layout()
plt.show()
In [3]:
# Generate the dataset
X, y = datasets.make_classification(n_classes=2, class_sep=2,
weights=[.95, .05], n_informative=50,
n_redundant=10, flip_y=0, n_features=100,
n_clusters_per_class=4, n_samples=50000,
random_state=seed)
In [4]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=.3)
In [5]:
rf = RandomForestClassifier(random_state=seed,n_jobs = -1)
rf.fit(X_train,y_train)
preds = rf.predict(X_test)
In [6]:
print(metrics.classification_report(y_test,preds))
In [7]:
cohen_score = metrics.cohen_kappa_score(y_test, preds)
acc_score = metrics.accuracy_score(y_test, preds)
auc_score = metrics.roc_auc_score(y_test, preds)
avg_prec = metrics.average_precision_score(y_test, preds)
print("Cohen Kappa score is {}".format(cohen_score))
print("Accuracy Score is {}".format(acc_score))
print("AUC Score is {}".format(auc_score))
print("Average Precision is {}".format(avg_prec))
In [8]:
#pass a range of ratios to evaluate
balance_arr = np.arange(.1,1,.1)
#return the score and time dictionaries for plotting
bal_model_scores, bal_model_times = eval_sampler(X_train,y_train,balance_arr)
#return the score and time dictionaries for plotting base performance
base_scores, base_times = eval_unbalanced(X_train, y_train)
In [9]:
plot_performance(bal_model_scores,balance_arr)
In [10]:
plot_performance_comp(bal_model_scores,base_scores,balance_arr)
In [11]:
plot_time(bal_model_times, balance_arr)
In [12]:
plot_time_comp(bal_model_times,base_times,balance_arr)