In [47]:
## https://elitedatascience.com/imbalanced-classes
import pandas as pd
import numpy as np
df = pd.read_csv('balance-scale.data.txt',
names=['balance','var1','var2','var3','var4'])
df.head()
Out[47]:
In [48]:
%pwd
Out[48]:
In [49]:
df.balance.value_counts() # df['balance'].value_counts() ## both are same
Out[49]:
In [50]:
df.balance = [1 if b == 'B' else 0 for b in df.balance]
df.balance.value_counts()
Out[50]:
In [51]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
y = df.balance
X = df.drop('balance', axis =1)
clf_0 = LogisticRegression().fit(X,y)
pred_y_0 = clf_0.predict(X)
In [52]:
print("Accuracy_score::", accuracy_score(pred_y_0,y))
In [53]:
print("np.unique(pred_y_0)::", np.unique(pred_y_0))
In [54]:
##As you can see, this model is only predicting 0, which means it's completely ignoring
##the minority class in favor of the majority class.
Next, we'll create a new DataFrame with an up-sampled minority class. Here are the steps:
1.First, we'll separate observations from each class into different DataFrames
2.Next, we'll resample the minority class with replacement, setting the number of samples to match that of the majority class.
3.Finally, we'll combine the up-sampled minority class DataFrame with the original majority class DataFrame.
In [14]:
from sklearn.utils import resample
# Separate majority and minority classes
df_majority = df[df.balance == 0]
df_minority = df[df.balance == 1]
# Upsample minority class
df_minority_upsample = resample(df_minority,
replace=True,
n_samples=576,
random_state=123)
df_upsampled = pd.concat([df_majority, df_minority_upsample])
df_upsampled.balance.value_counts()
Out[14]:
As you can see, the new DataFrame has more observations than the original, and the ratio of the two classes is now 1:1.
Let's train another model using Logistic Regression, this time on the balanced dataset:
In [15]:
y = df_upsampled.balance
X = df_upsampled.drop('balance', axis =1)
clf_1 = LogisticRegression().fit(X,y)
pred_y_1 = clf_1.predict(X)
print("np.unique(pred_y_1)::", np.unique(pred_y_1))
In [18]:
print("Accuracy_score after upscaling::", accuracy_score(y,pred_y_1))
Great, now the model is no longer predicting just one class. While the accuracy also took a nosedive, it's now more meaningful as a performance metric.
The most common heuristic for doing so is resampling without replacement.
The process is similar to that of up-sampling. Here are the steps:
1.First, we'll separate observations from each class into different DataFrames.
2.Next, we'll resample the majority class without replacement, setting the number of samples to match that of the minority class.
3.Finally, we'll combine the down-sampled majority class DataFrame with the original minority class DataFrame.
In [20]:
# Separate majority and minority classes
df_majority = df[df.balance == 0]
df_minority = df[df.balance == 1]
df_majority_downsampled = resample(df_majority,
replace=False,
n_samples = 49,
random_state = 123)
# Combine minority class with downsampled majority class
df_downsampled = pd.concat([df_majority_downsampled,df_minority])
df_downsampled.balance.value_counts()
Out[20]:
In [21]:
##### https://elitedatascience.com/imbalanced-classes
# Separate input features (X) and target variable (y)
y = df_downsampled.balance
X = df_downsampled.drop('balance', axis = 1)
clf_2 = LogisticRegression().fit(X,y)
pred_y_2 = clf_2.predict(X)
# Is our model still predicting just one class?
print(np.unique(pred_y_2))
In [22]:
# How's our accuracy?
print( accuracy_score(y, pred_y_2) )
In [26]:
from sklearn.metrics import confusion_matrix
confusion = confusion_matrix(y, pred_y_2)
print("confusion::\n", confusion)
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# Accuracy = TP + TN / (TP + TN + FP + FN)
# Precision = TP / (TP + FP)
# Recall = TP / (TP + FN) Also known as sensitivity, or True Positive Rate
# F1 = 2 * Precision * Recall / (Precision + Recall)
print('Accuracy: {:.2f}'.format(accuracy_score(y, pred_y_2)))
print('Precision: {:.2f}'.format(precision_score(y, pred_y_2)))
print('Recall: {:.2f}'.format(recall_score(y, pred_y_2)))
print('F1: {:.2f}'.format(f1_score(y, pred_y_2)))
In [27]:
clf_2_decisionFunction = clf_2.decision_function(X)
In [28]:
clf_2_decisionFunction
Out[28]:
In [31]:
clf_2_y_score_list = list(zip(y, clf_2_decisionFunction))
clf_2_y_score_list
Out[31]:
Albert Einstein once said, "if you judge a fish on its ability to climb a tree, it will live its whole life believing that it is stupid." This quote really highlights the importance of choosing the right evaluation metric.
For a general-purpose metric for classification, we recommend Area Under ROC Curve (AUROC).
We won't dive into its details in this guide, but you can read more about it here. Intuitively, AUROC represents the likelihood of your model distinguishing observations from two classes. In other words, if you randomly select one observation from each class, what's the probability that your model will be able to "rank" them correctly?
In [33]:
from sklearn.metrics import roc_auc_score
# Predict class probabilities
prob_y_2 = clf_2.predict_proba(X)
prob_y_2
Out[33]:
In [35]:
# Keep only the positive class
prob_y_2 = [p[1] for p in prob_y_2 ]
prob_y_2[:5]
Out[35]:
In [41]:
## https://stackoverflow.com/questions/25009284/how-to-plot-roc-curve-in-python
from sklearn.metrics import roc_curve, auc
fpr, tpr, threshold = roc_curve(y,prob_y_2)
roc_auc = auc(fpr,tpr)
print("roc_auc:",roc_auc)
print("###################################")
import matplotlib.pyplot as plt
plt.figure()
plt.xlim([-0.01, 1.00])
plt.ylim([-0.01, 1.01])
plt.plot(fpr, tpr, lw=3, label='LogRegr ROC curve (area = {:0.2f})'.format(roc_auc))
plt.xlabel('False Positive Rate', fontsize=16)
plt.ylabel('True Positive Rate', fontsize=16)
plt.title('ROC curve balance', fontsize=16)
plt.legend(loc='lower right', fontsize=13)
plt.plot([0, 1], [0, 1], color='navy', lw=3, linestyle='--')
plt.axes().set_aspect('equal')
plt.show()
In [36]:
###So how did this model (trained on the down-sampled dataset) do in terms of AUROC?
print("roc_auc_score::", roc_auc_score(y,prob_y_2))
Ok... and how does this compare to the original model trained on the imbalanced dataset?
In [37]:
prob_y_0 = clf_0.predict_proba(X)
prob_y_0 = [p[1] for p in prob_y_0]
print("roc_auc_score::", roc_auc_score(y,prob_y_0))
The next tactic is to use penalized learning algorithms that increase the cost of classification mistakes on the minority class.
A popular algorithm for this technique is Penalized-SVM:
Support Vector MachinePython 1 from sklearn.svm import SVC During training, we can use the argument class_weight='balanced' to penalize mistakes on the minority class by an amount proportional to how under-represented it is.
We also want to include the argument probability=True if we want to enable probability estimates for SVM algorithms.
Let's train a model using Penalized-SVM on the original imbalanced dataset:
In [55]:
# Separate input features (X) and target variable (y)
y = df.balance
X = df.drop('balance', axis = 1)
print(y.value_counts())
from sklearn.svm import SVC
clf_3 = SVC(kernel='linear',
class_weight='balanced',#penelize
probability=True)
clf_3.fit(X,y)
# Predict on training set
pred_y_3 = clf_3.predict(X)
# Is our model still predicting just one class?
print('np.unique(pred_y_3):', np.unique(pred_y_3))
print("accuracy_score:", accuracy_score(y, pred_y_3))
# What about AUROC?
prob_y_3 = clf_3.predict_proba(X)
prob_y_3 = [p[1] for p in prob_y_3]
print("roc_auc_score(y, prob_y_3):", roc_auc_score(y,prob_y_3))
In [61]:
### lets apply Grid search in SVC
from sklearn.model_selection import GridSearchCV
clf_GS = SVC();
Cs = [0.001, 0.01, 0.1, 1, 10]
gammas = [0.001, 0.01, 0.1, 1]
kernels = ['linear','rbf']
params_grid = {'kernel': kernels, 'C': Cs, 'gamma' : gammas}
grid_search_acc = GridSearchCV(clf_GS, param_grid = params_grid, scoring='accuracy', n_jobs = -1,verbose =2 )
grid_search_acc.fit(X,y)
print('Grid best parameter(accuracy)::', grid_search_acc.best_params_)
print('Grid best score(accuracy):', grid_search_acc.best_score_)
print("#######################################################################################################")
grid_search_roc = GridSearchCV(clf_GS, param_grid = params_grid, scoring='roc_auc', n_jobs = -1,verbose = 1 )
grid_search_roc.fit(X,y)
print('Grid best parameter(roc)::', grid_search_roc.best_params_)
print('Grid best score(roc):', grid_search_roc.best_score_)
The final tactic we'll consider is using tree-based algorithms. Decision trees often perform well on imbalanced datasets because their hierarchical structure allows them to learn signals from both classes.
In modern applied machine learning, tree ensembles (Random Forests, Gradient Boosted Trees, etc.) almost always outperform singular decision trees, so we'll jump right into those:
In [46]:
from sklearn.ensemble import RandomForestClassifier
# Separate input features (X) and target variable (y)
y = df.balance
X = df.drop('balance', axis=1)
print("y.value_counts():", y.value_counts())
clf_4 = RandomForestClassifier().fit(X,y)
# Predict on training set
pred_y_4 = clf_4.predict(X)
# Is our model still predicting just one class?
print('np.unique( pred_y_4 )::', np.unique( pred_y_4 ) )
# How's our accuracy?
print('accuracy_score(y,pred_y_4):',accuracy_score(y,pred_y_4))
# What about AUROC?
prob_y_4 = clf_4.predict_proba(X)
prob_y_4 =[p[1] for p in prob_y_4]
print('roc_auc_score(y, prob_y_4):',roc_auc_score(y, prob_y_4))
In [ ]: