In [1]:
%matplotlib inline
from sklearn import utils
import matplotlib
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from sklearn.linear_model import LogisticRegression
#metrics to print
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
# percision recall curve
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
%matplotlib inline
np.random.seed(1) #to reproduce results
In [2]:
#use Iris data and pick one flower to filter down
# currently each has 50
col = ['sepal_length','sepal_width','petal_length','petal_width','type']
data = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', names = col)
#pick a flower and select 10 out of the 50 observations
virginica = data[data.type == 'Iris-virginica'].sample(frac=0.2).copy()
not_virginica = data[data.type != 'Iris-virginica']
df = pd.concat([virginica,not_virginica])
#turn into binary
df['virginica'] = np.where(df['type']=='Iris-virginica', 1, 0)
df.drop('type',inplace=True, axis=1)
print('Pct Minority: ' + str(round((df.virginica.sum()/df.virginica.count())*100,2)) + '%')
print('Pct Majority: ' + str(round((1-df.virginica.sum()/df.virginica.count())*100,2)) + '%')
In [3]:
def evaluation(y,y_prob,ztype):
y_class = np.where(y_prob > .5,1,0)
acc = metrics.accuracy_score(y, y_class)
f1 = metrics.f1_score(y, y_class)
pre = precision_score(y,y_class)
rec = recall_score(y,y_class)
print('Evaluation for ' + ztype)
print('Accuracy : ', str(round(acc,4)))
print('F1 : ', str(round(f1,4)))
print('Precision: ', str(round(pre,4)))
print('Recall : ', str(round(rec,4)))
print()
print(confusion_matrix(y, y_class))
You might think it’s silly, but collecting more data is almost always overlooked.
Can you collect more data? Take a second and think about whether you are able to gather more data on your problem.
A larger dataset might expose a different and perhaps more balanced perspective on the classes.
More examples of minor classes may be useful later when we look at resampling your dataset.
As mentioned prior - accuracy is a paradox and no longer the appropriate measurement.
In [78]:
X_train, X_dev, y_train, y_dev = train_test_split(df.drop('virginica',axis=1), df.virginica, test_size=0.3,random_state=0)
print("virginica in train set = ", str(y_train.sum()))
print("virginica in dev set = ", str(y_dev.sum()))
print()
logistic = LogisticRegression()
logistic.fit(X_train, y_train)
#Get predicted classes
y_train_pred = logistic.predict_proba(X_train)[:,1]
y_dev_pred = logistic.predict_proba(X_dev)[:,1]
evaluation(y_train,y_train_pred,'training set')
print()
evaluation(y_dev,y_dev_pred,'testing set')
Risk if undersample
Risk if oversample
Some Rules of Thumb
In [83]:
#undersample
virginica = df[df.virginica == 1].copy()
not_virginica = df[df.virginica == 0 ].sample(frac=0.5).copy()
df_undersample = pd.concat([virginica,not_virginica])
X_train, X_dev, y_train, y_dev = train_test_split(df_undersample.drop('virginica',axis=1), df_undersample.virginica, test_size=0.3,random_state=0)
print("virginica in train set = ", str(y_train.sum()))
print("virginica in dev set = ", str(y_dev.sum()))
logistic = LogisticRegression()
logistic.fit(X_train, y_train)
#Get predicted classes
y_train_pred = logistic.predict_proba(X_train)[:,1]
y_dev_pred = logistic.predict_proba(X_dev)[:,1]
evaluation(y_train,y_train_pred,'training set')
print()
evaluation(y_dev,y_dev_pred,'testing set')
In [88]:
X_train, X_dev, y_train, y_dev = train_test_split(df.drop('virginica',axis=1), df.virginica, test_size=0.3,random_state=0)
print("virginica in train set = ", str(y_train.sum()))
print("virginica in dev set = ", str(y_dev.sum()))
print()
#smote
sm = SMOTE(ratio=.5,k_neighbors =2,kind='regular',random_state=10);
X_train, y_train = sm.fit_sample(X_train, np.ravel(y_train))
print("AFTER SMOTE: virginica in train set = ", str(y_train.sum()))
print()
logistic = LogisticRegression()
logistic.fit(X_train, y_train)
#Get predicted classes
y_train_pred = logistic.predict_proba(X_train)[:,1]
y_dev_pred = logistic.predict_proba(X_dev)[:,1]
evaluation(y_train,y_train_pred,'training set')
print()
evaluation(y_dev,y_dev_pred,'testing set')
In [90]:
X_train, X_dev, y_train, y_dev = train_test_split(df.drop('virginica',axis=1), df.virginica, test_size=0.3,random_state=0)
print("virginica in train set = ", str(y_train.sum()))
print("virginica in dev set = ", str(y_dev.sum()))
print()
#logistic regression has class_weight - to penalize the cost function to be balanced
logistic = LogisticRegression(class_weight='balanced')
logistic.fit(X_train, y_train)
#Get predicted classes
y_train_pred = logistic.predict_proba(X_train)[:,1]
y_dev_pred = logistic.predict_proba(X_dev)[:,1]
evaluation(y_train,y_train_pred,'training set')
print()
evaluation(y_dev,y_dev_pred,'testing set')
In [91]:
#random forest
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
#Get predicted classes
y_train_pred = rfc.predict_proba(X_train)[:,1]
y_dev_pred = rfc.predict_proba(X_dev)[:,1]
evaluation(y_train,y_train_pred,'training set')
print()
evaluation(y_dev,y_dev_pred,'testing set')
There are fields of study dedicated to imbalanced datasets. They have their own algorithms, measures and terminology.
Taking a look and thinking about your problem from these perspectives can sometimes shame loose some ideas.
Two you might like to consider are anomaly detection and change detection.
checkout NFDS github from 2017-09 for examples on single class SVM for anomaly detection
Really climb inside your problem and think about how to break it down into smaller problems that are more tractable.
For inspiration, take a look at the very creative answers on Quora in response to the question “In classification, how do you handle an unbalanced training set?”
Mix and match methods to find the right combination for your data!