We have two classification tasks:
In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import pandas as pd
import numpy as np
import seaborn as sns
Here, we load the DataFrame for the full training set and repeat the classification approach identified in step 2.
In [2]:
%%time
raw_input = pd.read_pickle('train_full.pkl')
In [3]:
raw_input.head()
Out[3]:
In [4]:
raw_input.info()
In [5]:
level_counts = raw_input.level.value_counts().sort_index()
group_counts = raw_input.group.value_counts().sort_index()
_, ax = plt.subplots(1, 2, figsize=(10, 5))
_ = level_counts.plot(kind='bar', title='Feature Instances per Level', ax=ax[0], rot=0)
_ = group_counts.plot(kind='bar', title='Feature Instances per Group', ax=ax[1], rot=0)
plt.tight_layout()
Here we apply the same approach of converting text to bag-of-words features and then using a maximum entropy classifier. The difference is we are now running on the full dataset which is much larger. The optimizer now requires more steps to converge, so we change the max_iters
attribute of LogisticRegression
to 1000. We address the label imbalance by setting class_weight='balanced'
.
In [6]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
en_stopwords = set(stopwords.words('english'))
print(en_stopwords)
In [7]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
def display_results(y, y_pred):
"""Given some predications y_pred for a target label y,
display the precision/recall/f1 score and the confusion matrix."""
report = classification_report(y_pred, y)
print(report)
level_values = y.unique()
level_values.sort()
cm = confusion_matrix(y_true=y, y_pred=y_pred.values, labels=level_values)
cm = pd.DataFrame(index=level_values, columns=level_values, data=cm)
fig, ax = plt.subplots(1, 1, figsize=(12, 10))
ax = sns.heatmap(cm, annot=True, ax=ax, fmt='d')
In [15]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict, StratifiedKFold
def build_pipeline():
"""Return the combination of a Feature Extractor and a LogisticRegression model in a ``Pipeline``. """
counter = CountVectorizer(
lowercase=True,
stop_words=en_stopwords,
ngram_range=(1, 1),
min_df=5,
max_df=0.4,
binary=True)
model = LogisticRegression(
# maximize log-likelihood + square norm of parameters
penalty='l2',
# steps required for the L-BFGS optimizer to converge, found by trial and error
max_iter=1000,
# use softmax instead of one-vs-rest style classification
multi_class='multinomial',
# use L-BFGS optimizer
solver='lbfgs',
# This prints out a warning if the optimizer hasn't converged
verbose=True,
# to handle the class imbalance
# automatically adjust weights inversely proportional to
# class frequencies in the input data
class_weight='balanced',
random_state=4321)
pipeline = make_pipeline(counter, model)
return pipeline
def classify(input_df, target_label='level'):
"""
Build a classifier for the `target_label` column in the DataFrame `input_df` using the `text` column.
Return the (labels, predicted_labels) tuple.
Use a 10-fold Stratified K-fold cross-validator to generate the out-of-sample predictions."""
assert target_label in input_df.columns
pipeline = build_pipeline()
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1234)
X = input_df.text
y = input_df.loc[:, target_label]
y_pred = cross_val_predict(pipeline, X=X.values, y=y.values, cv=cv, n_jobs=10, verbose=2)
y_pred = pd.Series(index=input_df.index.copy(), data=y_pred)
return y.copy(), y_pred
In [9]:
%%time
levels, levels_predicted = classify(raw_input, target_label='level')
In [11]:
display_results(levels, levels_predicted)
In [12]:
%%time
groups, groups_predicted = classify(raw_input, target_label='group')
In [ ]:
display_results(groups, groups_predicted)
Finally we report the performance on our classfier on both the leve
and group
classification tasks using the test dataset. For this we re-build the model using the hyperparameters used above, and train it using the entire train dataset.
In [19]:
from functools import lru_cache
@lru_cache(maxsize=1)
def get_test_dataset():
return pd.read_pickle('test.pkl')
def report_test_perf(train_df, target_label='level'):
"""Produce classification report and confusion matrix on the test Dataset for a given ``target_label``."""
test_df = get_test_dataset()
assert target_label in train_df.columns
assert target_label in test_df.columns
# Train the model using the entire training dataset
pipeline = build_pipeline()
X_train, y_train = train_df.text, train_df.loc[:, target_label]
pipeline = pipeline.fit(X_train.values, y_train.values)
# Generate predictions using test data
X_test, y_test = test_df.text, test_df.loc[:, target_label]
predicted = pipeline.predict(X_test.values)
predicted = pd.Series(index=test_df.index,
data=predicted,
name='{}_pred'.format(target_label))
display_results(y_test, predicted)
In [21]:
%% time
train_df = raw_input
report_test_perf(train_df, 'level')
In [23]:
%%time
report_test_perf(train_df, 'group')
In [ ]: