We have two classification tasks:
In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import pandas as pd
import numpy as np
import seaborn as sns
In [2]:
raw_input = pd.read_pickle('input.pkl')
gp_mapper = {
1: 'A1', 2: 'A1', 3: 'A1',
4: 'A2', 5: 'A2', 6: 'A2',
7: 'B1', 8: 'B1', 9: 'B1',
10: 'B2', 11: 'B2', 12: 'B2',
13: 'C1', 14: 'C1', 15: 'C1',
16: 'C2'
}
raw_input = raw_input.assign(group=raw_input.level.map(gp_mapper))
In [3]:
raw_input.info()
In [4]:
raw_input.head()
Out[4]:
In [5]:
from sklearn.model_selection import train_test_split
# Split the index of `raw_input` DataFrame into train and test and the use the to split the DataFrame.
train_idx, test_idx = train_test_split(
raw_input.index,
test_size=0.2,
stratify=raw_input.level,
shuffle=True,
random_state=0)
train_df, test_df = raw_input.loc[train_idx], raw_input.loc[test_idx]
train_df.to_pickle('train_full.pkl')
test_df.to_pickle('test.pkl')
# Small sample Dataset from train set using 1000 elements per level
train_df_small = train_df.groupby('level').apply(lambda g: g.sample(n=1000, replace=False, random_state=1234))
train_df_small.index = train_df_small.index.droplevel(0)
train_df_small.to_pickle('train_small.pkl')
For the rest of this notebook, we use the small sample dataset as input.
In [6]:
raw_input = pd.read_pickle('train_small.pkl')
In [7]:
level_counts = raw_input.level.value_counts().sort_index()
group_counts = raw_input.group.value_counts().sort_index()
_, ax = plt.subplots(1, 2, figsize=(10, 5))
_ = level_counts.plot(kind='bar', title='Counts per Level', ax=ax[0], rot=0)
_ = group_counts.plot(kind='bar', title='Counts per Group', ax=ax[1], rot=0)
plt.tight_layout()
In [8]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
en_stopwords = set(stopwords.words('english'))
print(en_stopwords)
In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict, StratifiedKFold
def classify_v1(input_df, target_label='level'):
"""
Build a classifier for the `target_label` column in the DataFrame `input_df` using the `text` column.
Return the (labels, predicted_labels) tuple.
Use a 10-fold Stratified K-fold cross-validator to generate the out-of-sample predictions."""
assert target_label in input_df.columns
counter = TfidfVectorizer(
ngram_range=(1, 2),
stop_words=en_stopwords,
max_df=0.4,
min_df=25,
max_features=3000,
sublinear_tf=True
)
scaler = StandardScaler(with_mean=False)
model = LogisticRegression(penalty='l2', max_iter=200, random_state=4321)
pipeline = make_pipeline(counter, scaler, model)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1234)
X = raw_input.text
y = raw_input.level
y_pred = cross_val_predict(pipeline, X=X.values, y=y.values, cv=cv, n_jobs=16, verbose=2)
y_pred = pd.Series(index=raw_input.index.copy(), data=y_pred)
return y.copy(), y_pred
In [10]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
def display_results(y, y_pred):
"""Given some predications y_pred for a target label y,
display the precision/recall/f1 score and the confusion matrix."""
report = classification_report(y_pred, y)
print(report)
level_values = y.unique()
level_values.sort()
cm = confusion_matrix(y_true=y, y_pred=y_pred.values, labels=level_values)
cm = pd.DataFrame(index=level_values, columns=level_values, data=cm)
fig, ax = plt.subplots(1, 1, figsize=(10, 8))
ax = sns.heatmap(cm, annot=True, ax=ax, fmt='d')
In [11]:
%%time
levels, levels_predicted = classify_v1(raw_input, target_label='level')
display_results(levels, levels_predicted)
In [12]:
# assign the predicated level as a column to the input data
input_with_preds = raw_input.assign(level_predicted=levels_predicted)
input_with_preds.head()
Out[12]:
We can identify the misclassified examples by the condition
input_with_preds.level != input_with_preds.level_predicted
For these rows, we want to identify the pair ('level', 'level_predicted') that produces the greatest number of mismatches, because addressing these will produce the biggest improvement in the overall score.
In [13]:
misclassifications = input_with_preds[input_with_preds.level != input_with_preds.level_predicted]
m_counts = misclassifications.groupby(by=['level', 'level_predicted'])['text'].count()
m_counts.sort_values(ascending=False).head(8)
Out[13]:
As an example, we investigate the misclassifications between levels 7 and 8.
In [23]:
cond = (misclassifications.level.isin([7, 8])) & (misclassifications.level_predicted.isin([7, 8]))
mis_sample = misclassifications.loc[cond, ['topic_text', 'topic_id', 'text', 'level', 'level_predicted']]
mis_sample.groupby(['topic_id', 'topic_text', 'level', 'level_predicted'])['text'].count().sort_values(ascending=False)
Out[23]:
So, most of the misclassifications for true level 7 occur for the topic "Planning for the future", whereas for level 8, it is "Making a 'to do' list of your dreams". Intuitively, this makes sense. These two topics are similar, so the word frequency distributions could very well be similar.
Next we extract wordcount tf-idf matrices for a subset of these articles and compare different aspects of them.
In [24]:
from sklearn.feature_extraction.text import CountVectorizer
def calc_bow_matrix_for_topic_id(df, topic_id, limit=5):
"""Return a dense DataFrame of Word counts with words as index, article IDs as columns."""
all_texts = df[df.topic_id == topic_id].text.head(limit)
cv = CountVectorizer(stop_words=en_stopwords)
t = cv.fit_transform(all_texts.values)
words = cv.get_feature_names()
tf_idf_matrix = pd.DataFrame(index=all_texts.index.copy(), columns=words, data=t.todense()).T
return tf_idf_matrix
tid_50, tid_59 = map(lambda x: calc_bow_matrix_for_topic_id(mis_sample, x), [50, 59])
In [25]:
tid_50.head(20)
Out[25]:
In [26]:
tid_59.head(20)
Out[26]:
In [27]:
uncommon_words = tid_50.index.symmetric_difference(tid_59.index).tolist()
print(uncommon_words)
So the word count matrix is extremely sparse and a fair amount of words only appear in one set of articles and not the other. Based on that, I concluded that the presence / absence of rare words could be a better indicator of level instead of tf-idf.
Next we re-run the model evaluation step using binary valued features indicating presence / absence.
In [28]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict, StratifiedKFold
def classify_v2(input_df, target_label='level'):
"""
Build a classifier for the `target_label` column in the DataFrame `input_df` using the `text` column.
Return the (labels, predicted_labels) tuple.
Use a 10-fold Stratified K-fold cross-validator to generate the out-of-sample predictions."""
assert target_label in input_df.columns
counter = CountVectorizer(
lowercase=True,
stop_words=en_stopwords,
ngram_range=(1, 1),
min_df=5,
max_df=0.4,
binary=True)
model = LogisticRegression(
penalty='l2',
max_iter=200,
multi_class='multinomial',
solver='lbfgs',
verbose=True,
random_state=4321)
pipeline = make_pipeline(counter, model)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1234)
X = input_df.text
y = input_df.loc[:, target_label]
y_pred = cross_val_predict(pipeline, X=X.values, y=y.values, cv=cv, n_jobs=10, verbose=2)
y_pred = pd.Series(index=raw_input.index.copy(), data=y_pred)
return y.copy(), y_pred
In [30]:
%%time
levels, levels_predicted = classify_v2(raw_input, target_label='level')
display_results(levels, levels_predicted)
Using binary features the composite f1-score has improved to 0.87 from 0.85.
In [31]:
%%time
groups, groups_predicted = classify_v2(raw_input, target_label='group')
display_results(groups, groups_predicted)
In [ ]: