In [ ]:
from IPython.display import display
import matplotlib.pyplot as plt
%matplotlib inline
from nltk.corpus import reuters, stopwords
import random
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import operator
from scipy.sparse import vstack
from functools import reduce
from pprint import pprint
random_state = 42
In [ ]:
# Load Reuters collection
documents = reuters.fileids()
train_docs_id = list(filter(lambda doc: doc.startswith("train"), documents))
test_docs_id = list(filter(lambda doc: doc.startswith("test"), documents))
train_docs = [reuters.raw(doc_id) for doc_id in train_docs_id]
test_docs = [reuters.raw(doc_id) for doc_id in test_docs_id]
In [ ]:
# Minimal data analysis
# List of categories
categories = reuters.categories();
print("Number of categories: {}".format(len(categories)))
# Documents per category.
category_distribution = [(category, len(reuters.fileids(category)))
for category in categories]
category_distribution = sorted(category_distribution,
key=operator.itemgetter(1),
reverse=True)
print("Most common categories")
pprint(category_distribution[:10])
print()
print("Least common categories")
pprint(category_distribution[-10:])
print()
In [ ]:
# Represent dataset with TFIDF
# Load the list of (english) stop-words from nltk
stop_words = stopwords.words("english")
vectorizer = TfidfVectorizer(stop_words=stop_words)
vectorizer.fit(train_docs)
X_train_by_id = dict(zip(train_docs_id, vectorizer.transform(train_docs)))
X_test_by_id = dict(zip(test_docs_id, vectorizer.transform(test_docs)))
X_test = vectorizer.transform(test_docs)
In [ ]:
# Sampling strategies
# Pool is modeled as a list of (id, label)
# Force positive will ensure there is at least one positive and one negative document in the sample
# Returns a list of (id, label) randomly selected
def select_random_with_replacement(num_docs, pool, all_classes_present=False):
selected = []
# Filter positives, randomise the set and pick the first one
if all_classes_present:
# Preselection of one positive and one negative examples
pool_with_idx = list(enumerate(pool))
randomised_pos = [(idx, doc_id, label) for (idx, (doc_id, label)) in pool_with_idx if label==True]
random.shuffle(randomised_pos)
idx, doc_id, label = randomised_pos[0]
selected.append((doc_id, label))
del pool[idx]
for i in range(num_docs-len(selected)):
if len(pool)==0:
break
position = random.randint(0, len(pool)-1)
selected.append(pool[position])
del pool[position]
return selected
# Pool is modeled as a list of (id, label)
def select_us_with_replacement(num_docs, pool, model, plot_density_scores=False):
selected = []
X = vstack([X_train_by_id[doc_id] for (doc_id, _) in pool])
# Distance to the boundary
probs = [abs(pos_prob-0.5) for (_, pos_prob) in model.predict_proba(X)]
# List the differences from the boundary. Positive values mean they would be assigned to the class
if plot_density_scores:
margin = [pos_prob-0.5 for (_, pos_prob) in model.predict_proba(X)]
df = pd.DataFrame({'margin': margin})
df.plot(kind='density', xlim=(-0.5,0.5))
docs = list(zip(probs, pool, [i for i in range(len(pool))]))
sorted_docs = sorted(docs, key=operator.itemgetter(0), reverse=False)
for (score, d, pos) in sorted_docs[:num_docs]:
selected.append(d)
# Larger idx first
for idx in sorted([pos for (_, _, pos) in sorted_docs[:num_docs]], reverse=True):
del pool[idx]
return selected
In [ ]:
# ML utility functions
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
def train(X, y):
clf = LogisticRegression(n_jobs=8, random_state=random_state)
clf.fit(X, y)
return clf
def evaluate(labels, predictions):
return f1_score(labels, predictions)
In [ ]:
def random_selection_split_quality(n_splits, docs_per_split, train_labels, test_labels, size_first_iteration=20):
# Random selection
# (Content, label) tuples
remain_random_pool = list(zip(train_docs_id, train_labels))
train_subset_random = []
qualities_random = []
for split in range(n_splits):
# Hard limit of documents in the first iteration
if split == 0:
new_batch = select_random_with_replacement(max(docs_per_split, size_first_iteration), remain_random_pool, all_classes_present=True)
else:
new_batch = select_random_with_replacement(docs_per_split, remain_random_pool)
train_subset_random.extend(new_batch)
#print("Split: {}, Batch: {}, Remain: {}, Subset: {}".format(split, len(new_batch), len(remain_random_pool), len(train_subset_random)))
#print("Training with: {}/{})".format(len(train_subset_random), len(train_docs_id)))
X = vstack([X_train_by_id[doc_id] for (doc_id, _) in train_subset_random])
y = [y for (_, y) in train_subset_random]
model_random = train(X, y)
y_predicted_random = model_random.predict(X_test)
quality_random_split = evaluate(test_labels, y_predicted_random)
qualities_random.append(quality_random_split)
#print("Quality Random: {}".format(quality_random_split))
return qualities_random
def us_selection_split_quality(n_splits, docs_per_split, train_labels, test_labels, size_first_iteration=20, ):
remain_us_pool = list(zip(train_docs_id, train_labels))
train_subset_us = []
qualities_us = []
# We only want 10 density graphs to not overload the output
density_control_points = n_splits/10
for split in range(n_splits):
# First iteration is Random
if split == 0:
# Hard limit of documents in the first iteration
new_batch = select_random_with_replacement(max(docs_per_split, size_first_iteration), remain_us_pool, all_classes_present=True)
train_subset_us.extend(new_batch)
else:
if split%density_control_points == 0:
plot_density_scores = True
else:
plot_density_scores = False
new_batch = select_us_with_replacement(docs_per_split, remain_us_pool, model_us, plot_density_scores=plot_density_scores)
train_subset_us.extend(new_batch)
#print("Split: {}, Batch: {}, Remain: {}, Subset: {}".format(split, len(new_batch), len(remain_us_pool), len(train_subset_us)))
#print("Training with: {}/{})".format(len(train_subset_us), len(train_docs_id)))
X = vstack([X_train_by_id[doc_id] for (doc_id, _) in train_subset_us])
y = [y for (_, y) in train_subset_us]
model_us = train(X, y)
y_predicted_us = model_us.predict(X_test)
quality_us_split = evaluate(test_labels, y_predicted_us)
qualities_us.append(quality_us_split)
#print("Quality US: {}".format(quality_us_split))
#print("===")
return qualities_us
In [ ]:
# Export and plotting utility functions
def plot_al_quality_curve(qualities_random, qualities_us, doc_ratios, n_splits, docs_per_split, topic):
df = pd.DataFrame({'Random Sampling': qualities_random,
'Uncertainty Sampling': qualities_us}, index=doc_ratios)
ax = df.plot(title="AL vs Random Selection. Topic: {}, Batches: {} ({} docs/split)"
.format(topic, n_splits, docs_per_split))
ax.set(xlabel='Ratio of Training Documents used', ylabel='F1')
plt.show()
fig = ax.get_figure()
fig.savefig('AL{}-{}.pdf'.format(n_splits, topic))
In [ ]:
# N iterations comparing Random Sampling and Uncertainty Sampling
def active_learning_evaluation(n_splits, train_docs_id, train_labels, test_labels, topic):
docs_per_split = int(len(train_docs_id)/n_splits)
print("Training {} with {} splits with {} documents each".format(topic, n_splits, docs_per_split))
doc_ratios = []
for i in range(n_splits):
doc_ratios.append(len(train_docs_id)/n_splits*(i+1)/len(train_docs_id))
# TODO: We have to ensure pos/neg documents are presented. Code will break if the sampling only selects negatives.
# Random
qualities_random = random_selection_split_quality(n_splits, docs_per_split, train_labels, test_labels)
best_quality = max(qualities_random)
# Active Learning (US)
qualities_us = us_selection_split_quality(n_splits, docs_per_split, train_labels, test_labels)
best_us_quality = max(qualities_us)
# Minimal Analysis
threshold = 0.99
for quality, ratio in zip(qualities_us, doc_ratios):
if quality>best_quality*threshold:
print("AL reaches >{}% of max. quality with {} documents ({:.2f}% of dataset)"
.format(threshold*100, int(ratio*len(train_docs_id)), ratio*100))
print("Best US Quality: {:.2f}, Best Random quality: {:.2f}".format(100*best_us_quality, 100*best_quality))
break
# Plotting
plot_al_quality_curve(qualities_random, qualities_us, doc_ratios, n_splits, docs_per_split, topic)
In [ ]:
for topic in ['acq', 'earn', 'ship', 'crude', 'wheat']:
print(" ==================================== ")
train_labels = [topic in reuters.categories(doc_id) for doc_id in train_docs_id]
test_labels = [topic in reuters.categories(doc_id) for doc_id in test_docs_id]
for n_splits in [10, 100, 500]:
active_learning_evaluation(n_splits, train_docs_id, train_labels, test_labels, topic)