In [ ]:
from IPython.display import display
import matplotlib.pyplot as plt
%matplotlib inline
from nltk.corpus import reuters, stopwords
import random
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import operator
from scipy.sparse import vstack
from functools import reduce
from pprint import pprint

random_state = 42

In [ ]:
# Load Reuters collection
documents = reuters.fileids()

train_docs_id = list(filter(lambda doc: doc.startswith("train"), documents))
test_docs_id = list(filter(lambda doc: doc.startswith("test"), documents))
    
train_docs = [reuters.raw(doc_id) for doc_id in train_docs_id]
test_docs = [reuters.raw(doc_id) for doc_id in test_docs_id]

In [ ]:
# Minimal data analysis

# List of categories 
categories = reuters.categories();
print("Number of categories: {}".format(len(categories)))

# Documents per category.
category_distribution = [(category, len(reuters.fileids(category))) 
                         for category in categories]

category_distribution = sorted(category_distribution, 
                               key=operator.itemgetter(1), 
                               reverse=True)

print("Most common categories")
pprint(category_distribution[:10])
print()

print("Least common categories")
pprint(category_distribution[-10:])
print()

In [ ]:
# Represent dataset with TFIDF

# Load the list of (english) stop-words from nltk
stop_words = stopwords.words("english")
vectorizer = TfidfVectorizer(stop_words=stop_words)
vectorizer.fit(train_docs)

X_train_by_id = dict(zip(train_docs_id, vectorizer.transform(train_docs)))
X_test_by_id = dict(zip(test_docs_id, vectorizer.transform(test_docs)))
X_test = vectorizer.transform(test_docs)

In [ ]:
# Sampling strategies
# Pool is modeled as a list of (id, label)
# Force positive will ensure there is at least one positive and one negative document in the sample
# Returns a list of (id, label) randomly selected

def select_random_with_replacement(num_docs, pool, all_classes_present=False):
    selected = []
    
    # Filter positives, randomise the set and pick the first one
    if all_classes_present:
        # Preselection of one positive and one negative examples
        pool_with_idx = list(enumerate(pool))
        
        randomised_pos = [(idx, doc_id, label) for (idx, (doc_id, label)) in pool_with_idx if label==True]
        random.shuffle(randomised_pos)
        idx, doc_id, label = randomised_pos[0]
        selected.append((doc_id, label))
        del pool[idx]
    
    for i in range(num_docs-len(selected)):
        if len(pool)==0:
            break
        position = random.randint(0, len(pool)-1)
        selected.append(pool[position])
        del pool[position]
    return selected

# Pool is modeled as a list of (id, label)
def select_us_with_replacement(num_docs, pool, model, plot_density_scores=False):
    selected = []
    X = vstack([X_train_by_id[doc_id] for (doc_id, _) in pool])
    
    # Distance to the boundary
    probs = [abs(pos_prob-0.5) for (_, pos_prob) in model.predict_proba(X)]

    # List the differences from the boundary. Positive values mean they would be assigned to the class
    if plot_density_scores:
        margin = [pos_prob-0.5 for (_, pos_prob) in model.predict_proba(X)]
        df = pd.DataFrame({'margin': margin})
        df.plot(kind='density', xlim=(-0.5,0.5))
    
    docs = list(zip(probs, pool, [i for i in range(len(pool))]))
    sorted_docs = sorted(docs, key=operator.itemgetter(0), reverse=False)
    
    for (score, d, pos) in sorted_docs[:num_docs]:
        selected.append(d)
    
    # Larger idx first
    for idx in sorted([pos for (_, _, pos) in sorted_docs[:num_docs]], reverse=True):
        del pool[idx]
    
    return selected

In [ ]:
# ML utility functions
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

def train(X, y):
    clf = LogisticRegression(n_jobs=8, random_state=random_state)
    clf.fit(X, y)
    return clf
    
def evaluate(labels, predictions):
    return f1_score(labels, predictions)

In [ ]:
def random_selection_split_quality(n_splits, docs_per_split, train_labels, test_labels, size_first_iteration=20):
    # Random selection
    # (Content, label) tuples
    remain_random_pool = list(zip(train_docs_id, train_labels))
    train_subset_random = []
    qualities_random = []

    for split in range(n_splits):
        # Hard limit of documents in the first iteration
        if split == 0:
            new_batch = select_random_with_replacement(max(docs_per_split, size_first_iteration), remain_random_pool, all_classes_present=True)
        else:
            new_batch = select_random_with_replacement(docs_per_split, remain_random_pool)
        train_subset_random.extend(new_batch)
        #print("Split: {}, Batch: {}, Remain: {}, Subset: {}".format(split, len(new_batch), len(remain_random_pool), len(train_subset_random)))
        #print("Training with: {}/{})".format(len(train_subset_random), len(train_docs_id)))

        X = vstack([X_train_by_id[doc_id] for (doc_id, _) in train_subset_random])
        y = [y for (_, y) in train_subset_random]
        model_random = train(X, y)

        y_predicted_random = model_random.predict(X_test)
        quality_random_split = evaluate(test_labels, y_predicted_random)
        qualities_random.append(quality_random_split)
        #print("Quality Random: {}".format(quality_random_split))
        
    return qualities_random

def us_selection_split_quality(n_splits, docs_per_split, train_labels, test_labels, size_first_iteration=20, ):
    remain_us_pool = list(zip(train_docs_id, train_labels))
    train_subset_us = []
    qualities_us = []
    
    # We only want 10 density graphs to not overload the output
    density_control_points = n_splits/10
    
    for split in range(n_splits):       
        # First iteration is Random
        if split == 0:
            # Hard limit of documents in the first iteration
            new_batch = select_random_with_replacement(max(docs_per_split, size_first_iteration), remain_us_pool, all_classes_present=True)
            train_subset_us.extend(new_batch)
        else:
            if split%density_control_points == 0:
                plot_density_scores = True
            else:
                plot_density_scores = False
            new_batch = select_us_with_replacement(docs_per_split, remain_us_pool, model_us, plot_density_scores=plot_density_scores)
            train_subset_us.extend(new_batch)
        #print("Split: {}, Batch: {}, Remain: {}, Subset: {}".format(split, len(new_batch), len(remain_us_pool), len(train_subset_us)))
        #print("Training with: {}/{})".format(len(train_subset_us), len(train_docs_id)))

        X = vstack([X_train_by_id[doc_id] for (doc_id, _) in train_subset_us])
        y = [y for (_, y) in train_subset_us] 
        model_us = train(X, y)

        y_predicted_us = model_us.predict(X_test)
        quality_us_split = evaluate(test_labels, y_predicted_us)
        qualities_us.append(quality_us_split)
        #print("Quality US: {}".format(quality_us_split))
        #print("===")
        
    return qualities_us

In [ ]:
# Export and plotting utility functions
def plot_al_quality_curve(qualities_random, qualities_us, doc_ratios, n_splits, docs_per_split, topic):
    df = pd.DataFrame({'Random Sampling': qualities_random,
                       'Uncertainty Sampling': qualities_us}, index=doc_ratios)
    ax = df.plot(title="AL vs Random Selection. Topic: {}, Batches: {} ({} docs/split)"
                 .format(topic, n_splits, docs_per_split))
    ax.set(xlabel='Ratio of Training Documents used', ylabel='F1')
    plt.show()

    fig = ax.get_figure()
    fig.savefig('AL{}-{}.pdf'.format(n_splits, topic))

In [ ]:
# N iterations comparing Random Sampling and Uncertainty Sampling 
def active_learning_evaluation(n_splits, train_docs_id, train_labels, test_labels, topic):
    docs_per_split = int(len(train_docs_id)/n_splits)
    print("Training {} with {} splits with {} documents each".format(topic, n_splits, docs_per_split))

    doc_ratios = []
    for i in range(n_splits):
        doc_ratios.append(len(train_docs_id)/n_splits*(i+1)/len(train_docs_id))

    # TODO: We have to ensure pos/neg documents are presented. Code will break if the sampling only selects negatives.
    
    # Random
    qualities_random = random_selection_split_quality(n_splits, docs_per_split, train_labels, test_labels)
    best_quality = max(qualities_random)

    # Active Learning (US)
    qualities_us = us_selection_split_quality(n_splits, docs_per_split, train_labels, test_labels)
    best_us_quality = max(qualities_us)

    # Minimal Analysis
    threshold = 0.99
    for quality, ratio in zip(qualities_us, doc_ratios):
        if quality>best_quality*threshold:
            print("AL reaches >{}% of max. quality with {} documents ({:.2f}% of dataset)"
                  .format(threshold*100, int(ratio*len(train_docs_id)), ratio*100))
            print("Best US Quality: {:.2f}, Best Random quality: {:.2f}".format(100*best_us_quality, 100*best_quality))
            break

    # Plotting
    plot_al_quality_curve(qualities_random, qualities_us, doc_ratios, n_splits, docs_per_split, topic)

In [ ]:
for topic in ['acq', 'earn', 'ship', 'crude', 'wheat']:
    print(" ==================================== ")
    train_labels = [topic in reuters.categories(doc_id) for doc_id in train_docs_id]
    test_labels = [topic in reuters.categories(doc_id) for doc_id in test_docs_id]

    for n_splits in [10, 100, 500]:
        active_learning_evaluation(n_splits, train_docs_id, train_labels, test_labels, topic)