In [1]:
import os
# This hack makes ipython notebook import the modules correctly.
if (os.path.basename(os.getcwd()) == 'dswont'):
os.chdir(os.path.dirname(os.getcwd()))
In [74]:
import logging
logging.basicConfig(level=logging.ERROR)
import collections
from contextlib import closing
import copy
import itertools
from matplotlib import pyplot as plt
import nltk
import numpy as np
import pandas as pd
import random
import semidbm
from sklearn.svm import SVC
from sklearn.base import BaseEstimator
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model.base import LinearModel
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn import cross_validation
from dswont import dbpedia
from dswont.dbpedia import uri_to_title, title_to_uri, to_title, to_category_uri
from dswont import util
from dswont import wikiapi
from dswont.topics import CategorySelectionClassifier
from dswont.topics import default_classifier
from dswont.topics import default_classifier_params
from dswont.topics import default_cross_validation
from dswont.topics import default_data
from dswont.topics import default_instance_weight_fn
from dswont.topics import default_features
from dswont.topics import depth_based_selection
from dswont.topics import evaluate_classifier
from dswont.topics import new_training_params
from dswont.topics import precompute_full_selection
from dswont.topics import read_ground_truth_data
from dswont.topics import train_topic_classifier
from dswont.topics import TrainingDataSelection
In [11]:
def make_topic_df(full_selection):
full_selection_df = pd.DataFrame({'topic':list(full_selection)})
full_selection_df['depth'] = full_selection_df['topic'].apply(full_selection.get_depth)
ground_truth_topic_relevance_data = read_ground_truth_data().items()
ground_truth_topic_relevance_topics, ground_truth_topic_relevance_relevance =\
zip(*read_ground_truth_data().items())
ground_truth_topic_relevance_df = pd.DataFrame(
{'topic':ground_truth_topic_relevance_topics,
'is_relevant':ground_truth_topic_relevance_relevance})
full_selection_df = full_selection_df.merge(ground_truth_topic_relevance_df, how='outer')
full_selection_df['title'] = full_selection_df['topic'].apply(to_title)
full_selection_df =\
full_selection_df.reindex(columns=['topic', 'title', 'depth', 'is_relevant'])
return full_selection_df
In [12]:
def make_feature_df(selection, features, topics):
feature_maps = features.compute(selection, topics)
for topic, feature_map in zip(topics, feature_maps):
feature_map['topic'] = topic
return pd.DataFrame(feature_maps)
In [13]:
def make_mistake_df(clf, feature_df):
df = feature_df.copy()
df['prediction'] = clf.predict(df['topic'])
mistake_indices = (df['prediction'] != df['is_relevant'])
return df[mistake_indices].dropna()
In [14]:
def precompute_topic_data_frame(precomputed_data={}):
if not 'topic_df' in precomputed_data:
full_selection = precompute_full_selection()
topics = [topic for topic in full_selection][1:]
feature_df = make_feature_df(full_selection, default_features, topics)
topic_only_df = make_topic_df(full_selection)
topic_df = topic_only_df.merge(feature_df)
precomputed_data['full_selection'] = full_selection
precomputed_data['topics'] = topics
precomputed_data['topic_df'] = topic_df
return precomputed_data['topic_df']
In [15]:
def stats_on_relevant_topics_in_ground_truth_vs_depth():
feature_columns = default_features.feature_names()
topic_df = precompute_topic_data_frame()
feature_and_prediction_columns = feature_columns + ['is_relevant']
stats_df = topic_df.dropna().groupby(['depth', 'is_relevant'])\
['title']\
.count()\
.unstack(['is_relevant'])\
.fillna(0)\
.sort(ascending=False)
stats_df = stats_df.rename(columns={False: 'irrelevant', True: 'relevant'})
stats_df.plot(kind='barh', stacked=True)
plt.xlabel('The number of topics')
stats_df['total'] = stats_df['relevant'] + stats_df['irrelevant']
stats_df['percent_relevant'] = (stats_df['relevant'] / stats_df['total']).round(3)
return stats_df.sort(ascending=True)
stats_on_relevant_topics_in_ground_truth_vs_depth()
Out[15]:
In [16]:
def default_trained_topic_selection_classifier(precomputed_data={}):
if 'clf' not in precomputed_data:
full_selection = precompute_full_selection()
training_data = default_data()
training_params = new_training_params()
clf = train_topic_classifier(training_data.keys(), training_data.values(),
full_selection,
**training_params)
precomputed_data['clf'] = clf
return precomputed_data['clf']
In [25]:
topics, classes = list(zip(*default_data().items()))
clf = default_trained_topic_selection_classifier()
print(evaluate_classifier(clf, topics, classes, util.accuracy_score))
print(evaluate_classifier(clf, topics, classes, util.f1_pos_class))
print(evaluate_classifier(clf, topics, classes, util.f1_neg_class))
print(evaluate_classifier(clf, topics, classes, util.weighted_f1))
In [30]:
def report_test_performance_of_the_learning_based_classifier():
# Preparing the data.
topic_df = precompute_topic_data_frame().dropna()
topic_df['is_relevant'] = topic_df['is_relevant'].astype(bool)
topics_and_classes = topic_df[['topic', 'is_relevant']].values
topics = topics_and_classes[:,0]
classes = topics_and_classes[:,1]
# Preparing the params.
classifier_params = default_classifier_params.copy()
classifier_params['C'] = 3
classifier_params['class_weight'] = {1:2, 0:1}
classifier_params['kernel'] = 'linear'
classifier_params['kernel'] = 'rbf'
classifier_fn = default_classifier
full_selection = precompute_full_selection()
clf = train_topic_classifier(topics, classes,
full_selection, full_selection._max_depth,
default_features, classifier_fn,
(lambda x : 1),
classifier_params)
print(clf)
for measure in [util.accuracy_score, util.f1_pos_class, util.f1_neg_class, util.weighted_f1]:
print(evaluate_classifier(clf, topics, classes, measure))
report_test_performance_of_the_learning_based_classifier()
In [51]:
def default_classifier_evaluation_params():
topic_df = precompute_topic_data_frame().dropna()
topic_df['is_relevant'] = topic_df['is_relevant'].astype(bool)
topics_and_classes = topic_df[['topic', 'is_relevant']].values
topics = topics_and_classes[:,0]
classes = topics_and_classes[:,1]
np.random.shuffle(topics_and_classes)
inner_cross_validation = None # No inner cross-validation.
outer_cross_validation = default_cross_validation
# outer_cross_validation = lambda outputs : cross_validation.KFold(len(outputs), n_folds=2)
def model_selection_measure(*args, **params):
return util.weighted_f1(*args, **params)
evaluation_measures = [util.accuracy_score, util.f1_pos_class, util.f1_neg_class, util.weighted_f1]
return topics, classes, inner_cross_validation,\
outer_cross_validation, model_selection_measure,\
evaluation_measures
def evaluate_learning_based_classifier_cross_validated(training_size=None):
np.random.seed(0)
topics, classes, inner_cross_validation,\
outer_cross_validation, model_selection_measure,\
evaluation_measures = default_classifier_evaluation_params()
# def smaller_cross_validation(outputs):
# return cross_validation.StratifiedKFold(outputs, n_folds=2)
# inner_cross_validation = lambda outputs: cross_validation.StratifiedKFold(outputs, n_folds=3)
# param_grid = new_training_params_cv()['param_grid']
# param_grid[0]['C'] = [0.25, 0.5, 1, 3, 7, 15]
# param_grid = None
full_selection = precompute_full_selection()
features = default_features.copy()
classifier_params = default_classifier_params.copy()
classifier_params['C'] = 1
classifier_params['kernel'] = 'linear'
# classifier_params['kernel'] = 'rbf'
# classifier_params['gamma'] = 1
tuned_clf = CategorySelectionClassifier(full_selection=full_selection,
features=features,
classifier_fn=default_classifier,
max_depth=full_selection._max_depth,
instance_weight=default_instance_weight_fn,
**classifier_params)
return train_evaluate_topic_classifier_cv(tuned_clf, topics, classes,
inner_cross_validation,
outer_cross_validation,
# smaller_cross_validation,
model_selection_measure,
evaluation_measures,
param_grid=None,
learning=True,
training_size=training_size)
def evaluate_depth_based_classifier_cross_validated(depth):
np.random.seed(0)
topics, classes, inner_cross_validation,\
outer_cross_validation, model_selection_measure,\
evaluation_measures = default_classifier_evaluation_params()
def smaller_cross_validation(outputs):
return cross_validation.StratifiedKFold(outputs, n_folds=2)
full_selection = precompute_full_selection()
tuned_clf = depth_based_selection(full_selection._root, depth)
return train_evaluate_topic_classifier_cv(tuned_clf, topics, classes,
inner_cross_validation,
# smaller_cross_validation,
outer_cross_validation,
model_selection_measure,
evaluation_measures,
param_grid=False,
learning=False)
def plot_performance_depth_based_classifier():
metric_names = ['accuracy', 'f1_pos', 'f1_neg', 'weighted_f1']
performance = []
depth_range = range (1, 8)
for depth in depth_range:
performance.append([])
metrics = evaluate_depth_based_classifier_cross_validated(depth)
for metric_name, metric in zip(metric_names, metrics):
performance[-1].append((metric.mean(), metric.std()))
performance = np.array(performance).T
for idx in range(len(metric_names)):
plt.figure()
plt.errorbar(depth_range, performance[0,idx,:], yerr=performance[1,idx,:])
plt.xlabel('depth')
plt.ylabel(metric_names[idx])
return performance
def evaluate_majority_classifier_cross_validated(training_size=None):
np.random.seed(0)
topics, classes, inner_cross_validation,\
outer_cross_validation, model_selection_measure,\
evaluation_measures = default_classifier_evaluation_params()
tuned_clf = MajorityClassClassifier()
return train_evaluate_topic_classifier_cv(tuned_clf, topics, classes,
inner_cross_validation,
outer_cross_validation,
model_selection_measure,
evaluation_measures,
param_grid=None,
learning=True,
training_size=training_size)
def evaluate_random_classifier_cross_validated(training_size=None):
np.random.seed(0)
topics, classes, inner_cross_validation,\
outer_cross_validation, model_selection_measure,\
evaluation_measures = default_classifier_evaluation_params()
tuned_clf = StratifiedRandomClassifier()
return train_evaluate_topic_classifier_cv(tuned_clf, topics, classes,
inner_cross_validation,
outer_cross_validation,
model_selection_measure,
evaluation_measures,
param_grid=None,
learning=True,
training_size=training_size)
def produce_learning_curve(evaluation_fn_of_training_size, training_sizes):
result = []
for training_size in training_sizes:
result.append(evaluation_fn_of_training_size(training_size=training_size))
return np.array(result)
In [41]:
depth_based_perf = plot_performance_depth_based_classifier()
In [43]:
%matplotlib inline
plt.errorbar(range(1,8), depth_based_perf.T[:,0,0], yerr=depth_based_perf.T[:,0,1], linewidth=2, elinewidth=1)
plt.plot(range(1, 8), [0.922] * 7, linewidth=2)
plt.xlabel('Depth of selection')
plt.ylabel('Accuracy')
plt.fill_between(range(1, 8), [0.922-0.011] * 7, [0.922+0.011] * 7, alpha=0.25, color='green')
plt.legend(['depth-based selection', 'our method'], loc='lower left')
plt.tight_layout()
plt.savefig(util.resource('accuracy-depth-based.pdf'))
In [46]:
metrics = evaluate_random_classifier_cross_validated()
metric_names = ['accuracy', 'f1_pos', 'f1_neg', 'weighted_f1']
for metric_name, metric in zip(metric_names, metrics):
print("{:<11s} : {:.2f} +- {:.3f}".format(metric_name, metric.mean(), metric.std()))
In [119]:
# training_sizes = [10, 15, 20, 25, 30, 35]
# metrics_ours_lc = produce_learning_curve(evaluate_learning_based_classifier_cross_validated, training_sizes)
In [137]:
# accuracies = metrics_ours_lc[:,0]
# f1s = metrics_ours_lc[:,3]
# plt.errorbar(training_sizes, accuracies.mean(axis=1), accuracies.std(axis=1))
# plt.errorbar(training_sizes, f1s.mean(axis=1), f1s.std(axis=1))
# plt.ylim(0.4, 1)
# plt.xlim(9, 30.2)
# plt.legend(['Accuracy', 'Weighted F1 score'], loc='lower right')
# plt.xlabel('Training data size (# points)')
# plt.tight_layout()
# plt.savefig(util.resource('node-selection-learning-curve.pdf'))
# plt.figure()
# plt.errorbar(training_sizes, f1s.mean(axis=1), f1s.std(axis=1))
# plt.plot(training_sizes, np.full_like(training_sizes, 0.908, dtype=float))
# plt.ylim(0.7, 1)
# (f1s[-1].mean(), f1s[-1].std())
In [59]:
metrics = evaluate_learning_based_classifier_cross_validated()
metric_names = ['accuracy', 'f1_pos', 'f1_neg', 'weighted_f1']
for metric_name, metric in zip(metric_names, metrics):
print("{:<11s} : {:.3f} +- {:.3f}".format(metric_name, metric.mean(), metric.std()))
In [61]:
metrics = evaluate_depth_based_classifier_cross_validated(depth=4)
metric_names = ['accuracy', 'f1_pos', 'f1_neg', 'weighted_f1']
for metric_name, metric in zip(metric_names, metrics):
print("{:<11s} : {:.3f} +- {:.3f}".format(metric_name, metric.mean(), metric.std()))
In [63]:
metrics = evaluate_majority_classifier_cross_validated()
metric_names = ['accuracy', 'f1_pos', 'f1_neg', 'weighted_f1']
for metric_name, metric in zip(metric_names, metrics):
print("{:<11s} : {:.3f} +- {:.3f}".format(metric_name, metric.mean(), metric.std()))
In [64]:
metrics = evaluate_random_classifier_cross_validated()
metric_names = ['accuracy', 'f1_pos', 'f1_neg', 'weighted_f1']
for metric_name, metric in zip(metric_names, metrics):
print("{:<11s} : {:.3f} +- {:.3f}".format(metric_name, metric.mean(), metric.std()))
In [65]:
def stats_on_relevant_topics_according_to_classifier(clf, topic_df):
feature_columns = default_features.feature_names()
topic_df = topic_df.copy()
topic_df['prediction'] = clf.predict(topic_df['topic'])
topic_df = topic_df.drop('is_relevant', axis=1)
feature_and_prediction_columns = feature_columns + ['prediction']
stats_df = topic_df.dropna().groupby(['depth', 'prediction'])\
['topic']\
.count()\
.unstack('prediction')\
.fillna(0)\
.sort(ascending=False)
stats_df = stats_df.rename(columns={False: 'irrelevant', True: 'relevant'})
stats_df.plot(kind='barh', stacked=True)
plt.xlabel('The number of topics')
stats_df['total'] = stats_df['relevant'] + stats_df['irrelevant']
stats_df['percent_relevant'] = (stats_df['relevant'] / stats_df['total']).round(3)
return stats_df.sort(ascending=True)
stats_on_relevant_topics_according_to_classifier(
default_trained_topic_selection_classifier(),
precompute_topic_data_frame())
Out[65]:
In [66]:
def compare_and_plot_the_performances_of_3_classifiers(topic_df):
topics = topic_df.dropna()['topic']
classes = topic_df.dropna()['is_relevant']
def eval_clf(clf, topic_df):
topics = topic_df.dropna()['topic']
classes = topic_df.dropna()['is_relevant']
return [evaluate_classifier(clf, topics, classes, util.f1_pos_class),
evaluate_classifier(clf, topics, classes, util.f1_neg_class)]
def overfitting_svm_clf(topic_df):
training_params = new_training_params()
training_params['classifier_params']['C'] = 1000
full_selection = precompute_full_selection()
return train_topic_classifier(topics, classes, full_selection, **training_params)
def default_tree_clf(topic_df):
training_params = new_training_params()
training_params['classifier_fn'] = DecisionTreeClassifier
training_params['classifier_params'] = {}
full_selection = precompute_full_selection()
return train_topic_classifier(topics, classes, full_selection, **training_params)
def default_k1n_clf(topic_df):
training_params = new_training_params()
training_params['classifier_fn'] = KNeighborsClassifier
training_params['classifier_params'] = {'n_neighbors' : 1}
full_selection = precompute_full_selection()
return train_topic_classifier(topics, classes, full_selection, **training_params)
print("F1 measures, classes 'relevant' and 'irrelevant' for different classifiers:")
print("Depth-4 classifier:", eval_clf(depth_based_selection(max_depth=4), topic_df))
print("SVM, with C=1000:", eval_clf(overfitting_svm_clf(topic_df), topic_df))
print("Decision tree:", eval_clf(default_tree_clf(topic_df), topic_df))
print("1-nearest neighbor:", eval_clf(default_k1n_clf(topic_df), topic_df))
compare_and_plot_the_performances_of_3_classifiers(precompute_topic_data_frame())
In [69]:
def plot_points_2d(topic_df, var1, var2, response):
from matplotlib.pylab import figure, subplot
from matplotlib.colors import ListedColormap
h = .02
cm_bright = ListedColormap(['#FF0000', '#00FF00'])
X1 = topic_df.dropna()[var1].values
X2 = topic_df.dropna()[var2].values
y = topic_df.dropna()[response]
x1_min, x1_max = X1.min() - .1, X1.max() + .1
x2_min, x2_max = X2.min() - .1, X2.max() + .1
xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, h),
np.arange(x2_min, x2_max, h))
figure(figsize=(10, 8))
ax = subplot(111)
# Plot the testing points
plt.scatter(X1 + np.random.normal(0, 0.02, len(X1)),
X2 + np.random.normal(0, 0.02, len(X2)),
c=y.astype(int), cmap=cm_bright,
marker='+', alpha=1)
ax.set_xlim(xx1.min(), xx1.max())
ax.set_ylim(xx2.min(), xx2.max())
ax.set_xlabel(var1)
ax.set_ylabel(var2)
ax.set_xticks(())
ax.set_yticks(())
def plot_topics_against_couple_of_dimensions(topic_df):
plot_points_2d(topic_df, 'frac_parents_in_graph', 'avg_normalized_parent_depth', 'is_relevant')
plot_points_2d(topic_df, 'normalized_depth', 'avg_normalized_parent_depth', 'is_relevant')
plot_points_2d(topic_df, 'normalized_depth', 'frac_parents_in_graph', 'is_relevant')
plot_points_2d(topic_df, 'median_normalized_parent_depth', 'avg_normalized_parent_depth', 'is_relevant')
plot_topics_against_couple_of_dimensions(precompute_topic_data_frame())
In [70]:
def list_topics_that_map_to_same_points(topic_df):
feature_cols = default_features.feature_names()
grouped = topic_df.dropna().groupby(feature_cols)
def group_has_both_relevant_and_irrelevant_topis(group):
return group['is_relevant'].nunique() > 1
grouped = grouped.filter(group_has_both_relevant_and_irrelevant_topis)
grouped = grouped.groupby(feature_cols)
for group_idx in grouped.groups.values():
print(topic_df.ix[group_idx]
[['depth', 'title', 'is_relevant']]
.sort('is_relevant'))
list_topics_that_map_to_same_points(precompute_topic_data_frame())
In [71]:
def report_topics_level_up_to_4_predicted_as_irrelevant(clf, topic_df):
topic_df = topic_df.copy()
topic_df['predicted_relevant'] = clf.predict(topic_df['topic'])
return topic_df[topic_df['depth'] <= 4][topic_df['predicted_relevant'] == False][['title', 'depth']]
report_topics_level_up_to_4_predicted_as_irrelevant(
default_trained_topic_selection_classifier(),
precompute_topic_data_frame())
Out[71]:
In [72]:
def report_shortest_path(selection, topic):
return TrainingDataSelection(selection)._shortest_path_from_root(to_category_uri(topic));
In [75]:
report_shortest_path(precompute_full_selection(), "Library science")
Out[75]:
In [76]:
def get_subcats(topic):
with WikipediaGraphIndex() as wiki:
return wiki.get_subcats(topic)
def get_supercats(topic):
with WikipediaGraphIndex() as wiki:
return wiki.get_supercats(topic)
def childrens_parents_with_duplicates(topic):
return [parent for child in get_subcats(topic)
for parent in get_supercats(child)
if parent != topic]
In [78]:
def default_trained_topic_selection_classifier(precomputed_data={}):
if 'default_clf' not in precomputed_data:
full_selection = precompute_full_selection()
training_data = read_ground_truth_data()
training_params = new_training_params()
training_params['classifier_params']['C'] = 1
training_params['instance_weight_fn'] = lambda x : 1
clf = train_topic_classifier(
training_data.keys(), training_data.values(),
full_selection,
**training_params)
precomputed_data['default_clf'] = clf
return precomputed_data['default_clf']