In [1]:
# %matplotlib inline
# %load_ext autoreload
# %autoreload 2

In [2]:
import os
# This hack makes ipython notebook import the modules correctly.
if (os.path.basename(os.getcwd()) == 'dswont'):
    os.chdir(os.path.dirname(os.getcwd()))

In [3]:
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy import stats

import logging
logging.basicConfig(level=logging.WARN)
# Silence the verbose urllib logger.
logging.getLogger('requests.packages.urllib3.connectionpool').setLevel(logging.WARN)

from dswont import topics
from dswont import util
from dswont import dbpedia

In [4]:
ROOT_CATEGORY_MUSIC = 'http://dbpedia.org/resource/Category:Music'
DEFAULT_SELECTION_DEPTH = 9
DEFAULT_RELATION_CACHE = topics.CategoryRelationCache(
    subcat_index_file=util.resource('wikipedia/uri-to-subcats-music'),
    supercat_index_file=util.resource('wikipedia/uri-to-supercats-music'))

def music_category_selection(**params):
    updated_params = {
        'root' : ROOT_CATEGORY_MUSIC,
        'relation_cache' : DEFAULT_RELATION_CACHE}
    updated_params.update(params)
    selection = topics.CategorySelection(**updated_params)
    selection.run()
    return selection

def precompute_full_selection(precomputed_data={}):
    if not 'full_selection' in precomputed_data:
        precomputed_data['full_selection'] = music_category_selection(max_depth=DEFAULT_SELECTION_DEPTH)
    return precomputed_data['full_selection']

In [5]:
def make_topic_data_frame(selection):
    topic_df = pd.DataFrame({'topic':list(selection)})
    topic_df['depth'] = topic_df['topic'].apply(selection.get_depth)
    topic_df['title'] = topic_df['topic'].apply(dbpedia.to_title)
    topic_df = topic_df.reindex(columns=['topic', 'title', 'depth'])
    return topic_df

def precompute_unlabeled_topic_data_frame(precomputed_data={}):
    if not 'unlabeled_topic_df' in precomputed_data:
        precomputed_data['unlabeled_topic_df'] = make_topic_data_frame(precompute_full_selection())
    return precomputed_data['unlabeled_topic_df']

In [6]:
def report_level_distribution(topic_df):
    return topic_df.groupby('depth').count()['title']

In [7]:
# report_level_distribution(precompute_unlabeled_topic_data_frame())


WARNING:root:Loop 'Israeli music'<->'Middle Eastern music'
WARNING:root:Loop 'Distributed data storage'<->'Distributed data storage systems'
WARNING:root:Loop 'Cloud storage'<->'File hosting'
WARNING:root:Loop 'Cash-Carter family'<->'Johnny Cash'
WARNING:root:Loop 'Lennon family'<->'John Lennon'
WARNING:root:Loop 'Lennon family'<->'Yoko Ono'
WARNING:root:Loop 'McCartney family'<->'Linda McCartney'
WARNING:root:Loop 'Igbo language'<->'Igboid languages'
WARNING:root:Loop 'Sound'<->'Hearing'
WARNING:root:Loop 'Sound'<->'Music'
WARNING:root:Loop 'Victoria Beckham'<->'David Beckham'
WARNING:root:Loop 'Chinese language by country'<->'Chinese languages in Singapore'
WARNING:root:Loop 'Magazines'<->'Magazine publishing'
WARNING:root:Loop 'Basketball venues in the United States'<->'American Basketball Association (2000–present) venues'
WARNING:root:Loop 'Islamic Republic of Iran Broadcasting'<->'Television stations in Iran'
WARNING:root:Loop 'Punjab'<->'Punjabi culture'
WARNING:root:Loop 'Punjab, Pakistan'<->'Punjabi culture'
WARNING:root:Loop 'Rosh Hashanah'<->'Ten Days of Repentance'
WARNING:root:Loop 'Yom Kippur'<->'Ten Days of Repentance'
WARNING:root:Loop 'Computer vision'<->'Image processing'
WARNING:root:Loop 'Chinese society'<->'Chinese culture'
WARNING:root:Loop 'Singaporean society'<->'Languages of Singapore'
WARNING:root:Loop 'Singaporean society'<->'Singaporean culture'
WARNING:root:Loop 'Punjab, India'<->'Punjabi culture'
WARNING:root:Loop 'Maccabees'<->'Cultural depictions of the Maccabees'
WARNING:root:Loop 'Optical materials'<->'Transparent materials'
Out[7]:
depth
0            1
1           35
2          382
3         2689
4        12043
5        25191
6        22872
7         6927
8         5637
9        15752
Name: title, dtype: int64

In [52]:
def sample_from_df(df, nrows = 10, seed=0):
    np.random.seed(seed)
    rows = np.random.choice(df.index.values, nrows, replace=False)
    return df.ix[rows]

In [53]:
def sample_from_level(df, level, nrows=10, seed=0):
    return sample_from_df(df[df['depth']==level], nrows, seed)

In [54]:
# sample_from_level(precompute_unlabeled_topic_data_frame(), 9, 100)['title'].values
# # Number of relevant topics : 1
# # 'Trauma Records albums'
# # 95% conf. interval: [0.000, 0.054]
# # Conclusion: could probably discard level 9

# sample_from_level(precompute_unlabeled_topic_data_frame(), 8, 100)['title'].values
# # Number of relevant topics : 21
# # 'Raised by Swans albums'
# # 'Low-importance Madonna articles'
# # 'Cub Country albums'
# # 'The Revolution Smile albums'
# # 'Island Records albums'
# # 'Polar Music albums'
# # 'Hannah Georgas albums'
# # 'Operas set in Turkey'
# # 'Category-Class Madonna articles'
# # 'Free multimedia codecs, containers, and splitters'
# # 'J Storm albums'
# # 'Portal-Class Madonna articles'
# # 'The Folk Implosion albums'
# # 'Hawksley Workman albums'
# # 'Skipping Girl Vinegar albums'
# # 'Loveless albums'
# # 'The Hours albums'
# # 'Nadine songs'
# # 'Two Hours Traffic albums'
# # 'Alternative rock groups from Maryland'
# # 'Cusco (band) albums'
# # 95% conf. interval: (0.135, 0.303)

None

In [55]:
def clopper_pearson(k, n, alpha):
    """
    http://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval
    alpha confidence intervals for a binomial distribution of k expected successes on n trials
    Clopper Pearson intervals are a conservative estimate.
    """
    lo = stats.beta.ppf(alpha/2, k, n-k+1)
    hi = stats.beta.ppf(1 - alpha/2, k+1, n-k)
    return lo, hi

In [56]:
def generate_and_save_topics_for_labeling(filename):
    topic_data_sampler = topics.TrainingDataSelection(precompute_full_selection())
    topic_sample = topic_data_sampler.sample_paths_through_from_anywhere(1000)
    data_processing = topics.TrainingDataProcessing()
    topics_for_labeling = list(topic_sample)
    data_processing.save_topic_labels(topics_for_labeling, 
                                      [None] * len(topics_for_labeling), 
                                      topic_data_file, 
                                      topic_data_sampler)
    
# generate_and_save_topics_for_labeling()

def read_ground_truth_topic_labels():
    data_processing = topics.TrainingDataProcessing()
    return data_processing.read_topic_labels(util.resource('labeled-topics-music-1000-dm.txt'))

def make_labeled_topic_data_frame(selection, ground_truth_data):
    selection_df = pd.DataFrame({'topic':list(selection)})
    selection_df['depth'] = selection_df['topic'].apply(selection.get_depth)
    ground_truth_topic_relevance_topics, ground_truth_topic_relevance_relevance =\
        zip(*ground_truth_data.items())
    ground_truth_topic_relevance_df = pd.DataFrame(
        {'topic':ground_truth_topic_relevance_topics,
         'is_relevant':ground_truth_topic_relevance_relevance})
    selection_df = selection_df.merge(ground_truth_topic_relevance_df, how='outer')
    selection_df['title'] = selection_df['topic'].apply(dbpedia.to_title)
    selection_df =\
        selection_df.reindex(columns=['topic', 'title', 'depth', 'is_relevant'])
    return selection_df

def precompute_labeled_topic_data_frame(precomputed_data={}):
    if not 'labeled_topic_df' in precomputed_data:
        selection = precompute_full_selection()
        ground_truth_data = read_ground_truth_topic_labels()
        selection_df = make_labeled_topic_data_frame(selection, ground_truth_data)
        precomputed_data['labeled_topic_df'] = selection_df
    return precomputed_data['labeled_topic_df']

In [15]:
# precompute_labeled_topic_data_frame().groupby('depth')['topic'].count()


Out[15]:
depth
0            1
1           35
2          382
3         2689
4        12043
5        25191
6        22872
7         6927
8         5637
9        15752
Name: topic, dtype: int64

In [33]:
def apply_to_new_domain(selection_classifier:topics.CategorySelectionClassifier,
                        new_domain_full_selection:topics.CategorySelection):
        result = selection_classifier.copy()
        old_topic_classifier = result.selection._classifier
        result.full_selection = new_domain_full_selection
        result.max_depth = new_domain_full_selection._max_depth
        result.selection = topics.CategorySelection(
            new_domain_full_selection._root,
            old_topic_classifier,
            new_domain_full_selection._max_depth,
            new_domain_full_selection._relations)
        result.selection.run()
        return result

In [34]:
# cs_clf = topics.default_trained_topic_selection_classifier()
# music_clf = apply_to_new_domain(cs_clf, precompute_full_selection())


WARNING:root:Loop 'Musicology'<->'Philosophy of music'
WARNING:root:Loop 'Philosophy of music'<->'Musicology'
WARNING:root:Loop 'Contemporary classical musicians'<->'Contemporary classical composers'
WARNING:root:Loop 'Israeli music'<->'Middle Eastern music'
WARNING:root:Loop 'Freak folk'<->'New Weird America'
WARNING:root:Loop 'Distributed data storage'<->'Distributed data storage systems'
WARNING:root:Loop 'Distributed data storage'<->'File sharing networks'
WARNING:root:Loop 'File sharing networks'<->'Distributed data storage'
WARNING:root:Loop 'Electronic musical instruments'<->'Quintephones'
WARNING:root:Loop 'Contemporary classical composers'<->'Contemporary classical musicians'
WARNING:root:Loop 'Band-centric video games'<->'Musician video games'
WARNING:root:Loop 'Musician video games'<->'Band-centric video games'
WARNING:root:Loop 'New Weird America'<->'Freak folk'
WARNING:root:Loop 'Cloud storage'<->'File hosting'
WARNING:root:Loop 'Cash-Carter family'<->'Johnny Cash'
WARNING:root:Loop 'Lennon family'<->'John Lennon'
WARNING:root:Loop 'Lennon family'<->'Yoko Ono'
WARNING:root:Loop 'McCartney family'<->'Linda McCartney'
WARNING:root:Loop 'Quintephones'<->'Electronic musical instruments'
WARNING:root:Loop 'Sound'<->'Hearing'
WARNING:root:Loop 'Sound'<->'Music'
WARNING:root:Loop 'Marathi music'<->'Marathi film music composers'
WARNING:root:Loop 'Middle Eastern music'<->'Israeli music'
WARNING:root:Loop 'Central American music'<->'Latin American music'
WARNING:root:Loop 'Latin American music'<->'Central American music'
WARNING:root:Loop 'Marathi film music composers'<->'Marathi music'
WARNING:root:Loop 'Johnny Cash'<->'Cash-Carter family'
WARNING:root:Loop 'Linda McCartney'<->'McCartney family'
WARNING:root:Loop 'Wynonna Judd'<->'The Judds'
WARNING:root:Loop 'Yoko Ono'<->'Lennon family'
WARNING:root:Loop 'John Lennon'<->'Lennon family'
WARNING:root:Loop 'Victoria Beckham'<->'David Beckham'
WARNING:root:Loop 'Latvian classical musicians by instrument'<->'Latvian classical organists'
WARNING:root:Loop 'Australian electronic music groups'<->'Australian electronic dance music groups'
WARNING:root:Loop 'Basketball venues in the United States'<->'American Basketball Association (2000–present) venues'
WARNING:root:Loop 'Latvian classical organists'<->'Latvian classical musicians'
WARNING:root:Loop 'Latvian classical organists'<->'Latvian classical musicians by instrument'

In [60]:
# categories, classes = zip(*read_ground_truth_topic_labels().items())
# print(topics.evaluate_classifier(music_clf, categories, classes, util.accuracy_score))
# print(topics.evaluate_classifier(music_clf, categories, classes, util.f1_pos_class))
# print(topics.evaluate_classifier(music_clf, categories, classes, util.f1_neg_class))
# print(topics.evaluate_classifier(music_clf, categories, classes, util.weighted_f1))


0.858378378378
0.896278701504
0.776831345826
0.860767325492

In [66]:
def depth_based_selection(full_selection, depth):
    selection = topics.CategorySelection(
        full_selection._root,
        None,
        depth,
        full_selection._relations)
    selection.run()
    return selection

# for depth in range(1, 10):
#     clf = depth_based_selection(precompute_full_selection(), depth)
#     print("Depth:", depth)
#     print("Accuracy:", topics.evaluate_classifier(clf, categories, classes, util.accuracy_score))
#     print("Positive F1:", topics.evaluate_classifier(clf, categories, classes, util.f1_pos_class))
#     print("Negative F1:", topics.evaluate_classifier(clf, categories, classes, util.f1_neg_class))
#     print("Weighted F1:", topics.evaluate_classifier(clf, categories, classes, util.weighted_f1))


Depth: 1
Accuracy: 0.297297297297
Positive F1: 0.0
Negative F1: 0.458333333333
Weighted F1: 0.136261261261
Depth: 2
Accuracy: 0.353513513514
Positive F1: 0.148148148148
Negative F1: 0.479094076655
Weighted F1: 0.246537478245
Depth:
WARNING:root:Loop 'Israeli music'<->'Middle Eastern music'
WARNING:root:Loop 'Distributed data storage'<->'Distributed data storage systems'
 3
Accuracy: 0.48
Positive F1: 0.419782870929
Negative F1: 0.52889324192
Weighted F1: 0.452221089332
Depth:
WARNING:root:Loop 'Israeli music'<->'Middle Eastern music'
WARNING:root:Loop 'Distributed data storage'<->'Distributed data storage systems'
WARNING:root:Loop 'Cloud storage'<->'File hosting'
WARNING:root:Loop 'Cash-Carter family'<->'Johnny Cash'
WARNING:root:Loop 'Lennon family'<->'John Lennon'
WARNING:root:Loop 'Lennon family'<->'Yoko Ono'
WARNING:root:Loop 'McCartney family'<->'Linda McCartney'
 4
Accuracy: 0.648648648649
Positive F1: 0.6772591857
Negative F1: 0.614472123369
Weighted F1: 0.658592761764
Depth:
WARNING:root:Loop 'Israeli music'<->'Middle Eastern music'
WARNING:root:Loop 'Distributed data storage'<->'Distributed data storage systems'
WARNING:root:Loop 'Cloud storage'<->'File hosting'
WARNING:root:Loop 'Cash-Carter family'<->'Johnny Cash'
WARNING:root:Loop 'Lennon family'<->'John Lennon'
WARNING:root:Loop 'Lennon family'<->'Yoko Ono'
WARNING:root:Loop 'McCartney family'<->'Linda McCartney'
WARNING:root:Loop 'Igbo language'<->'Igboid languages'
WARNING:root:Loop 'Sound'<->'Hearing'
WARNING:root:Loop 'Sound'<->'Music'
WARNING:root:Loop 'Victoria Beckham'<->'David Beckham'
 5
Accuracy: 0.810810810811
Positive F1: 0.854288093256
Negative F1: 0.730354391371
Weighted F1: 0.817442938641
Depth:
WARNING:root:Loop 'Israeli music'<->'Middle Eastern music'
WARNING:root:Loop 'Distributed data storage'<->'Distributed data storage systems'
WARNING:root:Loop 'Cloud storage'<->'File hosting'
WARNING:root:Loop 'Cash-Carter family'<->'Johnny Cash'
WARNING:root:Loop 'Lennon family'<->'John Lennon'
WARNING:root:Loop 'Lennon family'<->'Yoko Ono'
WARNING:root:Loop 'McCartney family'<->'Linda McCartney'
WARNING:root:Loop 'Igbo language'<->'Igboid languages'
WARNING:root:Loop 'Sound'<->'Hearing'
WARNING:root:Loop 'Sound'<->'Music'
WARNING:root:Loop 'Victoria Beckham'<->'David Beckham'
WARNING:root:Loop 'Chinese language by country'<->'Chinese languages in Singapore'
WARNING:root:Loop 'Magazines'<->'Magazine publishing'
WARNING:root:Loop 'Basketball venues in the United States'<->'American Basketball Association (2000–present) venues'
WARNING:root:Loop 'Islamic Republic of Iran Broadcasting'<->'Television stations in Iran'
 6
Accuracy: 0.885405405405
Positive F1: 0.921013412817
Negative F1: 0.791338582677
Weighted F1: 0.882461436289
Depth:
WARNING:root:Loop 'Israeli music'<->'Middle Eastern music'
WARNING:root:Loop 'Distributed data storage'<->'Distributed data storage systems'
WARNING:root:Loop 'Cloud storage'<->'File hosting'
WARNING:root:Loop 'Cash-Carter family'<->'Johnny Cash'
WARNING:root:Loop 'Lennon family'<->'John Lennon'
WARNING:root:Loop 'Lennon family'<->'Yoko Ono'
WARNING:root:Loop 'McCartney family'<->'Linda McCartney'
WARNING:root:Loop 'Igbo language'<->'Igboid languages'
WARNING:root:Loop 'Sound'<->'Hearing'
WARNING:root:Loop 'Sound'<->'Music'
WARNING:root:Loop 'Victoria Beckham'<->'David Beckham'
WARNING:root:Loop 'Chinese language by country'<->'Chinese languages in Singapore'
WARNING:root:Loop 'Magazines'<->'Magazine publishing'
WARNING:root:Loop 'Basketball venues in the United States'<->'American Basketball Association (2000–present) venues'
WARNING:root:Loop 'Islamic Republic of Iran Broadcasting'<->'Television stations in Iran'
WARNING:root:Loop 'Punjab'<->'Punjabi culture'
WARNING:root:Loop 'Punjab, Pakistan'<->'Punjabi culture'
WARNING:root:Loop 'Rosh Hashanah'<->'Ten Days of Repentance'
WARNING:root:Loop 'Yom Kippur'<->'Ten Days of Repentance'
WARNING:root:Loop 'Computer vision'<->'Image processing'
 7
Accuracy: 0.84
Positive F1: 0.896503496503
Negative F1: 0.647619047619
Weighted F1: 0.822510822511
Depth:
WARNING:root:Loop 'Israeli music'<->'Middle Eastern music'
WARNING:root:Loop 'Distributed data storage'<->'Distributed data storage systems'
WARNING:root:Loop 'Cloud storage'<->'File hosting'
WARNING:root:Loop 'Cash-Carter family'<->'Johnny Cash'
WARNING:root:Loop 'Lennon family'<->'John Lennon'
WARNING:root:Loop 'Lennon family'<->'Yoko Ono'
WARNING:root:Loop 'McCartney family'<->'Linda McCartney'
WARNING:root:Loop 'Igbo language'<->'Igboid languages'
WARNING:root:Loop 'Sound'<->'Hearing'
WARNING:root:Loop 'Sound'<->'Music'
WARNING:root:Loop 'Victoria Beckham'<->'David Beckham'
WARNING:root:Loop 'Chinese language by country'<->'Chinese languages in Singapore'
WARNING:root:Loop 'Magazines'<->'Magazine publishing'
WARNING:root:Loop 'Basketball venues in the United States'<->'American Basketball Association (2000–present) venues'
WARNING:root:Loop 'Islamic Republic of Iran Broadcasting'<->'Television stations in Iran'
WARNING:root:Loop 'Punjab'<->'Punjabi culture'
WARNING:root:Loop 'Punjab, Pakistan'<->'Punjabi culture'
WARNING:root:Loop 'Rosh Hashanah'<->'Ten Days of Repentance'
WARNING:root:Loop 'Yom Kippur'<->'Ten Days of Repentance'
WARNING:root:Loop 'Computer vision'<->'Image processing'
WARNING:root:Loop 'Chinese society'<->'Chinese culture'
WARNING:root:Loop 'Singaporean society'<->'Languages of Singapore'
WARNING:root:Loop 'Singaporean society'<->'Singaporean culture'
WARNING:root:Loop 'Punjab, India'<->'Punjabi culture'
WARNING:root:Loop 'Maccabees'<->'Cultural depictions of the Maccabees'
WARNING:root:Loop 'Optical materials'<->'Transparent materials'
 8
Accuracy: 0.76972972973
Positive F1: 0.859033752482
Negative F1: 0.371681415929
Weighted F1: 0.714145219993
Depth: 9
Accuracy: 0.702702702703
Positive F1: 0.825396825397
Negative F1: 0.0
Weighted F1: 0.580008580009

In [111]:
from sklearn import cross_validation

def default_classifier_evaluation_params():
    categories_and_classes = list(read_ground_truth_topic_labels().items())
    np.random.shuffle(categories_and_classes)
    categories, classes = list(zip(*categories_and_classes))
    classes = np.array(classes, dtype=bool)
    categories = np.array(categories)
    inner_cross_validation = None  # No inner cross-validation.
    outer_cross_validation = topics.default_cross_validation
    def model_selection_measure(*args, **params):
        return util.weighted_f1(*args, **params)
    evaluation_measures = [util.accuracy_score, util.f1_pos_class, util.f1_neg_class, util.weighted_f1]
    return categories, classes, inner_cross_validation,\
           outer_cross_validation, model_selection_measure,\
           evaluation_measures

def evaluate_learning_based_classifier_cross_validated(training_size=None):
    np.random.seed(0)
    categories, classes, inner_cross_validation,\
        outer_cross_validation, model_selection_measure,\
        evaluation_measures = default_classifier_evaluation_params()
    def smaller_cross_validation(outputs):
        return cross_validation.StratifiedKFold(outputs, n_folds=2)
    inner_cross_validation = lambda outputs: cross_validation.StratifiedKFold(outputs, n_folds=3)
    param_grid = topics.new_training_params_cv()['param_grid']
    param_grid[0]['C'] = [0.25, 0.5, 1, 3, 7, 15]
    full_selection = precompute_full_selection()
    features = topics.default_features.copy()
    classifier_params = topics.default_classifier_params.copy()
    classifier_params['C'] = 0.25
    tuned_clf = topics.CategorySelectionClassifier(
        full_selection=full_selection,
        features=features,
        classifier_fn=topics.default_classifier,
        max_depth=full_selection._max_depth,
        instance_weight=lambda x: 1,
        **classifier_params)
    print(classes.dtype)
    return topics.train_evaluate_topic_classifier_cv(
        tuned_clf, categories, classes,
        inner_cross_validation,
        outer_cross_validation,
#         smaller_cross_validation,                                              
        model_selection_measure,
        evaluation_measures,
        param_grid=param_grid,
        learning=True,
        training_size=training_size)

In [112]:
# metrics = evaluate_learning_based_classifier_cross_validated()


WARNING:root:Loop 'Musicology'<->'Philosophy of music'
WARNING:root:Loop 'Philosophy of music'<->'Musicology'
WARNING:root:Loop 'Contemporary classical musicians'<->'Contemporary classical composers'
WARNING:root:Loop 'Israeli music'<->'Middle Eastern music'
WARNING:root:Loop 'Freak folk'<->'New Weird America'
WARNING:root:Loop 'Distributed data storage'<->'Distributed data storage systems'
WARNING:root:Loop 'Distributed data storage'<->'File sharing networks'
WARNING:root:Loop 'File sharing networks'<->'Distributed data storage'
WARNING:root:Loop 'Electronic musical instruments'<->'Quintephones'
WARNING:root:Loop 'Contemporary classical composers'<->'Contemporary classical musicians'
WARNING:root:Loop 'Band-centric video games'<->'Musician video games'
WARNING:root:Loop 'Musician video games'<->'Band-centric video games'
WARNING:root:Loop 'New Weird America'<->'Freak folk'
WARNING:root:Loop 'Cloud storage'<->'File hosting'
WARNING:root:Loop 'Cash-Carter family'<->'Johnny Cash'
WARNING:root:Loop 'Lennon family'<->'John Lennon'
WARNING:root:Loop 'Lennon family'<->'Yoko Ono'
WARNING:root:Loop 'McCartney family'<->'Linda McCartney'
WARNING:root:Loop 'Quintephones'<->'Electronic musical instruments'
WARNING:root:Loop 'Igbo language'<->'Igboid languages'
WARNING:root:Loop 'Sound'<->'Hearing'
WARNING:root:Loop 'Sound'<->'Music'
WARNING:root:Loop 'Marathi music'<->'Marathi film music composers'
WARNING:root:Loop 'Middle Eastern music'<->'Israeli music'
WARNING:root:Loop 'Central American music'<->'Latin American music'
WARNING:root:Loop 'Latin American music'<->'Central American music'
WARNING:root:Loop 'Marathi film music composers'<->'Marathi music'
WARNING:root:Loop 'Johnny Cash'<->'Cash-Carter family'
WARNING:root:Loop 'Linda McCartney'<->'McCartney family'
WARNING:root:Loop 'Wynonna Judd'<->'The Judds'
WARNING:root:Loop 'Yoko Ono'<->'Lennon family'
WARNING:root:Loop 'John Lennon'<->'Lennon family'
WARNING:root:Loop 'Victoria Beckham'<->'David Beckham'
WARNING:root:Loop 'Latvian classical musicians by instrument'<->'Latvian classical organists'
WARNING:root:Loop 'Chinese language by country'<->'Chinese languages in Singapore'
WARNING:root:Loop 'Magazines'<->'Magazine publishing'
WARNING:root:Loop 'Australian electronic music groups'<->'Australian electronic dance music groups'
WARNING:root:Loop 'Basketball venues in the United States'<->'American Basketball Association (2000–present) venues'
WARNING:root:Loop 'Islamic Republic of Iran Broadcasting'<->'Television stations in Iran'
WARNING:root:Loop 'The Judds'<->'Wynonna Judd'
WARNING:root:Loop 'Latvian classical organists'<->'Latvian classical musicians'
WARNING:root:Loop 'Latvian classical organists'<->'Latvian classical musicians by instrument'
WARNING:root:Loop 'Igboid languages'<->'Igbo language'
WARNING:root:Loop 'Australian electronic dance music groups'<->'Australian electronic music groups'
bool
Fold 1
Fold
WARNING:root:Loop 'Musicology'<->'Philosophy of music'
WARNING:root:Loop 'Philosophy of music'<->'Musicology'
WARNING:root:Loop 'Contemporary classical musicians'<->'Contemporary classical composers'
WARNING:root:Loop 'Israeli music'<->'Middle Eastern music'
WARNING:root:Loop 'Freak folk'<->'New Weird America'
WARNING:root:Loop 'Distributed data storage'<->'Distributed data storage systems'
WARNING:root:Loop 'Distributed data storage'<->'File sharing networks'
WARNING:root:Loop 'File sharing networks'<->'Distributed data storage'
WARNING:root:Loop 'Electronic musical instruments'<->'Quintephones'
WARNING:root:Loop 'Contemporary classical composers'<->'Contemporary classical musicians'
WARNING:root:Loop 'Band-centric video games'<->'Musician video games'
WARNING:root:Loop 'Musician video games'<->'Band-centric video games'
WARNING:root:Loop 'New Weird America'<->'Freak folk'
WARNING:root:Loop 'Cloud storage'<->'File hosting'
WARNING:root:Loop 'Cash-Carter family'<->'Johnny Cash'
WARNING:root:Loop 'Lennon family'<->'John Lennon'
WARNING:root:Loop 'Lennon family'<->'Yoko Ono'
WARNING:root:Loop 'McCartney family'<->'Linda McCartney'
WARNING:root:Loop 'Quintephones'<->'Electronic musical instruments'
WARNING:root:Loop 'Igbo language'<->'Igboid languages'
WARNING:root:Loop 'Sound'<->'Hearing'
WARNING:root:Loop 'Sound'<->'Music'
WARNING:root:Loop 'Marathi music'<->'Marathi film music composers'
WARNING:root:Loop 'Middle Eastern music'<->'Israeli music'
WARNING:root:Loop 'Central American music'<->'Latin American music'
WARNING:root:Loop 'Latin American music'<->'Central American music'
WARNING:root:Loop 'Marathi film music composers'<->'Marathi music'
WARNING:root:Loop 'Johnny Cash'<->'Cash-Carter family'
WARNING:root:Loop 'Linda McCartney'<->'McCartney family'
WARNING:root:Loop 'Wynonna Judd'<->'The Judds'
WARNING:root:Loop 'Yoko Ono'<->'Lennon family'
WARNING:root:Loop 'John Lennon'<->'Lennon family'
WARNING:root:Loop 'Victoria Beckham'<->'David Beckham'
WARNING:root:Loop 'Latvian classical musicians by instrument'<->'Latvian classical organists'
WARNING:root:Loop 'Magazines'<->'Magazine publishing'
WARNING:root:Loop 'Australian electronic music groups'<->'Australian electronic dance music groups'
WARNING:root:Loop 'Basketball venues in the United States'<->'American Basketball Association (2000–present) venues'
WARNING:root:Loop 'The Judds'<->'Wynonna Judd'
WARNING:root:Loop 'Latvian classical organists'<->'Latvian classical musicians'
WARNING:root:Loop 'Latvian classical organists'<->'Latvian classical musicians by instrument'
WARNING:root:Loop 'Igboid languages'<->'Igbo language'
WARNING:root:Loop 'Australian electronic dance music groups'<->'Australian electronic music groups'
 2

In [115]:
# metric_names = ['accuracy', 'f1_pos', 'f1_neg', 'weighted_f1']
# for metric_name, metric in zip(metric_names, metrics):
#     print("{:<11s} : {:.3f} +- {:.3f}".format(metric_name, metric.mean(), metric.std()))


accuracy    : 0.883 +- 0.004
f1_pos      : 0.920 +- 0.004
f1_neg      : 0.782 +- 0.003
weighted_f1 : 0.879 +- 0.003

In [107]:
def evaluate_depth_based_classifier_cross_validated(depth):
    np.random.seed(0)
    categories, classes, inner_cross_validation,\
        outer_cross_validation, model_selection_measure,\
        evaluation_measures = default_classifier_evaluation_params()
    def smaller_cross_validation(outputs):
        return cross_validation.StratifiedKFold(outputs, n_folds=2)
    tuned_clf = depth_based_selection(precompute_full_selection(), 6)
    return topics.train_evaluate_topic_classifier_cv(
        tuned_clf, categories, classes,
        inner_cross_validation,
#       smaller_cross_validation,
        outer_cross_validation,
        model_selection_measure,
        evaluation_measures,
        param_grid=None,
        learning=False)

# metrics = evaluate_depth_based_classifier_cross_validated(6)
# metric_names = ['accuracy', 'f1_pos', 'f1_neg', 'weighted_f1']
# for metric_name, metric in zip(metric_names, metrics):
#     print("{:<11s} : {:.3f} +- {:.3f}".format(metric_name, metric.mean(), metric.std()))


WARNING:root:Loop 'Israeli music'<->'Middle Eastern music'
WARNING:root:Loop 'Distributed data storage'<->'Distributed data storage systems'
WARNING:root:Loop 'Cloud storage'<->'File hosting'
WARNING:root:Loop 'Cash-Carter family'<->'Johnny Cash'
WARNING:root:Loop 'Lennon family'<->'John Lennon'
WARNING:root:Loop 'Lennon family'<->'Yoko Ono'
WARNING:root:Loop 'McCartney family'<->'Linda McCartney'
WARNING:root:Loop 'Igbo language'<->'Igboid languages'
WARNING:root:Loop 'Sound'<->'Hearing'
WARNING:root:Loop 'Sound'<->'Music'
WARNING:root:Loop 'Victoria Beckham'<->'David Beckham'
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Fold 6
Fold 7
Fold 8
Fold 9
Fold 10
accuracy    : 0.885 +- 0.029
f1_pos      : 0.921 +- 0.020
f1_neg      : 0.790 +- 0.058
weighted_f1 : 0.882 +- 0.031

In [1]:
def default_trained_topic_selection_classifier(precomputed_data={}):
    if 'music_clf' not in precomputed_data:
        full_selection = precompute_full_selection()
        training_data = read_ground_truth_topic_labels()
        training_params = topics.new_training_params()
        training_params['classifier_params']['C'] = 0.5
        training_params['instance_weight_fn'] = lambda x : 1
        clf = topics.train_topic_classifier(
                training_data.keys(), training_data.values(),
                full_selection,
                **training_params)
        precomputed_data['music_clf'] = clf
    return precomputed_data['music_clf']