In [1]:
import json, ast, re
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline

Utility Functions


In [2]:
def load_items(filepath):
    print("Loading %s ..." % filepath)
    lines = open(filepath,"r").readlines()
    items = []
    category_map = {}
    for i, line in enumerate(lines):
        line = line.strip()
        if i == 0:
            task_name = line
        elif len(line) > 0:
            parts = line.split(":",1)
            category = parts[0].strip()
            for item in parts[1].strip().split(";"):
                if len(item) > 0:
                    items.append(item)
                    category_map[item] = category
    return task_name,items,category_map

In [3]:
def get_dict_data(filepath):
    print("Loading %s ..." % filepath)
    dict_data = json.load(open(filepath))
    dict_data = {ast.literal_eval(k): v for k, v in dict_data.items()}
    #dict_data = {frozenset(k): v for k, v in dict_data.items()}
    return dict_data

In [4]:
def create_similarity_matrix( items, dict_data ):
    n = len(items)
    S = np.zeros([n,n])
    index_map = {}
    for ind, item in enumerate(items):
        index_map[item] = ind
    for pair in dict_data:
        ind1 = index_map[pair[0]]
        ind2 = index_map[pair[1]]
        sim = dict_data[pair]
        S[ind1,ind2] = sim
        S[ind2,ind1] = sim
    return S

In [5]:
def create_distance_matrix( items, filepath ):
    sim_dict = get_dict_data( filepath )
    S = create_similarity_matrix( items, sim_dict )
    D = 1-(S/S.max())
    return D

In [6]:
# From sklearn calinski_harabaz_score, but need dispersion separately:
def between_within_dispersion(X, labels, cluster_names):
    """
    Based on
    .. [1] `T. Calinski and J. Harabasz, 1974. "A dendrite method for cluster
       analysis". Communications in Statistics
       <http://www.tandfonline.com/doi/abs/10.1080/03610927408827101>`_
    """    
    n_samples, _ = X.shape
    n_labels = max(labels)+1
    mean = np.mean(X, axis=0)
    extra_disp = 0.0
    intra_disp = 0.0
    results = pd.DataFrame(columns=["Categoey","Size","Within-SS", "Between-SS"])
    for k in range(n_labels):
        cluster_k = X[labels == k]
        mean_k = np.mean(cluster_k, axis=0)
        bgss = len(cluster_k) * np.sum((mean_k - mean) ** 2)
        wgss = np.sum((cluster_k - mean_k) ** 2)
        extra_disp += bgss
        intra_disp += wgss        
        results.loc[k] = (cluster_names[k], len(cluster_k), np.round(wgss,2), np.round(bgss,2)) 
    mean_within = wgss/k
    mean_between = bgss/k
    ch = extra_disp * (n_samples - n_labels) / (intra_disp * (n_labels - 1.))    
    return (results,ch,mean_within,mean_between)

Perform the Validation


In [7]:
methods = ["eve","word2vec_cbow","word2vec_sg","fasttext_cbow","fasttext_sg","glove"]
dataset_ids = ["european_cities", "movie_genres", "animal_classes", "cuisine", "music_genres", "nobel_laureates",
               "country_continent"]

In [8]:
result_cols = ["Dataset","Method","Mean Within","Mean Between","CH-Index"]
result_rows = []
for dataset_id in dataset_ids:
    # Load the data
    print()
    task_name,items,category_map = load_items("../dataset/tasks/%s.txt" % dataset_id )
    categories = list(set(category_map.values()))
    categories.sort()
    print("Task: %s" % task_name)
    print("%d categories: %s" % (len(categories),categories) )
    # Convert to labels
    labels = []
    for item in items:
        labels.append( categories.index( category_map[item] ) )
    labels = np.array(labels)    
    # Create distance matrices
    print("Creating distance matrices ...")
    D = {}
    for method_id in methods:
        filepath = "../output/pairwise_similarity/%s_%s.json" % (dataset_id,method_id) 
        D[method_id] = create_distance_matrix(items, filepath )
    # Calculate the scores
    print("Validating by measures ...")
    for method_id in methods:
        results,ch,mean_within,mean_between = between_within_dispersion( D[method_id], labels, categories )
        result_rows.append( [dataset_id, method_id, mean_within, mean_between, ch ] )


Loading ../dataset/tasks/european_cities.txt ...
Task: European cities
5 categories: ['France', 'Germany', 'Great Britain', 'Italy', 'Spain']
Creating distance matrices ...
Loading ../output/pairwise_similarity/european_cities_eve.json ...
Loading ../output/pairwise_similarity/european_cities_word2vec_cbow.json ...
Loading ../output/pairwise_similarity/european_cities_word2vec_sg.json ...
Loading ../output/pairwise_similarity/european_cities_fasttext_cbow.json ...
Loading ../output/pairwise_similarity/european_cities_fasttext_sg.json ...
Loading ../output/pairwise_similarity/european_cities_glove.json ...
Validating by measures ...

Loading ../dataset/tasks/movie_genres.txt ...
Task: Movie genres
5 categories: ['Animation', 'Crime film', 'Horror film', 'Science fiction film', 'Western (genre)']
Creating distance matrices ...
Loading ../output/pairwise_similarity/movie_genres_eve.json ...
Loading ../output/pairwise_similarity/movie_genres_word2vec_cbow.json ...
Loading ../output/pairwise_similarity/movie_genres_word2vec_sg.json ...
Loading ../output/pairwise_similarity/movie_genres_fasttext_cbow.json ...
Loading ../output/pairwise_similarity/movie_genres_fasttext_sg.json ...
Loading ../output/pairwise_similarity/movie_genres_glove.json ...
Validating by measures ...

Loading ../dataset/tasks/animal_classes.txt ...
Task: Animal classes
5 categories: ['Amphibian', 'Bird', 'Fish', 'Mammal', 'Reptile']
Creating distance matrices ...
Loading ../output/pairwise_similarity/animal_classes_eve.json ...
Loading ../output/pairwise_similarity/animal_classes_word2vec_cbow.json ...
Loading ../output/pairwise_similarity/animal_classes_word2vec_sg.json ...
Loading ../output/pairwise_similarity/animal_classes_fasttext_cbow.json ...
Loading ../output/pairwise_similarity/animal_classes_fasttext_sg.json ...
Loading ../output/pairwise_similarity/animal_classes_glove.json ...
Validating by measures ...

Loading ../dataset/tasks/cuisine.txt ...
Task: Cuisine
5 categories: ['Italian cuisine', 'Mexican cuisine', 'Pakistani cuisine', 'Swedish cuisine', 'Vietnamese cuisine']
Creating distance matrices ...
Loading ../output/pairwise_similarity/cuisine_eve.json ...
Loading ../output/pairwise_similarity/cuisine_word2vec_cbow.json ...
Loading ../output/pairwise_similarity/cuisine_word2vec_sg.json ...
Loading ../output/pairwise_similarity/cuisine_fasttext_cbow.json ...
Loading ../output/pairwise_similarity/cuisine_fasttext_sg.json ...
Loading ../output/pairwise_similarity/cuisine_glove.json ...
Validating by measures ...

Loading ../dataset/tasks/music_genres.txt ...
Task: Music genres
5 categories: ['Britpop', 'Classical music', 'Grunge', 'Hip hop music', 'Jazz']
Creating distance matrices ...
Loading ../output/pairwise_similarity/music_genres_eve.json ...
Loading ../output/pairwise_similarity/music_genres_word2vec_cbow.json ...
Loading ../output/pairwise_similarity/music_genres_word2vec_sg.json ...
Loading ../output/pairwise_similarity/music_genres_fasttext_cbow.json ...
Loading ../output/pairwise_similarity/music_genres_fasttext_sg.json ...
Loading ../output/pairwise_similarity/music_genres_glove.json ...
Validating by measures ...

Loading ../dataset/tasks/nobel_laureates.txt ...
Task: Nobel laureates
5 categories: ['List of Nobel Memorial Prize laureates in Economics', 'List of Nobel Peace Prize laureates', 'List of Nobel laureates in Chemistry', 'List of Nobel laureates in Literature', 'List of Nobel laureates in Physics']
Creating distance matrices ...
Loading ../output/pairwise_similarity/nobel_laureates_eve.json ...
Loading ../output/pairwise_similarity/nobel_laureates_word2vec_cbow.json ...
Loading ../output/pairwise_similarity/nobel_laureates_word2vec_sg.json ...
Loading ../output/pairwise_similarity/nobel_laureates_fasttext_cbow.json ...
Loading ../output/pairwise_similarity/nobel_laureates_fasttext_sg.json ...
Loading ../output/pairwise_similarity/nobel_laureates_glove.json ...
Validating by measures ...

Loading ../dataset/tasks/country_continent.txt ...
Task: Countries by continent
6 categories: ['Africa', 'Asia', 'Europe', 'North America', 'Oceania', 'South America']
Creating distance matrices ...
Loading ../output/pairwise_similarity/country_continent_eve.json ...
Loading ../output/pairwise_similarity/country_continent_word2vec_cbow.json ...
Loading ../output/pairwise_similarity/country_continent_word2vec_sg.json ...
Loading ../output/pairwise_similarity/country_continent_fasttext_cbow.json ...
Loading ../output/pairwise_similarity/country_continent_fasttext_sg.json ...
Loading ../output/pairwise_similarity/country_continent_glove.json ...
Validating by measures ...

In [9]:
df_results = pd.DataFrame( result_rows, columns = result_cols )
df_results = df_results.round(3)
df_results


Out[9]:
Dataset Method Mean Within Mean Between CH-Index
0 european_cities eve 3.126 2.391 29.081
1 european_cities word2vec_cbow 7.715 17.141 48.572
2 european_cities word2vec_sg 5.455 7.448 28.980
3 european_cities fasttext_cbow 8.303 13.244 33.725
4 european_cities fasttext_sg 5.750 10.857 41.884
5 european_cities glove 6.859 3.835 15.534
6 movie_genres eve 6.922 1.582 12.447
7 movie_genres word2vec_cbow 11.982 0.396 1.358
8 movie_genres word2vec_sg 6.044 0.184 1.431
9 movie_genres fasttext_cbow 9.807 0.408 1.509
10 movie_genres fasttext_sg 5.611 0.178 1.873
11 movie_genres glove 17.964 0.476 1.269
12 animal_classes eve 1.996 0.466 7.643
13 animal_classes word2vec_cbow 13.029 1.298 5.983
14 animal_classes word2vec_sg 6.228 0.739 4.092
15 animal_classes fasttext_cbow 10.313 1.144 3.914
16 animal_classes fasttext_sg 7.713 1.134 4.438
17 animal_classes glove 12.203 0.460 5.463
18 cuisine eve 2.924 8.184 54.176
19 cuisine word2vec_cbow 17.314 2.115 2.376
20 cuisine word2vec_sg 8.884 2.122 3.512
21 cuisine fasttext_cbow 9.742 14.520 14.252
22 cuisine fasttext_sg 6.247 10.803 15.997
23 cuisine glove 12.362 0.882 2.227
24 music_genres eve 1.900 2.234 25.044
25 music_genres word2vec_cbow 8.248 2.791 18.010
26 music_genres word2vec_sg 5.247 1.597 14.803
27 music_genres fasttext_cbow 6.723 1.158 13.056
28 music_genres fasttext_sg 5.772 1.383 12.931
29 music_genres glove 7.722 1.681 6.092
30 nobel_laureates eve 2.876 1.945 21.847
31 nobel_laureates word2vec_cbow 14.561 0.787 3.580
32 nobel_laureates word2vec_sg 8.989 0.388 3.341
33 nobel_laureates fasttext_cbow 12.396 0.559 1.728
34 nobel_laureates fasttext_sg 10.586 0.547 3.156
35 nobel_laureates glove 15.131 0.200 2.909
36 country_continent eve 2.343 3.328 15.834
37 country_continent word2vec_cbow 2.632 3.862 11.838
38 country_continent word2vec_sg 2.254 1.782 8.194
39 country_continent fasttext_cbow 2.826 4.077 13.690
40 country_continent fasttext_sg 2.563 2.834 12.291
41 country_continent glove 2.597 1.630 7.522

In [10]:
df_results.to_csv("cluster-validation.csv")

In [ ]: