In [1]:
# from kmapper import jupyter
import kmapper as km
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.cluster import AgglomerativeClustering
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import Isomap
from sklearn.preprocessing import MinMaxScaler
In [2]:
newsgroups = fetch_20newsgroups(subset='train')
X, y, target_names = np.array(newsgroups.data), np.array(newsgroups.target), np.array(newsgroups.target_names)
print("SAMPLE",X[0])
print("SHAPE",X.shape)
print("TARGET",target_names[y[0]])
To project the unstructured text dataset down to 2 fixed dimensions, we will set up a function pipeline. Every consecutive function will take as input the output from the previous function.
We will try out "Latent Semantic Char-Gram Analysis followed by Isometric Mapping".
In [3]:
mapper = km.KeplerMapper(verbose=2)
projected_X = mapper.fit_transform(X,
projection=[TfidfVectorizer(analyzer="char",
ngram_range=(1,6),
max_df=0.83,
min_df=0.05),
TruncatedSVD(n_components=100,
random_state=1729),
Isomap(n_components=2,
n_jobs=-1)],
scaler=[None, None, MinMaxScaler()])
print("SHAPE",projected_X.shape)
We cover the projection with 10 33%-overlapping intervals per dimension (10*10=100 cubes total).
We cluster on the projection (but, note, we can also create an inverse_X to cluster on by vectorizing the original text data).
For clustering we use Agglomerative Single Linkage Clustering with the "cosine"-distance and 3 clusters. Agglomerative Clustering is a good cluster algorithm for TDA, since it both creates pleasing informative networks, and it has strong theoretical garantuees (see functor and functoriality).
In [4]:
from sklearn import cluster
graph = mapper.map(projected_X,
inverse_X=None,
clusterer=cluster.AgglomerativeClustering(n_clusters=3,
linkage="complete",
affinity="cosine"),
overlap_perc=0.33)
Here we show the flexibility of KeplerMapper by creating an interpretable_inverse_X that is easier to interpret by humans.
For text, this can be TFIDF (1-3)-wordgrams, like we do here. For structured data this can be regularitory/protected variables of interest, or using another model to select, say, the top 10% features.
In [5]:
vec = TfidfVectorizer(analyzer="word",
strip_accents="unicode",
stop_words="english",
ngram_range=(1,3),
max_df=0.97,
min_df=0.02)
interpretable_inverse_X = vec.fit_transform(X).toarray()
interpretable_inverse_X_names = vec.get_feature_names()
print("SHAPE", interpretable_inverse_X.shape)
print("FEATURE NAMES SAMPLE", interpretable_inverse_X_names[:400])
We use interpretable_inverse_X as the inverse_X during visualization. This way we get cluster statistics that are more informative/interpretable to humans (chargrams vs. wordgrams).
We also pass the projected_X to get cluster statistics for the projection. For custom_tooltips we use a textual description of the label.
The color function is simply the multi-class ground truth represented as a non-negative integer.
In [6]:
html = mapper.visualize(graph,
inverse_X=interpretable_inverse_X,
inverse_X_names=interpretable_inverse_X_names,
path_html="newsgroups20.html",
projected_X=projected_X,
projected_X_names=["ISOMAP1", "ISOMAP2"],
title="Newsgroups20: Latent Semantic Char-gram Analysis with Isometric Embedding",
custom_tooltips=np.array([target_names[ys] for ys in y]),
color_function=y)
# jupyter.display("newsgroups20.html")
In [ ]: