In [1]:
%load_ext autoreload
%autoreload 2
In [2]:
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_context('poster')
sns.set_color_codes()
In [3]:
import clumpy
from clumpy import datasets
In [ ]:
import clumpy
diabetes = clumpy.datasets.fetch_10kdiabetes().as_raw()
clusterer = clumpy.analysis.cluster(diabetes)
In [ ]:
kmeans = clumpy.cluster.auto_kmeans(clusterer.embedding_, n_clusters=[4])
clumpy.plots.plot_clusters(clusterer.embedding_, kmeans.labels_)
In [114]:
clusterer.rules_[0].limits
Out[114]:
In [113]:
clusterer.rules_[1].limits
Out[113]:
In [110]:
clumpy.analysis.plot(clusterer, diabetes, 0)
In [8]:
from clumpy.datasets import fetch_10kdiabetes
from clumpy.datasets.utils import numeric_columns
diabetes = fetch_10kdiabetes()
data = diabetes.as_raw()
data.pop('readmitted')
num_columns = numeric_columns(data)
categorical_columns = [col for col in data.columns if
col not in num_columns]
feature_names = num_columns + categorical_columns
In [17]:
from clumpy.preprocessing import process_data
from sklearn.decomposition import RandomizedPCA
from sklearn.manifold import TSNE
X, num_cols, cat_cols = process_data(data, categorical_columns=categorical_columns,
impute='mean', cat_preprocessing='onehot', num_preprocessing='standardize')
# pca on one-hot vectors
pca = RandomizedPCA(n_components=50, random_state=124, iterated_power=7).fit_transform(X[:, len(num_columns):])
X = np.hstack((X[:, :len(num_columns)], pca))
scaled_X = X - np.mean(X, axis=0)
scaled_X /= np.max(scaled_X, axis=0)
scaled_X
#tsne = TSNE(n_components=2, random_state=1234, verbose=True, init='pca')
#embedding = tsne.fit_transform(scaled_X)
#embedding -= np.mean(embedding, axis=0)
Out[17]:
In [109]:
from clumpy.preprocessing import process_data
from sklearn.manifold import TSNE
X = process_data(data, categorical_columns=categorical_columns, impute='mean')
#indices = np.arange(X.shape[1])
#dist_func = clumpy.metrics.GowerDistance(
# numeric_indices=indices[:len(num_columns)], categorical_indices=indices[len(num_columns):], n_jobs=-1, gamma='heuristic')
#dis = dist_func(X)
#tsne = TSNE(n_components=2, random_state=1234, verbose=True, init='random', metric='precomputed')
#embedding = tsne.fit_transform(dis)
#embedding -= np.mean(embedding, axis=0)
Out[109]:
In [103]:
import hdbscan
clusterer = hdbscan.HDBSCAN(min_cluster_size=int(embedding.shape[0] * .01)).fit(embedding)
clumpy.plots.plot_clusters(embedding, clusterer.labels_)
Out[103]:
In [104]:
from clumpy.base import convert_to_kmeans
kmeans = convert_to_kmeans(embedding, clusterer)
clumpy.plots.plot_clusters(embedding, kmeans.labels_, kmeans.cluster_centers_)
Out[104]:
In [18]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=4, n_jobs=4).fit(scaled_X)
In [23]:
from clumpy import importance
X, num_cols, cat_cols = process_data(data, categorical_columns=categorical_columns)
feature_names = num_cols + cat_cols
importances = importance.anova_importance(X, kmeans.labels_, feature_names=feature_names, n_features=5)
In [25]:
from clumpy.rules import tree_descriptions
rules = tree_descriptions(
data,
kmeans.labels_,
categorical_columns=categorical_columns,
feature_names=importances,
max_depth=5)
pd.DataFrame({'cluster_id': range(len(rules)), 'description': rules})
for cluster_id, rule in enumerate(rules):
print('cluster_id: {}'.format(cluster_id))
print
print(rule)
print
In [43]:
importances
Out[43]:
In [46]:
from clumpy.rules import prim_descriptions
boxes = prim_descriptions(data, kmeans.labels_, feature_names=importances)
In [50]:
for box in boxes:
print(box.limits)
In [203]:
from clumpy.plots import plot_cluster_statistics
cluster_id = 0
cluster_importances = importances[cluster_id]
cat_vars = [var for var in cluster_importances if var in cat_cols]
num_vars = [var for var in cluster_importances if var in num_cols]
plot_cluster_statistics(
cluster_labels=kmeans.labels_,
cluster_id=cluster_id,
data=data, scale=True,
quant_var=num_vars,
qual_var=cat_vars,
figsize=(15,15))
In [9]:
import mca
X, num_cols, cat_cols = clumpy.preprocessing.process_data(data, categorical_columns=categorical_columns,
impute='mean', cat_preprocessing='onehot', num_preprocessing='standardize')
In [10]:
cat_X = X[:, len(num_cols):]
In [14]:
df = pd.DataFrame(cat_X, columns=cat_cols)
pd.isnull(df.values).sum()
Out[14]:
In [12]:
mca = mca.mca(df, ncols=df.shape[1])
In [ ]: