In [1]:
%load_ext autoreload
%autoreload 2
In [2]:
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_context('poster')
sns.set_color_codes()
sns.set_style('white')
In [3]:
from clumpy.datasets import fetch_cars
data = fetch_cars()
data.drop(['name'], axis=1, inplace=True)
data.head(10)
Out[3]:
In [4]:
from clumpy.preprocessing import process_data
numeric_cols = ['economy (mpg)', 'displacement (cc)', 'power (hp)', 'weight (lb)', '0-60 mph (s)']
ordinal_cols = ['cylinders', 'year']
columns = numeric_cols + ordinal_cols
X = process_data(data[columns], impute='mean', num_preprocessing='minmax')
In [5]:
from clumpy import auto_kmeans
kmeans = auto_kmeans(X, n_clusters=[3, 4, 5])
In [6]:
kmeans.n_clusters
Out[6]:
In [61]:
from clumpy.rules import tree_descriptions
rules = tree_descriptions(data[columns],
kmeans.labels_,
feature_names=columns,
max_depth=20,
n_features=5)
In [60]:
for rule in rules:
print(rule)
print
In [58]:
from clumpy import importance
from clumpy.plots import plot_cluster_statistics
X = process_data(data)
importances = importance.anova_importance(X, kmeans.labels_, feature_names=columns, n_features=5)
cluster_id = 0
cluster_importances = importances[cluster_id]
cat_vars = [var for var in cluster_importances if var in ordinal_cols]
num_vars = [var for var in cluster_importances if var in columns]
plot_cluster_statistics(
cluster_labels=kmeans.labels_,
cluster_id=cluster_id,
data=data,
scale=True,
quant_var=num_vars,
qual_var=cat_vars,
figsize=(15,10))
In [ ]: