notebook.community

Edit and run



In [1]:

    
%load_ext autoreload
%autoreload 2



In [2]:

    
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_context('poster')
sns.set_color_codes()
sns.set_style('white')



In [3]:

    
from clumpy.datasets import fetch_cars
data = fetch_cars()
data.drop(['name'], axis=1, inplace=True)
data.head(10)









    



/home/joshua/workspace/scikit-learn/sklearn/cross_validation.py:43: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
/home/joshua/workspace/scikit-learn/sklearn/grid_search.py:43: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. This module will be removed in 0.20.
  DeprecationWarning)






    Out[3]:






  
    
      
      economy (mpg)
      cylinders
      displacement (cc)
      power (hp)
      weight (lb)
      0-60 mph (s)
      year
    
  
  
    
      0
      13.0
      8
      360.0
      175.0
      3821
      11.0
      73
    
    
      1
      15.0
      8
      390.0
      190.0
      3850
      8.5
      70
    
    
      2
      17.0
      8
      304.0
      150.0
      3672
      11.5
      72
    
    
      3
      20.2
      6
      232.0
      90.0
      3265
      18.2
      79
    
    
      4
      18.1
      6
      258.0
      120.0
      3410
      15.1
      78
    
    
      5
      23.0
      4
      151.0
      NaN
      3035
      20.5
      82
    
    
      6
      19.4
      6
      232.0
      90.0
      3210
      17.2
      78
    
    
      7
      24.3
      4
      151.0
      90.0
      3003
      20.1
      80
    
    
      8
      18.0
      6
      232.0
      100.0
      2789
      15.0
      73
    
    
      9
      19.0
      6
      232.0
      100.0
      2634
      13.0
      71



In [4]:

    
from clumpy.preprocessing import process_data

numeric_cols = ['economy (mpg)', 'displacement (cc)', 'power (hp)', 'weight (lb)', '0-60 mph (s)']
ordinal_cols = ['cylinders', 'year']
columns = numeric_cols + ordinal_cols

X = process_data(data[columns], impute='mean', num_preprocessing='minmax')



In [5]:

    
from clumpy import auto_kmeans
kmeans = auto_kmeans(X, n_clusters=[3, 4, 5])



In [6]:

    
kmeans.n_clusters









    Out[6]:





4



In [61]:

    
from clumpy.rules import tree_descriptions

rules = tree_descriptions(data[columns], 
                  kmeans.labels_, 
                  feature_names=columns,
                  max_depth=20,
                  n_features=5)



In [60]:

    
for rule in rules:
    print(rule)
    print









    



displacement (cc) <= 140.5 AND
year <= 76.5 

displacement (cc) > 284.5 AND
economy (mpg) <= 21.6 

cylinders > 4.5 AND
displacement (cc) <= 284.5 AND
economy (mpg) <= 29.75 

displacement (cc) <= 162.0 AND
economy (mpg) > 26.9 AND
year > 76.5



In [58]:

    
from clumpy import importance
from clumpy.plots import plot_cluster_statistics

X = process_data(data)
importances = importance.anova_importance(X, kmeans.labels_, feature_names=columns, n_features=5)

cluster_id = 0
cluster_importances = importances[cluster_id]
cat_vars = [var for var in cluster_importances if var in ordinal_cols]
num_vars = [var for var in cluster_importances if var in columns]

plot_cluster_statistics(
    cluster_labels=kmeans.labels_, 
    cluster_id=cluster_id, 
    data=data, 
    scale=True,
    quant_var=num_vars,
    qual_var=cat_vars,
    figsize=(15,10))









    



   displacement (cc)  power (hp)  0-60 mph (s)  cylinders       year        id
0         193.425879  104.469388     15.568090   5.454774  76.010050  marginal
0         106.405263   81.563830     16.524211   3.968421  73.484211   cluster



In [ ]:

	economy (mpg)	cylinders	displacement (cc)	power (hp)	weight (lb)	0-60 mph (s)	year
0	13.0	8	360.0	175.0	3821	11.0	73
1	15.0	8	390.0	190.0	3850	8.5	70
2	17.0	8	304.0	150.0	3672	11.5	72
3	20.2	6	232.0	90.0	3265	18.2	79
4	18.1	6	258.0	120.0	3410	15.1	78
5	23.0	4	151.0	NaN	3035	20.5	82
6	19.4	6	232.0	90.0	3210	17.2	78
7	24.3	4	151.0	90.0	3003	20.1	80
8	18.0	6	232.0	100.0	2789	15.0	73
9	19.0	6	232.0	100.0	2634	13.0	71