In [1]:

    
%load_ext autoreload
%autoreload 2



In [2]:

    
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_context('poster')
sns.set_color_codes()



In [3]:

    
import clumpy
from clumpy import datasets

10k Diabetes Processing

Numerics: Center and Scale
Categoricals: Remove categoricals with < 2 levels, Add a level for missing values, One-Hot Encode



In [ ]:

    
import clumpy
diabetes = clumpy.datasets.fetch_10kdiabetes().as_raw()
clusterer = clumpy.analysis.cluster(diabetes)









    



/home/joshua/workspace/scikit-learn/sklearn/utils/extmath.py:368: UserWarning: The number of power iterations is increased to 7 to achieve higher precision.
  warnings.warn("The number of power iterations is increased to "






    



[t-SNE] Computing pairwise distances...
[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Computed conditional probabilities for sample 1000 / 10000
[t-SNE] Computed conditional probabilities for sample 2000 / 10000
[t-SNE] Computed conditional probabilities for sample 3000 / 10000
[t-SNE] Computed conditional probabilities for sample 4000 / 10000
[t-SNE] Computed conditional probabilities for sample 5000 / 10000
[t-SNE] Computed conditional probabilities for sample 6000 / 10000
[t-SNE] Computed conditional probabilities for sample 7000 / 10000
[t-SNE] Computed conditional probabilities for sample 8000 / 10000
[t-SNE] Computed conditional probabilities for sample 9000 / 10000
[t-SNE] Computed conditional probabilities for sample 10000 / 10000
[t-SNE] Mean sigma: 1.110843
[t-SNE] KL divergence after 100 iterations with early exaggeration: 1.430321



In [ ]:

    
kmeans = clumpy.cluster.auto_kmeans(clusterer.embedding_, n_clusters=[4])
clumpy.plots.plot_clusters(clusterer.embedding_, kmeans.labels_)



In [114]:

    
clusterer.rules_[0].limits









    Out[114]:






  
    
      
      min
      max
      qp values
    
  
  
    
      number_diagnoses
      1.0
      8.5
      0.0
    
    
      num_procedures
      0.0
      3.5
      0.0
    
    
      num_lab_procedures
      1.0
      62.5
      0.0
    
    
      time_in_hospital
      1.0
      1.5
      0.0
    
    
      num_medications
      1.0
      13.5
      0.0



In [113]:

    
clusterer.rules_[1].limits









    Out[113]:






  
    
      
      min
      max
      qp values
    
  
  
    
      num_medications
      18.5
      81.0
      1.225095e-48
    
    
      number_diagnoses
      7.5
      9.0
      1.257108e-19
    
    
      time_in_hospital
      4.5
      14.0
      1.876100e-17
    
    
      num_lab_procedures
      28.5
      120.0
      2.876519e-04



In [110]:

    
clumpy.analysis.plot(clusterer, diabetes, 0)









    



   time_in_hospital  num_lab_procedures  num_procedures  num_medications  \
0          4.434700           43.078600        1.399200        15.563800   
0          2.972764           36.895246        0.902498        11.612893   

   number_diagnoses        id  
0          7.025300  marginal  
0          6.339726   cluster



In [8]:

    
from clumpy.datasets import fetch_10kdiabetes
from clumpy.datasets.utils import numeric_columns
diabetes = fetch_10kdiabetes()
data = diabetes.as_raw()
data.pop('readmitted')

num_columns = numeric_columns(data)
categorical_columns = [col for col in data.columns if 
                       col not in num_columns]
feature_names = num_columns + categorical_columns

PCA + Scaled Numeric



In [17]:

    
from clumpy.preprocessing import process_data
from sklearn.decomposition import RandomizedPCA
from sklearn.manifold import TSNE

X, num_cols, cat_cols = process_data(data, categorical_columns=categorical_columns,
                 impute='mean', cat_preprocessing='onehot', num_preprocessing='standardize')

# pca on one-hot vectors
pca = RandomizedPCA(n_components=50, random_state=124, iterated_power=7).fit_transform(X[:, len(num_columns):])

X = np.hstack((X[:, :len(num_columns)], pca))
scaled_X = X - np.mean(X, axis=0)
scaled_X /= np.max(scaled_X, axis=0)
scaled_X
#tsne = TSNE(n_components=2, random_state=1234, verbose=True, init='pca')
#embedding = tsne.fit_transform(scaled_X)
#embedding -= np.mean(embedding, axis=0)









    



/home/joshua/workspace/scikit-learn/sklearn/utils/deprecation.py:52: DeprecationWarning: Class RandomizedPCA is deprecated; RandomizedPCA was deprecated in 0.18 and will be removed in 0.20. Use PCA(svd_solver='randomized') instead. The new implementation DOES NOT store whiten components_. Apply transform to get them.
  warnings.warn(msg, category=DeprecationWarning)






    Out[17]:





array([[-0.35907917, -0.10502409,  0.56529299, ...,  0.0175151 ,
         0.01313631,  0.05929169],
       [-0.25453462, -0.45603174,  0.7826465 , ...,  0.08007108,
        -0.00091255, -0.11509268],
       [ 0.26818814, -0.40403061, -0.30412102, ..., -0.1300456 ,
        -0.02781727, -0.18802612],
       ..., 
       [ 0.89545545,  0.03797903,  0.13058599, ...,  0.0238098 ,
         0.11360666, -0.09321078],
       [-0.25453462,  0.24598356, -0.08676752, ...,  0.03997871,
         0.13922643, -0.14722971],
       [ 0.37273269,  0.23298328, -0.30412102, ..., -0.40680363,
        -0.3883513 , -0.24963658]])

Dissimilarity measure



In [109]:

    
from clumpy.preprocessing import process_data
from sklearn.manifold import TSNE

X = process_data(data, categorical_columns=categorical_columns, impute='mean')
#indices = np.arange(X.shape[1])
#dist_func = clumpy.metrics.GowerDistance(
#    numeric_indices=indices[:len(num_columns)], categorical_indices=indices[len(num_columns):], n_jobs=-1, gamma='heuristic')
#dis = dist_func(X)

#tsne = TSNE(n_components=2, random_state=1234, verbose=True, init='random', metric='precomputed')
#embedding = tsne.fit_transform(dis)
#embedding -= np.mean(embedding, axis=0)









    Out[109]:





array([[   1.,   35.,    4., ...,  344.,  430.,  203.],
       [   2.,    8.,    5., ...,   45.,  184.,  437.],
       [   7.,   12.,    0., ...,  455.,    1.,  136.],
       ..., 
       [  13.,   46.,    2., ...,  344.,    1.,    1.],
       [   2.,   62.,    1., ...,    1.,  207.,  461.],
       [   8.,   61.,    0., ...,  138.,  371.,  404.]])

Cluster using HDBSCAN



In [103]:

    
import hdbscan

clusterer = hdbscan.HDBSCAN(min_cluster_size=int(embedding.shape[0] * .01)).fit(embedding)
clumpy.plots.plot_clusters(embedding, clusterer.labels_)









    Out[103]:





<seaborn.axisgrid.FacetGrid at 0xe9e5990>



In [104]:

    
from clumpy.base import convert_to_kmeans

kmeans = convert_to_kmeans(embedding, clusterer)
clumpy.plots.plot_clusters(embedding, kmeans.labels_, kmeans.cluster_centers_)









    Out[104]:





<seaborn.axisgrid.FacetGrid at 0x10016f10>



In [18]:

    
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=4, n_jobs=4).fit(scaled_X)

Feature Importance



In [23]:

    
from clumpy import importance
X, num_cols, cat_cols = process_data(data, categorical_columns=categorical_columns)
feature_names = num_cols + cat_cols

importances = importance.anova_importance(X, kmeans.labels_, feature_names=feature_names, n_features=5)









    



/home/joshua/.virtualenvs/plot/local/lib/python2.7/site-packages/numpy/lib/arraysetops.py:200: FutureWarning: numpy not_equal will not check object identity in the future. The comparison did not return the same result as suggested by the identity (`is`)) and will change.
  flag = np.concatenate(([True], aux[1:] != aux[:-1]))
/home/joshua/.virtualenvs/plot/local/lib/python2.7/site-packages/numpy/lib/arraysetops.py:259: FutureWarning: numpy equal will not check object identity in the future. The comparison did not return the same result as suggested by the identity (`is`)) and will change.
  return aux[:-1][aux[1:] == aux[:-1]]
/home/joshua/workspace/scikit-learn/sklearn/feature_selection/univariate_selection.py:113: UserWarning: Features [27 35 37 38 42 43 44] are constant.
  UserWarning)

Text Descriptions



In [25]:

    
from clumpy.rules import tree_descriptions
rules = tree_descriptions(
    data, 
    kmeans.labels_, 
    categorical_columns=categorical_columns,
    feature_names=importances,
    max_depth=5)

pd.DataFrame({'cluster_id': range(len(rules)), 'description': rules})
for cluster_id, rule in enumerate(rules):
    print('cluster_id: {}'.format(cluster_id))
    print
    print(rule)
    print









    



cluster_id: 0

gender != Female AND
number_diagnoses > 7.5 

cluster_id: 1

gender != Male AND
number_diagnoses > 7.5 

cluster_id: 2

4.5 < number_diagnoses <= 5.5 

cluster_id: 3

5.5 < number_diagnoses <= 6.5



In [43]:

    
importances









    Out[43]:





{0: ['time_in_hospital',
  'num_lab_procedures',
  'num_medications',
  'number_diagnoses',
  'gender'],
 1: ['time_in_hospital',
  'num_medications',
  'number_diagnoses',
  'gender',
  'age'],
 2: ['time_in_hospital',
  'num_medications',
  'number_diagnoses',
  'age',
  'admission_source_id'],
 3: ['time_in_hospital',
  'num_medications',
  'number_diagnoses',
  'change',
  'diabetesMed']}



In [46]:

    
from clumpy.rules import prim_descriptions

boxes = prim_descriptions(data, kmeans.labels_, feature_names=importances)



In [50]:

    
for box in boxes:
    print(box.limits)









    



                     min     max  qp values
gender            {Male}  {Male}        0.0
number_diagnoses     7.5       9        0.0
                       min       max  qp values
gender            {Female}  {Female}        0.0
number_diagnoses       7.5         9        0.0
                  min  max  qp values
number_diagnoses  1.0  4.5        0.0
                   min   max      qp values
number_diagnoses   4.5   7.5   0.000000e+00
diabetesMed       {No}  {No}  1.038991e-187

Visualization



In [203]:

    
from clumpy.plots import plot_cluster_statistics

cluster_id = 0
cluster_importances = importances[cluster_id]
cat_vars = [var for var in cluster_importances if var in cat_cols]
num_vars = [var for var in cluster_importances if var in num_cols]

plot_cluster_statistics(
    cluster_labels=kmeans.labels_, 
    cluster_id=cluster_id, 
    data=data, scale=True,
    quant_var=num_vars,
    qual_var=cat_vars,
    figsize=(15,15))









    



   number_diagnoses        id
0           7.02530  marginal
0           8.50506   cluster



In [9]:

    
import mca
X, num_cols, cat_cols = clumpy.preprocessing.process_data(data, categorical_columns=categorical_columns,
                 impute='mean', cat_preprocessing='onehot', num_preprocessing='standardize')



In [10]:

    
cat_X = X[:, len(num_cols):]



In [14]:

    
df = pd.DataFrame(cat_X, columns=cat_cols)
pd.isnull(df.values).sum()









    Out[14]:





0



In [12]:

    
mca = mca.mca(df, ncols=df.shape[1])









    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-12-354035f3c5cd> in <module>()
----> 1 mca = mca.mca(df, ncols=df.shape[1])

/home/joshua/.virtualenvs/plot/local/lib/python2.7/site-packages/mca.pyc in __init__(self, DF, cols, ncols, benzecri, TOL)
     50 
     51                 # another option, not pursued here, is sklearn.decomposition.TruncatedSVD
---> 52                 self.P, self.s, self.Q = scipy.linalg.svd(_mul(self.D_r, Z_c, self.D_c))
     53 
     54 		if benzecri: self.E = numpy.array([(K/(K-1)*(_ - 1/K))**2 

/home/joshua/.virtualenvs/plot/local/lib/python2.7/site-packages/scipy-0.18.0-py2.7-linux-x86_64.egg/scipy/linalg/decomp_svd.pyc in svd(a, full_matrices, compute_uv, overwrite_a, check_finite, lapack_driver)
     94 
     95     """
---> 96     a1 = _asarray_validated(a, check_finite=check_finite)
     97     if len(a1.shape) != 2:
     98         raise ValueError('expected matrix')

/home/joshua/.virtualenvs/plot/local/lib/python2.7/site-packages/scipy-0.18.0-py2.7-linux-x86_64.egg/scipy/_lib/_util.pyc in _asarray_validated(a, check_finite, sparse_ok, objects_ok, mask_ok, as_inexact)
    226             raise ValueError('masked arrays are not supported')
    227     toarray = np.asarray_chkfinite if check_finite else np.asarray
--> 228     a = toarray(a)
    229     if not objects_ok:
    230         if a.dtype is np.dtype('O'):

/home/joshua/.virtualenvs/plot/local/lib/python2.7/site-packages/numpy/lib/function_base.pyc in asarray_chkfinite(a, dtype, order)
   1031     if a.dtype.char in typecodes['AllFloat'] and not np.isfinite(a).all():
   1032         raise ValueError(
-> 1033             "array must not contain infs or NaNs")
   1034     return a
   1035 

ValueError: array must not contain infs or NaNs



In [ ]:

	min	max
number_diagnoses	1.0	8.5
num_procedures	0.0	3.5
num_lab_procedures	1.0	62.5
time_in_hospital	1.0	1.5
num_medications	1.0	13.5

	min	max	qp values
num_medications	18.5	81.0	1.225095e-48
number_diagnoses	7.5	9.0	1.257108e-19
time_in_hospital	4.5	14.0	1.876100e-17
num_lab_procedures	28.5	120.0	2.876519e-04