Clustering



In [6]:

    
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from Orange.data import Table, Domain

import warnings
warnings.simplefilter('ignore', DeprecationWarning)
warnings.simplefilter('ignore', RuntimeWarning)

Generating toy data sets



In [17]:

    
from sklearn import datasets
from sklearn.preprocessing import StandardScaler

n_samples = 1000
np.random.seed(0)

colors = np.array([x for x in 'rcbgmykbgrcmykbgrcmykbgrcmyk'])
colors = np.hstack([colors] * 20)

# Noisy circles
circles_X, circles_Y  = datasets.make_circles(n_samples=n_samples, factor=.5, noise=.05)
circles_X = StandardScaler().fit_transform(circles_X)
plt.scatter(circles_X[:, 0], circles_X[:, 1], color=colors[circles_Y].tolist(), s=10);

circles = Table(circles_X, Y=circles_Y)



In [18]:

    
# Noisy moons
moons_X, moons_Y = datasets.make_moons(n_samples=n_samples, noise=0.05)
moons_X = StandardScaler().fit_transform(moons_X)
plt.scatter(moons_X[:, 0], moons_X[:, 1], color=colors[moons_Y].tolist(), s=10);

moons = Table(moons_X, Y=moons_Y)



In [19]:

    
# Noisy blobs
centers = np.array([[0, 0], [3, 3], [4, 4]])
cluster_std = 0.4

blobs_X, blobs_Y = datasets.make_blobs(n_samples=n_samples, centers=centers, cluster_std=cluster_std, random_state=8)
blobs_X = StandardScaler().fit_transform(blobs_X)
plt.scatter(blobs_X[:, 0], blobs_X[:, 1], color=colors[blobs_Y].tolist(), s=10);

blobs = Table(blobs_X, Y=blobs_Y)



In [20]:

    
# No structure
no_structure_X, no_structure_Y = np.random.rand(n_samples, 2), None
plt.scatter(no_structure_X[:, 0], no_structure_X[:, 1], color='b', s=10);

no_structure = Table(no_structure_X, Y=no_structure_Y)

Comparing different clustering algorithms



In [21]:

    
from Orange.clustering import HierarchicalClustering, KMeans, DBSCAN
from Orange.clustering.hierarchical import AVERAGE
from Orange.distance import Euclidean

plot_num = 1
plt.figure(figsize=(17, 15))
plt.subplots_adjust(left=.001, right=.999, bottom=.001, top=.96, wspace=.05,hspace=.01)

for i_dataset, dataset in enumerate([circles, moons, blobs, no_structure]):

    dist_matrix = Euclidean(dataset)

    two_means = KMeans(n_clusters=2)(dataset)
    average = HierarchicalClustering(n_clusters=2, linkage=AVERAGE).fit_predict(dist_matrix)
    dbscan = DBSCAN(eps=.2)(dataset)

    for name, model in [('KMeans', two_means), ('Hierarchical', average), ('DBSCAN', dbscan)]:
        if hasattr(model, 'labels_'):
            y_pred = model.labels_.astype(np.int)
        else:
            y_pred = model.astype(np.int)

        # plot
        plt.subplot(4, 3, plot_num)
        if i_dataset == 0:
            plt.title(name, size=18)
        plt.scatter(dataset.X[:, 0], dataset.X[:, 1], color=colors[y_pred].tolist(), s=10)

        if hasattr(model, 'cluster_centers_'):
            centers = model.cluster_centers_
            center_colors = colors[:len(centers)]
            plt.scatter(centers[:, 0], centers[:, 1], s=100, c=center_colors)

        plt.xlim(-2, 2)
        plt.ylim(-2, 2)
        plt.xticks(())
        plt.yticks(())
        plot_num += 1

plt.show()

Data retrieval from the NCBI GEO database



In [7]:

    
import orangecontrib.bio.geo

# Analysis of cultured skin fibroblasts prepared from patients with Marfan syndrome (MFS). 
# MFS is a heritable connective tissue disorder caused by mutations in the fibrillin-1 gene. 
# Results provide insight into the molecular pathogenesis of MFS.
# Y = [control, Marfan syndrome]
# http://www.ncbi.nlm.nih.gov/sites/GDSbrowser?acc=GDS2960

gds = orangecontrib.bio.geo.GDS('GDS2960')
print('Title: %s'% gds.info['title'])
print('Description: %s' % gds.info['description'])
print('PubMed ID: %s' % gds.info['pubmed_id'])

data = gds.getdata(transpose=True, report_genes=True)
print('Attributes: %d' % len(data.domain.attributes))
print('Samples: %d' % len(data))









    



Title: Marfan syndrome: cultured skin fibroblasts
Description: Analysis of cultured skin fibroblasts prepared from patients with Marfan syndrome (MFS). MFS is a heritable connective tissue disorder caused by mutations in the fibrillin-1 gene. Results provide insight into the molecular pathogenesis of MFS.
PubMed ID: 17850668
Attributes: 4068
Samples: 101

Gene selection with CUR matrix decomposition



In [8]:

    
from Orange.preprocess import Impute

# Remove all-NaN columns
non_nan_idx = np.nonzero(np.sum(np.isnan(data.X), 0) < data.X.shape[0])[0]
non_nan_attr = [data.domain.attributes[i] for i in non_nan_idx]
non_nan_domain = Domain(attributes=non_nan_attr, class_vars=data.domain.class_vars, metas=data.domain.metas)
non_nan_data = Table(non_nan_domain, data.X[:, non_nan_idx], Y=data.Y, metas=data.metas)
print('Attributes: %d' % len(non_nan_data.domain.attributes))
print('Samples: %d' % len(non_nan_data))

# Impute missing values
from sklearn.preprocessing import Imputer
imputed = Imputer().fit_transform(non_nan_data.X)
data = Table(non_nan_data.domain, imputed, Y=non_nan_data.Y, metas=non_nan_data.metas)









    



Attributes: 4068
Samples: 101



In [9]:

    
from Orange.projection import CUR

# Gene selection 
cur = CUR(rank=43, max_error=1, random_state=1)
cur_model = cur(data)

new_data = cur_model(data, axis=0)
print('Attributes: %d' % len(new_data.domain.attributes))
print('Samples: %d' % len(new_data))

# Gene statistical leverage scores
n_genes = len(data.domain.attributes)
plt.plot(np.arange(n_genes), cur_model.lev_features_);
plt.xlim(0, n_genes);

# Genes with highest leverage scores
lev_name = sorted([(lev, attr.name) for lev, attr in zip(cur_model.lev_features_, data.domain.attributes)], reverse=True)
print()
print('Top scoring genes:\n %s' % '\n '.join('%5.4f %s' % (lev, name) for lev, name in lev_name[:10]))









    



Attributes: 161
Samples: 101

Top scoring genes:
 0.0047 3771
 0.0044 1785
 0.0043 COL3A1
 0.0037 ELN
 0.0032 EPAS1
 0.0031 IGFBP5
 0.0030 DKK3
 0.0029 1876
 0.0028 784
 0.0026 PAR5

Hierarchical clustering



In [25]:

    
from Orange.distance import Euclidean
from Orange.clustering import hierarchical

# Compute distance
dist_matrix = Euclidean(new_data)



In [26]:

    
from scipy.cluster.hierarchy import dendrogram

Z = hierarchical.dist_matrix_linkage(dist_matrix, linkage=hierarchical.COMPLETE)
d = dendrogram(Z)

plt.title('Dendrogram')
plt.xticks(())
plt.show()

Clustering performance evaluation



In [27]:

    
hc = HierarchicalClustering(n_clusters=4, linkage=hierarchical.COMPLETE)
y_pred = hc.fit_predict(dist_matrix)
y_true = new_data.Y

# Any evaluation metric should not take the absolute values of the cluster labels 
# into account but rather if this clustering define separations of the data similar 
# to some ground truth set of classes or satisfying some assumption such that 
# members belong to the same class are more similar that members of different 
# classes according to some similarity metric.
print('Predicted labels: %s' % y_pred)
print()
print('Ground-truth labels: %s' % y_true)









    



Predicted labels: [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.
  0.  3.  0.  0.  2.  2.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  0.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  0.  1.
  1.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.]

Ground-truth labels: [ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]

Adjusted Rand Index (ARI)



In [28]:

    
from sklearn import metrics
from collections import defaultdict

ari = defaultdict(dict)
for linkage in [hierarchical.COMPLETE, hierarchical.SINGLE, hierarchical.AVERAGE, hierarchical.WEIGHTED]:
    for n_clusters in [2, 3, 4, 5]:
        hc = HierarchicalClustering(n_clusters=n_clusters, linkage=linkage)
        y_pred = hc.fit_predict(dist_matrix)
        y_true = new_data.Y
        ari[linkage][n_clusters] = metrics.adjusted_rand_score(y_true, y_pred)  
    scores = '\n '.join('%d: %5.4f' % (ncls, score) for ncls, score in ari[linkage].items())
    print('Linkage: %s\n %s' % (linkage, scores))









    



Linkage: complete
 2: 0.0298
 3: 0.0290
 4: 0.0052
 5: -0.0062
Linkage: single
 2: 0.0092
 3: 0.0187
 4: 0.0286
 5: 0.0389
Linkage: average
 2: 0.0092
 3: 0.0187
 4: 0.0286
 5: 0.0389
Linkage: weighted
 2: 0.0092
 3: 0.0290
 4: 0.0393
 5: 0.0500



In [29]:

    
# One can permute 0 and 1 in the predicted labels, rename 2 to 3, and get the same ARI score
y_true_renamed = y_true.copy()
y_true_renamed[y_true == 1] = 42
assert metrics.adjusted_rand_score(y_true_renamed, y_pred) == metrics.adjusted_rand_score(y_true, y_pred)

# ARI is symmetric, swapping the argument does not change the score
assert metrics.adjusted_rand_score(y_true, y_pred) == metrics.adjusted_rand_score(y_pred, y_true)

# 1.0 is the perfect match score
assert metrics.adjusted_rand_score(y_true, y_true) == 1

Mutual information based scores (NMI, AMI)



In [30]:

    
nmi = defaultdict(dict)
for linkage in [hierarchical.COMPLETE, hierarchical.SINGLE, hierarchical.AVERAGE, hierarchical.WEIGHTED]:
    for n_clusters in [2, 3, 4, 5]:
        hc = HierarchicalClustering(n_clusters=n_clusters, linkage=linkage)
        y_pred = hc.fit_predict(dist_matrix)
        y_true = new_data.Y
        ari[linkage][n_clusters] = metrics.normalized_mutual_info_score(y_true, y_pred)  
    scores = '\n '.join('%d: %5.4f' % (ncls, score) for ncls, score in ari[linkage].items())
    print('Linkage: %s\n %s' % (linkage, scores))









    



Linkage: complete
 2: 0.0913
 3: 0.0855
 4: 0.0368
 5: 0.0712
Linkage: single
 2: 0.0465
 3: 0.0663
 4: 0.0819
 5: 0.0954
Linkage: average
 2: 0.0465
 3: 0.0663
 4: 0.0819
 5: 0.0954
Linkage: weighted
 2: 0.0465
 3: 0.0855
 4: 0.0985
 5: 0.1104



In [31]:

    
# Random (uniform) label assignments have a AMI score close to 0.0 
y_rnd = np.random.randint(0, 5, y_true.size)
print('Random assignment: %5.4f' % metrics.adjusted_mutual_info_score(y_rnd, y_true))

# Bounded range in [0,1]
assert metrics.adjusted_mutual_info_score(y_true, y_true) == 1.









    



Random assignment: 0.0294

Homogeneity, Completeness and V-measure



In [32]:

    
# homogeneity: each cluster contains only members of a single class
h_score = metrics.homogeneity_score(y_pred, y_true)
print('Homogeneity: %5.4f' % h_score)

# completeness: all members of a given class are assigned to the same cluster
c_score = metrics.completeness_score(y_pred, y_true)
print('Completeness: %5.4f' % c_score)

# v-measure is harmonic mean of homogeneity and completeness scores
v_score = metrics.v_measure_score(y_pred, y_true)
print('V-measure: %5.4f' % v_score)
from scipy.stats import hmean
# assert hmean([h_score, c_score]) == v_score









    



Homogeneity: 0.1769
Completeness: 0.0689
V-measure: 0.0992



In [33]:

    
# Bounded range 0.0 is as bad as it can be, 1.0 is a perfect score
assert metrics.v_measure_score(y_true, y_true) == 1.

# These metrics are not normalized with regards to random labeling
y_rnd = np.random.randint(0, 5, y_true.size)
print('Random assignment: %5.4f' % metrics.v_measure_score(y_rnd, y_true))









    



Random assignment: 0.0152

K-means clustering



In [34]:

    
from Orange.clustering.kmeans import KMeans
from Orange.evaluation.clustering import ClusteringEvaluation, Silhouette

print('Attributes: %d' % len(data.domain.attributes))
print('Samples: %d' % len(data))

cluster_range = list(range(2, 10))
learners = [KMeans(n_clusters=r) for r in cluster_range]
cr = ClusteringEvaluation(data, learners, k=3)

print('\n'.join('Cluster: %d - score: %5.3f' % (cr, sc) for cr, sc in zip(cluster_range, Silhouette(cr))))









    



Attributes: 4068
Samples: 101
Cluster: 2 - score: 0.197
Cluster: 3 - score: 0.238
Cluster: 4 - score: 0.173
Cluster: 5 - score: 0.166
Cluster: 6 - score: 0.130
Cluster: 7 - score: 0.139
Cluster: 8 - score: 0.111
Cluster: 9 - score: 0.104

Silhouette plot



In [35]:

    
# Perform clustering with a chosen number of clusters
N = 5
colors = ['g', 'b', 'r', 'y', 'c']
kmeans = KMeans(n_clusters=N, random_state=1)
km_model = kmeans(data)
y = km_model(data)
X, y = data.X, y.X.ravel()

# Score silhouettes
from sklearn.metrics import silhouette_samples
s = silhouette_samples(X, y)

# Silhouette plot - short example
s = s[np.argsort(y)]  # sort by clusters
parts = []
# within clusters sort by silhouette scores
for label, (i, j) in enumerate([(sum(y == c1), sum(y == c1) + sum(y == c2)) for c1, c2 in zip(range(-1, N-1), range(0, N))]):
    scores = sorted(s[i:j])
    parts.append((scores, label))

# Silhouette
fig, ax = plt.subplots()
ax.set_title('Silhouette coefficient')
total = 0
centers = []
for scores, label in parts:
    ax.barh(range(total, total + len(scores)), scores, color=colors[label], edgecolor=colors[label])
    centers.append(total + len(scores)/2)
    total += len(scores)

ax.set_yticks(centers)
ax.set_yticklabels(range(N));
ax.set_ylabel('Cluster label');



In [36]:

    
from Orange.projection.manifold import MDS
from Orange.distance import Euclidean

mds = MDS(dissimilarity=Euclidean)
mds_model = mds(data)

labels = km_model.labels_
plt.scatter(mds_model.embedding_[:, 0], mds_model.embedding_[:, 1], c=labels);

DBSCAN



In [37]:

    
# Find core points, then form a cluster together with all points (core or non-core) that are reachable from core points
from Orange.clustering import DBSCAN

from sklearn import metrics, datasets

# Generate sample data
centers = [[1, 1], [2, 3], [1, -1]]
X, y_true = datasets.make_blobs(n_samples=500, centers=centers, cluster_std=0.3, random_state=0)
data = Table(X)

db = DBSCAN(eps=0.3, min_samples=20)
db_model = db(data)
y_pred = db_model.labels_

# Noisy samples are given the label -1
n_clusters = len(set(y_pred)) - (1 if -1 in y_pred else 0)

print('Clusters: %d' % n_clusters)
print('V-measure: %0.3f' % metrics.v_measure_score(y_pred, y_true))
print('ARI: %0.3f' % metrics.adjusted_rand_score(y_pred, y_true))
print('NMI: %0.3f' % metrics.normalized_mutual_info_score(y_pred, y_true))

unique_labels = set(y_pred)
colors = plt.cm.rainbow(np.linspace(0, 1, len(unique_labels)))

for k, col in zip(unique_labels, colors):
    cluster_core_mask = list(set(np.nonzero(y_pred == k)[0]).intersection(db_model.core_sample_indices_))
    xy = data[cluster_core_mask]
    plt.plot(xy.X[:, 0], xy.X[:, 1], 'o', markerfacecolor=col, markersize=15)

    cluster_noncore_mask = list(set(np.nonzero(y_pred == k)[0]).difference(db_model.core_sample_indices_))
    xy = data[cluster_noncore_mask]
    plt.plot(xy.X[:, 0], xy.X[:, 1], 'o', markerfacecolor=col, markersize=5)
    
plt.show()









    



Clusters: 3
V-measure: 0.916
ARI: 0.945
NMI: 0.917



In [37]: