In [6]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from Orange.data import Table, Domain
import warnings
warnings.simplefilter('ignore', DeprecationWarning)
warnings.simplefilter('ignore', RuntimeWarning)
In [17]:
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
n_samples = 1000
np.random.seed(0)
colors = np.array([x for x in 'rcbgmykbgrcmykbgrcmykbgrcmyk'])
colors = np.hstack([colors] * 20)
# Noisy circles
circles_X, circles_Y = datasets.make_circles(n_samples=n_samples, factor=.5, noise=.05)
circles_X = StandardScaler().fit_transform(circles_X)
plt.scatter(circles_X[:, 0], circles_X[:, 1], color=colors[circles_Y].tolist(), s=10);
circles = Table(circles_X, Y=circles_Y)
In [18]:
# Noisy moons
moons_X, moons_Y = datasets.make_moons(n_samples=n_samples, noise=0.05)
moons_X = StandardScaler().fit_transform(moons_X)
plt.scatter(moons_X[:, 0], moons_X[:, 1], color=colors[moons_Y].tolist(), s=10);
moons = Table(moons_X, Y=moons_Y)
In [19]:
# Noisy blobs
centers = np.array([[0, 0], [3, 3], [4, 4]])
cluster_std = 0.4
blobs_X, blobs_Y = datasets.make_blobs(n_samples=n_samples, centers=centers, cluster_std=cluster_std, random_state=8)
blobs_X = StandardScaler().fit_transform(blobs_X)
plt.scatter(blobs_X[:, 0], blobs_X[:, 1], color=colors[blobs_Y].tolist(), s=10);
blobs = Table(blobs_X, Y=blobs_Y)
In [20]:
# No structure
no_structure_X, no_structure_Y = np.random.rand(n_samples, 2), None
plt.scatter(no_structure_X[:, 0], no_structure_X[:, 1], color='b', s=10);
no_structure = Table(no_structure_X, Y=no_structure_Y)
In [21]:
from Orange.clustering import HierarchicalClustering, KMeans, DBSCAN
from Orange.clustering.hierarchical import AVERAGE
from Orange.distance import Euclidean
plot_num = 1
plt.figure(figsize=(17, 15))
plt.subplots_adjust(left=.001, right=.999, bottom=.001, top=.96, wspace=.05,hspace=.01)
for i_dataset, dataset in enumerate([circles, moons, blobs, no_structure]):
dist_matrix = Euclidean(dataset)
two_means = KMeans(n_clusters=2)(dataset)
average = HierarchicalClustering(n_clusters=2, linkage=AVERAGE).fit_predict(dist_matrix)
dbscan = DBSCAN(eps=.2)(dataset)
for name, model in [('KMeans', two_means), ('Hierarchical', average), ('DBSCAN', dbscan)]:
if hasattr(model, 'labels_'):
y_pred = model.labels_.astype(np.int)
else:
y_pred = model.astype(np.int)
# plot
plt.subplot(4, 3, plot_num)
if i_dataset == 0:
plt.title(name, size=18)
plt.scatter(dataset.X[:, 0], dataset.X[:, 1], color=colors[y_pred].tolist(), s=10)
if hasattr(model, 'cluster_centers_'):
centers = model.cluster_centers_
center_colors = colors[:len(centers)]
plt.scatter(centers[:, 0], centers[:, 1], s=100, c=center_colors)
plt.xlim(-2, 2)
plt.ylim(-2, 2)
plt.xticks(())
plt.yticks(())
plot_num += 1
plt.show()
In [7]:
import orangecontrib.bio.geo
# Analysis of cultured skin fibroblasts prepared from patients with Marfan syndrome (MFS).
# MFS is a heritable connective tissue disorder caused by mutations in the fibrillin-1 gene.
# Results provide insight into the molecular pathogenesis of MFS.
# Y = [control, Marfan syndrome]
# http://www.ncbi.nlm.nih.gov/sites/GDSbrowser?acc=GDS2960
gds = orangecontrib.bio.geo.GDS('GDS2960')
print('Title: %s'% gds.info['title'])
print('Description: %s' % gds.info['description'])
print('PubMed ID: %s' % gds.info['pubmed_id'])
data = gds.getdata(transpose=True, report_genes=True)
print('Attributes: %d' % len(data.domain.attributes))
print('Samples: %d' % len(data))
In [8]:
from Orange.preprocess import Impute
# Remove all-NaN columns
non_nan_idx = np.nonzero(np.sum(np.isnan(data.X), 0) < data.X.shape[0])[0]
non_nan_attr = [data.domain.attributes[i] for i in non_nan_idx]
non_nan_domain = Domain(attributes=non_nan_attr, class_vars=data.domain.class_vars, metas=data.domain.metas)
non_nan_data = Table(non_nan_domain, data.X[:, non_nan_idx], Y=data.Y, metas=data.metas)
print('Attributes: %d' % len(non_nan_data.domain.attributes))
print('Samples: %d' % len(non_nan_data))
# Impute missing values
from sklearn.preprocessing import Imputer
imputed = Imputer().fit_transform(non_nan_data.X)
data = Table(non_nan_data.domain, imputed, Y=non_nan_data.Y, metas=non_nan_data.metas)
In [9]:
from Orange.projection import CUR
# Gene selection
cur = CUR(rank=43, max_error=1, random_state=1)
cur_model = cur(data)
new_data = cur_model(data, axis=0)
print('Attributes: %d' % len(new_data.domain.attributes))
print('Samples: %d' % len(new_data))
# Gene statistical leverage scores
n_genes = len(data.domain.attributes)
plt.plot(np.arange(n_genes), cur_model.lev_features_);
plt.xlim(0, n_genes);
# Genes with highest leverage scores
lev_name = sorted([(lev, attr.name) for lev, attr in zip(cur_model.lev_features_, data.domain.attributes)], reverse=True)
print()
print('Top scoring genes:\n %s' % '\n '.join('%5.4f %s' % (lev, name) for lev, name in lev_name[:10]))
In [25]:
from Orange.distance import Euclidean
from Orange.clustering import hierarchical
# Compute distance
dist_matrix = Euclidean(new_data)
In [26]:
from scipy.cluster.hierarchy import dendrogram
Z = hierarchical.dist_matrix_linkage(dist_matrix, linkage=hierarchical.COMPLETE)
d = dendrogram(Z)
plt.title('Dendrogram')
plt.xticks(())
plt.show()
In [27]:
hc = HierarchicalClustering(n_clusters=4, linkage=hierarchical.COMPLETE)
y_pred = hc.fit_predict(dist_matrix)
y_true = new_data.Y
# Any evaluation metric should not take the absolute values of the cluster labels
# into account but rather if this clustering define separations of the data similar
# to some ground truth set of classes or satisfying some assumption such that
# members belong to the same class are more similar that members of different
# classes according to some similarity metric.
print('Predicted labels: %s' % y_pred)
print()
print('Ground-truth labels: %s' % y_true)
In [28]:
from sklearn import metrics
from collections import defaultdict
ari = defaultdict(dict)
for linkage in [hierarchical.COMPLETE, hierarchical.SINGLE, hierarchical.AVERAGE, hierarchical.WEIGHTED]:
for n_clusters in [2, 3, 4, 5]:
hc = HierarchicalClustering(n_clusters=n_clusters, linkage=linkage)
y_pred = hc.fit_predict(dist_matrix)
y_true = new_data.Y
ari[linkage][n_clusters] = metrics.adjusted_rand_score(y_true, y_pred)
scores = '\n '.join('%d: %5.4f' % (ncls, score) for ncls, score in ari[linkage].items())
print('Linkage: %s\n %s' % (linkage, scores))
In [29]:
# One can permute 0 and 1 in the predicted labels, rename 2 to 3, and get the same ARI score
y_true_renamed = y_true.copy()
y_true_renamed[y_true == 1] = 42
assert metrics.adjusted_rand_score(y_true_renamed, y_pred) == metrics.adjusted_rand_score(y_true, y_pred)
# ARI is symmetric, swapping the argument does not change the score
assert metrics.adjusted_rand_score(y_true, y_pred) == metrics.adjusted_rand_score(y_pred, y_true)
# 1.0 is the perfect match score
assert metrics.adjusted_rand_score(y_true, y_true) == 1
In [30]:
nmi = defaultdict(dict)
for linkage in [hierarchical.COMPLETE, hierarchical.SINGLE, hierarchical.AVERAGE, hierarchical.WEIGHTED]:
for n_clusters in [2, 3, 4, 5]:
hc = HierarchicalClustering(n_clusters=n_clusters, linkage=linkage)
y_pred = hc.fit_predict(dist_matrix)
y_true = new_data.Y
ari[linkage][n_clusters] = metrics.normalized_mutual_info_score(y_true, y_pred)
scores = '\n '.join('%d: %5.4f' % (ncls, score) for ncls, score in ari[linkage].items())
print('Linkage: %s\n %s' % (linkage, scores))
In [31]:
# Random (uniform) label assignments have a AMI score close to 0.0
y_rnd = np.random.randint(0, 5, y_true.size)
print('Random assignment: %5.4f' % metrics.adjusted_mutual_info_score(y_rnd, y_true))
# Bounded range in [0,1]
assert metrics.adjusted_mutual_info_score(y_true, y_true) == 1.
In [32]:
# homogeneity: each cluster contains only members of a single class
h_score = metrics.homogeneity_score(y_pred, y_true)
print('Homogeneity: %5.4f' % h_score)
# completeness: all members of a given class are assigned to the same cluster
c_score = metrics.completeness_score(y_pred, y_true)
print('Completeness: %5.4f' % c_score)
# v-measure is harmonic mean of homogeneity and completeness scores
v_score = metrics.v_measure_score(y_pred, y_true)
print('V-measure: %5.4f' % v_score)
from scipy.stats import hmean
# assert hmean([h_score, c_score]) == v_score
In [33]:
# Bounded range 0.0 is as bad as it can be, 1.0 is a perfect score
assert metrics.v_measure_score(y_true, y_true) == 1.
# These metrics are not normalized with regards to random labeling
y_rnd = np.random.randint(0, 5, y_true.size)
print('Random assignment: %5.4f' % metrics.v_measure_score(y_rnd, y_true))
In [34]:
from Orange.clustering.kmeans import KMeans
from Orange.evaluation.clustering import ClusteringEvaluation, Silhouette
print('Attributes: %d' % len(data.domain.attributes))
print('Samples: %d' % len(data))
cluster_range = list(range(2, 10))
learners = [KMeans(n_clusters=r) for r in cluster_range]
cr = ClusteringEvaluation(data, learners, k=3)
print('\n'.join('Cluster: %d - score: %5.3f' % (cr, sc) for cr, sc in zip(cluster_range, Silhouette(cr))))
In [35]:
# Perform clustering with a chosen number of clusters
N = 5
colors = ['g', 'b', 'r', 'y', 'c']
kmeans = KMeans(n_clusters=N, random_state=1)
km_model = kmeans(data)
y = km_model(data)
X, y = data.X, y.X.ravel()
# Score silhouettes
from sklearn.metrics import silhouette_samples
s = silhouette_samples(X, y)
# Silhouette plot - short example
s = s[np.argsort(y)] # sort by clusters
parts = []
# within clusters sort by silhouette scores
for label, (i, j) in enumerate([(sum(y == c1), sum(y == c1) + sum(y == c2)) for c1, c2 in zip(range(-1, N-1), range(0, N))]):
scores = sorted(s[i:j])
parts.append((scores, label))
# Silhouette
fig, ax = plt.subplots()
ax.set_title('Silhouette coefficient')
total = 0
centers = []
for scores, label in parts:
ax.barh(range(total, total + len(scores)), scores, color=colors[label], edgecolor=colors[label])
centers.append(total + len(scores)/2)
total += len(scores)
ax.set_yticks(centers)
ax.set_yticklabels(range(N));
ax.set_ylabel('Cluster label');
In [36]:
from Orange.projection.manifold import MDS
from Orange.distance import Euclidean
mds = MDS(dissimilarity=Euclidean)
mds_model = mds(data)
labels = km_model.labels_
plt.scatter(mds_model.embedding_[:, 0], mds_model.embedding_[:, 1], c=labels);
In [37]:
# Find core points, then form a cluster together with all points (core or non-core) that are reachable from core points
from Orange.clustering import DBSCAN
from sklearn import metrics, datasets
# Generate sample data
centers = [[1, 1], [2, 3], [1, -1]]
X, y_true = datasets.make_blobs(n_samples=500, centers=centers, cluster_std=0.3, random_state=0)
data = Table(X)
db = DBSCAN(eps=0.3, min_samples=20)
db_model = db(data)
y_pred = db_model.labels_
# Noisy samples are given the label -1
n_clusters = len(set(y_pred)) - (1 if -1 in y_pred else 0)
print('Clusters: %d' % n_clusters)
print('V-measure: %0.3f' % metrics.v_measure_score(y_pred, y_true))
print('ARI: %0.3f' % metrics.adjusted_rand_score(y_pred, y_true))
print('NMI: %0.3f' % metrics.normalized_mutual_info_score(y_pred, y_true))
unique_labels = set(y_pred)
colors = plt.cm.rainbow(np.linspace(0, 1, len(unique_labels)))
for k, col in zip(unique_labels, colors):
cluster_core_mask = list(set(np.nonzero(y_pred == k)[0]).intersection(db_model.core_sample_indices_))
xy = data[cluster_core_mask]
plt.plot(xy.X[:, 0], xy.X[:, 1], 'o', markerfacecolor=col, markersize=15)
cluster_noncore_mask = list(set(np.nonzero(y_pred == k)[0]).difference(db_model.core_sample_indices_))
xy = data[cluster_noncore_mask]
plt.plot(xy.X[:, 0], xy.X[:, 1], 'o', markerfacecolor=col, markersize=5)
plt.show()
In [37]: