author: Diogo Silva

SKL = SciKit-Learn

``````

In [1]:

%pylab inline

``````
``````

Populating the interactive namespace from numpy and matplotlib

``````
``````

In [2]:

import seaborn as sns

``````
``````

In [3]:

home = %env HOME

``````
``````

In [4]:

cd \$home/QCThesis/

``````
``````

/home/chiroptera/workspace/QCThesis

``````
``````

In [13]:

from sklearn.cluster import KMeans as KMeans_skl
import MyML.cluster.eac as eac
import MyML.cluster.K_Means3 as K_Means3
import MyML.metrics.accuracy as determine_ci

``````
``````

Out[13]:

<module 'MyML.metrics.accuracy' from 'MyML/metrics/accuracy.pyc'>

``````

# Helper functions

``````

In [14]:

def stat_my_kmeans(data,nclusters,gtruth,rounds=20):
nsamples=data.shape[0]
all_acc = list()
for r in xrange(rounds):
iters="converge"
kmeans_mode="numpy"

grouper = K_Means3.K_Means(n_clusters=nclusters, mode=kmeans_mode, cuda_mem='manual',tol=1e-4,max_iters=iters)
grouper._centroid_mode = "iter"
grouper.fit(data)

myAcc = determine_ci.HungarianIndex(nsamples=nsamples)
myAcc.score(gtruth,grouper.labels_,format='array')

all_acc.append(myAcc.accuracy)

return np.mean(all_acc),np.var(all_acc),np.max(all_acc),np.min(all_acc)

def stat_skl_kmeans(data,nclusters,gtruth,rounds=20,init='random'):
nsamples=data.shape[0]
all_acc = list()
for r in xrange(rounds):
iters="converge"
kmeans_mode="numpy"

gSKL = KMeans_skl(n_clusters=nclusters,n_init=1,init=init)
gSKL.fit(data)

myAcc = determine_ci.HungarianIndex(nsamples=nsamples)
myAcc.score(gtruth,grouper.labels_,format='array')

all_acc.append(myAcc.accuracy)

return np.mean(all_acc),np.var(all_acc),np.max(all_acc),np.min(all_acc)

``````
``````

In [15]:

print "b MyML/cluster/K_Means3.py:"

``````
``````

b MyML/cluster/K_Means3.py:

``````
``````

In [16]:

def k_analysis(partition_files,ground_truth,nprots,iters="converge",rounds=20,files=True):
nsamples=data.shape[0]
all_acc = list()

for r in xrange(rounds):
prot_mode="random"

estimator=eac.EAC(nsamples)
estimator.fit(partition_files,files=files,assoc_mode='prot', prot_mode=prot_mode, nprot=nprots,build_only=True)

kmeans_mode = "numpy"
nclusters = np.unique(ground_truth).shape[0]

grouper = K_Means3.K_Means(n_clusters=nclusters,mode=kmeans_mode, cuda_mem='manual',tol=1e-4,max_iters=iters)
grouper._centroid_mode = "iter"
grouper.fit(estimator._coassoc)

myAcc = determine_ci.HungarianIndex(nsamples=nsamples)
myAcc.score(ground_truth,grouper.labels_,format='array')

all_acc.append(myAcc.accuracy)
return np.mean(all_acc),np.var(all_acc),np.max(all_acc),np.min(all_acc)

def k_skl_analysis(partition_files,ground_truth,nprots,rounds=20,files=True):
nsamples=data.shape[0]
all_acc = list()

for r in xrange(rounds):
prot_mode="random"

estimator=eac.EAC(nsamples)
estimator.fit(partition_files,files=files,assoc_mode='prot', prot_mode=prot_mode, nprot=nprots,build_only=True)

kmeans_mode = "numpy"
nclusters = np.unique(ground_truth).shape[0]

grouper = KMeans_skl(n_clusters=nclusters,n_init=1,init="random")
grouper.fit(estimator._coassoc)

myAcc = determine_ci.HungarianIndex(nsamples=nsamples)
myAcc.score(ground_truth,grouper.labels_,format='array')

all_acc.append(myAcc.accuracy)
return np.mean(all_acc),np.var(all_acc),np.max(all_acc),np.min(all_acc)

``````

# Generate data

``````

In [47]:

center1=(0,0)
center2=(10,10)

cov1=1
cov2=1

n1=500000
n2=500000
nsamples=n1+n2
dim=2

g1 = np.random.normal(loc=center1,scale=cov1,size=(n1,dim)).astype(np.float32)
g2 = np.random.normal(loc=center2,scale=cov2,size=(n2,dim)).astype(np.float32)

data = np.vstack((g1,g2))
gt=np.zeros(data.shape[0],dtype=np.int32)
gt[100:]=1

figData=plt.figure()
plt.plot(g1[:,0],g1[:,1],'.')
plt.plot(g2[:,0],g2[:,1],'.')

``````
``````

Out[47]:

[<matplotlib.lines.Line2D at 0x7f6659668990>]

``````
``````

In [48]:

import MyML.helper.partition

``````
``````

Out[48]:

<module 'MyML.helper.partition' from 'MyML/helper/partition.pyc'>

``````
``````

In [49]:

py_estimator=K_Means3.K_Means(n_clusters=20,mode="numpy", cuda_mem='manual',tol=1e-4,max_iter=3)
cu_estimator=K_Means3.K_Means(n_clusters=20,mode="cuda", cuda_mem='manual',tol=1e-4,max_iter=3)

``````
``````

In [50]:

%timeit MyML.helper.partition.generateEnsemble(data,cu_estimator,n_clusters=[6,30],npartitions=30,iters=3)
%timeit MyML.helper.partition.generateEnsemble(data,py_estimator,n_clusters=[6,30],npartitions=30,iters=3)

``````
``````

1 loops, best of 3: 48.1 s per loop
1 loops, best of 3: 2min 6s per loop

``````
``````

In [174]:

cProfile.run("grouperSKL.fit(data)")

``````
``````

481 function calls in 0.063 seconds

Ordered by: standard name

ncalls  tottime  percall  cumtime  percall filename:lineno(function)
1    0.000    0.000    0.063    0.063 <string>:1(<module>)
3    0.002    0.001    0.002    0.001 _k_means.pyx:244(_centers_dense)
19    0.000    0.000    0.001    0.000 _methods.py:31(_sum)
3    0.000    0.000    0.000    0.000 _methods.py:43(_count_reduce_items)
2    0.000    0.000    0.001    0.001 _methods.py:53(_mean)
1    0.000    0.000    0.003    0.003 _methods.py:77(_var)
6    0.000    0.000    0.000    0.000 _weakrefset.py:70(__contains__)
3    0.000    0.000    0.000    0.000 abc.py:128(__instancecheck__)
45    0.000    0.000    0.000    0.000 base.py:865(isspmatrix)
3    0.000    0.000    0.010    0.003 extmath.py:168(safe_sparse_dot)
3    0.000    0.000    0.000    0.000 extmath.py:44(squared_norm)
4    0.000    0.000    0.000    0.000 extmath.py:54(row_norms)
3    0.000    0.000    0.000    0.000 fromnumeric.py:1291(ravel)
1    0.000    0.000    0.000    0.000 fromnumeric.py:2651(mean)
1    0.000    0.000    0.003    0.003 fromnumeric.py:2838(var)
1    0.000    0.000    0.003    0.003 k_means_.py:142(_tolerance)
1    0.000    0.000    0.063    0.063 k_means_.py:151(k_means)
1    0.000    0.000    0.058    0.058 k_means_.py:296(_kmeans_single)
3    0.022    0.007    0.048    0.016 k_means_.py:400(_labels_inertia_precompute_dense)
3    0.000    0.000    0.049    0.016 k_means_.py:447(_labels_inertia)
1    0.000    0.000    0.007    0.007 k_means_.py:500(_init_centroids)
1    0.000    0.000    0.000    0.000 k_means_.py:688(_check_fit_data)
1    0.000    0.000    0.063    0.063 k_means_.py:716(fit)
3    0.000    0.000    0.000    0.000 numeric.py:141(ones)
13    0.000    0.000    0.000    0.000 numeric.py:394(asarray)
30    0.000    0.000    0.000    0.000 numeric.py:464(asanyarray)
3    0.015    0.005    0.026    0.009 pairwise.py:143(euclidean_distances)
3    0.000    0.000    0.001    0.000 pairwise.py:60(check_pairwise_arrays)
10    0.000    0.000    0.000    0.000 shape_base.py:60(atleast_2d)
10    0.000    0.000    0.001    0.000 validation.py:115(array2d)
7    0.000    0.000    0.001    0.000 validation.py:128(_atleast2d_or_sparse)
7    0.000    0.000    0.001    0.000 validation.py:157(atleast2d_or_csr)
4    0.000    0.000    0.000    0.000 validation.py:337(check_random_state)
16    0.000    0.000    0.001    0.000 validation.py:37(_assert_all_finite)
6    0.000    0.000    0.000    0.000 validation.py:57(safe_asarray)
1    0.000    0.000    0.000    0.000 validation.py:83(as_float_array)
3    0.000    0.000    0.000    0.000 {getattr}
2    0.000    0.000    0.000    0.000 {hasattr}
60    0.000    0.000    0.000    0.000 {isinstance}
4    0.000    0.000    0.000    0.000 {issubclass}
31    0.000    0.000    0.000    0.000 {len}
1    0.000    0.000    0.000    0.000 {max}
10    0.000    0.000    0.000    0.000 {method 'append' of 'list' objects}
12    0.000    0.000    0.000    0.000 {method 'copy' of 'numpy.ndarray' objects}
1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}
6    0.000    0.000    0.000    0.000 {method 'fill' of 'numpy.ndarray' objects}
1    0.000    0.000    0.001    0.001 {method 'mean' of 'numpy.ndarray' objects}
1    0.007    0.007    0.007    0.007 {method 'permutation' of 'mtrand.RandomState' objects}
3    0.000    0.000    0.000    0.000 {method 'ravel' of 'numpy.ndarray' objects}
23    0.005    0.000    0.005    0.000 {method 'reduce' of 'numpy.ufunc' objects}
19    0.000    0.000    0.001    0.000 {method 'sum' of 'numpy.ndarray' objects}
6    0.010    0.002    0.010    0.002 {numpy.core._dotblas.dot}
49    0.000    0.000    0.000    0.000 {numpy.core.multiarray.array}
3    0.000    0.000    0.000    0.000 {numpy.core.multiarray.copyto}
4    0.000    0.000    0.000    0.000 {numpy.core.multiarray.einsum}
9    0.000    0.000    0.000    0.000 {numpy.core.multiarray.empty}
1    0.000    0.000    0.000    0.000 {numpy.core.multiarray.zeros}
6    0.000    0.000    0.000    0.000 {range}
3    0.000    0.000    0.002    0.001 {sklearn.cluster._k_means._centers_dense}

``````
``````

In [200]:

cProfile.run("grouper.fit(data)")

``````
``````

2991 function calls (2940 primitive calls) in 0.066 seconds

Ordered by: standard name

ncalls  tottime  percall  cumtime  percall filename:lineno(function)
1    0.000    0.000    0.066    0.066 <string>:1(<module>)
9    0.000    0.000    0.000    0.000 <string>:8(__new__)
1    0.000    0.000    0.000    0.000 K_Means3.py:133(_init_centroids)
3    0.000    0.000    0.021    0.007 K_Means3.py:161(_label)
3    0.000    0.000    0.021    0.007 K_Means3.py:341(_cu_label)
3    0.000    0.000    0.001    0.000 K_Means3.py:398(_cu_label_kernel)
3    0.000    0.000    0.045    0.015 K_Means3.py:563(_recompute_centroids)
3    0.018    0.006    0.045    0.015 K_Means3.py:625(_np_recompute_centroids_good)
1    0.000    0.000    0.066    0.066 K_Means3.py:79(fit)
3    0.000    0.000    0.000    0.000 _methods.py:31(_sum)
18    0.000    0.000    0.000    0.000 _methods.py:34(_prod)
60    0.000    0.000    0.000    0.000 _methods.py:43(_count_reduce_items)
60    0.002    0.000    0.010    0.000 _methods.py:53(_mean)
3    0.000    0.000    0.009    0.003 api.py:116(synchronize)
6    0.000    0.000    0.000    0.000 api.py:121(_prepare_shape_strides_dtype)
6    0.000    0.000    0.005    0.001 api.py:150(device_array_like)
3    0.000    0.000    0.004    0.001 api.py:23(to_device)
6    0.000    0.000    0.004    0.001 api.py:63(device_array)
18    0.001    0.000    0.002    0.000 arrayobj.py:46(make_array_ctype)
18    0.000    0.000    0.000    0.000 arrayobj.py:61(c_array)
3    0.000    0.000    0.005    0.002 arraysetops.py:96(unique)
3    0.000    0.000    0.000    0.000 compiler.py:172(copy)
3    0.000    0.000    0.000    0.000 compiler.py:175(configure)
3    0.000    0.000    0.000    0.000 compiler.py:201(__getitem__)
3    0.000    0.000    0.000    0.000 compiler.py:250(get)
3    0.000    0.000    0.001    0.000 compiler.py:301(__call__)
3    0.000    0.000    0.001    0.000 compiler.py:326(_kernel_call)
12    0.000    0.000    0.000    0.000 compiler.py:377(_prepare_args)
9    0.000    0.000    0.000    0.000 contextlib.py:12(__init__)
9    0.000    0.000    0.000    0.000 contextlib.py:15(__enter__)
9    0.000    0.000    0.000    0.000 contextlib.py:21(__exit__)
9    0.000    0.000    0.000    0.000 contextlib.py:82(helper)
6    0.000    0.000    0.000    0.000 copy.py:306(_reconstruct)
6    0.000    0.000    0.000    0.000 copy.py:66(copy)
6    0.000    0.000    0.000    0.000 copy_reg.py:92(__newobj__)
9    0.000    0.000    0.000    0.000 devicearray.py:123(__del__)
9    0.000    0.000    0.000    0.000 devicearray.py:129(_default_stream)
9    0.000    0.000    0.000    0.000 devicearray.py:141(device_ctypes_pointer)
3    0.000    0.000    0.000    0.000 devicearray.py:151(copy_to_device)
6    0.000    0.000    0.001    0.000 devicearray.py:168(copy_to_host)
12    0.000    0.000    0.000    0.000 devicearray.py:232(as_cuda_arg)
3    0.000    0.000    0.004    0.001 devicearray.py:332(from_array_like)
12    0.000    0.000    0.000    0.000 devicearray.py:353(auto_device)
9    0.000    0.000    0.008    0.001 devicearray.py:56(__init__)
33    0.000    0.000    0.001    0.000 devices.py:108(current_context)
33    0.000    0.000    0.001    0.000 devices.py:183(get_or_create_context)
33    0.000    0.000    0.001    0.000 devices.py:224(get_context)
9    0.000    0.000    0.009    0.001 devices.py:236(_require_cuda_context)
3    0.000    0.000    0.000    0.000 driver.py:1018(configure)
3    0.000    0.000    0.000    0.000 driver.py:1035(__call__)
3    0.000    0.000    0.000    0.000 driver.py:1070(launch_kernel)
18    0.000    0.000    0.000    0.000 driver.py:1247(host_pointer)
3    0.000    0.000    0.000    0.000 driver.py:1258(host_memory_extents)
9    0.000    0.000    0.000    0.000 driver.py:1263(memory_size_from_info)
3    0.000    0.000    0.000    0.000 driver.py:1273(host_memory_size)
27    0.000    0.000    0.001    0.000 driver.py:1280(device_pointer)
39    0.000    0.000    0.001    0.000 driver.py:1285(device_ctypes_pointer)
66    0.000    0.000    0.000    0.000 driver.py:1293(is_device_memory)
39    0.000    0.000    0.000    0.000 driver.py:1304(require_device_memory)
9    0.000    0.000    0.000    0.000 driver.py:1311(device_memory_depends)
12    0.000    0.000    0.001    0.000 driver.py:1321(host_to_device)
6    0.000    0.000    0.001    0.000 driver.py:1339(device_to_host)
94    0.012    0.000    0.012    0.000 driver.py:212(safe_cuda_api_call)
94    0.000    0.000    0.000    0.000 driver.py:241(_check_error)
33    0.000    0.000    0.001    0.000 driver.py:270(get_context)
9    0.000    0.000    0.000    0.000 driver.py:291(add_trash)
9    0.000    0.000    0.001    0.000 driver.py:294(process)
9    0.000    0.000    0.002    0.000 driver.py:498(memalloc)
3    0.000    0.000    0.009    0.003 driver.py:614(synchronize)
9    0.000    0.000    0.000    0.000 driver.py:664(_make_mem_finalizer)
9    0.000    0.000    0.000    0.000 driver.py:665(mem_finalize)
9    0.000    0.000    0.000    0.000 driver.py:669(core)
10    0.000    0.000    0.001    0.000 driver.py:670(cleanup)
9    0.000    0.000    0.000    0.000 driver.py:728(__init__)
10    0.000    0.000    0.000    0.000 driver.py:739(__del__)
9    0.000    0.000    0.000    0.000 driver.py:746(own)
9    0.000    0.000    0.000    0.000 driver.py:749(free)
39    0.000    0.000    0.000    0.000 driver.py:778(device_ctypes_pointer)
9    0.000    0.000    0.000    0.000 driver.py:838(__init__)
9    0.000    0.000    0.000    0.000 driver.py:847(__del__)
90    0.000    0.000    0.000    0.000 driver.py:858(__getattr__)
9    0.000    0.000    0.000    0.000 driver.py:911(query)
9    0.000    0.000    0.000    0.000 driver.py:925(record)
18    0.000    0.000    0.000    0.000 dummyarray.py:104(is_contiguous)
18    0.000    0.000    0.000    0.000 dummyarray.py:108(compute_index)
42    0.000    0.000    0.000    0.000 dummyarray.py:109(<genexpr>)
9    0.000    0.000    0.001    0.000 dummyarray.py:148(from_desc)
9    0.000    0.000    0.001    0.000 dummyarray.py:157(__init__)
21    0.000    0.000    0.000    0.000 dummyarray.py:160(<genexpr>)
21    0.000    0.000    0.000    0.000 dummyarray.py:161(<genexpr>)
9    0.000    0.000    0.000    0.000 dummyarray.py:167(_compute_layout)
15    0.000    0.000    0.000    0.000 dummyarray.py:172(is_contig)
9    0.000    0.000    0.000    0.000 dummyarray.py:184(_compute_extent)
12    0.000    0.000    0.000    0.000 dummyarray.py:27(__init__)
24    0.000    0.000    0.000    0.000 dummyarray.py:80(get_offset)
18    0.000    0.000    0.001    0.000 fromnumeric.py:2259(prod)
9    0.000    0.000    0.004    0.000 ndarray.py:130(ndarray_populate_head)
9    0.000    0.000    0.000    0.000 ndarray.py:42(__new__)
18    0.000    0.000    0.000    0.000 ndarray.py:69(get_stage)
9    0.000    0.000    0.001    0.000 ndarray.py:80(allocate)
9    0.000    0.000    0.000    0.000 ndarray.py:91(free)
9    0.000    0.000    0.002    0.000 ndarray.py:96(write)
63    0.000    0.000    0.000    0.000 numeric.py:464(asanyarray)
3    0.000    0.000    0.000    0.000 numeric.py:81(zeros_like)
1    0.000    0.000    0.000    0.000 random.py:293(sample)
9    0.000    0.000    0.001    0.000 service.py:18(service)
66    0.000    0.000    0.000    0.000 threadlocal.py:13(stack)
33    0.000    0.000    0.000    0.000 threadlocal.py:29(top)
33    0.000    0.000    0.000    0.000 threadlocal.py:33(is_empty)
33    0.000    0.000    0.000    0.000 threadlocal.py:37(__bool__)
33    0.000    0.000    0.000    0.000 threadlocal.py:40(__nonzero__)
9    0.000    0.000    0.000    0.000 utils.py:142(__setitem__)
12    0.000    0.000    0.000    0.000 {_ctypes.addressof}
42    0.000    0.000    0.000    0.000 {_ctypes.byref}
18    0.000    0.000    0.000    0.000 {_ctypes.sizeof}
18    0.000    0.000    0.000    0.000 {_weakref.proxy}
15    0.000    0.000    0.000    0.000 {built-in method __new__ of type object at 0x7eff0af34d00}
177/126    0.000    0.000    0.000    0.000 {getattr}
7    0.000    0.000    0.000    0.000 {hasattr}
6    0.000    0.000    0.000    0.000 {id}
204    0.000    0.000    0.000    0.000 {isinstance}
60    0.000    0.000    0.000    0.000 {issubclass}
6    0.000    0.000    0.000    0.000 {iter}
100    0.000    0.000    0.000    0.000 {len}
1    0.000    0.000    0.000    0.000 {math.ceil}
1    0.000    0.000    0.000    0.000 {math.log}
6    0.000    0.000    0.000    0.000 {method '__reduce_ex__' of 'object' objects}
29    0.000    0.000    0.000    0.000 {method 'add' of 'set' objects}
18    0.000    0.000    0.000    0.000 {method 'append' of 'collections.deque' objects}
39    0.000    0.000    0.000    0.000 {method 'append' of 'list' objects}
3    0.012    0.004    0.012    0.004 {method 'argsort' of 'numpy.ndarray' objects}
1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}
9    0.000    0.000    0.000    0.000 {method 'discard' of 'set' objects}
9    0.000    0.000    0.000    0.000 {method 'extend' of 'list' objects}
3    0.000    0.000    0.000    0.000 {method 'flatten' of 'numpy.ndarray' objects}
24    0.000    0.000    0.000    0.000 {method 'get' of 'dict' objects}
60    0.000    0.000    0.010    0.000 {method 'mean' of 'numpy.ndarray' objects}
10    0.000    0.000    0.000    0.000 {method 'pop' of 'list' objects}
18    0.000    0.000    0.000    0.000 {method 'popleft' of 'collections.deque' objects}
20    0.000    0.000    0.000    0.000 {method 'random' of '_random.Random' objects}
81    0.008    0.000    0.008    0.000 {method 'reduce' of 'numpy.ufunc' objects}
3    0.004    0.001    0.004    0.001 {method 'sort' of 'numpy.ndarray' objects}
3    0.000    0.000    0.000    0.000 {method 'sum' of 'numpy.ndarray' objects}
6    0.000    0.000    0.000    0.000 {method 'update' of 'dict' objects}
3    0.000    0.000    0.000    0.000 {min}
24    0.000    0.000    0.001    0.000 {next}
18    0.000    0.000    0.000    0.000 {numba.mviewbuf.memoryview_get_buffer}
9    0.000    0.000    0.000    0.000 {numba.mviewbuf.memoryview_get_extents_info}
3    0.000    0.000    0.000    0.000 {numba.mviewbuf.memoryview_get_extents}
63    0.000    0.000    0.000    0.000 {numpy.core.multiarray.array}
3    0.000    0.000    0.000    0.000 {numpy.core.multiarray.concatenate}
3    0.000    0.000    0.000    0.000 {numpy.core.multiarray.copyto}
3    0.000    0.000    0.000    0.000 {numpy.core.multiarray.empty_like}
3    0.000    0.000    0.000    0.000 {numpy.core.multiarray.empty}
6    0.000    0.000    0.000    0.000 {numpy.core.multiarray.zeros}
18    0.000    0.000    0.000    0.000 {sum}
30    0.000    0.000    0.000    0.000 {zip}

``````

## Generate partitions, k=6,10,[4,25]

``````

In [9]:

def formatPartition(partition):
clusters=np.unique(partition)
nclusters=clusters.size
finalPartition=[None]*nclusters
for c,l in clusters:
finalPartition[c] = np.where(clusters==l)

return finalPartition

def generatePartitions(data,npartitions,nclusters,iters=3):

if type(nclusters) is list:
clusterRange = True
min_ncluster=nclusters[0]
max_ncluster=nclusters[1]
else:
clusterRange = False
k = nclusters

partitions = list()

mode = "numpy"
for p in xrange(npartitions):
if clusterRange:
k = np.random.randint(min_ncluster,max_ncluster)

grouper = K_Means3.K_Means(n_clusters=k,mode=mode, cuda_mem='manual',tol=1e-4,max_iters=iters)
grouper._centroid_mode = "index"
grouper.fit(data)
partitions.append(grouper.partition)

return partitions

def generatePartitionsSKL(data,npartitions,nclusters,iters=3):

if type(nclusters) is list:
clusterRange = True
min_ncluster=nclusters[0]
max_ncluster=nclusters[1]
else:
clusterRange = False
k = nclusters

partitions = list()

mode = "numpy"
for p in xrange(npartitions):
if clusterRange:
k = np.random.randint(min_ncluster,max_ncluster)

gSKL = KMeans_skl(n_clusters=k,n_init=1,init="random",max_iter=iters)
gSKL.fit(data)
partitions.append(formatPartition(gSKL.labels_))

return partitions

``````
``````

In [11]:

``````
``````

Out[11]:

<module 'MyML.cluster.K_Means3' from 'MyML/cluster/K_Means3.py'>

``````
``````

In [13]:

npartitions=30
iters=3

nclusters=10
partitions_my_10 = generatePartitions(data=data,npartitions=npartitions,nclusters=nclusters,iters=iters)
partitions_skl_10 = generatePartitions(data=data,npartitions=npartitions,nclusters=nclusters,iters=iters)

if type(nclusters) is not list:
allGood=True
for p in xrange(npartitions):
if len(partitions_my_10[p]) != nclusters:
print 'partition {} of partitions_my has different number of clusters:{}'.format(p,len(partitions_my_10[p]))
allGood=False
if len(partitions_skl_10[p]) != nclusters:
print 'partition {} of partitions_my has different number of clusters:{}'.format(p,len(partitions_skl_10[p]))
allGood=False
if allGood:
print 'All partitions have good number of clusters.'

nclusters=6
partitions_my_6 = generatePartitions(data=data,npartitions=npartitions,nclusters=nclusters,iters=iters)
partitions_skl_6 = generatePartitions(data=data,npartitions=npartitions,nclusters=nclusters,iters=iters)

if type(nclusters) is not list:
allGood=True
for p in xrange(npartitions):
if len(partitions_my_6[p]) != nclusters:
print 'partition {} of partitions_my has different number of clusters:{}'.format(p,len(partitions_my_6[p]))
allGood=False
if len(partitions_skl_6[p]) != nclusters:
print 'partition {} of partitions_my has different number of clusters:{}'.format(p,len(partitions_skl_6[p]))
allGood=False
if allGood:
print 'All partitions have good number of clusters.'

nclusters=[4,25]
partitions_my_rand = generatePartitions(data=data,npartitions=npartitions,nclusters=nclusters,iters=iters)
partitions_skl_rand = generatePartitions(data=data,npartitions=npartitions,nclusters=nclusters,iters=iters)

if type(nclusters) is not list:
allGood=True
for p in xrange(npartitions):
if len(partitions_my_rand[p]) != nclusters:
print 'partition {} of partitions_my has different number of clusters:{}'.format(p,len(partitions_my[p]))
allGood=False
if len(partitions_skl_rand[p]) != nclusters:
print 'partition {} of partitions_my has different number of clusters:{}'.format(p,len(partitions_skl[p]))
allGood=False
if allGood:
print 'All partitions have good number of clusters.'

``````
``````

All partitions have good number of clusters.
All partitions have good number of clusters.

``````

### Visualizing some partitions

``````

In [14]:

figEnsemble=plt.figure(figsize=(16,12))

for c in partitions_my_10[0]:
ax1En.plot(data[c,0],data[c,1],'.')
ax1En.set_title("Sample of one partition generated with my K-Means")

for c in partitions_my_10[1]:
ax2En.plot(data[c,0],data[c,1],'.')
ax2En.set_title("Sample of one partition generated with my K-Means")

for c in partitions_skl_10[0]:
ax3En.plot(data[c,0],data[c,1],'.')
ax3En.set_title("Sample of one partition generated with SKL's K-Means")

for c in partitions_skl_10[1]:
ax4En.plot(data[c,0],data[c,1],'.')
ax4En.set_title("Sample of one partition generated with SKL's K-Means")

``````
``````

Out[14]:

<matplotlib.text.Text at 0x7fc5b408fdd0>

``````

# EAC K-Means

## 6 clusters per partition

``````

In [15]:

# generate coassoc
prot_mode="random"
assoc_mode='prot' # prot or full
nprots=nsamples # number of prototypes

partitions_used = partitions_my_6

myEstimator=eac.EAC(nsamples)
myEstimator.fit(partitions_used,files=False,assoc_mode=assoc_mode, prot_mode=prot_mode, nprot=nprots,build_only=True)

# final clustering with the true number of clusters
true_nclusters = np.unique(gt).shape[0]

# cluster with my K-Means
kmeans_mode = "numpy"

grouper = K_Means3.K_Means(n_clusters=true_nclusters, mode=kmeans_mode, cuda_mem='manual',tol=1e-4,max_iters=iters)
grouper._centroid_mode = "index"
grouper.fit(myEstimator._coassoc)

# cluster with SKL K-Means
gSKL = KMeans_skl(n_clusters=true_nclusters,n_init=1,init="random")
gSKL.fit(myEstimator._coassoc)

# Hungarian accuracy
myAcc = determine_ci.HungarianIndex(nsamples=nsamples)
myAcc.score(gt,grouper.labels_,format='array')

sklAcc = determine_ci.HungarianIndex(nsamples=nsamples)
sklAcc.score(gt,gSKL.labels_,format='array')

print 'My Accuracy:\t',myAcc.accuracy
print 'SKL Accuracy:\t',sklAcc.accuracy

figEAC=plt.figure(figsize=(16,6))

for c in np.unique(grouper.labels_):
clusterData=grouper.labels_==c
ax1EAC.plot(data[clusterData,0],data[clusterData,1],'.')
ax1EAC.set_title("Final EAC partition with my K-Means")

for c in np.unique(gSKL.labels_):
clusterData=gSKL.labels_==c
ax2EAC.plot(data[clusterData,0],data[clusterData,1],'.')
ax2EAC.set_title("Final EAC partition with SKL's K-Means")

``````
``````

My Accuracy:	1.0
SKL Accuracy:	1.0

MyML/helper/partition.py:56: FutureWarning: comparison to `None` will result in an elementwise object comparison in the future.
if clusts == None:

Out[15]:

<matplotlib.text.Text at 0x7fc5b6770d50>

``````

Accuracy is usually 100% in both cases (clustering from my K-Means and SciKit-Learn's). This depends on the ensemble. For some ensembles the accuracy on both is always one, for others it sometimes is not in one or both of the K-Means used (mine vs SKL).

The number of prototypes is equal to the number of samples and since there are not repeated prototypes, all the samples are being used. Above are the visualizations of the solutions.

### Statistic analysis

``````

In [17]:

stat_nprots=nsamples
print "{}\t{}\t{}\t{}\t{}".format("type","mean","var","max","min")
print "skl \t",
for metric in k_skl_analysis(partitions_used,files=False,ground_truth=gt,nprots=stat_nprots,rounds=100):
print "{}\t".format(metric),
print "\nmy  \t",
for metric in k_analysis(partitions_used,files=False,ground_truth=gt,nprots=stat_nprots,iters="converge",rounds=100):
print "{}\t".format(metric),

``````
``````

type	mean	var	max	min
skl 	1.0	0.0	1.0	1.0
my  	1.0	0.0	1.0	1.0

``````
``````

In [29]:

nprots=[5,20,40,60,80,100,120,140,160,180,200]

results_k10=list()
for n in nprots:
print '.',
r=k_analysis(partitions_used,files=False,ground_truth=gt,nprots=n,rounds=100)
results_k10.append(r)

mean_k10=[res[0] for res in results_k10]
var_k10=[res[1] for res in results_k10]
best_k10=[res[2] for res in results_k10]
worst_k10=[res[3] for res in results_k10]

plt.plot(mean_k10,label='mean')
plt.plot(best_k10,label='best')
plt.plot(worst_k10,label='worst')
plt.plot([0, 10], [0.5, 0.5], 'k-', lw=1)
plt.title("Analysis of the influence of the number of prototypes")
plt.legend(loc='best')

``````
``````

.

---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-29-329d395e5d0e> in <module>()
4 for n in nprots:
5     print '.',
----> 6     r=k_analysis(partitions_used,files=False,ground_truth=gt,nprots=n,rounds=100)
7     results_k10.append(r)
8

<ipython-input-7-935179233b8c> in k_analysis(partition_files, ground_truth, nprots, iters, rounds, files)
14         grouper = K_Means3.K_Means(n_clusters=nclusters,mode=kmeans_mode, cuda_mem='manual',tol=1e-4,max_iters=iters)
15         grouper._centroid_mode = "iter"
---> 16         grouper.fit(estimator._coassoc)
17
18         myAcc = determine_ci.HungarianIndex(nsamples=nsamples)

/home/diogoaos/QCThesis/MyML/cluster/K_Means3.py in fit(self, data)
121
122             # compute new centroids
--> 123             self.centroids =  self._recompute_centroids(data,self.centroids,labels)
124
125         self.labels_ = labels

/home/diogoaos/QCThesis/MyML/cluster/K_Means3.py in _recompute_centroids(self, data, centroids, labels)
563             new_centroids = self._np_recompute_centroids_index(data,centroids,labels)
564         elif self._centroid_mode == "iter":
--> 565             new_centroids = self._np_recompute_centroids_iter(data,centroids,labels)
566         elif self._centroid_mode == "good":
567             new_centroids = self._np_recompute_centroids_good(data,centroids,labels)

/home/diogoaos/QCThesis/MyML/cluster/K_Means3.py in _np_recompute_centroids_iter(self, data, centroids, labels)
770         for n in xrange(N):
771             new_centroids[labels[n]] += data[n]
--> 772             centroid_count[labels[n]] += 1
773
774

KeyboardInterrupt:

``````

## 10 clusters per partition

``````

In [19]:

partitions_used = partitions_my_10

# generate coassoc
prot_mode="random"
assoc_mode='prot' # prot or full
nprots=nsamples # number of prototypes

myEstimator=eac.EAC(nsamples)
myEstimator.fit(partitions_used,files=False,assoc_mode=assoc_mode, prot_mode=prot_mode, nprot=nprots,build_only=True)

# final clustering with the true number of clusters
true_nclusters = np.unique(gt).shape[0]

# cluster with my K-Means
kmeans_mode = "numpy"

grouper = K_Means3.K_Means(n_clusters=true_nclusters, mode=kmeans_mode, cuda_mem='manual',tol=1e-4,max_iters=iters)
grouper._centroid_mode = "iter"
grouper.fit(myEstimator._coassoc)

# cluster with SKL K-Means
gSKL = KMeans_skl(n_clusters=true_nclusters,n_init=1,init="random")
gSKL.fit(myEstimator._coassoc)

# Hungarian accuracy
myAcc = determine_ci.HungarianIndex(nsamples=nsamples)
myAcc.score(gt,grouper.labels_,format='array')

sklAcc = determine_ci.HungarianIndex(nsamples=nsamples)
sklAcc.score(gt,gSKL.labels_,format='array')

print 'My Accuracy:\t',myAcc.accuracy
print 'SKL Accuracy:\t',sklAcc.accuracy

figEAC2=plt.figure(figsize=(16,12))

for c in np.unique(grouper.labels_):
clusterData=grouper.labels_==c
ax1EAC2.plot(data[clusterData,0],data[clusterData,1],'.')
ax1EAC2.set_title("Final EAC partition with my K-Means")

for c in np.unique(gSKL.labels_):
clusterData=gSKL.labels_==c
ax2EAC2.plot(data[clusterData,0],data[clusterData,1],'.')
ax2EAC2.set_title("Final EAC partition with SKL's K-Means")

nprots=[5,20,40,60,80,100,120,140,160,180,200]
results_k6=list()
for n in nprots:
r=k_skl_analysis(partitions_used,files=False,ground_truth=gt,nprots=stat_nprots,rounds=100)
results_k6.append(r)

mean_k6=[res[0] for res in results_k6]
var_k6=[res[1] for res in results_k6]
best_k6=[res[2] for res in results_k6]
worst_k6=[res[3] for res in results_k6]

ax3EAC2.plot(mean_k6)
ax3EAC2.plot(best_k6)
ax3EAC2.plot(worst_k6)
ax3EAC2.plot([0, 10], [0.5, 0.5], 'k-', lw=1)
ax3EAC2.set_title("Analysis of the influence of the number of prototypes (SKL)")

print "\nStatistical analysis"
stat_nprots=nsamples
print "{}\t{}\t{}\t{}\t{}".format("type","mean","var","max","min")
print "skl \t",
for metric in k_skl_analysis(partitions_used,files=False,ground_truth=gt,nprots=stat_nprots,rounds=100):
print "{}\t".format(metric),
print "\nmy  \t",
for metric in k_analysis(partitions_used,files=False,ground_truth=gt,nprots=stat_nprots,iters="converge",rounds=100):
print "{}\t".format(metric),

``````
``````

My Accuracy:	1.0
SKL Accuracy:	1.0

Statistical analysis
type	mean	var	max	min
skl 	0.98525	0.0034096875	1.0	0.75
my  	0.9925	0.00181875	1.0	0.75

``````

## Random number of clusters per partition

``````

In [20]:

partitions_used = partitions_my_rand

# generate coassoc
prot_mode="random"
assoc_mode='prot' # prot or full
nprots=nsamples # number of prototypes

myEstimator=eac.EAC(nsamples)
myEstimator.fit(partitions_used,files=False,assoc_mode=assoc_mode, prot_mode=prot_mode, nprot=nprots,build_only=True)

# final clustering with the true number of clusters
true_nclusters = np.unique(gt).shape[0]

# cluster with my K-Means
kmeans_mode = "numpy"

grouper = K_Means3.K_Means(n_clusters=true_nclusters, mode=kmeans_mode, cuda_mem='manual',tol=1e-4,max_iters=iters)
grouper._centroid_mode = "iter"
grouper.fit(myEstimator._coassoc)

# cluster with SKL K-Means
gSKL = KMeans_skl(n_clusters=true_nclusters,n_init=1,init="random")
gSKL.fit(myEstimator._coassoc)

# Hungarian accuracy
myAcc = determine_ci.HungarianIndex(nsamples=nsamples)
myAcc.score(gt,grouper.labels_,format='array')

sklAcc = determine_ci.HungarianIndex(nsamples=nsamples)
sklAcc.score(gt,gSKL.labels_,format='array')

print 'My Accuracy:\t',myAcc.accuracy
print 'SKL Accuracy:\t',sklAcc.accuracy

figEAC2=plt.figure(figsize=(16,12))

for c in np.unique(grouper.labels_):
clusterData=grouper.labels_==c
ax1EAC2.plot(data[clusterData,0],data[clusterData,1],'.')
ax1EAC2.set_title("Final EAC partition with my K-Means")

for c in np.unique(gSKL.labels_):
clusterData=gSKL.labels_==c
ax2EAC2.plot(data[clusterData,0],data[clusterData,1],'.')
ax2EAC2.set_title("Final EAC partition with SKL's K-Means")

nprots=[5,20,40,60,80,100,120,140,160,180,200]
results_k6=list()
for n in nprots:
r=k_skl_analysis(partitions_used,files=False,ground_truth=gt,nprots=stat_nprots,rounds=100)
results_k6.append(r)

mean_k6=[res[0] for res in results_k6]
var_k6=[res[1] for res in results_k6]
best_k6=[res[2] for res in results_k6]
worst_k6=[res[3] for res in results_k6]

ax3EAC2.plot(mean_k6)
ax3EAC2.plot(best_k6)
ax3EAC2.plot(worst_k6)
ax3EAC2.plot([0, 10], [0.5, 0.5], 'k-', lw=1)
ax3EAC2.set_title("Analysis of the influence of the number of prototypes (SKL)")

print "\nStatistical analysis"
stat_nprots=nsamples
print "{}\t{}\t{}\t{}\t{}".format("type","mean","var","max","min")
print "skl \t",
for metric in k_skl_analysis(partitions_used,files=False,ground_truth=gt,nprots=stat_nprots,rounds=100):
print "{}\t".format(metric),
print "\nmy  \t",
for metric in k_analysis(partitions_used,files=False,ground_truth=gt,nprots=stat_nprots,iters="converge",rounds=100):
print "{}\t".format(metric),

``````
``````

My Accuracy:	0.78
SKL Accuracy:	1.0

---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-20-c7997f7bef87> in <module>()
52 results_k6=list()
53 for n in nprots:
---> 54     r=k_skl_analysis(partitions_used,files=False,ground_truth=gt,nprots=stat_nprots,rounds=100)
55     results_k6.append(r)
56

<ipython-input-7-935179233b8c> in k_skl_analysis(partition_files, ground_truth, nprots, rounds, files)
30
31         estimator=eac.EAC(nsamples)
---> 32         estimator.fit(partition_files,files=files,assoc_mode='prot', prot_mode=prot_mode, nprot=nprots,build_only=True)
33
34         kmeans_mode = "numpy"

/home/diogoaos/QCThesis/MyML/cluster/eac.pyc in fit(self, ensemble, files, assoc_mode, prot_mode, nprot, link, build_only)
75                 else:
76                         for partition in ensemble:
---> 77                                 self._update_coassoc_matrix(partition) # update co-association matrix
78
79         def _create_coassoc(self,mode,nsamples,nprot=None):

/home/diogoaos/QCThesis/MyML/cluster/eac.pyc in _update_coassoc_matrix(self, clusters)
211                 # reduced matrix
212                 elif self._assoc_mode is "other":
--> 213                         self._update_coassoc_k(self._coassoc,clusters,self.k_labels)
214                 elif self._assoc_mode is "knn":
215                         self._update_coassoc_knn(self._coassoc,clusters,self.k_neighbours)

/home/diogoaos/QCThesis/MyML/cluster/eac.pyc in _update_coassoc_k(self, assoc_mat, clusters, k_labels)
260
261                                 # this indexing selects the rows and columns specified by n_in_cluster and k_in_cluster
--> 262                                 assoc_mat[n_in_cluster[:,np.newaxis],k_in_cluster] += 1 # np.newaxis is alias for None
263
264

KeyboardInterrupt:

``````
``````

In [ ]:

plt.pcolor(myEstimator._coassoc)

``````

# K-Means only

``````

In [ ]:

stat_nprots=nsamples
print "{}\t{}\t{}\t{}\t{}".format("type","mean","var","max","min")
print "my  \t",
for metric in stat_my_kmeans(data,true_nclusters,gt,rounds=100):
print "{}\t".format(metric),
print "\nskl \t",
for metric in stat_skl_kmeans(data,true_nclusters,gt,rounds=100):
print "{}\t".format(metric),

``````

# EAC K-Medoids

``````

In [ ]:

import MyML.cluster.KMedoids as KMedoids

``````

## 6 clusters per partition

``````

In [ ]:

#%%debug
partitions_used = partitions_my_6

# generate coassoc
prot_mode="random"
assoc_mode='full' # prot or full
nprots=50 # number of prototypes

myEstimator=eac.EAC(nsamples)
myEstimator.fit(partitions_used,files=False,assoc_mode=assoc_mode, prot_mode=prot_mode, nprot=nprots,build_only=True)

# final clustering with the true number of clusters
true_nclusters = np.unique(gt).shape[0]

# compute diassociation from co-assoc
diassoc=myEstimator._coassoc.max()-myEstimator._coassoc

#k-medoids
labels,medoids=KMedoids.cluster(diassoc,k=true_nclusters)

# Hungarian accuracy
acc = determine_ci.HungarianIndex(nsamples=nsamples)
acc.score(gt,labels,format='array')

print 'K-Medoids Accuracy:\t',acc.accuracy

``````

### Statistical analysis

``````

In [ ]:

class acc_medoids():
def __init__(self,data,nclusters,gt):
self.data=data
self.nsamples=data.shape[0]
self.nclusters=nclusters
self.gt=gt

def run(self):
labels,medoids=KMedoids.cluster(self.data,k=self.nclusters)
# Hungarian accuracy
acc = determine_ci.HungarianIndex(nsamples=self.nsamples)
acc.score(self.gt,labels,format='array')
return acc.accuracy

class acc_my_kmeans():
def __init__(self,data,nclusters,gt):
self.data=data
self.nclusters=nclusters
self.nsamples=data.shape[0]
self.gt=gt
def run(self):
# cluster with SKL K-Means
grouper = K_Means3.K_Means(n_clusters=true_nclusters,mode=kmeans_mode, cuda_mem='manual',tol=1e-4,max_iters=iters)
grouper._centroid_mode = "iter"
grouper.fit(self.data)

# Hungarian accuracy
sklAcc = determine_ci.HungarianIndex(nsamples=self.nsamples)
sklAcc.score(self.gt,grouper.labels_,format='array')

return sklAcc.accuracy

class acc_skl_kmeans():
def __init__(self,data,nclusters,gt):
self.data=data
self.nclusters=nclusters
self.nsamples=data.shape[0]
self.gt=gt
def run(self):
# cluster with SKL K-Means
gSKL = KMeans_skl(n_clusters=self.nclusters,n_init=1,init="random")
gSKL.fit(self.data)

# Hungarian accuracy
sklAcc = determine_ci.HungarianIndex(nsamples=self.nsamples)
sklAcc.score(self.gt,gSKL.labels_,format='array')

return sklAcc.accuracy

def stat_analysis(method,rounds=20):
rAll = np.zeros(rounds)
for r in xrange(rounds):
rAll[r]=method.run()
return rAll.mean(),rAll.var(),rAll.max(),rAll.min()

``````
``````

In [ ]:

rounds=100
diassoc=myEstimator._coassoc.max()-myEstimator._coassoc
x=acc_medoids(diassoc,nclusters=true_nclusters,gt=gt)
print 'diassoc kmedoids\t',stat_analysis(x,rounds=rounds)

x2=acc_my_kmeans(diassoc,nclusters=true_nclusters,gt=gt)
print 'diassoc kmeans  \t',stat_analysis(x2,rounds=rounds)

x3=acc_medoids(myEstimator._coassoc,nclusters=true_nclusters,gt=gt)
print 'assoc kmedoids  \t',stat_analysis(x3,rounds=rounds)

x4=acc_my_kmeans(myEstimator._coassoc,nclusters=true_nclusters,gt=gt)
print 'assoc kmeans    \t',stat_analysis(x4,rounds=rounds)

``````

## 10 clusters per partition

``````

In [ ]:

#%%debug
partitions_used = partitions_my_10

# generate coassoc
prot_mode="random"
assoc_mode='full' # prot or full
nprots=50 # number of prototypes

myEstimator=eac.EAC(nsamples)
myEstimator.fit(partitions_used,files=False,assoc_mode=assoc_mode, prot_mode=prot_mode, nprot=nprots,build_only=True)

# final clustering with the true number of clusters
true_nclusters = np.unique(gt).shape[0]

# compute diassociation from co-assoc
diassoc=myEstimator._coassoc.max()-myEstimator._coassoc

#k-medoids
labels,medoids=KMedoids.cluster(diassoc,k=true_nclusters)

# Hungarian accuracy
acc = determine_ci.HungarianIndex(nsamples=nsamples)
acc.score(gt,labels,format='array')

print 'K-Medoids Accuracy:\t',acc.accuracy

``````

### Statistical analysis

``````

In [ ]:

rounds=20
diassoc=myEstimator._coassoc.max()-myEstimator._coassoc
x=acc_medoids(diassoc,nclusters=true_nclusters,gt=gt)
print 'diassoc kmedoids\t',stat_analysis(x,rounds=rounds)

x2=acc_skl_kmeans(diassoc,nclusters=true_nclusters,gt=gt)
print 'diassoc kmeans  \t',stat_analysis(x2,rounds=rounds)

x3=acc_medoids(myEstimator._coassoc,nclusters=true_nclusters,gt=gt)
print 'assoc kmedoids  \t',stat_analysis(x3,rounds=rounds)

x4=acc_skl_kmeans(myEstimator._coassoc,nclusters=true_nclusters,gt=gt)
print 'assoc kmeans    \t',stat_analysis(x4,rounds=rounds)

``````

## Random clusters per partition

``````

In [ ]:

#%%debug
npartitions=30
nclusters=[4,25]
iters=3
partitions_used = partitions_my_rand

# generate coassoc
prot_mode="random"
assoc_mode='full' # prot or full
nprots=50 # number of prototypes

myEstimator=eac.EAC(nsamples)
myEstimator.fit(partitions_used,files=False,assoc_mode=assoc_mode, prot_mode=prot_mode, nprot=nprots,build_only=True)

# final clustering with the true number of clusters
true_nclusters = np.unique(gt).shape[0]

# compute diassociation from co-assoc
diassoc=myEstimator._coassoc.max()-myEstimator._coassoc

#k-medoids
labels,medoids=KMedoids.cluster(diassoc,k=true_nclusters)

# Hungarian accuracy
acc = determine_ci.HungarianIndex(nsamples=nsamples)
acc.score(gt,labels,format='array')

print 'K-Medoids Accuracy:\t',acc.accuracy

``````

### Statistical analysis

``````

In [ ]:

rounds=20
diassoc=myEstimator._coassoc.max()-myEstimator._coassoc
x=acc_medoids(diassoc,nclusters=true_nclusters,gt=gt)
print 'diassoc kmedoids\t',stat_analysis(x,rounds=rounds)

x2=acc_skl_kmeans(diassoc,nclusters=true_nclusters,gt=gt)
print 'diassoc kmeans  \t',stat_analysis(x2,rounds=rounds)

x3=acc_medoids(myEstimator._coassoc,nclusters=true_nclusters,gt=gt)
print 'assoc kmedoids  \t',stat_analysis(x3,rounds=rounds)

x4=acc_skl_kmeans(myEstimator._coassoc,nclusters=true_nclusters,gt=gt)
print 'assoc kmeans    \t',stat_analysis(x4,rounds=rounds)

``````

## K-Medoids only

``````

In [ ]:

from sklearn.metrics.pairwise import pairwise_distances

``````
``````

In [ ]:

pairwise=pairwise_distances(data)
y=acc_medoids(pairwise,2,gt=gt)
stat_analysis(y,rounds=20)

``````

``````

In [ ]:

partitions_used = partitions_my_rand

# generate coassoc
prot_mode="random"
assoc_mode='full' # prot or full
nprots=nsamples # number of prototypes

myEstimator=eac.EAC(nsamples)
myEstimator.fit(partitions_used,files=False,assoc_mode=assoc_mode, prot_mode=prot_mode, nprot=nprots,build_only=True)

# final clustering with the true number of clusters
true_nclusters = np.unique(gt).shape[0]

#k-medoids

# Hungarian accuracy
acc = determine_ci.HungarianIndex(nsamples=nsamples)
acc.score(gt,labels,format='array')

print 'EAC SL Accuracy:\t',acc.accuracy

``````
``````

In [ ]:

from scipy.cluster import hierarchy as hie
from scipy.spatial.distance import squareform

``````
``````

In [ ]:

# pairwise distances
dists = np.zeros((nsamples,nsamples))
for i,dp in enumerate(data):
dist = (data - dp)**2
dist = np.sqrt(dist.sum(axis=1))
dists[i]=dist

``````
``````

In [ ]:

#pairwise=pairwise_distances(data)
condensed_dists = squareform(dists)
parents=Z[-1,:2]
labels=myEstimator._buildLabels(Z=Z,parents=parents)

acc.score(gt,labels,format='array')

``````
``````

In [ ]:

#generated from: http://tools.medialab.sciences-po.fr/iwanthue/
my_colors=["#D37E30",
"#6F6FD8",
"#3AA579",
"#D5337B",
"#4595B8",
"#3EA729",
"#D150D7",
"#4E6E23",
"#8F4D79",
"#D64430",
"#A1952B",
"#C15257",
"#AA5BB3",
"#6A76B0",
"#8E5723",
"#2A7464",
"#D66C9F",
"#60994E",
"#73A32D",
"#33A74F"]
my_pallete=sns.color_palette(my_colors,len(my_colors))
sns.palplot(my_pallete)
sns.set_palette(my_pallete,len(my_colors))
#marker_types=['.','^','*','h','x']
marker_types=matplotlib.markers.MarkerStyle.filled_markers

``````
``````

In [ ]:

sns.set_style("whitegrid")

``````
``````

In [ ]:

figX=sns.plt.figure(figsize=(12,90))
for i,p in enumerate(partitions_my_rand):
for j,c in enumerate(p):
ax.plot(data[c,0],data[c,1],ls=u'None',marker=marker_types[j/6],markersize=8)
#ax.scatter(data[c,0],data[c,1],marker=marker_types[j/6],linewidths=5)
ax.set_title("partition {}, {} clusters".format(i+1,j+1))

``````
``````

In [ ]:

``````