author: Diogo Silva

SKL = SciKit-Learn



In [1]:

    
%pylab inline









    



Populating the interactive namespace from numpy and matplotlib



In [2]:

    
import seaborn as sns



In [3]:

    
home = %env HOME



In [4]:

    
cd $home/QCThesis/









    



/home/chiroptera/workspace/QCThesis



In [13]:

    
from sklearn.cluster import KMeans as KMeans_skl
import MyML.cluster.eac as eac
reload(eac)
import MyML.cluster.K_Means3 as K_Means3
reload(K_Means3)
import MyML.metrics.accuracy as determine_ci
reload(determine_ci)









    Out[13]:





<module 'MyML.metrics.accuracy' from 'MyML/metrics/accuracy.pyc'>

Helper functions



In [14]:

    
def stat_my_kmeans(data,nclusters,gtruth,rounds=20):
    nsamples=data.shape[0]
    all_acc = list()
    for r in xrange(rounds):
        iters="converge"
        kmeans_mode="numpy"

        grouper = K_Means3.K_Means(n_clusters=nclusters, mode=kmeans_mode, cuda_mem='manual',tol=1e-4,max_iters=iters)
        grouper._centroid_mode = "iter"
        grouper.fit(data)


        myAcc = determine_ci.HungarianIndex(nsamples=nsamples)
        myAcc.score(gtruth,grouper.labels_,format='array')
        
        all_acc.append(myAcc.accuracy)
        
        
    return np.mean(all_acc),np.var(all_acc),np.max(all_acc),np.min(all_acc)
    
def stat_skl_kmeans(data,nclusters,gtruth,rounds=20,init='random'):
    nsamples=data.shape[0]
    all_acc = list()
    for r in xrange(rounds):
        iters="converge"
        kmeans_mode="numpy"

        gSKL = KMeans_skl(n_clusters=nclusters,n_init=1,init=init)
        gSKL.fit(data)

        myAcc = determine_ci.HungarianIndex(nsamples=nsamples)
        myAcc.score(gtruth,grouper.labels_,format='array')
        
        all_acc.append(myAcc.accuracy)
        
        
    return np.mean(all_acc),np.var(all_acc),np.max(all_acc),np.min(all_acc)



In [15]:

    
print "b MyML/cluster/K_Means3.py:"









    



b MyML/cluster/K_Means3.py:



In [16]:

    
def k_analysis(partition_files,ground_truth,nprots,iters="converge",rounds=20,files=True):
    nsamples=data.shape[0]
    all_acc = list()
    
    for r in xrange(rounds):
        prot_mode="random"

        estimator=eac.EAC(nsamples)
        estimator.fit(partition_files,files=files,assoc_mode='prot', prot_mode=prot_mode, nprot=nprots,build_only=True)

        kmeans_mode = "numpy"
        nclusters = np.unique(ground_truth).shape[0]

        grouper = K_Means3.K_Means(n_clusters=nclusters,mode=kmeans_mode, cuda_mem='manual',tol=1e-4,max_iters=iters)
        grouper._centroid_mode = "iter"
        grouper.fit(estimator._coassoc)

        myAcc = determine_ci.HungarianIndex(nsamples=nsamples)
        myAcc.score(ground_truth,grouper.labels_,format='array')
        
        all_acc.append(myAcc.accuracy)
    return np.mean(all_acc),np.var(all_acc),np.max(all_acc),np.min(all_acc)

def k_skl_analysis(partition_files,ground_truth,nprots,rounds=20,files=True):
    nsamples=data.shape[0]
    all_acc = list()
    
    for r in xrange(rounds):
        prot_mode="random"

        estimator=eac.EAC(nsamples)
        estimator.fit(partition_files,files=files,assoc_mode='prot', prot_mode=prot_mode, nprot=nprots,build_only=True)

        kmeans_mode = "numpy"
        nclusters = np.unique(ground_truth).shape[0]


        grouper = KMeans_skl(n_clusters=nclusters,n_init=1,init="random")
        grouper.fit(estimator._coassoc)
        
        myAcc = determine_ci.HungarianIndex(nsamples=nsamples)
        myAcc.score(ground_truth,grouper.labels_,format='array')
        
        all_acc.append(myAcc.accuracy)
    return np.mean(all_acc),np.var(all_acc),np.max(all_acc),np.min(all_acc)

Generate data



In [47]:

    
center1=(0,0)
center2=(10,10)

cov1=1
cov2=1

n1=500000
n2=500000
nsamples=n1+n2
dim=2

g1 = np.random.normal(loc=center1,scale=cov1,size=(n1,dim)).astype(np.float32)
g2 = np.random.normal(loc=center2,scale=cov2,size=(n2,dim)).astype(np.float32)

data = np.vstack((g1,g2))
gt=np.zeros(data.shape[0],dtype=np.int32)
gt[100:]=1

figData=plt.figure()
plt.plot(g1[:,0],g1[:,1],'.')
plt.plot(g2[:,0],g2[:,1],'.')









    Out[47]:





[<matplotlib.lines.Line2D at 0x7f6659668990>]



In [48]:

    
import MyML.helper.partition
reload(MyML.helper.partition)









    Out[48]:





<module 'MyML.helper.partition' from 'MyML/helper/partition.pyc'>



In [49]:

    
py_estimator=K_Means3.K_Means(n_clusters=20,mode="numpy", cuda_mem='manual',tol=1e-4,max_iter=3)
cu_estimator=K_Means3.K_Means(n_clusters=20,mode="cuda", cuda_mem='manual',tol=1e-4,max_iter=3)



In [50]:

    
%timeit MyML.helper.partition.generateEnsemble(data,cu_estimator,n_clusters=[6,30],npartitions=30,iters=3)
%timeit MyML.helper.partition.generateEnsemble(data,py_estimator,n_clusters=[6,30],npartitions=30,iters=3)









    



1 loops, best of 3: 48.1 s per loop
1 loops, best of 3: 2min 6s per loop



In [174]:

    
cProfile.run("grouperSKL.fit(data)")









    



         481 function calls in 0.063 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    0.063    0.063 <string>:1(<module>)
        3    0.002    0.001    0.002    0.001 _k_means.pyx:244(_centers_dense)
       19    0.000    0.000    0.001    0.000 _methods.py:31(_sum)
        3    0.000    0.000    0.000    0.000 _methods.py:43(_count_reduce_items)
        2    0.000    0.000    0.001    0.001 _methods.py:53(_mean)
        1    0.000    0.000    0.003    0.003 _methods.py:77(_var)
        6    0.000    0.000    0.000    0.000 _weakrefset.py:70(__contains__)
        3    0.000    0.000    0.000    0.000 abc.py:128(__instancecheck__)
       45    0.000    0.000    0.000    0.000 base.py:865(isspmatrix)
        3    0.000    0.000    0.010    0.003 extmath.py:168(safe_sparse_dot)
        3    0.000    0.000    0.000    0.000 extmath.py:44(squared_norm)
        4    0.000    0.000    0.000    0.000 extmath.py:54(row_norms)
        3    0.000    0.000    0.000    0.000 fromnumeric.py:1291(ravel)
        1    0.000    0.000    0.000    0.000 fromnumeric.py:2651(mean)
        1    0.000    0.000    0.003    0.003 fromnumeric.py:2838(var)
        1    0.000    0.000    0.003    0.003 k_means_.py:142(_tolerance)
        1    0.000    0.000    0.063    0.063 k_means_.py:151(k_means)
        1    0.000    0.000    0.058    0.058 k_means_.py:296(_kmeans_single)
        3    0.022    0.007    0.048    0.016 k_means_.py:400(_labels_inertia_precompute_dense)
        3    0.000    0.000    0.049    0.016 k_means_.py:447(_labels_inertia)
        1    0.000    0.000    0.007    0.007 k_means_.py:500(_init_centroids)
        1    0.000    0.000    0.000    0.000 k_means_.py:688(_check_fit_data)
        1    0.000    0.000    0.063    0.063 k_means_.py:716(fit)
        3    0.000    0.000    0.000    0.000 numeric.py:141(ones)
       13    0.000    0.000    0.000    0.000 numeric.py:394(asarray)
       30    0.000    0.000    0.000    0.000 numeric.py:464(asanyarray)
        3    0.015    0.005    0.026    0.009 pairwise.py:143(euclidean_distances)
        3    0.000    0.000    0.001    0.000 pairwise.py:60(check_pairwise_arrays)
       10    0.000    0.000    0.000    0.000 shape_base.py:60(atleast_2d)
       10    0.000    0.000    0.001    0.000 validation.py:115(array2d)
        7    0.000    0.000    0.001    0.000 validation.py:128(_atleast2d_or_sparse)
        7    0.000    0.000    0.001    0.000 validation.py:157(atleast2d_or_csr)
        4    0.000    0.000    0.000    0.000 validation.py:337(check_random_state)
       16    0.000    0.000    0.001    0.000 validation.py:37(_assert_all_finite)
        6    0.000    0.000    0.000    0.000 validation.py:57(safe_asarray)
        1    0.000    0.000    0.000    0.000 validation.py:83(as_float_array)
        3    0.000    0.000    0.000    0.000 {getattr}
        2    0.000    0.000    0.000    0.000 {hasattr}
       60    0.000    0.000    0.000    0.000 {isinstance}
        4    0.000    0.000    0.000    0.000 {issubclass}
       31    0.000    0.000    0.000    0.000 {len}
        1    0.000    0.000    0.000    0.000 {max}
       10    0.000    0.000    0.000    0.000 {method 'append' of 'list' objects}
       12    0.000    0.000    0.000    0.000 {method 'copy' of 'numpy.ndarray' objects}
        1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}
        6    0.000    0.000    0.000    0.000 {method 'fill' of 'numpy.ndarray' objects}
        1    0.000    0.000    0.001    0.001 {method 'mean' of 'numpy.ndarray' objects}
        1    0.007    0.007    0.007    0.007 {method 'permutation' of 'mtrand.RandomState' objects}
        3    0.000    0.000    0.000    0.000 {method 'ravel' of 'numpy.ndarray' objects}
       23    0.005    0.000    0.005    0.000 {method 'reduce' of 'numpy.ufunc' objects}
       19    0.000    0.000    0.001    0.000 {method 'sum' of 'numpy.ndarray' objects}
        6    0.010    0.002    0.010    0.002 {numpy.core._dotblas.dot}
       49    0.000    0.000    0.000    0.000 {numpy.core.multiarray.array}
        3    0.000    0.000    0.000    0.000 {numpy.core.multiarray.copyto}
        4    0.000    0.000    0.000    0.000 {numpy.core.multiarray.einsum}
        9    0.000    0.000    0.000    0.000 {numpy.core.multiarray.empty}
        1    0.000    0.000    0.000    0.000 {numpy.core.multiarray.zeros}
        6    0.000    0.000    0.000    0.000 {range}
        3    0.000    0.000    0.002    0.001 {sklearn.cluster._k_means._centers_dense}



In [200]:

    
cProfile.run("grouper.fit(data)")









    



         2991 function calls (2940 primitive calls) in 0.066 seconds

   Ordered by: standard name

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    0.066    0.066 <string>:1(<module>)
        9    0.000    0.000    0.000    0.000 <string>:8(__new__)
        1    0.000    0.000    0.000    0.000 K_Means3.py:133(_init_centroids)
        3    0.000    0.000    0.021    0.007 K_Means3.py:161(_label)
        3    0.000    0.000    0.021    0.007 K_Means3.py:341(_cu_label)
        3    0.000    0.000    0.001    0.000 K_Means3.py:398(_cu_label_kernel)
        3    0.000    0.000    0.045    0.015 K_Means3.py:563(_recompute_centroids)
        3    0.018    0.006    0.045    0.015 K_Means3.py:625(_np_recompute_centroids_good)
        1    0.000    0.000    0.066    0.066 K_Means3.py:79(fit)
        3    0.000    0.000    0.000    0.000 _methods.py:31(_sum)
       18    0.000    0.000    0.000    0.000 _methods.py:34(_prod)
       60    0.000    0.000    0.000    0.000 _methods.py:43(_count_reduce_items)
       60    0.002    0.000    0.010    0.000 _methods.py:53(_mean)
        3    0.000    0.000    0.009    0.003 api.py:116(synchronize)
        6    0.000    0.000    0.000    0.000 api.py:121(_prepare_shape_strides_dtype)
        6    0.000    0.000    0.005    0.001 api.py:150(device_array_like)
        3    0.000    0.000    0.004    0.001 api.py:23(to_device)
        6    0.000    0.000    0.004    0.001 api.py:63(device_array)
       18    0.001    0.000    0.002    0.000 arrayobj.py:46(make_array_ctype)
       18    0.000    0.000    0.000    0.000 arrayobj.py:61(c_array)
        3    0.000    0.000    0.005    0.002 arraysetops.py:96(unique)
        3    0.000    0.000    0.000    0.000 compiler.py:172(copy)
        3    0.000    0.000    0.000    0.000 compiler.py:175(configure)
        3    0.000    0.000    0.000    0.000 compiler.py:201(__getitem__)
        3    0.000    0.000    0.000    0.000 compiler.py:250(get)
        3    0.000    0.000    0.001    0.000 compiler.py:301(__call__)
        3    0.000    0.000    0.001    0.000 compiler.py:326(_kernel_call)
       12    0.000    0.000    0.000    0.000 compiler.py:377(_prepare_args)
        9    0.000    0.000    0.000    0.000 contextlib.py:12(__init__)
        9    0.000    0.000    0.000    0.000 contextlib.py:15(__enter__)
        9    0.000    0.000    0.000    0.000 contextlib.py:21(__exit__)
        9    0.000    0.000    0.000    0.000 contextlib.py:82(helper)
        6    0.000    0.000    0.000    0.000 copy.py:306(_reconstruct)
        6    0.000    0.000    0.000    0.000 copy.py:66(copy)
        6    0.000    0.000    0.000    0.000 copy_reg.py:92(__newobj__)
        9    0.000    0.000    0.000    0.000 devicearray.py:123(__del__)
        9    0.000    0.000    0.000    0.000 devicearray.py:129(_default_stream)
        9    0.000    0.000    0.000    0.000 devicearray.py:141(device_ctypes_pointer)
        3    0.000    0.000    0.000    0.000 devicearray.py:151(copy_to_device)
        6    0.000    0.000    0.001    0.000 devicearray.py:168(copy_to_host)
       12    0.000    0.000    0.000    0.000 devicearray.py:232(as_cuda_arg)
        3    0.000    0.000    0.004    0.001 devicearray.py:332(from_array_like)
       12    0.000    0.000    0.000    0.000 devicearray.py:353(auto_device)
        9    0.000    0.000    0.008    0.001 devicearray.py:56(__init__)
       33    0.000    0.000    0.001    0.000 devices.py:108(current_context)
       33    0.000    0.000    0.001    0.000 devices.py:183(get_or_create_context)
       33    0.000    0.000    0.001    0.000 devices.py:224(get_context)
        9    0.000    0.000    0.009    0.001 devices.py:236(_require_cuda_context)
        3    0.000    0.000    0.000    0.000 driver.py:1018(configure)
        3    0.000    0.000    0.000    0.000 driver.py:1035(__call__)
        3    0.000    0.000    0.000    0.000 driver.py:1070(launch_kernel)
       18    0.000    0.000    0.000    0.000 driver.py:1247(host_pointer)
        3    0.000    0.000    0.000    0.000 driver.py:1258(host_memory_extents)
        9    0.000    0.000    0.000    0.000 driver.py:1263(memory_size_from_info)
        3    0.000    0.000    0.000    0.000 driver.py:1273(host_memory_size)
       27    0.000    0.000    0.001    0.000 driver.py:1280(device_pointer)
       39    0.000    0.000    0.001    0.000 driver.py:1285(device_ctypes_pointer)
       66    0.000    0.000    0.000    0.000 driver.py:1293(is_device_memory)
       39    0.000    0.000    0.000    0.000 driver.py:1304(require_device_memory)
        9    0.000    0.000    0.000    0.000 driver.py:1311(device_memory_depends)
       12    0.000    0.000    0.001    0.000 driver.py:1321(host_to_device)
        6    0.000    0.000    0.001    0.000 driver.py:1339(device_to_host)
       94    0.012    0.000    0.012    0.000 driver.py:212(safe_cuda_api_call)
       94    0.000    0.000    0.000    0.000 driver.py:241(_check_error)
       33    0.000    0.000    0.001    0.000 driver.py:270(get_context)
        9    0.000    0.000    0.000    0.000 driver.py:291(add_trash)
        9    0.000    0.000    0.001    0.000 driver.py:294(process)
        9    0.000    0.000    0.002    0.000 driver.py:498(memalloc)
        3    0.000    0.000    0.009    0.003 driver.py:614(synchronize)
        9    0.000    0.000    0.000    0.000 driver.py:664(_make_mem_finalizer)
        9    0.000    0.000    0.000    0.000 driver.py:665(mem_finalize)
        9    0.000    0.000    0.000    0.000 driver.py:669(core)
       10    0.000    0.000    0.001    0.000 driver.py:670(cleanup)
        9    0.000    0.000    0.000    0.000 driver.py:728(__init__)
       10    0.000    0.000    0.000    0.000 driver.py:739(__del__)
        9    0.000    0.000    0.000    0.000 driver.py:746(own)
        9    0.000    0.000    0.000    0.000 driver.py:749(free)
       39    0.000    0.000    0.000    0.000 driver.py:778(device_ctypes_pointer)
        9    0.000    0.000    0.000    0.000 driver.py:838(__init__)
        9    0.000    0.000    0.000    0.000 driver.py:847(__del__)
       90    0.000    0.000    0.000    0.000 driver.py:858(__getattr__)
        9    0.000    0.000    0.000    0.000 driver.py:911(query)
        9    0.000    0.000    0.000    0.000 driver.py:925(record)
       18    0.000    0.000    0.000    0.000 dummyarray.py:104(is_contiguous)
       18    0.000    0.000    0.000    0.000 dummyarray.py:108(compute_index)
       42    0.000    0.000    0.000    0.000 dummyarray.py:109(<genexpr>)
        9    0.000    0.000    0.001    0.000 dummyarray.py:148(from_desc)
        9    0.000    0.000    0.001    0.000 dummyarray.py:157(__init__)
       21    0.000    0.000    0.000    0.000 dummyarray.py:160(<genexpr>)
       21    0.000    0.000    0.000    0.000 dummyarray.py:161(<genexpr>)
        9    0.000    0.000    0.000    0.000 dummyarray.py:167(_compute_layout)
       15    0.000    0.000    0.000    0.000 dummyarray.py:172(is_contig)
        9    0.000    0.000    0.000    0.000 dummyarray.py:184(_compute_extent)
       12    0.000    0.000    0.000    0.000 dummyarray.py:27(__init__)
       24    0.000    0.000    0.000    0.000 dummyarray.py:80(get_offset)
       18    0.000    0.000    0.001    0.000 fromnumeric.py:2259(prod)
        9    0.000    0.000    0.004    0.000 ndarray.py:130(ndarray_populate_head)
        9    0.000    0.000    0.000    0.000 ndarray.py:42(__new__)
       18    0.000    0.000    0.000    0.000 ndarray.py:69(get_stage)
        9    0.000    0.000    0.001    0.000 ndarray.py:80(allocate)
        9    0.000    0.000    0.000    0.000 ndarray.py:91(free)
        9    0.000    0.000    0.002    0.000 ndarray.py:96(write)
       63    0.000    0.000    0.000    0.000 numeric.py:464(asanyarray)
        3    0.000    0.000    0.000    0.000 numeric.py:81(zeros_like)
        1    0.000    0.000    0.000    0.000 random.py:293(sample)
        9    0.000    0.000    0.001    0.000 service.py:18(service)
       66    0.000    0.000    0.000    0.000 threadlocal.py:13(stack)
       33    0.000    0.000    0.000    0.000 threadlocal.py:29(top)
       33    0.000    0.000    0.000    0.000 threadlocal.py:33(is_empty)
       33    0.000    0.000    0.000    0.000 threadlocal.py:37(__bool__)
       33    0.000    0.000    0.000    0.000 threadlocal.py:40(__nonzero__)
        9    0.000    0.000    0.000    0.000 utils.py:142(__setitem__)
       12    0.000    0.000    0.000    0.000 {_ctypes.addressof}
       42    0.000    0.000    0.000    0.000 {_ctypes.byref}
       18    0.000    0.000    0.000    0.000 {_ctypes.sizeof}
       18    0.000    0.000    0.000    0.000 {_weakref.proxy}
       15    0.000    0.000    0.000    0.000 {built-in method __new__ of type object at 0x7eff0af34d00}
  177/126    0.000    0.000    0.000    0.000 {getattr}
        7    0.000    0.000    0.000    0.000 {hasattr}
        6    0.000    0.000    0.000    0.000 {id}
      204    0.000    0.000    0.000    0.000 {isinstance}
       60    0.000    0.000    0.000    0.000 {issubclass}
        6    0.000    0.000    0.000    0.000 {iter}
      100    0.000    0.000    0.000    0.000 {len}
        1    0.000    0.000    0.000    0.000 {math.ceil}
        1    0.000    0.000    0.000    0.000 {math.log}
        6    0.000    0.000    0.000    0.000 {method '__reduce_ex__' of 'object' objects}
       29    0.000    0.000    0.000    0.000 {method 'add' of 'set' objects}
       18    0.000    0.000    0.000    0.000 {method 'append' of 'collections.deque' objects}
       39    0.000    0.000    0.000    0.000 {method 'append' of 'list' objects}
        3    0.012    0.004    0.012    0.004 {method 'argsort' of 'numpy.ndarray' objects}
        1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}
        9    0.000    0.000    0.000    0.000 {method 'discard' of 'set' objects}
        9    0.000    0.000    0.000    0.000 {method 'extend' of 'list' objects}
        3    0.000    0.000    0.000    0.000 {method 'flatten' of 'numpy.ndarray' objects}
       24    0.000    0.000    0.000    0.000 {method 'get' of 'dict' objects}
       60    0.000    0.000    0.010    0.000 {method 'mean' of 'numpy.ndarray' objects}
       10    0.000    0.000    0.000    0.000 {method 'pop' of 'list' objects}
       18    0.000    0.000    0.000    0.000 {method 'popleft' of 'collections.deque' objects}
       20    0.000    0.000    0.000    0.000 {method 'random' of '_random.Random' objects}
       81    0.008    0.000    0.008    0.000 {method 'reduce' of 'numpy.ufunc' objects}
        3    0.004    0.001    0.004    0.001 {method 'sort' of 'numpy.ndarray' objects}
        3    0.000    0.000    0.000    0.000 {method 'sum' of 'numpy.ndarray' objects}
        6    0.000    0.000    0.000    0.000 {method 'update' of 'dict' objects}
        3    0.000    0.000    0.000    0.000 {min}
       24    0.000    0.000    0.001    0.000 {next}
       18    0.000    0.000    0.000    0.000 {numba.mviewbuf.memoryview_get_buffer}
        9    0.000    0.000    0.000    0.000 {numba.mviewbuf.memoryview_get_extents_info}
        3    0.000    0.000    0.000    0.000 {numba.mviewbuf.memoryview_get_extents}
       63    0.000    0.000    0.000    0.000 {numpy.core.multiarray.array}
        3    0.000    0.000    0.000    0.000 {numpy.core.multiarray.concatenate}
        3    0.000    0.000    0.000    0.000 {numpy.core.multiarray.copyto}
        3    0.000    0.000    0.000    0.000 {numpy.core.multiarray.empty_like}
        3    0.000    0.000    0.000    0.000 {numpy.core.multiarray.empty}
        6    0.000    0.000    0.000    0.000 {numpy.core.multiarray.zeros}
       18    0.000    0.000    0.000    0.000 {sum}
       30    0.000    0.000    0.000    0.000 {zip}

Generate partitions, k=6,10,[4,25]



In [9]:

    
def formatPartition(partition):
    clusters=np.unique(partition)
    nclusters=clusters.size
    finalPartition=[None]*nclusters
    for c,l in clusters:
        finalPartition[c] = np.where(clusters==l)

    return finalPartition

def generatePartitions(data,npartitions,nclusters,iters=3):
    
    if type(nclusters) is list:
        clusterRange = True
        min_ncluster=nclusters[0]
        max_ncluster=nclusters[1]
    else:
        clusterRange = False
        k = nclusters
        
    partitions = list()
    
    mode = "numpy"
    for p in xrange(npartitions):
        if clusterRange:
            k = np.random.randint(min_ncluster,max_ncluster)
        
        grouper = K_Means3.K_Means(n_clusters=k,mode=mode, cuda_mem='manual',tol=1e-4,max_iters=iters)
        grouper._centroid_mode = "index"
        grouper.fit(data)
        partitions.append(grouper.partition)
        
    return partitions
        
def generatePartitionsSKL(data,npartitions,nclusters,iters=3):
    
    if type(nclusters) is list:
        clusterRange = True
        min_ncluster=nclusters[0]
        max_ncluster=nclusters[1]
    else:
        clusterRange = False
        k = nclusters
        
    partitions = list()
    
    mode = "numpy"
    for p in xrange(npartitions):
        if clusterRange:
            k = np.random.randint(min_ncluster,max_ncluster)
        
        gSKL = KMeans_skl(n_clusters=k,n_init=1,init="random",max_iter=iters)
        gSKL.fit(data)
        partitions.append(formatPartition(gSKL.labels_))
        
    return partitions



In [11]:

    
reload(K_Means3)









    Out[11]:





<module 'MyML.cluster.K_Means3' from 'MyML/cluster/K_Means3.py'>



In [13]:

    
npartitions=30
iters=3

nclusters=10
partitions_my_10 = generatePartitions(data=data,npartitions=npartitions,nclusters=nclusters,iters=iters)
partitions_skl_10 = generatePartitions(data=data,npartitions=npartitions,nclusters=nclusters,iters=iters)

if type(nclusters) is not list:
    allGood=True
    for p in xrange(npartitions):
        if len(partitions_my_10[p]) != nclusters:
            print 'partition {} of partitions_my has different number of clusters:{}'.format(p,len(partitions_my_10[p]))
            allGood=False
        if len(partitions_skl_10[p]) != nclusters:
            print 'partition {} of partitions_my has different number of clusters:{}'.format(p,len(partitions_skl_10[p]))
            allGood=False
    if allGood:
        print 'All partitions have good number of clusters.'

nclusters=6
partitions_my_6 = generatePartitions(data=data,npartitions=npartitions,nclusters=nclusters,iters=iters)
partitions_skl_6 = generatePartitions(data=data,npartitions=npartitions,nclusters=nclusters,iters=iters)

if type(nclusters) is not list:
    allGood=True
    for p in xrange(npartitions):
        if len(partitions_my_6[p]) != nclusters:
            print 'partition {} of partitions_my has different number of clusters:{}'.format(p,len(partitions_my_6[p]))
            allGood=False
        if len(partitions_skl_6[p]) != nclusters:
            print 'partition {} of partitions_my has different number of clusters:{}'.format(p,len(partitions_skl_6[p]))
            allGood=False
    if allGood:
        print 'All partitions have good number of clusters.'

nclusters=[4,25]
partitions_my_rand = generatePartitions(data=data,npartitions=npartitions,nclusters=nclusters,iters=iters)
partitions_skl_rand = generatePartitions(data=data,npartitions=npartitions,nclusters=nclusters,iters=iters)

if type(nclusters) is not list:
    allGood=True
    for p in xrange(npartitions):
        if len(partitions_my_rand[p]) != nclusters:
            print 'partition {} of partitions_my has different number of clusters:{}'.format(p,len(partitions_my[p]))
            allGood=False
        if len(partitions_skl_rand[p]) != nclusters:
            print 'partition {} of partitions_my has different number of clusters:{}'.format(p,len(partitions_skl[p]))
            allGood=False
    if allGood:
        print 'All partitions have good number of clusters.'









    



All partitions have good number of clusters.
All partitions have good number of clusters.

Visualizing some partitions



In [14]:

    
figEnsemble=plt.figure(figsize=(16,12))
ax1En=figEnsemble.add_subplot(2,2,1)
ax2En=figEnsemble.add_subplot(2,2,2)
ax3En=figEnsemble.add_subplot(2,2,3)
ax4En=figEnsemble.add_subplot(2,2,4)

for c in partitions_my_10[0]:
    ax1En.plot(data[c,0],data[c,1],'.')
ax1En.set_title("Sample of one partition generated with my K-Means")

for c in partitions_my_10[1]:
    ax2En.plot(data[c,0],data[c,1],'.')
ax2En.set_title("Sample of one partition generated with my K-Means")

for c in partitions_skl_10[0]:
    ax3En.plot(data[c,0],data[c,1],'.')
ax3En.set_title("Sample of one partition generated with SKL's K-Means")

for c in partitions_skl_10[1]:
    ax4En.plot(data[c,0],data[c,1],'.')
ax4En.set_title("Sample of one partition generated with SKL's K-Means")









    Out[14]:





<matplotlib.text.Text at 0x7fc5b408fdd0>

EAC K-Means

6 clusters per partition



In [15]:

    
# generate coassoc
prot_mode="random"
assoc_mode='prot' # prot or full
nprots=nsamples # number of prototypes

partitions_used = partitions_my_6

myEstimator=eac.EAC(nsamples)
myEstimator.fit(partitions_used,files=False,assoc_mode=assoc_mode, prot_mode=prot_mode, nprot=nprots,build_only=True)

# final clustering with the true number of clusters
true_nclusters = np.unique(gt).shape[0]

# cluster with my K-Means
kmeans_mode = "numpy"

grouper = K_Means3.K_Means(n_clusters=true_nclusters, mode=kmeans_mode, cuda_mem='manual',tol=1e-4,max_iters=iters)
grouper._centroid_mode = "index"
grouper.fit(myEstimator._coassoc)

# cluster with SKL K-Means
gSKL = KMeans_skl(n_clusters=true_nclusters,n_init=1,init="random")
gSKL.fit(myEstimator._coassoc)

# Hungarian accuracy
myAcc = determine_ci.HungarianIndex(nsamples=nsamples)
myAcc.score(gt,grouper.labels_,format='array')

sklAcc = determine_ci.HungarianIndex(nsamples=nsamples)
sklAcc.score(gt,gSKL.labels_,format='array')


print 'My Accuracy:\t',myAcc.accuracy
print 'SKL Accuracy:\t',sklAcc.accuracy

figEAC=plt.figure(figsize=(16,6))
ax1EAC=figEAC.add_subplot(1,2,1)
ax2EAC=figEAC.add_subplot(1,2,2)

for c in np.unique(grouper.labels_):
    clusterData=grouper.labels_==c
    ax1EAC.plot(data[clusterData,0],data[clusterData,1],'.')
ax1EAC.set_title("Final EAC partition with my K-Means")

for c in np.unique(gSKL.labels_):
    clusterData=gSKL.labels_==c
    ax2EAC.plot(data[clusterData,0],data[clusterData,1],'.')
ax2EAC.set_title("Final EAC partition with SKL's K-Means")









    



My Accuracy:	1.0
SKL Accuracy:	1.0






    



MyML/helper/partition.py:56: FutureWarning: comparison to `None` will result in an elementwise object comparison in the future.
  if clusts == None:






    Out[15]:





<matplotlib.text.Text at 0x7fc5b6770d50>

Accuracy is usually 100% in both cases (clustering from my K-Means and SciKit-Learn's). This depends on the ensemble. For some ensembles the accuracy on both is always one, for others it sometimes is not in one or both of the K-Means used (mine vs SKL).

The number of prototypes is equal to the number of samples and since there are not repeated prototypes, all the samples are being used. Above are the visualizations of the solutions.

Statistic analysis



In [17]:

    
stat_nprots=nsamples
print "{}\t{}\t{}\t{}\t{}".format("type","mean","var","max","min")
print "skl \t",
for metric in k_skl_analysis(partitions_used,files=False,ground_truth=gt,nprots=stat_nprots,rounds=100):
    print "{}\t".format(metric),
print "\nmy  \t",
for metric in k_analysis(partitions_used,files=False,ground_truth=gt,nprots=stat_nprots,iters="converge",rounds=100):
    print "{}\t".format(metric),









    



type	mean	var	max	min
skl 	1.0	0.0	1.0	1.0	
my  	1.0	0.0	1.0	1.0



In [29]:

    
nprots=[5,20,40,60,80,100,120,140,160,180,200]

results_k10=list()
for n in nprots:
    print '.',
    r=k_analysis(partitions_used,files=False,ground_truth=gt,nprots=n,rounds=100)
    results_k10.append(r)
    
mean_k10=[res[0] for res in results_k10]
var_k10=[res[1] for res in results_k10]
best_k10=[res[2] for res in results_k10]
worst_k10=[res[3] for res in results_k10]

plt.plot(mean_k10,label='mean')
plt.plot(best_k10,label='best')
plt.plot(worst_k10,label='worst')
plt.plot([0, 10], [0.5, 0.5], 'k-', lw=1)
plt.title("Analysis of the influence of the number of prototypes")
plt.legend(loc='best')









    



.





    



---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-29-329d395e5d0e> in <module>()
      4 for n in nprots:
      5     print '.',
----> 6     r=k_analysis(partitions_used,files=False,ground_truth=gt,nprots=n,rounds=100)
      7     results_k10.append(r)
      8 

<ipython-input-7-935179233b8c> in k_analysis(partition_files, ground_truth, nprots, iters, rounds, files)
     14         grouper = K_Means3.K_Means(n_clusters=nclusters,mode=kmeans_mode, cuda_mem='manual',tol=1e-4,max_iters=iters)
     15         grouper._centroid_mode = "iter"
---> 16         grouper.fit(estimator._coassoc)
     17 
     18         myAcc = determine_ci.HungarianIndex(nsamples=nsamples)

/home/diogoaos/QCThesis/MyML/cluster/K_Means3.py in fit(self, data)
    121 
    122             # compute new centroids
--> 123             self.centroids =  self._recompute_centroids(data,self.centroids,labels)
    124 
    125         self.labels_ = labels

/home/diogoaos/QCThesis/MyML/cluster/K_Means3.py in _recompute_centroids(self, data, centroids, labels)
    563             new_centroids = self._np_recompute_centroids_index(data,centroids,labels)
    564         elif self._centroid_mode == "iter":
--> 565             new_centroids = self._np_recompute_centroids_iter(data,centroids,labels)
    566         elif self._centroid_mode == "good":
    567             new_centroids = self._np_recompute_centroids_good(data,centroids,labels)

/home/diogoaos/QCThesis/MyML/cluster/K_Means3.py in _np_recompute_centroids_iter(self, data, centroids, labels)
    770         for n in xrange(N):
    771             new_centroids[labels[n]] += data[n]
--> 772             centroid_count[labels[n]] += 1
    773 
    774 

KeyboardInterrupt:

10 clusters per partition



In [19]:

    
partitions_used = partitions_my_10

# generate coassoc
prot_mode="random"
assoc_mode='prot' # prot or full
nprots=nsamples # number of prototypes

myEstimator=eac.EAC(nsamples)
myEstimator.fit(partitions_used,files=False,assoc_mode=assoc_mode, prot_mode=prot_mode, nprot=nprots,build_only=True)

# final clustering with the true number of clusters
true_nclusters = np.unique(gt).shape[0]

# cluster with my K-Means
kmeans_mode = "numpy"

grouper = K_Means3.K_Means(n_clusters=true_nclusters, mode=kmeans_mode, cuda_mem='manual',tol=1e-4,max_iters=iters)
grouper._centroid_mode = "iter"
grouper.fit(myEstimator._coassoc)

# cluster with SKL K-Means
gSKL = KMeans_skl(n_clusters=true_nclusters,n_init=1,init="random")
gSKL.fit(myEstimator._coassoc)

# Hungarian accuracy
myAcc = determine_ci.HungarianIndex(nsamples=nsamples)
myAcc.score(gt,grouper.labels_,format='array')

sklAcc = determine_ci.HungarianIndex(nsamples=nsamples)
sklAcc.score(gt,gSKL.labels_,format='array')


print 'My Accuracy:\t',myAcc.accuracy
print 'SKL Accuracy:\t',sklAcc.accuracy

figEAC2=plt.figure(figsize=(16,12))
ax1EAC2=figEAC2.add_subplot(2,2,1)
ax2EAC2=figEAC2.add_subplot(2,2,2)
ax3EAC2=figEAC2.add_subplot(2,2,3)

for c in np.unique(grouper.labels_):
    clusterData=grouper.labels_==c
    ax1EAC2.plot(data[clusterData,0],data[clusterData,1],'.')
ax1EAC2.set_title("Final EAC partition with my K-Means")

for c in np.unique(gSKL.labels_):
    clusterData=gSKL.labels_==c
    ax2EAC2.plot(data[clusterData,0],data[clusterData,1],'.')
ax2EAC2.set_title("Final EAC partition with SKL's K-Means")

nprots=[5,20,40,60,80,100,120,140,160,180,200]
results_k6=list()
for n in nprots:
    r=k_skl_analysis(partitions_used,files=False,ground_truth=gt,nprots=stat_nprots,rounds=100)
    results_k6.append(r)
    
mean_k6=[res[0] for res in results_k6]
var_k6=[res[1] for res in results_k6]
best_k6=[res[2] for res in results_k6]
worst_k6=[res[3] for res in results_k6]

ax3EAC2.plot(mean_k6)
ax3EAC2.plot(best_k6)
ax3EAC2.plot(worst_k6)
ax3EAC2.plot([0, 10], [0.5, 0.5], 'k-', lw=1)
ax3EAC2.set_title("Analysis of the influence of the number of prototypes (SKL)")

print "\nStatistical analysis"
stat_nprots=nsamples
print "{}\t{}\t{}\t{}\t{}".format("type","mean","var","max","min")
print "skl \t",
for metric in k_skl_analysis(partitions_used,files=False,ground_truth=gt,nprots=stat_nprots,rounds=100):
    print "{}\t".format(metric),
print "\nmy  \t",
for metric in k_analysis(partitions_used,files=False,ground_truth=gt,nprots=stat_nprots,iters="converge",rounds=100):
    print "{}\t".format(metric),









    



My Accuracy:	1.0
SKL Accuracy:	1.0

Statistical analysis
type	mean	var	max	min
skl 	0.98525	0.0034096875	1.0	0.75	
my  	0.9925	0.00181875	1.0	0.75

Random number of clusters per partition



In [20]:

    
partitions_used = partitions_my_rand

# generate coassoc
prot_mode="random"
assoc_mode='prot' # prot or full
nprots=nsamples # number of prototypes

myEstimator=eac.EAC(nsamples)
myEstimator.fit(partitions_used,files=False,assoc_mode=assoc_mode, prot_mode=prot_mode, nprot=nprots,build_only=True)

# final clustering with the true number of clusters
true_nclusters = np.unique(gt).shape[0]

# cluster with my K-Means
kmeans_mode = "numpy"

grouper = K_Means3.K_Means(n_clusters=true_nclusters, mode=kmeans_mode, cuda_mem='manual',tol=1e-4,max_iters=iters)
grouper._centroid_mode = "iter"
grouper.fit(myEstimator._coassoc)

# cluster with SKL K-Means
gSKL = KMeans_skl(n_clusters=true_nclusters,n_init=1,init="random")
gSKL.fit(myEstimator._coassoc)

# Hungarian accuracy
myAcc = determine_ci.HungarianIndex(nsamples=nsamples)
myAcc.score(gt,grouper.labels_,format='array')

sklAcc = determine_ci.HungarianIndex(nsamples=nsamples)
sklAcc.score(gt,gSKL.labels_,format='array')


print 'My Accuracy:\t',myAcc.accuracy
print 'SKL Accuracy:\t',sklAcc.accuracy

figEAC2=plt.figure(figsize=(16,12))
ax1EAC2=figEAC2.add_subplot(2,2,1)
ax2EAC2=figEAC2.add_subplot(2,2,2)
ax3EAC2=figEAC2.add_subplot(2,2,3)

for c in np.unique(grouper.labels_):
    clusterData=grouper.labels_==c
    ax1EAC2.plot(data[clusterData,0],data[clusterData,1],'.')
ax1EAC2.set_title("Final EAC partition with my K-Means")

for c in np.unique(gSKL.labels_):
    clusterData=gSKL.labels_==c
    ax2EAC2.plot(data[clusterData,0],data[clusterData,1],'.')
ax2EAC2.set_title("Final EAC partition with SKL's K-Means")

nprots=[5,20,40,60,80,100,120,140,160,180,200]
results_k6=list()
for n in nprots:
    r=k_skl_analysis(partitions_used,files=False,ground_truth=gt,nprots=stat_nprots,rounds=100)
    results_k6.append(r)
    
mean_k6=[res[0] for res in results_k6]
var_k6=[res[1] for res in results_k6]
best_k6=[res[2] for res in results_k6]
worst_k6=[res[3] for res in results_k6]

ax3EAC2.plot(mean_k6)
ax3EAC2.plot(best_k6)
ax3EAC2.plot(worst_k6)
ax3EAC2.plot([0, 10], [0.5, 0.5], 'k-', lw=1)
ax3EAC2.set_title("Analysis of the influence of the number of prototypes (SKL)")

print "\nStatistical analysis"
stat_nprots=nsamples
print "{}\t{}\t{}\t{}\t{}".format("type","mean","var","max","min")
print "skl \t",
for metric in k_skl_analysis(partitions_used,files=False,ground_truth=gt,nprots=stat_nprots,rounds=100):
    print "{}\t".format(metric),
print "\nmy  \t",
for metric in k_analysis(partitions_used,files=False,ground_truth=gt,nprots=stat_nprots,iters="converge",rounds=100):
    print "{}\t".format(metric),









    



My Accuracy:	0.78
SKL Accuracy:	1.0






    



---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-20-c7997f7bef87> in <module>()
     52 results_k6=list()
     53 for n in nprots:
---> 54     r=k_skl_analysis(partitions_used,files=False,ground_truth=gt,nprots=stat_nprots,rounds=100)
     55     results_k6.append(r)
     56 

<ipython-input-7-935179233b8c> in k_skl_analysis(partition_files, ground_truth, nprots, rounds, files)
     30 
     31         estimator=eac.EAC(nsamples)
---> 32         estimator.fit(partition_files,files=files,assoc_mode='prot', prot_mode=prot_mode, nprot=nprots,build_only=True)
     33 
     34         kmeans_mode = "numpy"

/home/diogoaos/QCThesis/MyML/cluster/eac.pyc in fit(self, ensemble, files, assoc_mode, prot_mode, nprot, link, build_only)
     75                 else:
     76                         for partition in ensemble:
---> 77                                 self._update_coassoc_matrix(partition) # update co-association matrix
     78 
     79         def _create_coassoc(self,mode,nsamples,nprot=None):

/home/diogoaos/QCThesis/MyML/cluster/eac.pyc in _update_coassoc_matrix(self, clusters)
    211                 # reduced matrix
    212                 elif self._assoc_mode is "other":
--> 213                         self._update_coassoc_k(self._coassoc,clusters,self.k_labels)
    214                 elif self._assoc_mode is "knn":
    215                         self._update_coassoc_knn(self._coassoc,clusters,self.k_neighbours)

/home/diogoaos/QCThesis/MyML/cluster/eac.pyc in _update_coassoc_k(self, assoc_mat, clusters, k_labels)
    260 
    261                                 # this indexing selects the rows and columns specified by n_in_cluster and k_in_cluster
--> 262                                 assoc_mat[n_in_cluster[:,np.newaxis],k_in_cluster] += 1 # np.newaxis is alias for None
    263 
    264 

KeyboardInterrupt:



In [ ]:

    
plt.pcolor(myEstimator._coassoc)

K-Means only



In [ ]:

    
stat_nprots=nsamples
print "{}\t{}\t{}\t{}\t{}".format("type","mean","var","max","min")
print "my  \t",
for metric in stat_my_kmeans(data,true_nclusters,gt,rounds=100):
    print "{}\t".format(metric),
print "\nskl \t",
for metric in stat_skl_kmeans(data,true_nclusters,gt,rounds=100):
    print "{}\t".format(metric),

EAC K-Medoids



In [ ]:

    
import MyML.cluster.KMedoids as KMedoids

6 clusters per partition



In [ ]:

    
#%%debug
partitions_used = partitions_my_6

# generate coassoc
prot_mode="random"
assoc_mode='full' # prot or full
nprots=50 # number of prototypes

myEstimator=eac.EAC(nsamples)
myEstimator.fit(partitions_used,files=False,assoc_mode=assoc_mode, prot_mode=prot_mode, nprot=nprots,build_only=True)

# final clustering with the true number of clusters
true_nclusters = np.unique(gt).shape[0]

# compute diassociation from co-assoc
diassoc=myEstimator._coassoc.max()-myEstimator._coassoc

#k-medoids
labels,medoids=KMedoids.cluster(diassoc,k=true_nclusters)

# Hungarian accuracy
acc = determine_ci.HungarianIndex(nsamples=nsamples)
acc.score(gt,labels,format='array')

print 'K-Medoids Accuracy:\t',acc.accuracy

Statistical analysis



In [ ]:

    
class acc_medoids():
    def __init__(self,data,nclusters,gt):
        self.data=data
        self.nsamples=data.shape[0]
        self.nclusters=nclusters
        self.gt=gt
        
    def run(self):
        labels,medoids=KMedoids.cluster(self.data,k=self.nclusters)
        # Hungarian accuracy
        acc = determine_ci.HungarianIndex(nsamples=self.nsamples)
        acc.score(self.gt,labels,format='array')
        return acc.accuracy
    
class acc_my_kmeans():
    def __init__(self,data,nclusters,gt):
        self.data=data
        self.nclusters=nclusters
        self.nsamples=data.shape[0]
        self.gt=gt
    def run(self):
        # cluster with SKL K-Means
        grouper = K_Means3.K_Means(n_clusters=true_nclusters,mode=kmeans_mode, cuda_mem='manual',tol=1e-4,max_iters=iters)
        grouper._centroid_mode = "iter"
        grouper.fit(self.data)

        # Hungarian accuracy
        sklAcc = determine_ci.HungarianIndex(nsamples=self.nsamples)
        sklAcc.score(self.gt,grouper.labels_,format='array')
        
        return sklAcc.accuracy    

class acc_skl_kmeans():
    def __init__(self,data,nclusters,gt):
        self.data=data
        self.nclusters=nclusters
        self.nsamples=data.shape[0]
        self.gt=gt
    def run(self):
        # cluster with SKL K-Means
        gSKL = KMeans_skl(n_clusters=self.nclusters,n_init=1,init="random")
        gSKL.fit(self.data)

        # Hungarian accuracy
        sklAcc = determine_ci.HungarianIndex(nsamples=self.nsamples)
        sklAcc.score(self.gt,gSKL.labels_,format='array')
        
        return sklAcc.accuracy

def stat_analysis(method,rounds=20):
    rAll = np.zeros(rounds)
    for r in xrange(rounds):
        rAll[r]=method.run()
    return rAll.mean(),rAll.var(),rAll.max(),rAll.min()



In [ ]:

    
rounds=100
diassoc=myEstimator._coassoc.max()-myEstimator._coassoc
x=acc_medoids(diassoc,nclusters=true_nclusters,gt=gt)
print 'diassoc kmedoids\t',stat_analysis(x,rounds=rounds)

x2=acc_my_kmeans(diassoc,nclusters=true_nclusters,gt=gt)
print 'diassoc kmeans  \t',stat_analysis(x2,rounds=rounds)

x3=acc_medoids(myEstimator._coassoc,nclusters=true_nclusters,gt=gt)
print 'assoc kmedoids  \t',stat_analysis(x3,rounds=rounds)

x4=acc_my_kmeans(myEstimator._coassoc,nclusters=true_nclusters,gt=gt)
print 'assoc kmeans    \t',stat_analysis(x4,rounds=rounds)

10 clusters per partition



In [ ]:

    
#%%debug
partitions_used = partitions_my_10

# generate coassoc
prot_mode="random"
assoc_mode='full' # prot or full
nprots=50 # number of prototypes

myEstimator=eac.EAC(nsamples)
myEstimator.fit(partitions_used,files=False,assoc_mode=assoc_mode, prot_mode=prot_mode, nprot=nprots,build_only=True)

# final clustering with the true number of clusters
true_nclusters = np.unique(gt).shape[0]

# compute diassociation from co-assoc
diassoc=myEstimator._coassoc.max()-myEstimator._coassoc

#k-medoids
labels,medoids=KMedoids.cluster(diassoc,k=true_nclusters)

# Hungarian accuracy
acc = determine_ci.HungarianIndex(nsamples=nsamples)
acc.score(gt,labels,format='array')

print 'K-Medoids Accuracy:\t',acc.accuracy

Statistical analysis



In [ ]:

    
rounds=20
diassoc=myEstimator._coassoc.max()-myEstimator._coassoc
x=acc_medoids(diassoc,nclusters=true_nclusters,gt=gt)
print 'diassoc kmedoids\t',stat_analysis(x,rounds=rounds)

x2=acc_skl_kmeans(diassoc,nclusters=true_nclusters,gt=gt)
print 'diassoc kmeans  \t',stat_analysis(x2,rounds=rounds)

x3=acc_medoids(myEstimator._coassoc,nclusters=true_nclusters,gt=gt)
print 'assoc kmedoids  \t',stat_analysis(x3,rounds=rounds)

x4=acc_skl_kmeans(myEstimator._coassoc,nclusters=true_nclusters,gt=gt)
print 'assoc kmeans    \t',stat_analysis(x4,rounds=rounds)

Random clusters per partition



In [ ]:

    
#%%debug
npartitions=30
nclusters=[4,25]
iters=3
partitions_used = partitions_my_rand

# generate coassoc
prot_mode="random"
assoc_mode='full' # prot or full
nprots=50 # number of prototypes

myEstimator=eac.EAC(nsamples)
myEstimator.fit(partitions_used,files=False,assoc_mode=assoc_mode, prot_mode=prot_mode, nprot=nprots,build_only=True)

# final clustering with the true number of clusters
true_nclusters = np.unique(gt).shape[0]

# compute diassociation from co-assoc
diassoc=myEstimator._coassoc.max()-myEstimator._coassoc

#k-medoids
labels,medoids=KMedoids.cluster(diassoc,k=true_nclusters)

# Hungarian accuracy
acc = determine_ci.HungarianIndex(nsamples=nsamples)
acc.score(gt,labels,format='array')

print 'K-Medoids Accuracy:\t',acc.accuracy

Statistical analysis



In [ ]:

    
rounds=20
diassoc=myEstimator._coassoc.max()-myEstimator._coassoc
x=acc_medoids(diassoc,nclusters=true_nclusters,gt=gt)
print 'diassoc kmedoids\t',stat_analysis(x,rounds=rounds)

x2=acc_skl_kmeans(diassoc,nclusters=true_nclusters,gt=gt)
print 'diassoc kmeans  \t',stat_analysis(x2,rounds=rounds)

x3=acc_medoids(myEstimator._coassoc,nclusters=true_nclusters,gt=gt)
print 'assoc kmedoids  \t',stat_analysis(x3,rounds=rounds)

x4=acc_skl_kmeans(myEstimator._coassoc,nclusters=true_nclusters,gt=gt)
print 'assoc kmeans    \t',stat_analysis(x4,rounds=rounds)

K-Medoids only



In [ ]:

    
from sklearn.metrics.pairwise import pairwise_distances



In [ ]:

    
pairwise=pairwise_distances(data)
y=acc_medoids(pairwise,2,gt=gt)
stat_analysis(y,rounds=20)

EAC Single link



In [ ]:

    
partitions_used = partitions_my_rand

# generate coassoc
prot_mode="random"
assoc_mode='full' # prot or full
nprots=nsamples # number of prototypes

myEstimator=eac.EAC(nsamples)
myEstimator.fit(partitions_used,files=False,assoc_mode=assoc_mode, prot_mode=prot_mode, nprot=nprots,build_only=True)

# final clustering with the true number of clusters
true_nclusters = np.unique(gt).shape[0]

#k-medoids
myEstimator._apply_linkage()
labels = myEstimator._clusterFromLinkage()

# Hungarian accuracy
acc = determine_ci.HungarianIndex(nsamples=nsamples)
acc.score(gt,labels,format='array')

print 'EAC SL Accuracy:\t',acc.accuracy

Single-Link only



In [ ]:

    
from scipy.cluster import hierarchy as hie
from scipy.spatial.distance import squareform



In [ ]:

    
# pairwise distances
dists = np.zeros((nsamples,nsamples))
for i,dp in enumerate(data):
    dist = (data - dp)**2
    dist = np.sqrt(dist.sum(axis=1))
    dists[i]=dist



In [ ]:

    
#pairwise=pairwise_distances(data)
condensed_dists = squareform(dists)
Z = hie.linkage(condensed_dists,method='single')
parents=Z[-1,:2]
labels=myEstimator._buildLabels(Z=Z,parents=parents)

acc.score(gt,labels,format='array')
print "Single-Link accuracy:\t",acc.accuracy



In [ ]:

    
#generated from: http://tools.medialab.sciences-po.fr/iwanthue/
my_colors=["#D37E30",
"#6F6FD8",
"#3AA579",
"#D5337B",
"#4595B8",
"#3EA729",
"#D150D7",
"#4E6E23",
"#8F4D79",
"#D64430",
"#A1952B",
"#C15257",
"#AA5BB3",
"#6A76B0",
"#8E5723",
"#2A7464",
"#D66C9F",
"#60994E",
"#73A32D",
"#33A74F"]
my_pallete=sns.color_palette(my_colors,len(my_colors))
sns.palplot(my_pallete)
sns.set_palette(my_pallete,len(my_colors))
#marker_types=['.','^','*','h','x']
marker_types=matplotlib.markers.MarkerStyle.filled_markers



In [ ]:

    
sns.set_style("whitegrid")



In [ ]:

    
figX=sns.plt.figure(figsize=(12,90))
for i,p in enumerate(partitions_my_rand):
    ax=figX.add_subplot(15,2,i+1)
    for j,c in enumerate(p):
        ax.plot(data[c,0],data[c,1],ls=u'None',marker=marker_types[j/6],markersize=8)
        #ax.scatter(data[c,0],data[c,1],marker=marker_types[j/6],linewidths=5)
    ax.set_title("partition {}, {} clusters".format(i+1,j+1))



In [ ]: