In [ ]:
    
author: Diogo Silva
    
In [2]:
    
import numpy as np
import seaborn as snb
import matplotlib.pyplot as plt
    
In [3]:
    
%matplotlib inline
    
In [4]:
    
from MyML.cluster.K_Means3 import K_Means
import MyML.helper.partition as partMod
    
In [5]:
    
n_samples = 1e4
n_samples = int(n_samples)
n_features = 2
n_clusters = 6
    
In [7]:
    
data = np.random.uniform(size=(n_samples, n_features)).astype(np.float32)
plt.plot(data[:,0],data[:,1],'.')
    
    Out[7]:
    
In [34]:
    
g1 = np.random.normal(loc=(0,0), scale=1, size=(n_samples/n_clusters, n_features))
g2 = np.random.normal(loc=(10,5), scale=0.5, size=(n_samples/n_clusters, n_features))
g3 = np.random.normal(loc=(0,8), scale=0.1, size=(n_samples/n_clusters, n_features))
g4 = np.random.normal(loc=(-10,5), scale=(0.25,1), size=(n_samples/n_clusters, n_features))
g5 = np.random.normal(loc=(5,10), scale=0.5, size=(n_samples/n_clusters, n_features))
g6 = np.random.normal(loc=(10,-5), scale=0.5, size=(n_samples/n_clusters, n_features))
data = np.vstack((g1,g2,g3,g4,g5,g6))
gt = np.empty(data.shape[0], dtype = np.int32)
for i in range(n_clusters):
    gt[g1.shape[0] * i:g1.shape[0] * i + g1.shape[0]] = i
plt.plot(data[:,0],data[:,1],'.')
    
    Out[34]:
    
In [35]:
    
np.unique(gt)
    
    Out[35]:
In [36]:
    
foldername = "/home/diogoaos/QCThesis/datasets/gaussmix1e4/"
dataname = "data.csv"
gtname = "ground_truth.csv"
    
In [37]:
    
np.savetxt(foldername + dataname, data, delimiter = ",")
np.savetxt(foldername + gtname, gt, delimiter = ",")
    
In [38]:
    
data = np.genfromtxt(foldername + dataname, delimiter = ",", dtype = np.float32)
    
In [8]:
    
n_partitions = 100
ensemble_clusters = [np.sqrt(n_samples)/2, np.sqrt(n_samples)]
ensemble_clusters = map(int, ensemble_clusters)
generator = K_Means(cuda_mem="manual")
    
In [40]:
    
%time partMod.generateEnsembleToFiles(foldername, data, generator, n_clusters=ensemble_clusters, npartitions=n_partitions)
    
    
In [9]:
    
%time ensemble = partMod.generateEnsemble(data, generator, n_clusters=ensemble_clusters, npartitions=n_partitions)
    
    
In [17]:
    
import MyML.cluster.eac as eac
    
In [42]:
    
reload(eac)
    
    Out[42]:
In [43]:
    
foldername = "/home/diogoaos/QCThesis/datasets/gaussmix1e4/"
    
    
In [44]:
    
ensemble = partMod.loadEnsembleFromFiles(foldername=foldername)
    
In [18]:
    
i=0
    
In [20]:
    
for clust in ensemble[i]:
    plt.plot(data[clust,0], data[clust, 1], '.')
i+=1
if i >= len(ensemble):
    i=0
    
    
In [194]:
    
n_samples = 0
for clust in ensemble[0]:
    n_samples += clust.size
print n_samples
    
    
In [67]:
    
fullEAC = eac.EAC(n_samples = n_samples, mat_sparse = False)
    
In [68]:
    
%time fullEAC.fit(ensemble, assoc_mode="full", prot_mode="none")
    
    
Total time = 3.37s
In [69]:
    
full_nnz = fullEAC._coassoc.nonzero()[0].size
print "full matrix edges/vertices ratio : ", full_nnz * 1.0 / n_samples
    
    
In [161]:
    
coassoc = np.zeros((n_samples,n_samples))
%time eac.update_coassoc_with_ensemble(coassoc, ensemble)
    
    
Total time = 987 ms
In [163]:
    
(coassoc == fullEAC._coassoc).all()
    
    Out[163]:
In [76]:
    
protEAC = eac.EAC(n_samples = n_samples, mat_sparse = False)
    
In [77]:
    
n_prots = int(0.1 * n_samples)
%time protEAC.fit(ensemble, assoc_mode="full", prot_mode="random", nprot=n_prots)
    
    
In [78]:
    
protEAC_nnz = protEAC._coassoc.nonzero()[0].size
print "prot matrix edges/vertices ratio : ", full_nnz * 1.0 / n_samples
    
    
Total time = 421 ms
In [79]:
    
sparseEAC = eac.EAC(n_samples = n_samples, mat_sparse = True)
    
In [80]:
    
%time sparseEAC.fit(ensemble, assoc_mode="full")
    
    
Total time = 13min 21s
Save full coassoc.
In [115]:
    
%time np.savetxt(foldername + "full_coassoc.csv", fullEAC._coassoc, fmt="%d", delimiter=",")
    
    
Save full coassocs in CSR format.
In [82]:
    
from scipy.sparse import csr_matrix
    
In [116]:
    
%time full_sp = csr_matrix(fullEAC._coassoc)
%time prot_sp = csr_matrix(protEAC._coassoc)
    
    
In [114]:
    
fullEAC._coassoc[np.diag_indices_from(fullEAC._coassoc)] = 0
protEAC._coassoc[np.diag_indices_from(protEAC._coassoc)] = 0
    
In [157]:
    
np.where(fullEAC._coassoc[0:1666,0:1666]==100)[0]
    
    Out[157]:
In [491]:
    
print full_sp.__str__
print prot_sp.__str__
    
    
In [117]:
    
np.savetxt(foldername + "full_dest.csr", full_sp.indices, fmt="%d",delimiter=",")
np.savetxt(foldername + "full_weight.csr", full_sp.data, fmt="%d",delimiter=",")
np.savetxt(foldername + "full_fe.csr", full_sp.indptr, fmt="%d",delimiter=",")
    
In [118]:
    
np.savetxt(foldername + "prot_dest.csr", prot_sp.indices, fmt="%d",delimiter=",")
np.savetxt(foldername + "prot_weight.csr", prot_sp.data, fmt="%d",delimiter=",")
np.savetxt(foldername + "prot_fe.csr", prot_sp.indptr, fmt="%d",delimiter=",")
    
In [ ]:
    
del full_sp, prot_sp
    
In [21]:
    
reload(eac)
    
    Out[21]:
In [51]:
    
fullEAC = eac.EAC(n_samples = n_samples, mat_sparse = False)
%time fullEAC.fit(ensemble, assoc_mode="full", prot_mode="none")
    
    
In [23]:
    
print fullEAC.getMaxAssocs()[0]
print fullEAC.getNNZAssocs()
    
    
In [53]:
    
resPD=pd.DataFrame(columns=["col1","col2","col3"])
    
In [52]:
    
thresholds = np.arange(0,1.01,0.05)
res = np.empty((thresholds.size, 3))
for i in range(thresholds.size):
    res[i, 0] = thresholds[i]
    fullEAC.apply_threshold(thresholds[i])
    max_assocs, max_idx = fullEAC.getMaxAssocs()
    res[i, 1] = max_assocs
    nnz_pc = fullEAC.getNNZAssocs() / 8533572.0
    res[i, 2] = nnz_pc
    print thresholds[i], max_assocs, nnz_pc
    
    
In [25]:
    
import pandas as pd
    
In [30]:
    
resPD = pd.DataFrame(res, columns=["threshold","max_assocs", "nnz percent relative to max"])
    
In [33]:
    
print resPD.to_latex()
    
    
In [ ]:
    
resPD8
    
In [35]:
    
fullEAC._coassoc
    
    Out[35]:
In [16]:
    
fullEAC._coassoc[::100,::100].shape
    
    Out[16]:
In [18]:
    
plt.pcolor(fullEAC._coassoc[::50,::50])
    
    Out[18]:
    
In [ ]:
    
plt.pcolor()
    
In [91]:
    
from numba import jit
@jit
def outdegree_from_firstedge(firstedge, outdegree, n_edges):
    n_vertices = firstedge.size
    for v in range(n_vertices - 1):
        outdegree[v] = firstedge[v + 1] - firstedge[v]
    outdegree[n_vertices - 1] = n_edges - firstedge[n_vertices - 1]
    
In [6]:
    
print foldername
    
    
In [119]:
    
%time dest = np.genfromtxt(foldername + "full_dest.csr", dtype = np.int32, delimiter=",")
%time weight = np.genfromtxt(foldername + "full_weight.csr", dtype = np.float32, delimiter=",")
%time fe = np.genfromtxt(foldername + "full_fe.csr", dtype = np.int32, delimiter=",")
    
    
In [86]:
    
dest = np.genfromtxt(foldername + "prot_dest.csr", dtype = np.int32, delimiter=",")
weight = np.genfromtxt(foldername + "prot_weight.csr", dtype = np.float32, delimiter=",")
fe = np.genfromtxt(foldername + "prot_fe.csr", dtype = np.int32, delimiter=",")
    
In [120]:
    
fe = fe[:-1]
    
In [121]:
    
od = np.empty_like(fe)
outdegree_from_firstedge(fe, od, dest.size)
    
In [126]:
    
weight = 100 - weight
    
In [127]:
    
print "# edges : ", dest.size
print "# vertices : ", fe.size
print "edges/vertices ratio : ", dest.size * 1.0 / fe.size
    
    
In [10]:
    
from numba import jit
@jit
def outdegree_from_firstedge(firstedge, outdegree, n_edges):
    n_vertices = firstedge.size
    for v in range(n_vertices - 1):
        outdegree[v] = firstedge[v + 1] - firstedge[v]
    outdegree[n_vertices - 1] = n_edges - firstedge[n_vertices - 1]
    
In [106]:
    
from numba import cuda
import MyML.graph.mst as myMST
import MyML.graph.build as graphBuild
import MyML.graph.connected_components as ccomps
import MyML.cluster.linkage as linkage
    
    
In [136]:
    
reload(linkage)
    
    Out[136]:
In [142]:
    
%prun linkage.sl_mst_lifetime_gpu(dest, weight, fe, od, disconnect_weight = 100, MAX_TPB=512)
    
    
In [140]:
    
%time labels = linkage.sl_mst_lifetime_gpu(dest, weight, fe, od, disconnect_weight = 100, MAX_TPB=512)
    
    
In [149]:
    
import MyML.metrics.accuracy as accuracy
    
In [145]:
    
%time gt = np.genfromtxt(foldername + "ground_truth.csv", dtype = np.int32, delimiter=",")
    
    
In [151]:
    
scorer = accuracy.HungarianIndex(nsamples=gt.size)
%time scorer.score(gt, labels)
    
    
    
In [152]:
    
print scorer.accuracy