author: Diogo Silva
In [1]:
%pylab inline
#%qtconsole
In [49]:
home = %env HOME
In [50]:
cd $home/QCThesis/EAC
In [4]:
%run generatePartitions.py -d synthetic -n 100 -D 2 -C 6 -i 3 -m cuda -s syn_cem -np 20 -mc 4 -Mc 25 -dir test/
In [55]:
ls test
In [88]:
files=!ls $home/QCThesis/EAC/test
folder= home + "/QCThesis/EAC/test/"
for i,f in enumerate(files):
files[i] = folder+f
partition_files = [f for f in files if "_partition_" in f and "syn_cem" in f]
In [89]:
partition_files
Out[89]:
In [60]:
pwd
Out[60]:
In [83]:
import eac
In [84]:
reload(eac)
Out[84]:
In [90]:
#%debug
reload(eac)
estimator=eac.EAC(100)
estimator.fit(partition_files,files=True,assoc_mode='full', prot_mode='random', nprot=None)
In [91]:
np.where(estimator._coassoc==20)[0].size
Out[91]:
In [92]:
print estimator._coassoc.shape
print estimator._coassoc
print "Diagonal"
print estimator._coassoc.diagonal()
print "Symmetric:\t",(estimator._coassoc.T == estimator._coassoc).all()
In [20]:
from scipy.cluster.hierarchy import linkage
from scipy.spatial.distance import squareform
def _apply_linkage(assoc_mat,method='single'):
"""
SciPy linkage wants a distance array of format pdist. SciPy squareform
converts between the two formats.
assoc_mat : pair-wise similarity association matrix
method : linkage method to use; can be 'single'(default), 'complete',
'average', 'weighted', 'centroid', 'median', 'ward'
"""
condensed_assoc = squareform(assoc_mat)
# convert pair-wise similarity array (assoc_mat->condensed_assoc) to dissimilarity
condensed_diss_assoc = condensed_assoc.max() - condensed_assoc
Z = linkage(condensed_diss_assoc,method=method)
return Z
In [56]:
assoc=estimator._coassoc
coassoc = assoc / 20
coassoc = coassoc.max() - coassoc
In [57]:
coassoc
Out[57]:
In [58]:
Z=_apply_linkage(coassoc)
In [48]:
import scipy.cluster.hierarchy as hie
In [64]:
for l in Z:
print l
X = [c for c in Z if c[0] >=100 and c[1] >= 100]
print "argmax\t",np.argmax(Z[:,2])
print Z[np.argmax(Z[:,2])]
In [125]:
plt.figure(figsize=(16,12))
r=hie.dendrogram(Z,leaf_rotation=90,show_contracted=True)
In [131]:
len(r['color_list'])
Out[131]:
In [123]:
dif=Z[1:,2]-Z[0:-1,2]
#[maximo,indice]=max(dif);
In [124]:
print "max lifetime: ", dif.max()
print "index: ", dif.argmax()
In [100]:
#%debug
reload(eac)
estimator=eac.EAC(100)
estimator.fit(partition_files,files=True,assoc_mode='prot', prot_mode='random', nprot=5)
In [101]:
estimator._coassoc.sum()
Out[101]:
In [21]:
#%%debug -b eac.py:190
reload(eac)
estimator=eac.EAC(100,data=data)
estimator.fit(partition_files,files=True,assoc_mode='prot', prot_mode='other', nprot=5)
In [112]:
b=np.zeros(a.shape[0])
for c in xrange(a.shape[0]):
dist = data - a[c]
dist = dist **2
dist = dist.sum(axis=1)
b[c] = np.argmin(dist)
b
Out[112]:
In [108]:
a=array([[-778.1035957 , 728.38305131],
[ 474.98214377, 654.43652209],
[ -62.22709694, 637.21319263]])
In [115]:
estimator.k_labels
Out[115]:
In [88]:
estimator._coassoc.sum()
Out[88]:
In [7]:
datafile=None
for f in files:
if 'syn_cem' in f and 'data' in f:
datafile=f
print datafile
In [8]:
data = np.genfromtxt(datafile,delimiter=',')
In [251]:
from sklearn.neighbors import NearestNeighbors
In [252]:
K_neigh=6
# Minkowski distance is a generalization of Euclidean distance and is equivelent to it for p=2
neigh = NearestNeighbors(n_neighbors=K_neigh, radius=1.0, algorithm='auto', leaf_size=30, metric='minkowski', p=2)
neigh.fit(data)
Out[252]:
In [253]:
data_neighbours=neigh.kneighbors(X=data,return_distance=False)
# the closest neighbour is the point itself - no interest in that
data_neighbours = data_neighbours[:,1:]
In [254]:
data_neighbours
Out[254]:
In [11]:
#%debug
reload(eac)
estimator=eac.EAC(100,data=data)
estimator.fit(partition_files,files=True,assoc_mode='prot', prot_mode='knn', nprot=5)
In [12]:
estimator._coassoc.sum()
Out[12]:
In [197]:
diss_mat = estimator._coassoc
diss_labels = estimator.k_neighbours
In [13]:
from scipy.sparse import *
In [214]:
smat = csr_matrix((100,100))
mat = np.zeros((100,100))
In [212]:
%timeit smat[0,0]=50
%timeit mat[0,0]=50
In [208]:
rows=diss_labels[0]
cols=diss_labels[1]
In [216]:
rows
Out[216]:
In [223]:
%timeit mat[rows[:,np.newaxis],cols]=1
%timeit smat[rows[:,np.newaxis],cols]=1
In [227]:
smat*2
Out[227]:
In [142]:
def _update_coassoc_knn(assoc_mat,clusters,k_neighbours):
"""
Updates an NxK co-association matrix.
k_neighbours is an NxK array where the k-th element of the i-th row is the index of a data point
that corresponds to the k-th nearest neighbour of the i-th data point. That neighbour is the k-th
prototype of the i-th data point.
"""
nclusters = len(clusters)
for i in xrange(nclusters):
if clusters[i].shape > 1:
# all data points in cluster - rows to select
n_in_cluster = clusters[i]
# update row j of matrix
for j in n_in_cluster:
# all prototypes in cluster - columns to select
k_in_cluster = np.where(np.in1d(k_neighbours[j] ,n_in_cluster))
# this indexing selects the rows and columns specified by n_in_cluster and k_in_cluster
assoc_mat[j,k_in_cluster] += 1 # np.newaxis is alias for None
pass
def _update_coassoc_knn_sparse(assoc_mat,clusters,k_neighbours):
"""
Updates an NxK co-association matrix.
k_neighbours is an NxK array where the k-th element of the i-th row is the index of a data point
that corresponds to the k-th nearest neighbour of the i-th data point. That neighbour is the k-th
prototype of the i-th data point.
"""
nclusters = len(clusters)
for i in xrange(nclusters):
if clusters[i].shape > 1:
# all data points in cluster - rows to select
n_in_cluster = clusters[i]
# update row j of matrix
for j in n_in_cluster:
# all prototypes in cluster - columns to select
# column indices corresponding to the K-prototypes
#k_in_cluster = np.where(np.in1d(n_in_cluster,k_neighbours[j]))[0]
k_in_cluster = n_in_cluster[np.in1d(n_in_cluster,k_neighbours[j])]
# this indexing selects the rows and columns specified by n_in_cluster and k_in_cluster
#assoc_mat[j,k_in_cluster] += 1 # np.newaxis is alias for None
assoc_mat[j,k_in_cluster] += np.ones_like(k_in_cluster)
pass
In [10]:
def mat_match(sparse,normal,neigh):
for row in xrange(sparse.shape[0]):
cols_in_sparse = neigh[row][normal[row].astype(bool)]
row_in_sparse = sparse[row,cols_in_sparse]
row_in_normal = normal[row][normal[row].astype(bool)]
if (row_in_sparse != row_in_normal).any():
return False
return True
In [111]:
def zero_low_tri(sparse):
r,c = sparse.shape
for i in xrange(r):
for j in xrange(c):
if j >= c:
continue
if sparse[i,j] != 0:
return False
return True
In [202]:
def func1(smat):
"""
check that, where the matrix is not symetric, at least one of the values is 0
"""
a=smat.todense()
row,col = np.where(a!=a.T)
row=np.array(row).flatten()
col=np.array(col).flatten()
for i in xrange(row.size):
if a[row[i],col[i]] != 0 and a[col[i],row[i]] != 0:
return False
return True
In [227]:
def func2(smat):
"""
true if lower triangle is zero where matrix is not symetric
"""
a=smat.todense()
row,col = np.where(a!=a.T)
row=np.array(row).flatten()
col=np.array(col).flatten()
for i in xrange(row.size):
if row[i] > col[i] and a[row[i],col[i]] != 0:
return False
return True
In [226]:
def func3(smat):
"""
true if upper triangle is zero where matrix is not symetric
"""
a=smat.todense()
row,col = np.where(a!=a.T)
row=np.array(row).flatten()
col=np.array(col).flatten()
for i in xrange(row.size):
if row[i] < col[i] and a[row[i],col[i]] != 0:
return False
return True
In [251]:
def func3(smat):
"""
make dists only on upper triangle
"""
a=smat.todense()
rows,cols = smat.nonzero()
for i in xrange(rows.size):
if rows[i]>cols[i] and smat[rows[i],cols[i]] != 0:
smat[cols[i],rows[i]] = smat[rows[i],cols[i]]
smat[rows[i],cols[i]] = 0
pass
In [259]:
def func4(smat):
"""
make dists symmetric
"""
a=smat.todense()
rows,cols = smat.nonzero()
for i in xrange(rows.size):
if rows[i]<cols[i] and smat[rows[i],cols[i]] != 0:
smat[cols[i],rows[i]] = smat[rows[i],cols[i]]
pass
In [157]:
clusters_a = estimator._readPartition(partition_files[0])
neigh_labels = estimator.k_neighbours
#sparse_coassoc = csc_matrix((100,100))
#sparse_coassoc = csr_matrix((100,100))
sparse_coassoc = lil_matrix((100,100))
#sparse_coassoc = coo_matrix((100,100)) # doesn't support slicing
coassoc = np.zeros((100,neigh_labels.shape[1]))
In [158]:
for f in partition_files:
clusters_a = estimator._readPartition(f)
_update_coassoc_knn(coassoc,clusters_a,neigh_labels)
_update_coassoc_knn_sparse(sparse_coassoc,clusters_a,neigh_labels)
mat_match(sparse_coassoc,coassoc,neigh_labels)
Out[158]:
In [29]:
%timeit _update_coassoc_knn(coassoc,clusters_a,neigh_labels)
%timeit _update_coassoc_knn_sparse(sparse_coassoc,clusters_a,neigh_labels)
In [30]:
print "csr/np = {}".format(134/10.4)
print "csc/np = {}".format(131/10.4)
print "lil/np = {}".format(86.8/10.4)
In [31]:
from scipy.cluster.hierarchy import linkage
from scipy.spatial.distance import squareform
def _apply_linkage(assoc_mat,method='single'):
"""
SciPy linkage wants a distance array of format pdist. SciPy squareform
converts between the two formats.
assoc_mat : pair-wise similarity association matrix
method : linkage method to use; can be 'single'(default), 'complete',
'average', 'weighted', 'centroid', 'median', 'ward'
"""
condensed_assoc = squareform(assoc_mat)
# convert pair-wise similarity array (assoc_mat->condensed_assoc) to dissimilarity
condensed_diss_assoc = condensed_assoc.max() - condensed_assoc
Z = linkage(condensed_diss_assoc,method=method)
return Z
In [87]:
Y = sparse_coassoc.tocsr()
Y[i,j]=Y.max()-Y[i,j]
In [88]:
Out[88]:
In [115]:
import scipy
In [119]:
points=np.array([[5,5],[5,6],[1,1],[1,2],[-2,-2]])
dist=scipy.spatial.distance.pdist(points)
invDist=dist.max()-dist
In [125]:
Z_d=linkage(dist,method="single")
Z_id=linkage(invDist,method="complete")
In [126]:
p=scipy.cluster.hierarchy.dendrogram(Z_d)
In [127]:
p=scipy.cluster.hierarchy.dendrogram(Z_id)
They're not equivelent since the distance between clusters is always minimized indepedentely of how the metric chosen.
In [138]:
a=np.ones(5)
In [140]:
a[:,np.newaxis]
Out[140]: