author: Diogo Silva
I'll analyze a small dataset of a gaussian mix with very well separated clusters. 6 clusters. 30 partitions will form an ensemble. The partitions' number of clusters will vary.
In [1]:
%pylab inline
#%qtconsole
In [9]:
home = %env HOME
In [11]:
cd $home/QCThesis/
In [12]:
from sklearn.cluster import KMeans as KMeans_skl
import MyML.cluster.eac as eac
reload(eac)
import MyML.cluster.K_Means3 as K_Means3
reload(K_Means3)
import MyML.metrics.accuracy as determine_ci
reload(determine_ci)
Out[12]:
In [5]:
def k_analysis(partition_files,ground_truth,nprots,iters="converge",rounds=20):
all_acc = list()
for r in xrange(rounds):
prot_mode="random"
estimator=eac.EAC(nsamples)
estimator.fit(partition_files,files=True,assoc_mode='prot', prot_mode=prot_mode, nprot=nprots,build_only=True)
kmeans_mode = "numpy"
nclusters = np.unique(ground_truth).shape[0]
grouper = K_Means3.K_Means()
grouper._centroid_mode = "index"
grouper.fit(estimator._coassoc, nclusters, iters=iters, mode=kmeans_mode, cuda_mem='manual',tol=1e-4,max_iters=300)
ci=determine_ci.ConsistencyIndex(N=nsamples)
accuracy=ci.score(ground_truth,grouper.labels_,format='array')
all_acc.append(accuracy)
return np.mean(all_acc),np.var(all_acc),np.max(all_acc),np.min(all_acc)
def k_skl_analysis(partition_files,ground_truth,nprots,iters="converge",rounds=20):
all_acc = list()
for r in xrange(rounds):
prot_mode="random"
estimator=eac.EAC(nsamples)
estimator.fit(partition_files,files=True,assoc_mode='prot', prot_mode=prot_mode, nprot=nprots,build_only=True)
kmeans_mode = "numpy"
nclusters = np.unique(ground_truth).shape[0]
grouper = KMeans(n_clusters=nclusters,n_init=1,init="random")
grouper.fit(estimator._coassoc)
ci=determine_ci.ConsistencyIndex(N=nsamples)
accuracy=ci.score(ground_truth,grouper.labels_,format='array')
all_acc.append(accuracy)
return np.mean(all_acc),np.var(all_acc),np.max(all_acc),np.min(all_acc)
In [6]:
def stat_my_kmeans(data,nclusters,gtruth,rounds=20):
all_acc = list()
for r in xrange(rounds):
iters="converge"
kmeans_mode="numpy"
grouper = K_Means3.K_Means()
grouper._centroid_mode = "index"
grouper.fit(data, nclusters, iters=iters, mode=kmeans_mode, cuda_mem='manual',tol=1e-4,max_iters=300)
ci=determine_ci.ConsistencyIndex(N=nsamples)
accuracy=ci.score(gtruth,grouper.labels_,format='array')
all_acc.append(accuracy)
return np.mean(all_acc),np.var(all_acc),np.max(all_acc),np.min(all_acc)
def stat_skl_kmeans(data,nclusters,gtruth,rounds=20,init='k-means++'):
all_acc = list()
for r in xrange(rounds):
iters="converge"
kmeans_mode="numpy"
gSKL = KMeans(n_clusters=nclusters,n_init=1,init=init)
gSKL.fit(data)
ci=determine_ci.ConsistencyIndex(N=nsamples)
accuracy=ci.score(gtruth,gSKL.labels_,format='array')
all_acc.append(accuracy)
return np.mean(all_acc),np.var(all_acc),np.max(all_acc),np.min(all_acc)
In [7]:
%run generatePartitions.py -d synthetic -n 100 -D 2 -C 6 -i 3 -m numpy -s sanity_cem_10k_ -np 30 -mc 10 -Mc 11 -dir test/
In [8]:
nsamples=100
prefix="sanity_cem_10k_"
files=!ls $home/QCThesis/EAC/test
folder= home + "/QCThesis/EAC/test/"
for i,f in enumerate(files):
files[i] = folder+f
partition_files = [f for f in files if "_partition_" in f and prefix in f]
data_file = home + "/QCThesis/EAC/test/" + prefix + "_data.csv"
ground_truth_file = home + "/QCThesis/EAC/test/" + prefix + "_ground_truth.csv"%%!
In [ ]:
data = np.genfromtxt(data_file,dtype=np.int32)
ground_truth = np.genfromtxt(ground_truth_file,dtype=np.int32)
In [ ]:
nprots=100
prot_mode="random"
estimator=eac.EAC(nsamples)
estimator.fit(partition_files,files=True,assoc_mode='full', prot_mode=prot_mode, nprot=5,build_only=True)
kmeans_mode = "numpy"
iters="converge"
nclusters = 6
grouper = K_Means3.K_Means()
grouper._centroid_mode = "index"
grouper.fit(estimator._coassoc, nclusters, iters=iters, mode=kmeans_mode, cuda_mem='manual',tol=1e-4,max_iters=300)
ci=determine_ci.ConsistencyIndex(N=nsamples)
accuracy=ci.score(ground_truth,grouper.labels_,format='array')
print accuracy
In [ ]:
k_skl_analysis(partition_files,ground_truth,nprots=5,iters="converge",rounds=100)
In [ ]:
nprots=[5,10,20,30,40,50,60,70,80,90,100]
results_k10=list()
for n in nprots:
r=k_skl_analysis(partition_files,ground_truth,nprots=n,iters="converge",rounds=100)
results_k10.append(r)
In [ ]:
mean_k10=[res[0] for res in results_k10]
var_k10=[res[1] for res in results_k10]
best_k10=[res[2] for res in results_k10]
worst_k10=[res[3] for res in results_k10]
In [ ]:
plt.plot(mean_k10)
plt.plot(best_k10)
plt.plot(worst_k10)
plt.plot([0, 10], [0.5, 0.5], 'k-', lw=1) #slowdown/speedup threshold
In [ ]:
%run generatePartitions.py -d synthetic -n 100 -D 2 -C 6 -i 3 -m numpy -s sanity_cem_25k_ -np 30 -mc 25 -Mc 26 -dir test/
In [ ]:
nsamples=100
prefix="sanity_cem_25k_"
files=!ls $home/QCThesis/EAC/test
folder= home + "/QCThesis/EAC/test/"
for i,f in enumerate(files):
files[i] = folder+f
partition_files = [f for f in files if "_partition_" in f and prefix in f]
data_file = home + "/QCThesis/EAC/test/" + prefix + "_data.csv"
ground_truth_file = home + "/QCThesis/EAC/test/" + prefix + "_ground_truth.csv"
In [ ]:
data = np.genfromtxt(data_file,delimiter=',',dtype=np.float32)
ground_truth = np.genfromtxt(ground_truth_file,dtype=np.int32)
In [ ]:
nprots=100
prot_mode="random"
estimator=eac.EAC(nsamples)
estimator.fit(partition_files,files=True,assoc_mode='full', prot_mode=prot_mode, nprot=nprots,build_only=True)
kmeans_mode = "numpy"
iters="converge"
nclusters = 6
grouper = K_Means3.K_Means()
grouper._centroid_mode = "index"
grouper.fit(estimator._coassoc, nclusters, iters=iters, mode=kmeans_mode, cuda_mem='manual',tol=1e-4,max_iters=300)
ci=determine_ci.ConsistencyIndex(N=nsamples)
accuracy=ci.score(ground_truth,grouper.labels_,format='array')
print accuracy
In [ ]:
nprots=[5,10,20,30,40,50,60,70,80,90,100]
results_k25=list()
for n in nprots:
r=k_skl_analysis(partition_files,ground_truth,nprots=n,iters="converge",rounds=100)
results_k25.append(r)
In [ ]:
mean_k25=[res[0] for res in results_k25]
var_k25=[res[1] for res in results_k25]
best_k25=[res[2] for res in results_k25]
worst_k25=[res[3] for res in results_k25]
In [ ]:
plt.plot(mean_k25)
plt.plot(best_k25)
plt.plot(worst_k25)
plt.plot([0, 10], [0.5, 0.5], 'k-', lw=1) #slowdown/speedup threshold
In [ ]:
k_analysis(partition_files,ground_truth,10,iters="converge",rounds=100)
In [ ]:
iters="converge"
kmeans_mode="numpy"
nclusters=6
grouper = K_Means3.K_Means()
grouper._centroid_mode = "index"
grouper.fit(data, nclusters, iters=iters, mode=kmeans_mode, cuda_mem='manual',tol=1e-4,max_iters=300)
ci=determine_ci.ConsistencyIndex(N=nsamples)
accuracy=ci.score(ground_truth,grouper.labels_,format='array')
print "iters: ",grouper.iters_
print "accuracy: ",accuracy
In [ ]:
stat_my_kmeans(data,6,ground_truth,rounds=100)
In [ ]:
stat_skl_kmeans(data,6,ground_truth,rounds=100,init="random")
In [ ]:
gSKL = KMeans(n_clusters=6,n_init=1)
gSKL.fit(data)
ci=determine_ci.ConsistencyIndex(N=nsamples)
accuracy=ci.score(ground_truth,gSKL.labels_,format='array')
print "accuracy: ",accuracy
In [ ]:
gSKL = KMeans(n_clusters=6,n_init=1)
gSKL.fit(estimator._coassoc)
ci=determine_ci.ConsistencyIndex(N=nsamples)
accuracy=ci.score(ground_truth,gSKL.labels_,format='array')
print "accuracy: ",accuracy
In [ ]:
plt.plot(data[:,0],data[:,1],'.')
In [ ]:
for c in grouper.partition:
plt.plot(data[c,0],data[c,1],'.')
In [ ]:
from sklearn.datasets import make_blobs # generate gaussian mixture
n_samples=100
ndims=2
centers=6
data, ground_truth = make_blobs(n_samples=nsamples,n_features=ndims,centers=centers)
plt.plot(data[:,0],data[:,1],'.')
In [ ]:
filename_base = home + "/QCThesis/EAC/test/" + "close_data"
np.savetxt(filename_base + "_ground_truth.csv", groundTruth, delimiter=',')
np.savetxt(filename_base + "_data.csv", data, delimiter=',')
In [ ]:
%run generatePartitions.py -d test/close_data_data.csv -n 100 -D 2 -C 6 -i 3 -m numpy -s sanity_close_data_25k_ -np 30 -mc 10 -Mc 11 -dir test/
In [ ]:
nsamples=100
prefix="sanity_close_data_10k_"
files=!ls $home/QCThesis/EAC/test
folder= home + "/QCThesis/EAC/test/"
for i,f in enumerate(files):
files[i] = folder+f
partition_files = [f for f in files if "_partition_" in f and prefix in f]
data_file = home + "/QCThesis/EAC/test/" + prefix + "_data.csv"
ground_truth_file = home + "/QCThesis/EAC/test/" + prefix + "_ground_truth.csv"
In [ ]:
nprots=[5,10,20,30,40,50,60,70,80,90,100]
results_close_k10=list()
for n in nprots:
r=k_skl_analysis(partition_files,ground_truth,nprots=n,iters="converge",rounds=100)
results_close_k10.append(r)
mean_close_k10=[res[0] for res in results_close_k10]
var_close_k10=[res[1] for res in results_close_k10]
best_close_k10=[res[2] for res in results_close_k10]
worst_close_k10=[res[3] for res in results_close_k10]
In [ ]:
plt.plot(mean_close_k10)
plt.plot(best_close_k10)
plt.plot(worst_close_k10)
plt.plot([0, 10], [0.5, 0.5], 'k-', lw=1) #slowdown/speedup threshold
In [ ]:
%run generatePartitions.py -d test/close_data_data.csv -n 100 -D 2 -C 6 -i 3 -m numpy -s sanity_close_data_25k_ -np 30 -mc 25 -Mc 26 -dir test/
In [ ]:
nsamples=100
prefix="sanity_close_data_25k_"
files=!ls $home/QCThesis/EAC/test
folder= home + "/QCThesis/EAC/test/"
for i,f in enumerate(files):
files[i] = folder+f
partition_files = [f for f in files if "_partition_" in f and prefix in f]
data_file = home + "/QCThesis/EAC/test/" + prefix + "_data.csv"
ground_truth_file = home + "/QCThesis/EAC/test/" + prefix + "_ground_truth.csv"
In [ ]:
nprots=[5,10,20,30,40,50,60,70,80,90,100]
results_close_k25=list()
for n in nprots:
r=k_skl_analysis(partition_files,ground_truth,nprots=n,iters="converge",rounds=100)
results_close_k25.append(r)
mean_close_k25=[res[0] for res in results_close_k25]
var_close_k25=[res[1] for res in results_close_k25]
best_close_k25=[res[2] for res in results_close_k25]
worst_close_k25=[res[3] for res in results_close_k25]
In [ ]:
plt.plot(mean_close_k25)
plt.plot(best_close_k25)
plt.plot(worst_close_k25)
plt.plot([0, 10], [0.5, 0.5], 'k-', lw=1) #slowdown/speedup threshold
In [ ]:
mean,var,best,worst=stat_skl_kmeans(data,6,ground_truth,rounds=100,init="random")
print "\
mean:\t{}\n\
var:\t{}\n\
best:\t{}\n\
worst:\t{}".format(mean,var,best,worst)
In [ ]:
plt.plot(mean_k10)
plt.plot(best_k10)
plt.plot(worst_k10)
plt.plot([0, 10], [0.5, 0.5], 'k-', lw=1) #slowdown/speedup threshold
plt.xticks(range(len(nprots)),nprots)
plt.xlabel("# prototypes")
plt.ylabel("accuracy")
In [ ]:
plt.plot(mean_k25)
plt.plot(best_k25)
plt.plot(worst_k25)
plt.plot([0, 10], [0.5, 0.5], 'k-', lw=1) #slowdown/speedup threshold
plt.xticks(range(len(nprots)),nprots)
plt.xlabel("# prototypes")
plt.ylabel("accuracy")
In [ ]:
plt.plot(mean_close_k10)
plt.plot(best_close_k10)
plt.plot(worst_close_k10)
plt.plot([0, 10], [0.5, 0.5], 'k-', lw=1) #slowdown/speedup threshold
plt.xticks(range(len(nprots)),nprots)
plt.xlabel("# prototypes")
plt.ylabel("accuracy")
In [ ]:
plt.plot(mean_close_k25)
plt.plot(best_close_k25)
plt.plot(worst_close_k25)
plt.plot([0, 10], [0.5, 0.5], 'k-', lw=1) #slowdown/speedup threshold
plt.xticks(range(len(nprots)),nprots)
plt.xlabel("# prototypes")
plt.ylabel("accuracy")