In [1]:
import numpy as np
import seaborn as sb
import pandas as pd
import MyML.cluster.K_Means3 as myKM
import MyML.cluster.eac as eac
import MyML.helper.partition as mpart
import MyML.metrics.accuracy as acc
import MyML.cluster.linkage as linkage
In [2]:
import sklearn.datasets
from sklearn.preprocessing import normalize
In [3]:
datasets_path = '/home/chiroptera/QCThesis/datasets/'
In [4]:
datasets = dict()
In [5]:
iris = sklearn.datasets.load_iris()
data = iris.data.astype(np.float32)
gt = iris.target
datasets['iris'] = {'data':data, 'gt':gt}
In [6]:
wine = sklearn.datasets.fetch_mldata('uci-20070111 wine', data_home='~/')
data = wine.data.astype(np.float32)
data_norm = normalize(wine.data, axis=0).astype(np.float32)
gt = wine.target.astype(np.int32)
datasets['wine'] = {'data':data, 'gt':gt}
datasets['wine_norm'] = {'data':data_norm, 'gt':gt}
In [7]:
dataname = datasets_path + "ionosphere/ionosphere.data"
dataset = pd.read_csv(dataname, header=None, sep=",")
print dataset.shape
dataset.head()
data = dataset.values[:,:-1].astype(np.float32)
gt = dataset.values[:,-1]
gt[gt=='g'] = 1
gt[gt=='b'] = 0
gt = gt.astype(np.int32)
datasets['ionosphere'] = {'data':data, 'gt':gt}
In [8]:
optdigits = sklearn.datasets.load_digits(n_class=10)
data = np.float32(optdigits.data)
gt = np.int32(optdigits.target)
datasets['optdigits'] = {'data':data, 'gt':gt}
In [9]:
dataname = datasets_path + 'optdigits/optdigits.tra'
dataset = pd.read_csv(dataname, header=None, sep=",")
data = dataset.get_values()[:,:-1]
data = data.astype(np.float32)
gt = dataset.get_values()[:,-1]
gt = gt.astype(np.int32)
datasets['optdigits'] = {'data':data, 'gt':gt}
In [10]:
dataname = datasets_path + "mfeat/mfeat-fou.asc"
dataset = pd.read_csv(dataname, header=None, sep=" ")
data = dataset.get_values().astype(np.float32)
gt = np.empty(dataset.shape[0], dtype=np.int32)
for i in range(10):
gt[i*200 : i*200+200]=i
datasets['mfeat_fou'] = {'data':data, 'gt':gt}
In [11]:
dataname = datasets_path + "breast-cancer/breast-cancer"
data, gt = sklearn.datasets.load_svmlight_file(dataname)
data = data.todense().astype(np.float32)
gt = gt.astype(np.int32)
datasets['breast_cancer'] = {'data':data, 'gt':gt}
In [12]:
dataname = datasets_path + "pima/pima-indians-diabetes.data"
pima = pd.read_csv(dataname, header=None, sep=",")
data = pima.get_values()[:,:-1].astype(np.float32)
data_norm = normalize(data, axis=0).astype(np.float32)
gt = pima.get_values()[:,-1].astype(np.int32)
datasets['pima'] = {'data':data, 'gt':gt}
datasets['pima_norm'] = {'data':data_norm, 'gt':gt}
In [13]:
datasets['optdigits']['data'].shape
Out[13]:
In [14]:
dataname = datasets_path + "isolet/isolet1-5.data"
dataset = pd.read_csv(dataname, header=None, sep=",")
data = dataset.get_values().astype(np.float32)[:,:-1]
gt = dataset.get_values()[:,-1].astype(np.int32)
datasets['isolet'] = {'data':data, 'gt':gt}
In [15]:
dataname = datasets_path + "ecg_it/ecg_it.data"
dataset = pd.read_csv(dataname, header=None, sep=",")
data = dataset.values[1:,1:-1].astype(np.float32)
gt = dataset.values[1:,-1].astype(np.int32)
#remove unlabeled
labeled_idx = gt!=2
data = data[labeled_idx]
gt = gt[labeled_idx]
In [16]:
gt0 = gt==0
gt1 = gt==1
In [17]:
data = np.concatenate((data[gt0],data[gt1][:600]))
gt = np.concatenate((gt[gt0],gt[gt1][:600]))
In [18]:
import sklearn.datasets
mnist = sklearn.datasets.fetch_mldata('MNIST (original)', data_home='~/')
data = mnist.data.astype(np.float32)
gt = mnist.target.astype(np.int32)
datasets['mnist'] = {'data':data, 'gt':gt}
In [19]:
datasets.keys()
Out[19]:
In [20]:
from scipy.io import savemat
In [21]:
savemat(datasets_path + 'dataset.mat', datasets)
In [22]:
import MyML.cluster.K_Means3 as myKM
import MyML.EAC.eac_new as eac
import MyML.helper.partition as part
import MyML.EAC.rules as rules
import MyML.metrics.accuracy as acc
import MyML.utils.profiling as prof
In [23]:
for name, ds in datasets.iteritems():
print ds['data'].shape, '\t', name
In [24]:
del datasets['mnist']
In [25]:
validation_my_eac = pd.DataFrame(index=datasets.keys(), columns=['accuracy', 'n_clusts','lifetime accuracy', 'lifetime n_clusts', 'ensemble time', 'build time', 'clustering time'])
In [42]:
t = prof.Timer()
for name, ds in datasets.iteritems():
#name = 'iris'
#ds = datasets[name]
print '{}:'.format(name),
data = ds['data']
gt = ds['gt']
# generate ensemble with K-Means
kmGen = myKM.K_Means()
kmGen._MAX_THREADS_BLOCK = 256
kmGen._label_mode = 'numba'
t.reset()
t.tic()
ensemble = part.generateEnsemble(data, kmGen, n_clusters=rules.rule1(data.shape[0]), npartitions=100, iters=2)
ensemble_time = t.tac()
# eac clustering
eacEst = eac.EAC(data.shape[0], sparse=False, condensed=True)
t.reset()
t.tic()
eacEst.buildMatrix(ensemble)
build_time = t.tac()
t.reset()
t.tic()
eacEst.finalClustering(np.unique(gt).size)
clustering_time = t.tac()
# eac clustering with lifetime
eacEst2 = eac.EAC(data.shape[0], sparse=False, condensed=True)
eacEst2.buildMatrix(ensemble)
t.tic()
eacEst2.finalClustering()
t.tac()
# score result
mukresAcc = acc.HungarianIndex(data.shape[0])
accuracy = mukresAcc.score(gt, eacEst.labels)
accuracy_lt = mukresAcc.score(gt, eacEst2.labels)
print '{} - {}'.format(accuracy, accuracy_lt)
validation_my_eac.loc[name] = (accuracy, np.unique(eacEst.labels).size, accuracy_lt, np.unique(eacEst2.labels).size, ensemble_time, build_time, clustering_time)
In [43]:
def load_matlab_ensemble(directory, dataset):
import os.path
import scipy.io
import glob
ensemble_directory = os.path.abspath(directory) + os.path.sep
ensemble = list()
for p_file in glob.glob(ensemble_directory + 'kmeans-{}-*.mat'.format(dataset)):
ml_part = scipy.io.loadmat(p_file)
part = ml_part['clusters_m']
py_part = list()
for c in part:
cluster = c[c != 0] - 1
py_part.append(cluster.astype(np.int32))
ensemble.append(py_part)
return ensemble
In [33]:
validation_ml_ensemble = pd.DataFrame(index=datasets.keys(), columns=['accuracy', 'n_clusts','lifetime accuracy', 'lifetime n_clusts', 'build time', 'clustering time'])
In [44]:
t = prof.Timer()
ensemble_directory = '/home/chiroptera/workspace/QCThesis/EAC_toolbox/'
for name, ds in datasets.iteritems():
#name = 'iris'
#ds = datasets[name]
data = ds['data']
gt = ds['gt']
ensemble = load_matlab_ensemble(ensemble_directory, name)
# eac clustering
eacEst = eac.EAC(data.shape[0], sparse=False, condensed=True)
t.reset()
t.tic()
eacEst.buildMatrix(ensemble)
build_time = t.tac()
t.reset()
t.tic()
eacEst.finalClustering(np.unique(gt).size)
clustering_time = t.tac()
# eac clustering with lifetime
eacEst2 = eac.EAC(data.shape[0], sparse=False, condensed=True)
eacEst2.buildMatrix(ensemble)
t.tic() # accumulate from with clustering
eacEst2.finalClustering()
t.tac()
# score result
mukresAcc = acc.HungarianIndex(data.shape[0])
accuracy = mukresAcc.score(gt, eacEst.labels)
accuracy_lt = mukresAcc.score(gt, eacEst2.labels)
print '{}: {} - {}'.format(name, accuracy, accuracy_lt)
validation_ml_ensemble.loc[name] = (accuracy, np.unique(eacEst.labels).size, accuracy_lt, np.unique(eacEst2.labels).size, build_time, clustering_time)
In [35]:
ml_validation = """dataset accuracy lifetime n_c lifetime accuracy fixed n_c fixed ensemble time build time clustering time
iris 0.66667 2 0.74667 3 0.71809 0.037135 0.084747
wine 0.46629 4 0.42135 3 0.89522 0.038203 0.088121
wine_norm 0.52247 2 0.52809 3 0.88707 0.040153 0.088825
breast_cancer 0.60029 3 0.64714 2 5.2023 0.082844 0.26207
ionosphere 0.54701 9 0.65242 2 2.5557 0.066384 0.15149
pima 0.64583 2 0.64583 2 6.0796 1.8383 0.28582
pima_norm 0.64974 2 0.64974 2 6.1221 1.757 0.29261
mfeat_fou 0.102 6 0.2975 10 36.4247 1.3105 2.4198
optdigits 0.19958 2 0.20037 10 85.8691 2.0843 12.2125
isolet 0.038605 2 0.09773 26 1064.6 4.1708 161.2619
"""
import StringIO
ml_buffer=StringIO.StringIO(buf=ml_validation)
ml_df = pd.read_table(ml_buffer)
ml_df = ml_df.set_index('dataset')
ml_df.rename(columns={'accuracy fixed':'accuracy',
'accuracy lifetime':'lifetime accuracy',
'n_c fixed':'n_clusts',
'n_c lifetime':'lifetime n_clusts'},inplace=True)
In [122]:
print "error of accuracies"
accuracy_error = ml_df.sort()[['accuracy','lifetime accuracy']] - validation_ml_ensemble.sort()[['accuracy','lifetime accuracy']]
accuracy_error.apply(np.abs)
Out[122]:
In [123]:
print accuracy_error.apply(np.abs).to_latex(index_names=False)
In [47]:
print "Number of clusters equal:"
ml_df.sort()[['n_clusts','lifetime n_clusts']] == validation_ml_ensemble.sort()[['n_clusts','lifetime n_clusts']]
Out[47]:
In [53]:
print 'speed-up from same ensemble:'
speed_ups_ml = ml_df.sort_index()[['build time','clustering time']] / validation_ml_ensemble.sort_index()[['build time','clustering time']]
speed_ups_ml
Out[53]:
In [54]:
print speed_ups_ml.to_latex(index_names=False)
In [131]:
print "speed-up from my ensemble:"
speed_ups_all = ml_df.sort_index()[['ensemble time','build time','clustering time']] / validation_my_eac.sort()[['ensemble time','build time','clustering time']]
speed_ups_all['No. patterns'] = 0
speed_ups_all['No. features'] = 0
for i in speed_ups_all.index:
speed_ups_all.loc[i,'No. patterns'] = datasets[i]['data'].shape[0]
speed_ups_all.loc[i,'No. features'] = datasets[i]['data'].shape[1]
speed_ups_all['No. classes'] = validation_my_eac['n_clusts']
speed_ups_all[['No. patterns','No. features','No. classes','ensemble time','build time','clustering time']]
Out[131]:
In [132]:
print speed_ups_all[['No. patterns','No. features','No. classes','ensemble time','build time','clustering time']].to_latex(index_names=False)
In [40]:
print validation_ml_ensemble.sort()[['accuracy','n_clusts','lifetime accuracy','lifetime n_clusts']]
print ml_df.sort()[['accuracy','n_clusts','lifetime accuracy','lifetime n_clusts']]
print validation_my_eac.sort()[['accuracy','n_clusts','lifetime accuracy','lifetime n_clusts']]
In [128]:
In [127]:
validation_my_eac['n_clusts']
Out[127]:
In [149]:
validation_my_eac['n_samples']=0
validation_my_eac['dimension']=0
for i in validation_my_eac.index:
validation_my_eac.n_samples.loc[i] = datasets[i]['data'].shape[0]
validation_my_eac.dimension.loc[i] = datasets[i]['data'].shape[1]
validation_my_eac
In [38]:
import os.path
In [204]:
ensemble_directory = '/home/chiroptera/workspace/QCThesis/EAC_toolbox/'
In [210]:
e_iris = load_matlab_ensemble(ensemble_directory, 'iris')
In [39]:
ensemble_directory = '/home/chiroptera/workspace/QCThesis/EAC_toolbox/'
def load_matlab_ensemble(directory, dataset):
import os.path
import scipy.io
import glob
ensemble_directory = os.path.abspath(directory) + os.path.sep
ensemble = list()
for p_file in glob.glob(ensemble_directory + 'kmeans-{}-*.mat'.format(dataset)):
ml_part = scipy.io.loadmat(p_file)
part = ml_part['clusters_m']
py_part = list()
for c in part:
cluster = c[c != 0] - 1
py_part.append(cluster.astype(np.int32))
ensemble.append(py_part)
return ensemble
In [129]:
directory = '/home/chiroptera/workspace/QCThesis/EAC_toolbox/eac'
def labels_path(dataset, lifetime=True):
labels_type = '-'
if lifetime:
labels_type += 'k-fixo-'
return '{}-eac-kmeans-single{}Stable-combined.mat'.format(dataset,labels_type)
In [118]:
accuracy_from_matlab('/home/chiroptera/QCThesis/EAC_toolbox/eac','mfeat_fou')
Out[118]:
In [67]:
def accuracy_from_matlab(directory, dataset):
#load matrix
import scipy.io
import os.path
def labels_path(dataset, lifetime=True):
labels_type = '-'
if lifetime:
labels_type += 'k-fixo-'
return '{}-eac-kmeans-single{}Stable-combined.mat'.format(dataset,labels_type)
matlab_mat = scipy.io.loadmat(os.path.join(directory, labels_path(dataset,False)))
labels_mat = matlab_mat['clusters_m'].astype(np.int32)
n_samples = labels_mat.max()
labels = np.empty(n_samples, dtype=np.int32)
labels_mat -= 1
for l,c in enumerate(labels_mat):
idx = c[c!=-1]
labels[idx] = l
gt = datasets[dataset]['gt']
# score result
mukresAcc = acc.HungarianIndex(n_samples)
accuracy_fixed = mukresAcc.score(gt, labels)
## lifetime accuracy
matlab_mat = scipy.io.loadmat(os.path.join(directory, labels_path(dataset,True)))
labels_mat = matlab_mat['clusters_m'].astype(np.int32)
n_samples = labels_mat.max()
labels = np.empty(n_samples, dtype=np.int32)
labels_mat -= 1
for l,c in enumerate(labels_mat):
idx = c[c!=-1]
labels[idx] = l
# score result
mukresAcc = acc.HungarianIndex(n_samples)
accuracy_lt = mukresAcc.score(gt, labels)
return accuracy_fixed,accuracy_lt
In [148]:
accuracy_from_matlab('iris')
Out[148]: