In [ ]:
import numpy as np
import seaborn as sb
import pandas as pd
import MyML.cluster.K_Means3 as myKM
import MyML.cluster.eac as eac
import MyML.helper.partition as mpart
import MyML.metrics.accuracy as accuracy
import MyML.cluster.linkage as linkage
In [2]:
import sklearn.datasets
from sklearn.preprocessing import normalize
In [3]:
datasets_path = '/home/chiroptera/QCThesis/datasets/'
In [4]:
datasets = dict()
In [5]:
iris = sklearn.datasets.load_iris()
data = iris.data.astype(np.float32)
gt = iris.target
datasets['iris'] = {'data':data, 'gt':gt}
In [6]:
wine = sklearn.datasets.fetch_mldata('uci-20070111 wine', data_home='~/')
data = wine.data.astype(np.float32)
data_norm = normalize(wine.data, axis=0).astype(np.float32)
gt = wine.target.astype(np.int32)
datasets['wine'] = {'data':data, 'gt':gt}
datasets['wine_norm'] = {'data':data_norm, 'gt':gt}
In [7]:
dataname = datasets_path + "ionosphere/ionosphere.data"
dataset = pd.read_csv(dataname, header=None, sep=",")
print dataset.shape
dataset.head()
data = dataset.values[:,:-1].astype(np.float32)
gt = dataset.values[:,-1]
gt[gt=='g'] = 1
gt[gt=='b'] = 0
gt = gt.astype(np.int32)
datasets['ionosphere'] = {'data':data, 'gt':gt}
In [8]:
optdigits = sklearn.datasets.load_digits(n_class=10)
data = np.float32(optdigits.data)
gt = np.int32(optdigits.target)
datasets['optdigits'] = {'data':data, 'gt':gt}
In [9]:
dataname = datasets_path + 'optdigits/optdigits.tra'
dataset = pd.read_csv(dataname, header=None, sep=",")
data = dataset.get_values()[:,:-1]
data = data.astype(np.float32)
gt = dataset.get_values()[:,-1]
gt = gt.astype(np.int32)
datasets['optdigits'] = {'data':data, 'gt':gt}
In [10]:
dataname = datasets_path + "mfeat/mfeat-fou.asc"
dataset = pd.read_csv(dataname, header=None, sep=" ")
data = dataset.get_values().astype(np.float32)
gt = np.empty(dataset.shape[0], dtype=np.int32)
for i in range(10):
gt[i*200 : i*200+200]=i
datasets['mfeat_fou'] = {'data':data, 'gt':gt}
In [11]:
dataname = datasets_path + "breast-cancer/breast-cancer"
data, gt = sklearn.datasets.load_svmlight_file(dataname)
data = data.todense().astype(np.float32)
gt = gt.astype(np.int32)
datasets['breast_cancer'] = {'data':data, 'gt':gt}
In [12]:
dataname = datasets_path + "pima/pima-indians-diabetes.data"
pima = pd.read_csv(dataname, header=None, sep=",")
data = pima.get_values()[:,:-1].astype(np.float32)
data_norm = normalize(data, axis=0).astype(np.float32)
gt = pima.get_values()[:,-1].astype(np.int32)
datasets['pima'] = {'data':data, 'gt':gt}
datasets['pima_norm'] = {'data':data_norm, 'gt':gt}
In [13]:
datasets['optdigits']['data'].shape
Out[13]:
In [14]:
dataname = datasets_path + "isolet/isolet1-5.data"
dataset = pd.read_csv(dataname, header=None, sep=",")
data = dataset.get_values().astype(np.float32)[:,:-1]
gt = dataset.get_values()[:,-1].astype(np.int32)
datasets['isolet'] = {'data':data, 'gt':gt}
In [15]:
dataname = datasets_path + "ecg_it/ecg_it.data"
dataset = pd.read_csv(dataname, header=None, sep=",")
data = dataset.values[1:,1:-1].astype(np.float32)
gt = dataset.values[1:,-1].astype(np.int32)
#remove unlabeled
labeled_idx = gt!=2
data = data[labeled_idx]
gt = gt[labeled_idx]
In [16]:
gt0 = gt==0
gt1 = gt==1
In [17]:
data = np.concatenate((data[gt0],data[gt1][:600]))
gt = np.concatenate((gt[gt0],gt[gt1][:600]))
In [18]:
import sklearn.datasets
mnist = sklearn.datasets.fetch_mldata('MNIST (original)', data_home='~/')
data = mnist.data.astype(np.float32)
gt = mnist.target.astype(np.int32)
datasets['mnist'] = {'data':data, 'gt':gt}
In [19]:
datasets.keys()
Out[19]:
In [20]:
from scipy.io import savemat
In [21]:
savemat(datasets_path + 'dataset.mat', datasets)
In [22]:
import MyML.cluster.K_Means3 as myKM
import MyML.EAC.eac_new as eac
import MyML.helper.partition as part
import MyML.EAC.rules as rules
import MyML.metrics.accuracy as acc
import MyML.utils.profiling as prof
In [23]:
data = datasets['iris']['data']
gt = datasets['iris']['gt']
In [24]:
for name, ds in datasets.iteritems():
print ds['data'].shape, '\t', name
In [26]:
validation_my_eac = pd.DataFrame(index=datasets.keys(), columns=['accuracy', 'n_clusts','lifetime accuracy', 'lifetime n_clusts', 'ensemble time', 'build time', 'clustering time'])
In [27]:
t = prof.Timer()
for name, ds in datasets.iteritems():
#name = 'iris'
#ds = datasets[name]
data = ds['data']
gt = ds['gt']
# generate ensemble with K-Means
kmGen = myKM.K_Means()
kmGen._MAX_THREADS_BLOCK = 256
kmGen._label_mode = 'numba'
print name, data.dtype
t.reset()
t.tic()
ensemble = part.generateEnsemble(data, kmGen, n_clusters=rules.rule1(data.shape[0]), npartitions=100, iters=3)
ensemble_time = t.tac()
# eac clustering
eacEst = eac.EAC(data.shape[0], sparse=False, condensed=True)
t.reset()
t.tic()
eacEst.buildMatrix(ensemble)
build_time = t.tac()
t.reset()
t.tic()
eacEst.finalClustering(np.unique(gt).size)
clustering_time = t.tac()
# eac clustering with lifetime
eacEst2 = eac.EAC(data.shape[0], sparse=False, condensed=True)
eacEst2.buildMatrix(ensemble)
eacEst2.finalClustering()
# score result
mukresAcc = acc.HungarianIndex(data.shape[0])
accuracy = mukresAcc.score(gt, eacEst.labels)
accuracy_lt = mukresAcc.score(gt, eacEst2.labels)
print '{}: {} - {}'.format(name, accuracy, accuracy_lt)
validation_my_eac.loc[name] = (accuracy, np.unique(eacEst.labels).size, accuracy_lt, np.unique(eacEst2.labels).size, ensemble_time, build_time, clustering_time)
In [ ]:
validation_my_eac['n_samples']=0
validation_my_eac['dimension']=0
for i in validation_my_eac.index:
validation_my_eac.n_samples.loc[i] = datasets[i]['data'].shape[0]
validation_my_eac.dimension.loc[i] = datasets[i]['data'].shape[1]
In [ ]:
validation_my_eac.sort(columns='n_samples')