In [ ]:
import numpy as np
import seaborn as sb
import pandas as pd
import MyML.cluster.K_Means3 as myKM
import MyML.cluster.eac as eac
import MyML.helper.partition as mpart
import MyML.metrics.accuracy as accuracy
import MyML.cluster.linkage as linkage

Prepare datasets


In [2]:
import sklearn.datasets
from sklearn.preprocessing import normalize

In [3]:
datasets_path = '/home/chiroptera/QCThesis/datasets/'

In [4]:
datasets = dict()

iris


In [5]:
iris = sklearn.datasets.load_iris()
data = iris.data.astype(np.float32)
gt = iris.target

datasets['iris'] = {'data':data, 'gt':gt}

wine


In [6]:
wine = sklearn.datasets.fetch_mldata('uci-20070111 wine', data_home='~/')
data = wine.data.astype(np.float32)
data_norm = normalize(wine.data, axis=0).astype(np.float32)
gt = wine.target.astype(np.int32)

datasets['wine'] = {'data':data, 'gt':gt}
datasets['wine_norm'] = {'data':data_norm, 'gt':gt}

ionosphere


In [7]:
dataname = datasets_path + "ionosphere/ionosphere.data"

dataset = pd.read_csv(dataname, header=None, sep=",")
print dataset.shape
dataset.head()

data = dataset.values[:,:-1].astype(np.float32)
gt = dataset.values[:,-1]
gt[gt=='g'] = 1
gt[gt=='b'] = 0
gt = gt.astype(np.int32)

datasets['ionosphere'] = {'data':data, 'gt':gt}


(351, 35)

optdigits


In [8]:
optdigits = sklearn.datasets.load_digits(n_class=10)
data = np.float32(optdigits.data)
gt = np.int32(optdigits.target)

datasets['optdigits'] = {'data':data, 'gt':gt}

In [9]:
dataname = datasets_path + 'optdigits/optdigits.tra'

dataset = pd.read_csv(dataname, header=None, sep=",")
data = dataset.get_values()[:,:-1]
data = data.astype(np.float32)
gt = dataset.get_values()[:,-1]
gt = gt.astype(np.int32)

datasets['optdigits'] = {'data':data, 'gt':gt}

mfeat-fou


In [10]:
dataname = datasets_path + "mfeat/mfeat-fou.asc"

dataset = pd.read_csv(dataname, header=None, sep="  ")
data = dataset.get_values().astype(np.float32)
gt = np.empty(dataset.shape[0], dtype=np.int32)
for i in range(10):
    gt[i*200 : i*200+200]=i
    
datasets['mfeat_fou'] = {'data':data, 'gt':gt}


/home/chiroptera/anaconda/lib/python2.7/site-packages/pandas/io/parsers.py:648: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators; you can avoid this warning by specifying engine='python'.
  ParserWarning)

breast-cancer


In [11]:
dataname = datasets_path + "breast-cancer/breast-cancer"

data, gt = sklearn.datasets.load_svmlight_file(dataname)
data = data.todense().astype(np.float32)
gt = gt.astype(np.int32)

datasets['breast_cancer'] = {'data':data, 'gt':gt}

pima


In [12]:
dataname = datasets_path + "pima/pima-indians-diabetes.data"

pima = pd.read_csv(dataname, header=None, sep=",")
data = pima.get_values()[:,:-1].astype(np.float32)
data_norm = normalize(data, axis=0).astype(np.float32)
gt = pima.get_values()[:,-1].astype(np.int32)
    
datasets['pima'] = {'data':data, 'gt':gt}
datasets['pima_norm'] = {'data':data_norm, 'gt':gt}

isolet


In [13]:
datasets['optdigits']['data'].shape


Out[13]:
(3823, 64)

In [14]:
dataname = datasets_path + "isolet/isolet1-5.data"

dataset = pd.read_csv(dataname, header=None, sep=",")
data = dataset.get_values().astype(np.float32)[:,:-1]
gt = dataset.get_values()[:,-1].astype(np.int32)

datasets['isolet'] = {'data':data, 'gt':gt}

ECG IT


In [15]:
dataname = datasets_path + "ecg_it/ecg_it.data"
dataset = pd.read_csv(dataname, header=None, sep=",")
data = dataset.values[1:,1:-1].astype(np.float32)
gt = dataset.values[1:,-1].astype(np.int32)

#remove unlabeled
labeled_idx = gt!=2
data = data[labeled_idx]
gt = gt[labeled_idx]

In [16]:
gt0 = gt==0
gt1 = gt==1

In [17]:
data = np.concatenate((data[gt0],data[gt1][:600]))
gt = np.concatenate((gt[gt0],gt[gt1][:600]))

MNIST


In [18]:
import sklearn.datasets
mnist = sklearn.datasets.fetch_mldata('MNIST (original)', data_home='~/')
data = mnist.data.astype(np.float32)
gt = mnist.target.astype(np.int32)

datasets['mnist'] = {'data':data, 'gt':gt}

In [19]:
datasets.keys()


Out[19]:
['optdigits',
 'iris',
 'breast_cancer',
 'isolet',
 'pima_norm',
 'mfeat_fou',
 'wine_norm',
 'pima',
 'ionosphere',
 'mnist',
 'wine']

In [20]:
from scipy.io import savemat

In [21]:
savemat(datasets_path + 'dataset.mat', datasets)

Process


In [22]:
import MyML.cluster.K_Means3 as myKM
import MyML.EAC.eac_new as eac
import MyML.helper.partition as part
import MyML.EAC.rules as rules
import MyML.metrics.accuracy as acc
import MyML.utils.profiling as prof

In [23]:
data = datasets['iris']['data']
gt = datasets['iris']['gt']

In [24]:
for name, ds in datasets.iteritems():
    print ds['data'].shape, '\t', name


(3823, 64) 	optdigits
(150, 4) 	iris
(683, 10) 	breast_cancer
(7797, 617) 	isolet
(768, 8) 	pima_norm
(2000, 76) 	mfeat_fou
(178, 4) 	wine_norm
(768, 8) 	pima
(351, 34) 	ionosphere
(70000, 784) 	mnist
(178, 4) 	wine

In [26]:
validation_my_eac = pd.DataFrame(index=datasets.keys(), columns=['accuracy', 'n_clusts','lifetime accuracy', 'lifetime n_clusts', 'ensemble time', 'build time', 'clustering time'])

In [27]:
t = prof.Timer()
for name, ds in datasets.iteritems():
#name = 'iris'
#ds = datasets[name]
    data = ds['data']
    gt = ds['gt']

    # generate ensemble with K-Means
    kmGen = myKM.K_Means()
    kmGen._MAX_THREADS_BLOCK = 256
    kmGen._label_mode = 'numba'

    print name, data.dtype
    
    t.reset()
    t.tic()
    ensemble = part.generateEnsemble(data, kmGen, n_clusters=rules.rule1(data.shape[0]), npartitions=100, iters=3)
    ensemble_time = t.tac()
    
    # eac clustering
    eacEst = eac.EAC(data.shape[0], sparse=False, condensed=True)
    
    t.reset()
    t.tic()
    eacEst.buildMatrix(ensemble)
    build_time = t.tac()
    
    t.reset()
    t.tic()    
    eacEst.finalClustering(np.unique(gt).size)
    clustering_time = t.tac()
    
    # eac clustering with lifetime
    eacEst2 = eac.EAC(data.shape[0], sparse=False, condensed=True)
    eacEst2.buildMatrix(ensemble)
    eacEst2.finalClustering()

    # score result
    mukresAcc = acc.HungarianIndex(data.shape[0])
    accuracy =  mukresAcc.score(gt, eacEst.labels)
    accuracy_lt = mukresAcc.score(gt, eacEst2.labels)

    print '{}: {} - {}'.format(name, accuracy, accuracy_lt)
    validation_my_eac.loc[name] = (accuracy, np.unique(eacEst.labels).size, accuracy_lt, np.unique(eacEst2.labels).size, ensemble_time, build_time, clustering_time)


optdigits float32
optdigits: 0.200627779231 - 0.199843055192
iris float32
iris: 0.973333333333 - 0.666666666667
breast_cancer float32
breast_cancer: 0.647144948755 - 0.600292825769
isolet float32
isolet: 0.0592535590612 - 0.0389893548801
pima_norm float32
pima_norm: 0.649739583333 - 0.649739583333
mfeat_fou float32
mfeat_fou: 0.1025 - 0.101
wine_norm float32
wine_norm: 0.780898876404 - 0.522471910112
pima float32
pima: 0.645833333333 - 0.494791666667
ionosphere float32
ionosphere: 0.643874643875 - 0.643874643875
mnist float32
---------------------------------------------------------------------------
MemoryError                               Traceback (most recent call last)
<ipython-input-27-ecec791fc5e6> in <module>()
     23     t.reset()
     24     t.tic()
---> 25     eacEst.buildMatrix(ensemble)
     26     build_time = t.tac()
     27 

/home/chiroptera/workspace/QCThesis/MyML/EAC/eac_new.pyc in buildMatrix(self, ensemble)
    128         elif self.full:
    129             coassoc = EAC_FULL(self.n_samples, condensed=self.condensed,
--> 130                                dtype=self.assoc_dtype)
    131             coassoc.update_ensemble(ensemble)
    132             coassoc.get_degree() # get association degree and nnz

/home/chiroptera/workspace/QCThesis/MyML/EAC/full.pyc in __init__(self, n_samples, dtype, condensed, **kwargs)
     23         if self.condensed:
     24             n = sum(xrange(1, n_samples))
---> 25             self.coassoc = np.zeros(n, dtype=dtype)
     26             self.update_partition = self._update_partition_condensed
     27         else:

MemoryError: 

In [ ]:
validation_my_eac['n_samples']=0
validation_my_eac['dimension']=0
for i in validation_my_eac.index:
    validation_my_eac.n_samples.loc[i] = datasets[i]['data'].shape[0]
    validation_my_eac.dimension.loc[i] = datasets[i]['data'].shape[1]

In [ ]:
validation_my_eac.sort(columns='n_samples')