notebook.community

Edit and run



In [1]:

    
import pandas as pd
import numpy as np
import seaborn as sns
import cPickle as pickle
import codecs
import skfuzzy as fuzz 
import time

from matplotlib import pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import metrics
from sklearn.preprocessing import Normalizer
from sklearn.cluster import KMeans
from sklearn.cluster.bicluster import SpectralCoclustering
from biclustering.biclustering import DeltaBiclustering
from sklearn.metrics.cluster import normalized_mutual_info_score
from sklearn.metrics.cluster import adjusted_rand_score



In [2]:

    
%matplotlib inline
sns.set_palette("deep", desat=.6)
sns.set_context(rc={"figure.figsize": (8, 4)})



In [3]:

    
arena_news_df = pd.read_pickle('arena_news_df.pkl')
sport_news_df = pd.read_pickle('sport_news_df.pkl')
jovem_news_df = pd.read_pickle('jovem_news_df.pkl')



In [4]:

    
labels_true = np.array(len(arena_news_df)*[0] + len(sport_news_df)*[1] + len(jovem_news_df.ix[0:99])*[2])



In [5]:

    
count_vect = CountVectorizer(encoding='UTF-8',lowercase=False, min_df=2)
X = count_vect.fit_transform(arena_news_df['all'].tolist() + sport_news_df['all'].tolist() + jovem_news_df['all'].ix[0:99].tolist())

X_train_norm_tfidf = TfidfTransformer(norm=u'l2', use_idf=True).fit_transform(X).toarray()
X_train_tfidf = TfidfTransformer(norm=False, use_idf=True).fit_transform(X).toarray()
X_train_norm = TfidfTransformer(norm=u'l2', use_idf=False).fit_transform(X).toarray()
X_train = TfidfTransformer(norm=False, use_idf=False).fit_transform(X).toarray()



In [6]:

    
print X_train.shape









    



(300, 6764)



In [7]:

    
def _big_s(x, center):
    len_x = len(x)
    total = 0

    for i in range(len_x):
        total += np.linalg.norm(x[i]-center)

    return total / len_x

def davies_bouldin_score(X, labels_pred, k_centers):
    try:
        num_clusters, _ = k_centers.shape
        big_ss = np.zeros([num_clusters], dtype=np.float64)
        d_eucs = np.zeros([num_clusters, num_clusters], dtype=np.float64)
        db = 0

        for k in range(num_clusters):
            samples_in_k_inds = np.where(labels_pred == k)[0]
            samples_in_k = X[samples_in_k_inds, :]
            big_ss[k] = _big_s(samples_in_k, k_centers[k])

        for k in range(num_clusters):
            for l in range(0, num_clusters):
                d_eucs[k, l] = np.linalg.norm(k_centers[k]-k_centers[l])

        for k in range(num_clusters):
            values = np.zeros([num_clusters-1], dtype=np.float64)
            for l in range(0, k):
                values[l] = (big_ss[k] + big_ss[l])/d_eucs[k, l]
            for l in range(k+1, num_clusters):
                values[l-1] = (big_ss[k] + big_ss[l])/d_eucs[k, l]

            db += np.max(values)
        res = db / num_clusters
    except Exception:
        return 0.0
    return res

def calculate_centroids_doc_mean(X, labels_pred, k):
    _, m = X.shape

    centroids = np.zeros((k, m))
    for k in range(k):
        samples_in_k_inds = np.where(labels_pred == k)[0]
        centroids[k, :] = X[samples_in_k_inds, :].mean(axis=0)

    return centroids

K-means



In [9]:

    
params = {
    'k' : [3],
    'X' : ['X_train', 'X_train_norm', 'X_train_tfidf', 'X_train_norm_tfidf']
}
with codecs.open('kmeans_news_results.csv', 'w', 'utf-8') as out_f:
    out_f.write('X,K,NMI,RAND,DAVIES\n')
    for k in params['k']:
        for data_str in params['X']:
            data = eval(data_str)

            error_best = np.inf
            for _ in xrange(10):
                estimator = KMeans(n_clusters=k, max_iter=1000, init='random')
                estimator.fit(data)

                labels_pred = estimator.labels_
                centroids = estimator.cluster_centers_
                error = estimator.inertia_

                nmi_score = normalized_mutual_info_score(labels_true, labels_pred)
                rand_score = adjusted_rand_score(labels_true, labels_pred)
                davies_score = davies_bouldin_score(data, labels_pred, centroids)
                
                out_f.write(u'%s,%s,%s,%s,%s\n' % (data_str, k, nmi_score, rand_score, davies_score))

                print 'Execution: X: %s, k: %s' % (data_str, k)
                print 'NMI score: %s' % nmi_score
                print 'Rand score: %s' % rand_score
                print 'Davies score: %s' % davies_score
                print '-----------------------------------------------\n'









    



Execution: X: X_train, k: 3
NMI score: 0.115098348432
Rand score: 0.0176825500343
Davies score: 2.61251748031
-----------------------------------------------

Execution: X: X_train, k: 3
NMI score: 0.0332641069195
Rand score: 4.48948747226e-05
Davies score: 0.460960431793
-----------------------------------------------

Execution: X: X_train, k: 3
NMI score: 0.108523319497
Rand score: 0.00670394416546
Davies score: 1.90916798943
-----------------------------------------------

Execution: X: X_train, k: 3
NMI score: 0.0798583744984
Rand score: 0.00718651227011
Davies score: 2.49414972016
-----------------------------------------------

Execution: X: X_train, k: 3
NMI score: 0.0957653737681
Rand score: 0.00533421984794
Davies score: 2.21933882234
-----------------------------------------------

Execution: X: X_train, k: 3
NMI score: 0.109527783943
Rand score: 0.0221321805431
Davies score: 2.53806177551
-----------------------------------------------

Execution: X: X_train, k: 3
NMI score: 0.033112114257
Rand score: -2.22229629876e-05
Davies score: 0.232497138352
-----------------------------------------------

Execution: X: X_train, k: 3
NMI score: 0.113170808189
Rand score: 0.00698552348165
Davies score: 1.8926420882
-----------------------------------------------

Execution: X: X_train, k: 3
NMI score: 0.137303915208
Rand score: 0.0182934266993
Davies score: 2.64448662565
-----------------------------------------------

Execution: X: X_train, k: 3
NMI score: 0.0420993054319
Rand score: 4.54964843268e-05
Davies score: 0.87367829059
-----------------------------------------------

Execution: X: X_train_norm, k: 3
NMI score: 0.521913149314
Rand score: 0.418341692302
Davies score: 5.66046591908
-----------------------------------------------

Execution: X: X_train_norm, k: 3
NMI score: 0.750347965957
Rand score: 0.743097150851
Davies score: 5.64176124467
-----------------------------------------------

Execution: X: X_train_norm, k: 3
NMI score: 0.769814058826
Rand score: 0.774495153649
Davies score: 5.62771194749
-----------------------------------------------

Execution: X: X_train_norm, k: 3
NMI score: 0.73330539018
Rand score: 0.756211625056
Davies score: 5.69825585164
-----------------------------------------------

Execution: X: X_train_norm, k: 3
NMI score: 0.689268883548
Rand score: 0.735594800467
Davies score: 5.87588415932
-----------------------------------------------

Execution: X: X_train_norm, k: 3
NMI score: 0.732719633799
Rand score: 0.740914888059
Davies score: 5.70214289324
-----------------------------------------------

Execution: X: X_train_norm, k: 3
NMI score: 0.694995112014
Rand score: 0.70407471517
Davies score: 5.71884607951
-----------------------------------------------

Execution: X: X_train_norm, k: 3
NMI score: 0.486777063097
Rand score: 0.500197069981
Davies score: 5.98884705756
-----------------------------------------------

Execution: X: X_train_norm, k: 3
NMI score: 0.701340292518
Rand score: 0.713387135514
Davies score: 5.64107211961
-----------------------------------------------

Execution: X: X_train_norm, k: 3
NMI score: 0.7203371033
Rand score: 0.749879030277
Davies score: 5.83784292313
-----------------------------------------------

Execution: X: X_train_tfidf, k: 3
NMI score: 0.0420993054319
Rand score: 4.54964843268e-05
Davies score: 0.951388926594
-----------------------------------------------

Execution: X: X_train_tfidf, k: 3
NMI score: 0.0332641069195
Rand score: 4.48948747226e-05
Davies score: 0.340947886061
-----------------------------------------------

Execution: X: X_train_tfidf, k: 3
NMI score: 0.0841667478165
Rand score: 0.00239135921591
Davies score: 1.80023477183
-----------------------------------------------

Execution: X: X_train_tfidf, k: 3
NMI score: 0.0332641069195
Rand score: 4.48948747226e-05
Davies score: 0.366723308377
-----------------------------------------------

Execution: X: X_train_tfidf, k: 3
NMI score: 0.0890316286059
Rand score: 0.0157143622587
Davies score: 3.51485324683
-----------------------------------------------

Execution: X: X_train_tfidf, k: 3
NMI score: 0.0937573943451
Rand score: 0.00301095060392
Davies score: 2.22680439836
-----------------------------------------------

Execution: X: X_train_tfidf, k: 3
NMI score: 0.0502101619414
Rand score: 0.000204065191032
Davies score: 1.26077676816
-----------------------------------------------

Execution: X: X_train_tfidf, k: 3
NMI score: 0.0505580866971
Rand score: 0.000406774467173
Davies score: 1.13909471336
-----------------------------------------------

Execution: X: X_train_tfidf, k: 3
NMI score: 0.0423578840663
Rand score: 0.000180184096344
Davies score: 1.33316380215
-----------------------------------------------

Execution: X: X_train_tfidf, k: 3
NMI score: 0.0244456080903
Rand score: -2.18473216816e-05
Davies score: 1.08710358709
-----------------------------------------------

Execution: X: X_train_norm_tfidf, k: 3
NMI score: 0.714027402401
Rand score: 0.716811988814
Davies score: 6.90606510817
-----------------------------------------------

Execution: X: X_train_norm_tfidf, k: 3
NMI score: 0.563323170205
Rand score: 0.461384690574
Davies score: 6.73332244052
-----------------------------------------------

Execution: X: X_train_norm_tfidf, k: 3
NMI score: 0.538867158946
Rand score: 0.441107615185
Davies score: 6.75416868347
-----------------------------------------------

Execution: X: X_train_norm_tfidf, k: 3
NMI score: 0.742032880064
Rand score: 0.725174816915
Davies score: 6.98030207157
-----------------------------------------------

Execution: X: X_train_norm_tfidf, k: 3
NMI score: 0.540461658542
Rand score: 0.523755187254
Davies score: 6.99990490797
-----------------------------------------------

Execution: X: X_train_norm_tfidf, k: 3
NMI score: 0.65495429972
Rand score: 0.621644183114
Davies score: 6.92359503984
-----------------------------------------------

Execution: X: X_train_norm_tfidf, k: 3
NMI score: 0.676197498241
Rand score: 0.627557887292
Davies score: 6.90973991489
-----------------------------------------------

Execution: X: X_train_norm_tfidf, k: 3
NMI score: 0.455998241256
Rand score: 0.366147628476
Davies score: 6.53967149765
-----------------------------------------------

Execution: X: X_train_norm_tfidf, k: 3
NMI score: 0.469068434148
Rand score: 0.31720717662
Davies score: 6.4008730121
-----------------------------------------------

Execution: X: X_train_norm_tfidf, k: 3
NMI score: 0.675724644637
Rand score: 0.662385827438
Davies score: 6.98262998189
-----------------------------------------------

Fuzzy K-means



In [14]:

    
params = {
    'k' : [3],
    'X' : ['X_train', 'X_train_norm', 'X_train_tfidf', 'X_train_norm_tfidf']
}
with codecs.open('fuzzy_cmeans_news_results.csv', 'w', 'utf-8') as out_f:
    out_f.write('X,K,NMI,RAND,DAVIES\n')
    for k in params['k']:
        for data_str in params['X']:
            data = eval(data_str)

            error_best = np.inf
            for _ in xrange(10):
                centroids, U, _, _, errors, _, _ = fuzz.cluster.cmeans(
                    data.T,
                    k,
                    2,
                    error=0.00001,
                    maxiter=10000)
                centroids

                labels_pred = np.argmax(U, axis=0)
                error = errors[-1]

                nmi_score = normalized_mutual_info_score(labels_true, labels_pred)
                rand_score = adjusted_rand_score(labels_true, labels_pred)
                davies_score = davies_bouldin_score(data, labels_pred, centroids)
                
                out_f.write(u'%s,%s,%s,%s,%s\n' % (data_str, k, nmi_score, rand_score, davies_score))

                print 'Execution: X: %s, k: %s' % (data_str, k)
                print 'NMI score: %s' % nmi_score
                print 'Rand score: %s' % rand_score
                print 'Davies score: %s' % davies_score
                print '-----------------------------------------------\n'









    



Execution: X: X_train, k: 3
NMI score: 0.069433856787
Rand score: 0.0671231171273
Davies score: 0.0
-----------------------------------------------

Execution: X: X_train, k: 3
NMI score: 0.069433856787
Rand score: 0.0671231171273
Davies score: 0.0
-----------------------------------------------

Execution: X: X_train, k: 3
NMI score: 0.069433856787
Rand score: 0.0671231171273
Davies score: 0.0
-----------------------------------------------

Execution: X: X_train, k: 3
NMI score: 0.069433856787
Rand score: 0.0671231171273
Davies score: 0.0
-----------------------------------------------

Execution: X: X_train, k: 3
NMI score: 0.069433856787
Rand score: 0.0671231171273
Davies score: 0.0
-----------------------------------------------

Execution: X: X_train, k: 3
NMI score: 0.069433856787
Rand score: 0.0671231171273
Davies score: 0.0
-----------------------------------------------

Execution: X: X_train, k: 3
NMI score: 0.069433856787
Rand score: 0.0671231171273
Davies score: 0.0
-----------------------------------------------

Execution: X: X_train, k: 3
NMI score: 0.069433856787
Rand score: 0.0671231171273
Davies score: 0.0
-----------------------------------------------

Execution: X: X_train, k: 3
NMI score: 0.069433856787
Rand score: 0.0671231171273
Davies score: 0.0
-----------------------------------------------

Execution: X: X_train, k: 3
NMI score: 0.069433856787
Rand score: 0.0671231171273
Davies score: 0.0
-----------------------------------------------

Execution: X: X_train_norm, k: 3
NMI score: 0.527432503572
Rand score: 0.492647983065
Davies score: 27575362.6491
-----------------------------------------------

Execution: X: X_train_norm, k: 3
NMI score: 0.480337565486
Rand score: 0.43299487363
Davies score: 32621464.289
-----------------------------------------------

Execution: X: X_train_norm, k: 3
NMI score: 0.557418703485
Rand score: 0.512702862949
Davies score: 6280724.36909
-----------------------------------------------

Execution: X: X_train_norm, k: 3
NMI score: 0.56130143665
Rand score: 0.488247274951
Davies score: 5165070.04158
-----------------------------------------------

Execution: X: X_train_norm, k: 3
NMI score: 0.450628320405
Rand score: 0.441472244602
Davies score: 6247508.62737
-----------------------------------------------

Execution: X: X_train_norm, k: 3
NMI score: 0.54806404393
Rand score: 0.486266088456
Davies score: 5586860.60266
-----------------------------------------------

Execution: X: X_train_norm, k: 3
NMI score: 0.701108875584
Rand score: 0.715965685589
Davies score: 18500370.6268
-----------------------------------------------

Execution: X: X_train_norm, k: 3
NMI score: 0.480328184858
Rand score: 0.453963353187
Davies score: 27879737.3421
-----------------------------------------------

Execution: X: X_train_norm, k: 3
NMI score: 0.577266351624
Rand score: 0.579425642872
Davies score: 17851035.9671
-----------------------------------------------

Execution: X: X_train_norm, k: 3
NMI score: 0.53692441653
Rand score: 0.484244851483
Davies score: 8676807.62388
-----------------------------------------------

Execution: X: X_train_tfidf, k: 3
NMI score: 0.0819042411728
Rand score: 0.0743504893537
Davies score: 18164334.7845
-----------------------------------------------

Execution: X: X_train_tfidf, k: 3
NMI score: 0.0812529319706
Rand score: 0.0752099196437
Davies score: 0.0
-----------------------------------------------

Execution: X: X_train_tfidf, k: 3
NMI score: 0.0819160957234
Rand score: 0.0723569251751
Davies score: 31618076.8668
-----------------------------------------------

Execution: X: X_train_tfidf, k: 3
NMI score: 0.0812529319706
Rand score: 0.0752099196437
Davies score: 0.0
-----------------------------------------------

Execution: X: X_train_tfidf, k: 3
NMI score: 0.0769984222888
Rand score: 0.0691796749166
Davies score: 49357999.2953
-----------------------------------------------

Execution: X: X_train_tfidf, k: 3
NMI score: 0.0793311061859
Rand score: 0.0719171995543
Davies score: 43954293.0254
-----------------------------------------------

Execution: X: X_train_tfidf, k: 3
NMI score: 0.09150695427
Rand score: 0.0799480266991
Davies score: 38803846.8985
-----------------------------------------------

Execution: X: X_train_tfidf, k: 3
NMI score: 0.0861311982625
Rand score: 0.0718300476822
Davies score: 108151122.719
-----------------------------------------------

Execution: X: X_train_tfidf, k: 3
NMI score: 0.0862373821358
Rand score: 0.0765596655208
Davies score: 18323480.4529
-----------------------------------------------

Execution: X: X_train_tfidf, k: 3
NMI score: 0.0897793452998
Rand score: 0.0704326742547
Davies score: 59740492.6243
-----------------------------------------------

Execution: X: X_train_norm_tfidf, k: 3
NMI score: 0.530106911994
Rand score: 0.457742950874
Davies score: 49300838.8036
-----------------------------------------------

Execution: X: X_train_norm_tfidf, k: 3
NMI score: 0.5069589086
Rand score: 0.421670676631
Davies score: 6852042.93976
-----------------------------------------------

Execution: X: X_train_norm_tfidf, k: 3
NMI score: 0.516707148
Rand score: 0.472229418727
Davies score: 48699474.3619
-----------------------------------------------

Execution: X: X_train_norm_tfidf, k: 3
NMI score: 0.363861455637
Rand score: 0.352891817007
Davies score: 83306596.3617
-----------------------------------------------

Execution: X: X_train_norm_tfidf, k: 3
NMI score: 0.377578980445
Rand score: 0.370451830071
Davies score: 5869604.36513
-----------------------------------------------

Execution: X: X_train_norm_tfidf, k: 3
NMI score: 0.473557483013
Rand score: 0.435025045685
Davies score: 81465449.5166
-----------------------------------------------

Execution: X: X_train_norm_tfidf, k: 3
NMI score: 0.379701617772
Rand score: 0.3386776096
Davies score: 84726979.5749
-----------------------------------------------

Execution: X: X_train_norm_tfidf, k: 3
NMI score: 0.473240614459
Rand score: 0.419876976021
Davies score: 50551305.2716
-----------------------------------------------

Execution: X: X_train_norm_tfidf, k: 3
NMI score: 0.625849210153
Rand score: 0.60492038654
Davies score: 70856173.0264
-----------------------------------------------

Execution: X: X_train_norm_tfidf, k: 3
NMI score: 0.453363633466
Rand score: 0.376575072076
Davies score: 47140093.176
-----------------------------------------------

Orthogonal Non-negative Matrix Tri-Factorization



In [ ]:

    
def onmtf(X, U, S, V):
    U = U * ((X.dot(V).dot(S.T)) / (U.dot(S).dot(V.T).dot(X.T).dot(U)))
    V = V * ((X.T.dot(U).dot(S)) / (V.dot(S.T).dot(U.T).dot(X).dot(V)))
    S = S * ((U.T.dot(X).dot(V)) / (U.T.dot(U).dot(S).dot(V.T).dot(V)))
    return U, S, V

def onm3f(X, U, S, V):
     U = U * (X.dot(V).dot(S.T)) / np.sqrt(U.dot(U.T).dot(X).dot(V).dot(S.T))
     V = V * (X.T.dot(U).dot(S)) / np.sqrt(V.dot(V.T).dot(X.T).dot(U).dot(S))
     S = S * (U.T.dot(X).dot(V)) / np.sqrt(U.T.dot(U).dot(S).dot(V.T).dot(V))
     return U, S, V

def nbvd(X, U, S, V):
     U = U * (X.dot(V).dot(S.T)) / U.dot(U.T).dot(X).dot(V).dot(S.T)
     V = V * (X.T.dot(U).dot(S)) / V.dot(V.T).dot(X.T).dot(U).dot(S)
     S = S * (U.T.dot(X).dot(V)) / U.T.dot(U).dot(S).dot(V.T).dot(V)
     return U, S, V
    
def matrix_factorization_clustering(X, k, l, factorization_func=onmtf, norm=False, num_iters=100):
    m, n = X.shape
    U = np.random.rand(m,k)
    S = np.random.rand(k,l)
    V = np.random.rand(n,l)

    if norm:
        X = Normalizer().fit_transform(X)

    error_best = np.inf

    for i in xrange(num_iters):
        U, S, V = factorization_func(X, U, S, V)
        error = np.sum((X - U.dot(S).dot(V.T)) ** 2)
        
        if error < error_best:
            U_best = U
            S_best = S
            V_best = V
            error_best = error

    Du = np.diag(np.ones(m).dot(U_best))
    Dv = np.diag(np.ones(n).dot(V_best))

    U_norm = U_best.dot( np.diag(S_best.dot(Dv).dot(np.ones(l))) )
    V_norm = V_best.dot( np.diag(np.ones(k).dot(Du).dot(S_best)) )

    rows_ind = np.argmax(U_best, axis=1)
    cols_ind = np.argmax(V_best, axis=1)

    return U_norm, S_best, V_norm, rows_ind, cols_ind, error_best


params = {
    'k' : [3],
    'l' : [2, 3, 4, 5, 6],
#     'X' : ['X_train', 'X_train_tfidf']
    'X' : ['X_train', 'X_train_norm', 'X_train_tfidf', 'X_train_norm_tfidf']
#     'X' : ['X_train_norm', 'X_train_norm_tfidf']
}
with codecs.open('onmtf_news_results.csv', 'w', 'utf-8') as out_f:
    out_f.write('X,K,L,NMI,RAND,DAVIES\n')
    for k in params['k']:
        for l in params['l']:
            for data_str in params['X']:
                data = eval(data_str)

                error_best = np.inf
                for _ in xrange(10):
                    init_time = time.time()
                    U, S, V, labels_pred, _, error = matrix_factorization_clustering(data, k, l, num_iters=1000)

                    nmi_score = normalized_mutual_info_score(labels_true, labels_pred)
                    rand_score = adjusted_rand_score(labels_true, labels_pred)
                    davies_score = davies_bouldin_score(data, labels_pred, calculate_centroids_doc_mean(data, labels_pred, k))

                    end_time = time.time()
                    print end_time - init_time

                    out_f.write(u'%s,%s,%s,%s,%s,%s\n' % (data_str, k, l, nmi_score, rand_score, davies_score))

                    print 'Execution: X: %s, k: %s' % (data_str, k)
                    print 'Algo error: %s' % error
                    print 'NMI score: %s' % nmi_score
                    print 'Rand score: %s' % rand_score
                    print 'Davies score: %s' % davies_score
                    print '-----------------------------------------------\n'









    



880.380249023
Execution: X: X_train, k: 3
Algo error: inf
NMI score: 0.105763880717
Rand score: 0.112250589335
Davies score: 6.60967741576
-----------------------------------------------

961.614548206
Execution: X: X_train, k: 3
Algo error: inf
NMI score: 0.00955425382046
Rand score: -0.000504490234317
Davies score: 3.60430111693
-----------------------------------------------

988.1425879
Execution: X: X_train, k: 3
Algo error: inf
NMI score: 0.112166021861
Rand score: 0.0930987431886
Davies score: 7.26283227383
-----------------------------------------------

2016.82614207
Execution: X: X_train, k: 3
Algo error: inf
NMI score: 0.0114363896112
Rand score: 0.000152143507417
Davies score: 3.95778514682
-----------------------------------------------

956.24012804
Execution: X: X_train, k: 3
Algo error: inf
NMI score: 0.074377378081
Rand score: 0.0806982433705
Davies score: 7.25519125519
-----------------------------------------------

1102.90737486
Execution: X: X_train, k: 3
Algo error: inf
NMI score: 0.0114723704368
Rand score: 3.55065596593e-05
Davies score: 4.59394979827
-----------------------------------------------

5369.40552998
Execution: X: X_train, k: 3
Algo error: inf
NMI score: 0.122576021036
Rand score: 0.0955287790951
Davies score: 6.74944508687
-----------------------------------------------

2185.76451111
Execution: X: X_train, k: 3
Algo error: inf
NMI score: 0.0690631774661
Rand score: 0.0523875238293
Davies score: 7.89910932922
-----------------------------------------------

800.545524836
Execution: X: X_train, k: 3
Algo error: inf
NMI score: 0.18211731532
Rand score: 0.168388052164
Davies score: 6.18708472597
-----------------------------------------------

798.458115101
Execution: X: X_train, k: 3
Algo error: inf
NMI score: 0.099706659096
Rand score: 0.101180431965
Davies score: 6.61228548261
-----------------------------------------------

799.084643126
Execution: X: X_train_norm, k: 3
Algo error: inf
NMI score: 0.523975002091
Rand score: 0.480404905785
Davies score: 7.13041405214
-----------------------------------------------

870.879578114
Execution: X: X_train_norm, k: 3
Algo error: inf
NMI score: 0.53546365998
Rand score: 0.41783426376
Davies score: 6.01841241755
-----------------------------------------------

899.173555851
Execution: X: X_train_norm, k: 3
Algo error: inf
NMI score: 0.504160091159
Rand score: 0.451108777411
Davies score: 8.1733467923
-----------------------------------------------

822.334868908
Execution: X: X_train_norm, k: 3
Algo error: inf
NMI score: 0.613912814192
Rand score: 0.646067872305
Davies score: 6.3207704238
-----------------------------------------------

824.606380939
Execution: X: X_train_norm, k: 3
Algo error: inf
NMI score: 0.554770180275
Rand score: 0.492391109252
Davies score: 6.33138256682
-----------------------------------------------

823.832136869
Execution: X: X_train_norm, k: 3
Algo error: inf
NMI score: 0.502765836651
Rand score: 0.449383409719
Davies score: 8.16141401588
-----------------------------------------------

873.731344938
Execution: X: X_train_norm, k: 3
Algo error: inf
NMI score: 0.464053056662
Rand score: 0.402838225734
Davies score: 7.7935827313
-----------------------------------------------

828.096446991
Execution: X: X_train_norm, k: 3
Algo error: inf
NMI score: 0.549768598539
Rand score: 0.426986013799
Davies score: 5.89456639739
-----------------------------------------------

824.221236944
Execution: X: X_train_norm, k: 3
Algo error: inf
NMI score: 0.486067248303
Rand score: 0.440589540582
Davies score: 8.21492336316
-----------------------------------------------

824.994168997
Execution: X: X_train_norm, k: 3
Algo error: inf
NMI score: 0.46220323797
Rand score: 0.397744598058
Davies score: 8.18204845577
-----------------------------------------------

821.209376097
Execution: X: X_train_tfidf, k: 3
Algo error: inf
NMI score: 0.0165378481869
Rand score: 0.000598250160109
Davies score: 5.26099313244
-----------------------------------------------

819.415800095
Execution: X: X_train_tfidf, k: 3
Algo error: inf
NMI score: 0.0126688866782
Rand score: -0.00026560069794
Davies score: 3.29601494815
-----------------------------------------------

822.370894909
Execution: X: X_train_tfidf, k: 3
Algo error: inf
NMI score: 0.00452864853373
Rand score: -0.000444820943217
Davies score: 4.13717064516
-----------------------------------------------

818.924009085
Execution: X: X_train_tfidf, k: 3
Algo error: inf
NMI score: 0.0102948375057
Rand score: -0.000172439946866
Davies score: 2.92783211148
-----------------------------------------------

819.258091927
Execution: X: X_train_tfidf, k: 3
Algo error: inf
NMI score: 0.00831990019078
Rand score: -0.000443257303298
Davies score: 3.53098323455
-----------------------------------------------

819.707537174
Execution: X: X_train_tfidf, k: 3
Algo error: inf
NMI score: 0.012654859657
Rand score: -0.000330694852543
Davies score: 3.21577925374
-----------------------------------------------

819.679028034
Execution: X: X_train_tfidf, k: 3
Algo error: inf
NMI score: 0.0414848822314
Rand score: 0.0170982304323
Davies score: 8.18872721087
-----------------------------------------------

818.433753967
Execution: X: X_train_tfidf, k: 3
Algo error: inf
NMI score: 0.0530110052105
Rand score: 0.028040646911
Davies score: 8.28145168057
-----------------------------------------------

823.967661142
Execution: X: X_train_tfidf, k: 3
Algo error: inf
NMI score: 0.0631866596296
Rand score: 0.0689663332184
Davies score: 7.85840126935
-----------------------------------------------

822.460726976
Execution: X: X_train_tfidf, k: 3
Algo error: inf
NMI score: 0.0476501096218
Rand score: 0.0214216530342
Davies score: 7.64899332097
-----------------------------------------------

829.599044085
Execution: X: X_train_norm_tfidf, k: 3
Algo error: inf
NMI score: 0.229607696764
Rand score: 0.16677325654
Davies score: 8.65808000174
-----------------------------------------------

826.484740019
Execution: X: X_train_norm_tfidf, k: 3
Algo error: inf
NMI score: 0.194659103643
Rand score: 0.0482034566717
Davies score: 4.06597191453
-----------------------------------------------

819.279025078
Execution: X: X_train_norm_tfidf, k: 3
Algo error: inf
NMI score: 0.532533823106
Rand score: 0.526554002715
Davies score: 7.71328052907
-----------------------------------------------

819.939161777
Execution: X: X_train_norm_tfidf, k: 3
Algo error: inf
NMI score: 0.52561507586
Rand score: 0.495813995883
Davies score: 7.89315969186
-----------------------------------------------

806.629768133
Execution: X: X_train_norm_tfidf, k: 3
Algo error: inf
NMI score: 0.219404482379
Rand score: 0.0545395922418
Davies score: 4.36973321238
-----------------------------------------------

803.114602089
Execution: X: X_train_norm_tfidf, k: 3
Algo error: inf
NMI score: 0.157219731085
Rand score: 0.0510994607359
Davies score: 9.30830562388
-----------------------------------------------

802.834872961
Execution: X: X_train_norm_tfidf, k: 3
Algo error: inf
NMI score: 0.556311425934
Rand score: 0.413686874779
Davies score: 7.28263389115
-----------------------------------------------

803.69725585
Execution: X: X_train_norm_tfidf, k: 3
Algo error: inf
NMI score: 0.464795848148
Rand score: 0.411220888128
Davies score: 8.47553755804
-----------------------------------------------

803.512454987
Execution: X: X_train_norm_tfidf, k: 3
Algo error: inf
NMI score: 0.522694560526
Rand score: 0.47803246848
Davies score: 8.03112265769
-----------------------------------------------

827.470321894
Execution: X: X_train_norm_tfidf, k: 3
Algo error: inf
NMI score: 0.48635175012
Rand score: 0.441979394905
Davies score: 9.51496924696
-----------------------------------------------

1000.57694197
Execution: X: X_train, k: 3
Algo error: inf
NMI score: 0.102590744011
Rand score: 0.0220781357711
Davies score: 4.19185073884
-----------------------------------------------

1002.63803315
Execution: X: X_train, k: 3
Algo error: inf
NMI score: 0.353541068463
Rand score: 0.326511461842
Davies score: 5.07352824857
-----------------------------------------------

935.658689976
Execution: X: X_train, k: 3
Algo error: inf
NMI score: 0.102590744011
Rand score: 0.0220781357711
Davies score: 4.19185073884
-----------------------------------------------

826.592447042
Execution: X: X_train, k: 3
Algo error: inf
NMI score: 0.393156447125
Rand score: 0.386675093746
Davies score: 5.24043687555
-----------------------------------------------

944.304490089
Execution: X: X_train, k: 3
Algo error: inf
NMI score: 0.353541068463
Rand score: 0.326511461842
Davies score: 5.07352824857
-----------------------------------------------

949.196790934
Execution: X: X_train, k: 3
Algo error: inf
NMI score: 0.286096827615
Rand score: 0.259482565826
Davies score: 5.33938361576
-----------------------------------------------

893.187283039
Execution: X: X_train, k: 3
Algo error: inf
NMI score: 0.353541068463
Rand score: 0.326511461842
Davies score: 5.07352824857
-----------------------------------------------

972.244926929
Execution: X: X_train, k: 3
Algo error: inf
NMI score: 0.13723492398
Rand score: 0.0645768676799
Davies score: 4.56700066142
-----------------------------------------------

886.650958061
Execution: X: X_train, k: 3
Algo error: inf
NMI score: 0.34685865253
Rand score: 0.322926327942
Davies score: 5.07425885276
-----------------------------------------------

797.797188044
Execution: X: X_train, k: 3
Algo error: inf
NMI score: 0.385042104238
Rand score: 0.371667544699
Davies score: 5.16977355596
-----------------------------------------------

915.363599062
Execution: X: X_train_norm, k: 3
Algo error: inf
NMI score: 0.577096664924
Rand score: 0.472515446739
Davies score: 4.88894537685
-----------------------------------------------

1060.65218997
Execution: X: X_train_norm, k: 3
Algo error: inf
NMI score: 0.577096664924
Rand score: 0.472515446739
Davies score: 4.88894537685
-----------------------------------------------

1010.36751103
Execution: X: X_train_norm, k: 3
Algo error: inf
NMI score: 0.646590524328
Rand score: 0.643330449211
Davies score: 5.62192693246
-----------------------------------------------

1062.57292986
Execution: X: X_train_norm, k: 3
Algo error: inf
NMI score: 0.577096664924
Rand score: 0.472515446739
Davies score: 4.88894537685
-----------------------------------------------

1018.08974099
Execution: X: X_train_norm, k: 3
Algo error: inf
NMI score: 0.577096664924
Rand score: 0.472515446739
Davies score: 4.88894537685
-----------------------------------------------

1039.17000985
Execution: X: X_train_norm, k: 3
Algo error: inf
NMI score: 0.577096664924
Rand score: 0.472515446739
Davies score: 4.88894537685
-----------------------------------------------

930.906779051
Execution: X: X_train_norm, k: 3
Algo error: inf
NMI score: 0.646590524328
Rand score: 0.643330449211
Davies score: 5.62192693246
-----------------------------------------------

887.551667213
Execution: X: X_train_norm, k: 3
Algo error: inf
NMI score: 0.609579351527
Rand score: 0.515359592747
Davies score: 4.97063203058
-----------------------------------------------

800.674169064
Execution: X: X_train_norm, k: 3
Algo error: inf
NMI score: 0.574503553678
Rand score: 0.468793003227
Davies score: 4.93193955798
-----------------------------------------------

799.360057116
Execution: X: X_train_norm, k: 3
Algo error: inf
NMI score: 0.681616360542
Rand score: 0.700077750324
Davies score: 5.65436581231
-----------------------------------------------

801.476296186
Execution: X: X_train_tfidf, k: 3
Algo error: inf
NMI score: 0.0209536700704
Rand score: 0.000141799506046
Davies score: 3.1507038417
-----------------------------------------------

801.618233919
Execution: X: X_train_tfidf, k: 3
Algo error: inf
NMI score: 0.0156369343336
Rand score: -0.000140862245134
Davies score: 3.04236504756
-----------------------------------------------

799.961426973
Execution: X: X_train_tfidf, k: 3
Algo error: inf
NMI score: 0.0209536700704
Rand score: 0.000141799506046
Davies score: 3.1507038417
-----------------------------------------------

799.421875
Execution: X: X_train_tfidf, k: 3
Algo error: inf
NMI score: 0.442981549585
Rand score: 0.41413148666
Davies score: 5.86800922718
-----------------------------------------------

800.390614986
Execution: X: X_train_tfidf, k: 3
Algo error: inf
NMI score: 0.474117492536
Rand score: 0.434005293211
Davies score: 5.77095139497
-----------------------------------------------

799.649295807
Execution: X: X_train_tfidf, k: 3
Algo error: inf
NMI score: 0.4868504364
Rand score: 0.450876044177
Davies score: 6.11381993368
-----------------------------------------------

800.787438869
Execution: X: X_train_tfidf, k: 3
Algo error: inf
NMI score: 0.0209536700704
Rand score: 0.000141799506046
Davies score: 3.1507038417
-----------------------------------------------

800.3811481
Execution: X: X_train_tfidf, k: 3
Algo error: inf
NMI score: 0.476584882674
Rand score: 0.423713256314
Davies score: 6.11118696071
-----------------------------------------------

799.479124069
Execution: X: X_train_tfidf, k: 3
Algo error: inf
NMI score: 0.0919166331182
Rand score: 0.0175186088213
Davies score: 4.3713550308
-----------------------------------------------

800.04998517
Execution: X: X_train_tfidf, k: 3
Algo error: inf
NMI score: 0.0919166331182
Rand score: 0.0175186088213
Davies score: 4.3713550308
-----------------------------------------------

800.046504974
Execution: X: X_train_norm_tfidf, k: 3
Algo error: inf
NMI score: 0.510286640106
Rand score: 0.422781676512
Davies score: 6.13252275605
-----------------------------------------------

800.571629047
Execution: X: X_train_norm_tfidf, k: 3
Algo error: inf
NMI score: 0.681939452481
Rand score: 0.574337050647
Davies score: 6.40195257874
-----------------------------------------------

799.103754044
Execution: X: X_train_norm_tfidf, k: 3
Algo error: inf
NMI score: 0.686850219425
Rand score: 0.668101581113
Davies score: 6.76332553712
-----------------------------------------------

800.508813143
Execution: X: X_train_norm_tfidf, k: 3
Algo error: inf
NMI score: 0.681939452481
Rand score: 0.574337050647
Davies score: 6.40195257874
-----------------------------------------------

799.210839033
Execution: X: X_train_norm_tfidf, k: 3
Algo error: inf
NMI score: 0.702434673569
Rand score: 0.588868573855
Davies score: 6.42883217082
-----------------------------------------------

800.358686924
Execution: X: X_train_norm_tfidf, k: 3
Algo error: inf
NMI score: 0.513408295655
Rand score: 0.421250182276
Davies score: 6.0354438518
-----------------------------------------------

799.896157026
Execution: X: X_train_norm_tfidf, k: 3
Algo error: inf
NMI score: 0.695176412437
Rand score: 0.681527662795
Davies score: 6.76207924676
-----------------------------------------------

800.804448128
Execution: X: X_train_norm_tfidf, k: 3
Algo error: inf
NMI score: 0.62121627848
Rand score: 0.507414371756
Davies score: 5.44259319795
-----------------------------------------------

801.254323006
Execution: X: X_train_norm_tfidf, k: 3
Algo error: inf
NMI score: 0.577764811789
Rand score: 0.530369321869
Davies score: 6.99129451294
-----------------------------------------------

805.160863876
Execution: X: X_train_norm_tfidf, k: 3
Algo error: inf
NMI score: 0.739880265736
Rand score: 0.747731518739
Davies score: 6.82911112788
-----------------------------------------------

810.197912931
Execution: X: X_train, k: 3
Algo error: inf
NMI score: 0.403965506798
Rand score: 0.392230137565
Davies score: 5.17283644595
-----------------------------------------------

803.635970116
Execution: X: X_train, k: 3
Algo error: inf
NMI score: 0.353541068463
Rand score: 0.326511461842
Davies score: 5.07352824857
-----------------------------------------------

802.501942873
Execution: X: X_train, k: 3
Algo error: inf
NMI score: 0.389593963467
Rand score: 0.381128684653
Davies score: 5.1624701394
-----------------------------------------------

803.274652004
Execution: X: X_train, k: 3
Algo error: inf
NMI score: 0.177046488247
Rand score: 0.117888162389
Davies score: 5.14559610565
-----------------------------------------------

805.6863451
Execution: X: X_train, k: 3
Algo error: inf
NMI score: 0.353541068463
Rand score: 0.326511461842
Davies score: 5.07352824857
-----------------------------------------------

803.577783108
Execution: X: X_train, k: 3
Algo error: inf
NMI score: 0.0215306592295
Rand score: 0.000434257670008
Davies score: 3.06156508496
-----------------------------------------------

801.292065144
Execution: X: X_train, k: 3
Algo error: inf
NMI score: 0.0870616607828
Rand score: 0.0287542512393
Davies score: 4.67375932228
-----------------------------------------------

800.100991011
Execution: X: X_train, k: 3
Algo error: inf
NMI score: 0.350848678233
Rand score: 0.325125756654
Davies score: 5.11929943741
-----------------------------------------------

798.713268995
Execution: X: X_train, k: 3
Algo error: inf
NMI score: 0.528313133884
Rand score: 0.480479986483
Davies score: 4.96382088404
-----------------------------------------------

797.596824884
Execution: X: X_train, k: 3
Algo error: inf
NMI score: 0.147513465114
Rand score: 0.0819286708103
Davies score: 5.11759007483
-----------------------------------------------

797.666160107
Execution: X: X_train_norm, k: 3
Algo error: inf
NMI score: 0.584376325394
Rand score: 0.479630776401
Davies score: 4.91425624196
-----------------------------------------------

805.704232931
Execution: X: X_train_norm, k: 3
Algo error: inf
NMI score: 0.681957688463
Rand score: 0.688134461053
Davies score: 5.70579490279
-----------------------------------------------

801.2458601
Execution: X: X_train_norm, k: 3
Algo error: inf
NMI score: 0.724645587107
Rand score: 0.748491409193
Davies score: 5.69444439139
-----------------------------------------------

808.681071997
Execution: X: X_train_norm, k: 3
Algo error: inf
NMI score: 0.574503553678
Rand score: 0.468793003227
Davies score: 4.93193955798
-----------------------------------------------

797.845577955
Execution: X: X_train_norm, k: 3
Algo error: inf
NMI score: 0.69364602905
Rand score: 0.701187937455
Davies score: 5.7078293332
-----------------------------------------------

797.926918983
Execution: X: X_train_norm, k: 3
Algo error: inf
NMI score: 0.69870664287
Rand score: 0.703740129213
Davies score: 5.70241791056
-----------------------------------------------

797.452886105
Execution: X: X_train_norm, k: 3
Algo error: inf
NMI score: 0.606451559217
Rand score: 0.507429920415
Davies score: 5.00447898674
-----------------------------------------------

799.160154104
Execution: X: X_train_norm, k: 3
Algo error: inf
NMI score: 0.729567517508
Rand score: 0.734982682056
Davies score: 5.66617515282
-----------------------------------------------

800.483223915
Execution: X: X_train_norm, k: 3
Algo error: inf
NMI score: 0.584376325394
Rand score: 0.479630776401
Davies score: 4.91425624196
-----------------------------------------------

797.819401979
Execution: X: X_train_norm, k: 3
Algo error: inf
NMI score: 0.724645587107
Rand score: 0.748491409193
Davies score: 5.69444439139
-----------------------------------------------

799.743741035
Execution: X: X_train_tfidf, k: 3
Algo error: inf
NMI score: 0.4868504364
Rand score: 0.450876044177
Davies score: 6.11381993368
-----------------------------------------------

796.404560089
Execution: X: X_train_tfidf, k: 3
Algo error: inf
NMI score: 0.105980852831
Rand score: 0.0276051074495
Davies score: 4.73359709791
-----------------------------------------------

798.697227001
Execution: X: X_train_tfidf, k: 3
Algo error: inf
NMI score: 0.0167632208662
Rand score: -7.70506138984e-05
Davies score: 2.82053041558
-----------------------------------------------

797.179314137
Execution: X: X_train_tfidf, k: 3
Algo error: inf
NMI score: 0.044366588328
Rand score: 0.000831856567733
Davies score: 3.39411538173
-----------------------------------------------

798.768301964
Execution: X: X_train_tfidf, k: 3
Algo error: inf
NMI score: 0.346744702159
Rand score: 0.315446431834
Davies score: 6.42710155066
-----------------------------------------------

800.902426958
Execution: X: X_train_tfidf, k: 3
Algo error: inf
NMI score: 0.0167632208662
Rand score: -7.70506138984e-05
Davies score: 2.82053041558
-----------------------------------------------

802.591784
Execution: X: X_train_tfidf, k: 3
Algo error: inf
NMI score: 0.0919166331182
Rand score: 0.0175186088213
Davies score: 4.3713550308
-----------------------------------------------

801.386595964
Execution: X: X_train_tfidf, k: 3
Algo error: inf
NMI score: 0.044366588328
Rand score: 0.000831856567733
Davies score: 3.39411538173
-----------------------------------------------

802.169070005
Execution: X: X_train_tfidf, k: 3
Algo error: inf
NMI score: 0.0919166331182
Rand score: 0.0175186088213
Davies score: 4.3713550308
-----------------------------------------------

800.4772861
Execution: X: X_train_tfidf, k: 3
Algo error: inf
NMI score: 0.0912715740058
Rand score: 0.0180976742783
Davies score: 4.33098906157
-----------------------------------------------

803.692107916
Execution: X: X_train_norm_tfidf, k: 3
Algo error: inf
NMI score: 0.524305859013
Rand score: 0.385570702271
Davies score: 5.56908110648
-----------------------------------------------

803.353945017
Execution: X: X_train_norm_tfidf, k: 3
Algo error: inf
NMI score: 0.6083511767
Rand score: 0.536117798647
Davies score: 6.71590528441
-----------------------------------------------

801.546469212
Execution: X: X_train_norm_tfidf, k: 3
Algo error: inf
NMI score: 0.702934557581
Rand score: 0.638490094831
Davies score: 6.70521842259
-----------------------------------------------

802.53918004
Execution: X: X_train_norm_tfidf, k: 3
Algo error: inf
NMI score: 0.684084991719
Rand score: 0.580157106604
Davies score: 6.40242918551
-----------------------------------------------

801.560765028
Execution: X: X_train_norm_tfidf, k: 3
Algo error: inf
NMI score: 0.581731482948
Rand score: 0.488763869804
Davies score: 5.98043455345
-----------------------------------------------

801.591675997
Execution: X: X_train_norm_tfidf, k: 3
Algo error: inf
NMI score: 0.627004716702
Rand score: 0.497451701338
Davies score: 5.49743999805
-----------------------------------------------

802.881339073
Execution: X: X_train_norm_tfidf, k: 3
Algo error: inf
NMI score: 0.544799121353
Rand score: 0.445790639497
Davies score: 6.13322697559
-----------------------------------------------

847.753988028
Execution: X: X_train_norm_tfidf, k: 3
Algo error: inf
NMI score: 0.486676823486
Rand score: 0.345531177482
Davies score: 6.40254960486
-----------------------------------------------

994.667692184
Execution: X: X_train_norm_tfidf, k: 3
Algo error: inf
NMI score: 0.630744939611
Rand score: 0.594120567894
Davies score: 6.83339871412
-----------------------------------------------

959.980799913
Execution: X: X_train_norm_tfidf, k: 3
Algo error: inf
NMI score: 0.54270846311
Rand score: 0.449656242725
Davies score: 6.06993728846
-----------------------------------------------

968.462424994
Execution: X: X_train, k: 3
Algo error: inf
NMI score: 0.197132517431
Rand score: 0.123926802869
Davies score: 5.34281916861
-----------------------------------------------

954.961694956
Execution: X: X_train, k: 3
Algo error: inf
NMI score: 0.522110049291
Rand score: 0.475789634229
Davies score: 4.99972345072
-----------------------------------------------

867.81651783
Execution: X: X_train, k: 3
Algo error: inf
NMI score: 0.214893012585
Rand score: 0.163232040743
Davies score: 5.25469499636
-----------------------------------------------

849.603387833
Execution: X: X_train, k: 3
Algo error: inf
NMI score: 0.423346173765
Rand score: 0.403757648521
Davies score: 5.34847080991
-----------------------------------------------

932.458371878
Execution: X: X_train, k: 3
Algo error: inf
NMI score: 0.367225109573
Rand score: 0.354862531505
Davies score: 5.16958816466
-----------------------------------------------

1028.0962131
Execution: X: X_train, k: 3
Algo error: inf
NMI score: 0.427671811064
Rand score: 0.407700929827
Davies score: 5.34969033568
-----------------------------------------------

810.572029114
Execution: X: X_train, k: 3
Algo error: inf
NMI score: 0.412731113751
Rand score: 0.397649087493
Davies score: 5.11043731124
-----------------------------------------------

813.070983887
Execution: X: X_train, k: 3
Algo error: inf
NMI score: 0.348900898874
Rand score: 0.328692753515
Davies score: 5.08891001527
-----------------------------------------------

810.845068932
Execution: X: X_train, k: 3
Algo error: inf
NMI score: 0.511856169753
Rand score: 0.469943694756
Davies score: 5.07093217767
-----------------------------------------------

810.389551878
Execution: X: X_train, k: 3
Algo error: inf
NMI score: 0.277693883368
Rand score: 0.246630748776
Davies score: 5.2730094364
-----------------------------------------------

809.550765991
Execution: X: X_train_norm, k: 3
Algo error: inf
NMI score: 0.565184486222
Rand score: 0.422928548393
Davies score: 5.48305849946
-----------------------------------------------

808.517294884
Execution: X: X_train_norm, k: 3
Algo error: inf
NMI score: 0.728669317422
Rand score: 0.750108953351
Davies score: 5.69418696627
-----------------------------------------------

809.532860994
Execution: X: X_train_norm, k: 3
Algo error: inf
NMI score: 0.71625443052
Rand score: 0.710724986876
Davies score: 5.64037314288
-----------------------------------------------

822.702867031
Execution: X: X_train_norm, k: 3
Algo error: inf
NMI score: 0.577096664924
Rand score: 0.472515446739
Davies score: 4.88894537685
-----------------------------------------------

926.166749954
Execution: X: X_train_norm, k: 3
Algo error: inf
NMI score: 0.732145095932
Rand score: 0.759387066269
Davies score: 5.68955551123
-----------------------------------------------

916.224526167
Execution: X: X_train_norm, k: 3
Algo error: inf
NMI score: 0.537355379796
Rand score: 0.454709837017
Davies score: 5.21090235168
-----------------------------------------------

920.220302105
Execution: X: X_train_norm, k: 3
Algo error: inf
NMI score: 0.738705004468
Rand score: 0.759674936023
Davies score: 5.66509259047
-----------------------------------------------

910.577459097
Execution: X: X_train_norm, k: 3
Algo error: inf
NMI score: 0.732145095932
Rand score: 0.759387066269
Davies score: 5.68955551123
-----------------------------------------------

933.10690403
Execution: X: X_train_norm, k: 3
Algo error: inf
NMI score: 0.699062375722
Rand score: 0.688962185368
Davies score: 5.60548390223
-----------------------------------------------

995.103133917
Execution: X: X_train_norm, k: 3
Algo error: inf
NMI score: 0.693263846877
Rand score: 0.700910517697
Davies score: 5.67752991342
-----------------------------------------------

1002.91439009
Execution: X: X_train_tfidf, k: 3
Algo error: inf
NMI score: 0.0167632208662
Rand score: -7.70506138984e-05
Davies score: 2.82053041558
-----------------------------------------------

1353.45869398
Execution: X: X_train_tfidf, k: 3
Algo error: inf
NMI score: 0.411964326991
Rand score: 0.399445076525
Davies score: 5.95056109148
-----------------------------------------------

1461.84668207
Execution: X: X_train_tfidf, k: 3
Algo error: inf
NMI score: 0.423978449968
Rand score: 0.396653846559
Davies score: 6.33233084932
-----------------------------------------------

1103.69100213
Execution: X: X_train_tfidf, k: 3
Algo error: inf
NMI score: 0.0263929345497
Rand score: 0.000520400009594
Davies score: 3.26038507647
-----------------------------------------------

1030.02932405
Execution: X: X_train_tfidf, k: 3
Algo error: inf
NMI score: 0.110673797573
Rand score: 0.0301316696488
Davies score: 4.78806490899
-----------------------------------------------

1203.19901013
Execution: X: X_train_tfidf, k: 3
Algo error: inf
NMI score: 0.0263929345497
Rand score: 0.000520400009594
Davies score: 3.26038507647
-----------------------------------------------

944.727695942
Execution: X: X_train_tfidf, k: 3
Algo error: inf
NMI score: 0.0167632208662
Rand score: -7.70506138984e-05
Davies score: 2.82053041558
-----------------------------------------------

927.492874861
Execution: X: X_train_tfidf, k: 3
Algo error: inf
NMI score: 0.0167632208662
Rand score: -7.70506138984e-05
Davies score: 2.82053041558
-----------------------------------------------

900.209417105
Execution: X: X_train_tfidf, k: 3
Algo error: inf
NMI score: 0.29470546303
Rand score: 0.212253079613
Davies score: 6.02846011927
-----------------------------------------------

921.907628059
Execution: X: X_train_tfidf, k: 3
Algo error: inf
NMI score: 0.326579011211
Rand score: 0.345609852291
Davies score: 6.9627597528
-----------------------------------------------

962.88514185
Execution: X: X_train_norm_tfidf, k: 3
Algo error: inf
NMI score: 0.684084991719
Rand score: 0.580157106604
Davies score: 6.40242918551
-----------------------------------------------

915.950886965
Execution: X: X_train_norm_tfidf, k: 3
Algo error: inf
NMI score: 0.755887838426
Rand score: 0.778111571637
Davies score: 6.86005564424
-----------------------------------------------

938.746544838
Execution: X: X_train_norm_tfidf, k: 3
Algo error: inf
NMI score: 0.227194220915
Rand score: 0.0621337390109
Davies score: 4.68045268654
-----------------------------------------------

905.432644844
Execution: X: X_train_norm_tfidf, k: 3
Algo error: inf
NMI score: 0.771854201885
Rand score: 0.788485295343
Davies score: 6.82906106592
-----------------------------------------------

919.945120096
Execution: X: X_train_norm_tfidf, k: 3
Algo error: inf
NMI score: 0.685397537293
Rand score: 0.582768524908
Davies score: 6.42244181758
-----------------------------------------------

912.692452908
Execution: X: X_train_norm_tfidf, k: 3
Algo error: inf
NMI score: 0.675900639982
Rand score: 0.589253512938
Davies score: 6.55749784737
-----------------------------------------------

920.252959967
Execution: X: X_train_norm_tfidf, k: 3
Algo error: inf
NMI score: 0.683302727343
Rand score: 0.577031479823
Davies score: 6.421459557
-----------------------------------------------

987.917103052
Execution: X: X_train_norm_tfidf, k: 3
Algo error: inf
NMI score: 0.22271246263
Rand score: 0.0586260413498
Davies score: 4.62247977924
-----------------------------------------------

960.501576185
Execution: X: X_train_norm_tfidf, k: 3
Algo error: inf
NMI score: 0.684447366352
Rand score: 0.648317143204
Davies score: 6.78370768669
-----------------------------------------------

922.474231005
Execution: X: X_train_norm_tfidf, k: 3
Algo error: inf
NMI score: 0.241777883167
Rand score: 0.0572530286616
Davies score: 4.35950645973
-----------------------------------------------

919.64618516
Execution: X: X_train, k: 3
Algo error: inf
NMI score: 0.513708919479
Rand score: 0.469267747903
Davies score: 4.95592261957
-----------------------------------------------

904.801797867
Execution: X: X_train, k: 3
Algo error: inf
NMI score: 0.147193293568
Rand score: 0.0728785543716
Davies score: 4.58883608166
-----------------------------------------------

918.952564955
Execution: X: X_train, k: 3
Algo error: inf
NMI score: 0.344056565402
Rand score: 0.328279278579
Davies score: 5.3888716391
-----------------------------------------------

994.218925953
Execution: X: X_train, k: 3
Algo error: inf
NMI score: 0.350848678233
Rand score: 0.325125756654
Davies score: 5.11929943741
-----------------------------------------------

807.496315002
Execution: X: X_train, k: 3
Algo error: inf
NMI score: 0.157880209141
Rand score: 0.0878104095529
Davies score: 4.82982310656
-----------------------------------------------

806.176246166
Execution: X: X_train, k: 3
Algo error: inf
NMI score: 0.354083802297
Rand score: 0.337375589519
Davies score: 5.09793203294
-----------------------------------------------

808.172565937
Execution: X: X_train, k: 3
Algo error: inf
NMI score: 0.432563378313
Rand score: 0.41207076311
Davies score: 5.33791885327
-----------------------------------------------

808.32937789
Execution: X: X_train, k: 3
Algo error: inf
NMI score: 0.189584899658
Rand score: 0.133144109501
Davies score: 5.13012474885
-----------------------------------------------

803.985204935
Execution: X: X_train, k: 3
Algo error: inf
NMI score: 0.430044754665
Rand score: 0.409832731687
Davies score: 5.34199727263
-----------------------------------------------

808.361351967
Execution: X: X_train, k: 3
Algo error: inf
NMI score: 0.190524669038
Rand score: 0.103901016525
Davies score: 5.14394783981
-----------------------------------------------

806.012857914
Execution: X: X_train_norm, k: 3
Algo error: inf
NMI score: 0.566231571916
Rand score: 0.426158410391
Davies score: 5.36648961154
-----------------------------------------------

806.00150013
Execution: X: X_train_norm, k: 3
Algo error: inf
NMI score: 0.710838491549
Rand score: 0.715170076294
Davies score: 5.66995504659
-----------------------------------------------

807.022404194
Execution: X: X_train_norm, k: 3
Algo error: inf
NMI score: 0.741835559395
Rand score: 0.740085911598
Davies score: 5.66794117944
-----------------------------------------------

806.204664946
Execution: X: X_train_norm, k: 3
Algo error: inf
NMI score: 0.712279615219
Rand score: 0.739861694428
Davies score: 5.69486137613
-----------------------------------------------

808.17866087
Execution: X: X_train_norm, k: 3
Algo error: inf
NMI score: 0.73003519323
Rand score: 0.738958740884
Davies score: 5.67749847501
-----------------------------------------------

805.203550816
Execution: X: X_train_norm, k: 3
Algo error: inf
NMI score: 0.720324196225
Rand score: 0.723866195133
Davies score: 5.65268498104
-----------------------------------------------

808.80629611
Execution: X: X_train_norm, k: 3
Algo error: inf
NMI score: 0.574503553678
Rand score: 0.468793003227
Davies score: 4.93193955798
-----------------------------------------------

807.13172698
Execution: X: X_train_norm, k: 3
Algo error: inf
NMI score: 0.615622314935
Rand score: 0.51674297894
Davies score: 4.93019886509
-----------------------------------------------

807.609624863
Execution: X: X_train_norm, k: 3
Algo error: inf
NMI score: 0.577096664924
Rand score: 0.472515446739
Davies score: 4.88894537685
-----------------------------------------------

806.673099995
Execution: X: X_train_norm, k: 3
Algo error: inf
NMI score: 0.738705004468
Rand score: 0.759674936023
Davies score: 5.66509259047
-----------------------------------------------

809.76267314
Execution: X: X_train_tfidf, k: 3
Algo error: inf
NMI score: 0.0167632208662
Rand score: -7.70506138984e-05
Davies score: 2.82053041558
-----------------------------------------------

805.213690996
Execution: X: X_train_tfidf, k: 3
Algo error: inf
NMI score: 0.105980852831
Rand score: 0.0276051074495
Davies score: 4.73359709791
-----------------------------------------------

805.095651865
Execution: X: X_train_tfidf, k: 3
Algo error: inf
NMI score: 0.450859089402
Rand score: 0.414668513201
Davies score: 6.25585161867
-----------------------------------------------

806.118098021
Execution: X: X_train_tfidf, k: 3
Algo error: inf
NMI score: 0.114034989609
Rand score: 0.0309114877112
Davies score: 4.72050050203
-----------------------------------------------

804.578929901
Execution: X: X_train_tfidf, k: 3
Algo error: inf
NMI score: 0.427092987623
Rand score: 0.403062753786
Davies score: 6.3289527311
-----------------------------------------------

803.923925161
Execution: X: X_train_tfidf, k: 3
Algo error: inf
NMI score: 0.110673797573
Rand score: 0.0301316696488
Davies score: 4.78806490899
-----------------------------------------------

805.953124046
Execution: X: X_train_tfidf, k: 3
Algo error: inf
NMI score: 0.110673797573
Rand score: 0.0301316696488
Davies score: 4.78806490899
-----------------------------------------------

803.726119041
Execution: X: X_train_tfidf, k: 3
Algo error: inf
NMI score: 0.442669552639
Rand score: 0.407811149955
Davies score: 6.27461900794
-----------------------------------------------

807.045888901
Execution: X: X_train_tfidf, k: 3
Algo error: inf
NMI score: 0.110673797573
Rand score: 0.0301316696488
Davies score: 4.78806490899
-----------------------------------------------

805.090794086
Execution: X: X_train_tfidf, k: 3
Algo error: inf
NMI score: 0.175055628553
Rand score: 0.0657978074465
Davies score: 4.4429021003
-----------------------------------------------

809.555768013
Execution: X: X_train_norm_tfidf, k: 3
Algo error: inf
NMI score: 0.454963575854
Rand score: 0.337087895094
Davies score: 6.33672662309
-----------------------------------------------

804.585672855
Execution: X: X_train_norm_tfidf, k: 3
Algo error: inf
NMI score: 0.588589113606
Rand score: 0.534841001837
Davies score: 6.74795948717
-----------------------------------------------

851.635176897
Execution: X: X_train_norm_tfidf, k: 3
Algo error: inf
NMI score: 0.54270846311
Rand score: 0.449656242725
Davies score: 6.06993728846
-----------------------------------------------

821.803653002
Execution: X: X_train_norm_tfidf, k: 3
Algo error: inf
NMI score: 0.691754152745
Rand score: 0.612654562838
Davies score: 6.6570291567
-----------------------------------------------

803.371346235
Execution: X: X_train_norm_tfidf, k: 3
Algo error: inf
NMI score: 0.704092110368
Rand score: 0.685222829449
Davies score: 6.74629494447
-----------------------------------------------

805.222362041
Execution: X: X_train_norm_tfidf, k: 3
Algo error: inf
NMI score: 0.202314819925
Rand score: 0.0566860316715
Davies score: 4.36670126979
-----------------------------------------------

865.617736101
Execution: X: X_train_norm_tfidf, k: 3
Algo error: inf
NMI score: 0.607382561563
Rand score: 0.498919286905
Davies score: 5.94337354939
-----------------------------------------------

807.675718069
Execution: X: X_train_norm_tfidf, k: 3
Algo error: inf
NMI score: 0.240341381688
Rand score: 0.0601513626121
Davies score: 4.14790044789
-----------------------------------------------

805.264490843
Execution: X: X_train_norm_tfidf, k: 3
Algo error: inf
NMI score: 0.760306904644
Rand score: 0.779649700979
Davies score: 6.85104852656
-----------------------------------------------

869.981280804
Execution: X: X_train_norm_tfidf, k: 3
Algo error: inf
NMI score: 0.23440134192
Rand score: 0.0607570648456
Davies score: 3.79743296521
-----------------------------------------------

Fast Non-negative Matrix Tri-Factorization



In [ ]:

    
def fnmtf(X, k, l, num_iter=1000, norm=False):
    m, n = X.shape

    def weights_initialization(X, n, m, k):
        shuffle_inds = np.random.permutation(n)
        cluster_end_ind = 0
        for i in xrange(k):
            cluster_init_ind = cluster_end_ind
            cluster_end_ind = round((i + 1) * n / k)
            X[shuffle_inds[cluster_init_ind : cluster_end_ind], i] = 1
        return X
    
    U = weights_initialization(np.zeros((m, k)), m, n, k)
    S = np.random.rand(k,l)
    V = weights_initialization(np.zeros((n, l)), n, m, l)

    error_best = np.inf
    error = error_best

    if norm:
        X = Normalizer().fit_transform(X)
    for _ in xrange(num_iter):
        S = np.linalg.pinv(U.T.dot(U)).dot(U.T).dot(X).dot(V).dot(np.linalg.pinv(V.T.dot(V)))

        # solve subproblem to update V
        U_tilde = U.dot(S)
        V_new = np.zeros(n*l).reshape(n, l)
        for j in xrange(n):
            errors = np.zeros(l)
            for col_clust_ind in xrange(l):
                errors[col_clust_ind] = ((X[:][:, j] - U_tilde[:][:, col_clust_ind])**2).sum()
            ind = np.argmin(errors)
            V_new[j][ind] = 1
        V = V_new

#         while np.linalg.det(V.T.dot(V)) <= 0:
#             if np.isnan( np.sum(V) ):
#                 break

#             erros = (X - U.dot(S).dot(V.T)) ** 2
#             erros = np.sum(erros.dot(V), axis=0) / np.sum(V, axis=0)
#             erros[np.where(np.sum(V, axis=0) <= 1)] = -np.inf
#             quantidade = np.sum(V, axis=0)
#             indexMin = np.argmin(quantidade)
#             indexMax = np.argmax(erros)
#             indexes = np.nonzero(V[:, indexMax])[0]
#             end = len(indexes)
#             indexes_p = np.random.permutation(end)
#             V[indexes[indexes_p[0:np.floor(end/2.0)]], indexMax] = 0.0
#             V[indexes[indexes_p[0:np.floor(end/2.0)]], indexMin] = 1.0

        # solve subproblem to update U
        V_tilde = S.dot(V.T)
        U_new = np.zeros(m*k).reshape(m, k)
        for i in xrange(m):
            errors = np.zeros(k)
            for row_clust_ind in xrange(k):
                errors[row_clust_ind] = ((X[i][:] - V_tilde[row_clust_ind][:])**2).sum()
            ind = np.argmin(errors)
            U_new[i][ind] = 1
        U = U_new

#         while np.linalg.det(U.T.dot(U)) <= 0:
#             if np.isnan( np.sum(U) ):
#                 break

#             erros = (X - U.dot(V_tilde)) ** 2
#             erros = np.sum(U.T.dot(erros), axis=1) / np.sum(U, axis=0)
#             erros[np.where(np.sum(U, axis=0) <= 1)] = -np.inf
#             quantidade = np.sum(U, axis=0)
#             indexMin = np.argmin(quantidade)
#             indexMax = np.argmax(erros)
#             indexes = np.nonzero(U[:, indexMax])[0]

#             end = len(indexes)
#             indexes_p = np.random.permutation(end)
#             U[indexes[indexes_p[0:np.floor(end/2.0)]], indexMax] = 0.0
#             U[indexes[indexes_p[0:np.floor(end/2.0)]], indexMin] = 1.0

        error_ant = error
#         print error_ant
        error = np.sum((X - U.dot(S).dot(V.T)) ** 2)

        if error < error_best:
            U_best = U
            S_best = S
            V_best = V
            error_best = error

#         if np.abs(error - error_ant) <= 0.000001:
#             break

    rows_ind = np.argmax(U, axis=1)
    cols_ind = np.argmax(V, axis=1)

    return U, S, V, rows_ind, cols_ind, error


params = {
    'k' : [3],
    'l' : [2, 3, 4, 5, 6],
    'X' : ['X_train', 'X_train_norm', 'X_train_tfidf', 'X_train_norm_tfidf']
#     'X' : ['X_train', 'X_train_tfidf']
}

with codecs.open('nmtf_bin_news_results.csv', 'w', 'utf-8') as out_f:
    out_f.write('X,K,L,NMI,RAND,DAVIES\n')
    for k in params['k']:
        for l in params['l']:
            for data_str in params['X']:
                data = eval(data_str)

                error_best = np.inf
                for _ in xrange(10):
                    init_time = time.time()
                    U, S, V, labels_pred, _, error = fnmtf(data, k, l)

                    nmi_score = normalized_mutual_info_score(labels_true, labels_pred)
                    rand_score = adjusted_rand_score(labels_true, labels_pred)
                    davies_score = davies_bouldin_score(data, labels_pred, calculate_centroids_doc_mean(data, labels_pred, k))

#                     if error < error_best:
#                         error_best = error
#                         nmi_score_best = nmi_score
#                         rand_score_best = rand_score
#                         davies_score_best = davies_score
#                         labels_pred_best = labels_pred

                    end_time = time.time()
                    print end_time - init_time

                    out_f.write(u'%s,%s,%s,%s,%s,%s\n' % (data_str, k, l, nmi_score, rand_score, davies_score))

                    print 'Execution: X: %s, k: %s, l: %s' % (data_str, k, l)
                    print 'Algo error: %s' % error
                    print 'NMI score: %s' % nmi_score
                    print 'Rand score: %s' % rand_score
                    print 'Davies score: %s' % davies_score
                    print '-----------------------------------------------\n'









    



171.905370951
Execution: X: X_train, k: 3, l: 2
Algo error: 212713.143414
NMI score: 0.0558855682487
Rand score: 0.0352532594765
Davies score: 5.04271402634
-----------------------------------------------

170.441447973
Execution: X: X_train, k: 3, l: 2
Algo error: 212713.4328
NMI score: 0.0587342988573
Rand score: 0.0382849986737
Davies score: 5.04166124195
-----------------------------------------------

170.962042093
Execution: X: X_train, k: 3, l: 2
Algo error: 212713.143414
NMI score: 0.0558855682487
Rand score: 0.0352532594765
Davies score: 5.04271402634
-----------------------------------------------

170.195071936
Execution: X: X_train, k: 3, l: 2
Algo error: 212713.143414
NMI score: 0.0558855682487
Rand score: 0.0352532594765
Davies score: 5.04271402634
-----------------------------------------------

170.566911936
Execution: X: X_train, k: 3, l: 2
Algo error: 212713.4328
NMI score: 0.0587342988573
Rand score: 0.0382849986737
Davies score: 5.04166124195
-----------------------------------------------

177.716650009
Execution: X: X_train, k: 3, l: 2
Algo error: 212713.143414
NMI score: 0.0558855682487
Rand score: 0.0352532594765
Davies score: 5.04271402634
-----------------------------------------------

193.750792027
Execution: X: X_train, k: 3, l: 2
Algo error: 212713.143414
NMI score: 0.0558855682487
Rand score: 0.0352532594765
Davies score: 5.04271402634
-----------------------------------------------

185.568675995
Execution: X: X_train, k: 3, l: 2
Algo error: 212713.143414
NMI score: 0.0558855682487
Rand score: 0.0352532594765
Davies score: 5.04271402634
-----------------------------------------------

184.650127888
Execution: X: X_train, k: 3, l: 2
Algo error: 212713.4328
NMI score: 0.0587342988573
Rand score: 0.0382849986737
Davies score: 5.04166124195
-----------------------------------------------

184.346590042

Fast Overlapping Non-negative Matrix Tri-Factorization



In [ ]:

    
def matrix_factorization_overlapping_bin(X, k, l, num_iters=1000):
    def weights_initialization(X, n, m, k):
        shuffle_inds = np.random.permutation(n)
        cluster_end_ind = 0
        for i in xrange(k):
            cluster_init_ind = cluster_end_ind
            cluster_end_ind = round((i + 1) * n / k)
            X[shuffle_inds[cluster_init_ind : cluster_end_ind], i] = 1
        return X

    def calculate_block_matrix(X, F, G, S, k, l):
        for i in xrange(k):
            for j in xrange(l):
                S[i, j] = np.mean(X[F[:, i] == 1][:, G[i][:, j] == 1])
        where_are_NaNs = np.isnan(S)
        S[where_are_NaNs] = 0
        return S

    n, m = X.shape

    error_best = np.inf
    error = np.inf

    F = weights_initialization(np.zeros((n, k)), n, m, k)

    G = []
    for i in xrange(k):
        G.append( weights_initialization(np.zeros((m, l)), m, n, l) )

    S = np.random.rand(k, l)

    for iter_ind in xrange(num_iters):
        S = calculate_block_matrix(X, F, G, S, k, l)

        # Update G
        for i in xrange(k):
            F_t = F[F[:, i] == 1, :].dot(S)
            X_t = X[F[:, i] == 1, :]
            G[i] = np.zeros((m, l))
            for j in xrange(m):
                clust_len, _ = X_t.shape
                diff = F_t - X_t[:, j].reshape(clust_len, 1).dot(np.ones(l).reshape(1, l))
                errors = np.diag(diff.T.dot(diff))
                minV = np.min(errors)
                index = np.where(errors <= minV)[0]
                G[i][j, index[np.random.randint(len(index))]] = 1

#             while np.linalg.det(G[i].T.dot(G[i])) <= 0:
#                 erros = (X_t - F_t.dot(G[i].T)) ** 2
#                 erros = np.sum(erros.dot(G[i]), axis=0) / np.sum(G[i], axis=0)
#                 erros[np.where(np.sum(G[i], axis=0) <= 1)] = -np.inf
#                 quantidade = np.sum(G[i], axis=0)
#                 indexMin = np.argmin(quantidade)
#                 indexMax = np.argmax(erros)
#                 indexes = np.nonzero(G[i][:, indexMax])[0]

#                 end = len(indexes)
#                 indexes_p = np.random.permutation(end)
#                 G[i][indexes[indexes_p[0:np.floor(end/2.0)]], indexMax] = 0.0
#                 G[i][indexes[indexes_p[0:np.floor(end/2.0)]], indexMin] = 1.0

#         S = calculate_block_matrix(X, F, G, S, k, l)

        G_t = np.zeros((k, m))
        for i in xrange(k):
            G_t[i, :] = S[i, :].dot(G[i].T)

        F = np.zeros((n, k))
        for j in xrange(n):
            diff = G_t - np.ones(k).reshape(k, 1).dot(X[j, :].reshape(1, m))
            errors = np.diag(diff.dot(diff.T))
            minV = np.min(errors)
            index = np.where(errors <= minV)[0]
            F[j, index[np.random.randint(len(index))]] = 1

#         while np.linalg.det(F.T.dot(F)) <= 0:
#             erros = (X - F.dot(G_t)) ** 2
#             erros = np.sum(F.T.dot(erros), axis=1) / np.sum(F, axis=0)
#             erros[np.where(np.sum(F, axis=0) <= 1)] = -np.inf
#             quantidade = np.sum(F, axis=0)
#             indexMin = np.argmin(quantidade)
#             indexMax = np.argmax(erros)
#             indexes = np.nonzero(F[:, indexMax])[0]

#             end = len(indexes)
#             indexes_p = np.random.permutation(end)
#             F[indexes[indexes_p[0:np.floor(end/2.0)]], indexMax] = 0.0
#             F[indexes[indexes_p[0:np.floor(end/2.0)]], indexMin] = 1.0

        error_ant = error
        error = np.sum((X - F.dot(G_t))**2)
#         print error

        if error < error_best:
            error_best = error
            F_best = F
            S_best = S
            G_best = G
            G_t_best = G_t

#         if np.abs(error - error_ant) <= 0.000001:
#             break

    rows_ind = np.argmax(F_best, axis=1)
    reconstruction = F_best.dot(G_t_best)

    return F, S, G, G_t, rows_ind, error_best, reconstruction


params = {
    'k' : [3],
    'l' : [2, 3, 4, 5, 6],
    'X' : ['X_train', 'X_train_norm', 'X_train_tfidf', 'X_train_norm_tfidf']
}

with codecs.open('ovnmtf_bin_news_results.csv', 'w', 'utf-8') as out_f:
    out_f.write('X,K,L,NMI,RAND,DAVIES\n')
    for k in params['k']:
        for l in params['l']:
            for data_str in params['X']:
                data = eval(data_str)

                error_best = np.inf
                for _ in xrange(10):
                    init_time = time.time()
                    U, S, V, V_t, labels_pred, error, _ = matrix_factorization_overlapping_bin(data, k, l)

                    nmi_score = normalized_mutual_info_score(labels_true, labels_pred)
                    rand_score = adjusted_rand_score(labels_true, labels_pred)
                    davies_score = davies_bouldin_score(data, labels_pred, calculate_centroids_doc_mean(data, labels_pred, k))

#                     if error < error_best:
#                         error_best = error
#                         nmi_score_best = nmi_score
#                         rand_score_best = rand_score
#                         davies_score_best = davies_score
#                         labels_pred_best = labels_pred

                    end_time = time.time()
                    print end_time - init_time

                    out_f.write(u'%s,%s,%s,%s,%s,%s\n' % (data_str, k, l, nmi_score, rand_score, davies_score))

                    print 'Execution: X: %s, k: %s, l: %s' % (data_str, k, l)
                    print 'Algo error: %s' % error
                    print 'NMI score: %s' % nmi_score
                    print 'Rand score: %s' % rand_score
                    print 'Davies score: %s' % davies_score
                print '-----------------------------------------------\n'

Overlapping Non-Negative Matrix Tri-Factorization



In [ ]:

    
def is_any_clust_empty(U_bin):
    n, k = U_bin.shape
    return np.count_nonzero(np.sum(U_bin, axis=0)) != k

def overlapping_matrix_factorization_coclustering(X, k, l, norm=False, num_iters=100):
    n, m = X.shape
    U = np.random.rand(n, k)
    S = np.random.rand(k, l)
    V = []
    for i in xrange(k):
        V.append(np.random.rand(m, l))

    Ii = np.zeros((k, k))
    Ij = np.zeros((k, k))

    error_best = np.inf
    
    if norm:
        X = Normalizer().fit_transform(X)

    V_tilde = np.zeros((k, m))
    for i in xrange(k):
        Ii[i, i] = 1
        V_tilde += Ii.dot(S).dot(V[i].T)
        Ii[i, i] = 0
    error = np.sum((X - U.dot(V_tilde)) ** 2)

    for _ in xrange(num_iters):
        # Update U
        new_U_pos = np.zeros((n, k))
        new_U_neg = np.zeros((n, k))
        for i in xrange(k):
            Ii[i, i] = 1
            for j in xrange(k):
                Ij[j, j] = 1
                new_U_pos += U.dot(Ii).dot(S).dot(V[i].T).dot(V[j]).dot(S.T).dot(Ij)
                Ij[j, j] = 0
            new_U_neg += X.dot(V[i]).dot(S.T).dot(Ii)
            Ii[i, i] = 0
        U = U * (new_U_neg / new_U_pos)

        # Compute V'
        V_tilde = np.zeros((k, m))
        for i in xrange(k):
            Ii[i, i] = 1
            V_tilde += Ii.dot(S).dot(V[i].T)
            Ii[i, i] = 0

        # Update Vi
        for i in xrange(k):
            new_V_pos = np.zeros((m, l))
            new_V_neg = np.zeros((m, l))
            Ii[i, i] = 1
            for j in xrange(k):
                Ij[j, j] = 1

                new_V_pos += V[j].dot(S.T).dot(Ij).dot(U.T).dot(U).dot(Ii).dot(S)

                Ij[j, j] = 0

            new_V_neg += X.T.dot(U).dot(Ii).dot(S)

            Ii[i, i] = 0
            V[i] = V[i] * (new_V_neg / new_V_pos)

        # Recompute V'
        V_tilde = np.zeros((k, m))
        for i in xrange(k):
            Ii[i, i] = 1
            V_tilde += Ii.dot(S).dot(V[i].T)
            Ii[i, i] = 0
            
        new_S_pos = np.zeros((k, l))
        new_S_neg = np.zeros((k, l))
        for i in xrange(k):
            Ii[i, i] = 1
            for j in xrange(k):
                Ij[j, j] = 1
                new_S_pos += Ii.dot(U.T).dot(U).dot(Ij).dot(S).dot(V[j].T).dot(V[i])
                Ij[j, j] = 0
            new_S_neg += Ii.dot(U.T).dot(X).dot(V[i])
            Ii[i, i] = 0
        S = S * (new_S_neg / new_S_pos)

#         import pdb; pdb.set_trace()

        V_tilde = np.zeros((k, m))
        for i in xrange(k):
            Ii[i, i] = 1
            V_tilde += Ii.dot(S).dot(V[i].T)
            Ii[i, i] = 0

        error_ant = error
        error = np.sum((X - U.dot(V_tilde))**2)
#         print errorV_t

        if error < error_best:
            error_best = error
            U_best = U
            S_best = S
            V_best = V

        if np.abs(error - error_ant) <= 0.000001:
            break


    rows_ind = np.argmax(U_best, axis=1)
    reconstruction = U_best.dot(V_tilde)

    return U_best, S_best, V_best, V_tilde, rows_ind, error_best, reconstruction


params = {
    'k' : [3],
    'l' : [2, 3, 4, 5, 6],
    'X' : ['X_train', 'X_train_norm', 'X_train_tfidf', 'X_train_norm_tfidf']
}

with codecs.open('ovnmtf_news_results.csv', 'w', 'utf-8') as out_f:
    out_f.write('X,K,L,NMI,RAND,DAVIES\n')
    for k in params['k']:
        for l in params['l']:
            for data_str in params['X']:
                data = eval(data_str)

                error_best = np.inf
                for _ in xrange(10):
                    init_time = time.time()
                    U, S, V, V_t, labels_pred, error, _ = overlapping_matrix_factorization_coclustering(data, k, l)

                    nmi_score = normalized_mutual_info_score(labels_true, labels_pred)
                    rand_score = adjusted_rand_score(labels_true, labels_pred)
                    davies_score = davies_bouldin_score(data, labels_pred, calculate_centroids_doc_mean(data, labels_pred, k))

#                     if error < error_best:
#                         error_best = error
#                         nmi_score_best = nmi_score
#                         rand_score_best = rand_score
#                         davies_score_best = davies_score
#                         labels_pred_best = labels_pred

                    end_time = time.time()
                    print end_time - init_time

                out_f.write(u'%s,%s,%s,%s,%s,%s\n' % (data_str, k, l, nmi_score, rand_score, davies_score))

                print 'Execution: X: %s, k: %s, l: %s' % (data_str, k, l)
                print 'Algo error: %s' % error
                print 'NMI score: %s' % nmi_score
                print 'Rand score: %s' % rand_score
                print 'Davies score: %s' % davies_score
                print '-----------------------------------------------\n'



In [ ]: