In [40]:
import pandas as pd
import numpy as np
import seaborn as sns
import sys
import codecs

from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.metrics import silhouette_score

In [21]:
sys.setrecursionlimit(1000000000)

In [22]:
%matplotlib inline
sns.set_palette("deep", desat=.6)
sns.set_context(rc={"figure.figsize": (16, 7)})

In [23]:
arena_news_stem_df = pd.read_pickle('arena_news_stem_df.pkl')
sport_news_stem_df = pd.read_pickle('sport_news_stem_df.pkl')
jovem_news_stem_df = pd.read_pickle('jovem_news_stem_df.pkl')
arena_news_df = pd.read_pickle('arena_news_df.pkl')
sport_news_df = pd.read_pickle('sport_news_df.pkl')
jovem_news_df = pd.read_pickle('jovem_news_df.pkl')

In [24]:
labels = np.array(len(arena_news_df)*[1] + len(sport_news_df)*[0])

In [25]:
count_vect = CountVectorizer(encoding='UTF-8',lowercase=False, min_df=2)
X = count_vect.fit_transform(arena_news_df['all'].tolist() + sport_news_df['all'].tolist())

X_train_norm_tfidf = TfidfTransformer(norm=u'l2', use_idf=True).fit_transform(X)
X_train_norm = TfidfTransformer(norm=u'l2', use_idf=False).fit_transform(X)

In [38]:
def to_csv(some_list, file_name, header):
    def to_str(st):
        if isinstance(st, str) or isinstance(st, unicode):
            return st
        else:
            return str(st)
    with codecs.open(file_name, 'w', 'utf-8') as file_stream:
        if len(header) != 0:
            file_stream.write(u','.join(header) + '\n')
        for item in some_list:
            if isinstance(item, tuple) or isinstance(item, list):
                line = u','.join(map(to_str, item)) + '\n'
            else:
                line = item + '\n'
            file_stream.write(line)
    file_stream.close()

In [28]:
def _big_s(x, center):
    len_x = len(x)
    total = 0

    for i in range(len_x):
        total += np.linalg.norm(x[i]-center)

    return total / len_x

def davies_bouldin_score(X, labels_pred, k_centers):
    num_clusters, _ = k_centers.shape
    big_ss = np.zeros([num_clusters], dtype=np.float64)
    d_eucs = np.zeros([num_clusters, num_clusters], dtype=np.float64)
    db = 0

    for k in range(num_clusters):
        samples_in_k_inds = np.where(labels_pred == k)[0]
        samples_in_k = X[samples_in_k_inds, :]
        big_ss[k] = _big_s(samples_in_k, k_centers[k])

    for k in range(num_clusters):
        for l in range(0, num_clusters):
            d_eucs[k, l] = np.linalg.norm(k_centers[k]-k_centers[l])

    for k in range(num_clusters):
        values = np.zeros([num_clusters-1], dtype=np.float64)
        for l in range(0, k):
            values[l] = (big_ss[k] + big_ss[l])/d_eucs[k, l]
        for l in range(k+1, num_clusters):
            values[l-1] = (big_ss[k] + big_ss[l])/d_eucs[k, l]

        db += np.max(values)
    res = db / num_clusters
    return res

def calculate_centroids_doc_mean(X, labels_pred, k):
    _, m = X.shape

    centroids = np.zeros((k, m))
    for k in range(k):
        samples_in_k_inds = np.where(labels_pred == k)[0]
        centroids[k, :] = X[samples_in_k_inds, :].mean(axis=0)

    return centroids

In [18]:
def onmtf(X, U, S, V):
    U = U * ((X.dot(V).dot(S.T)) / (U.dot(S).dot(V.T).dot(X.T).dot(U)))
    V = V * ((X.T.dot(U).dot(S)) / (V.dot(S.T).dot(U.T).dot(X).dot(V)))
    S = S * ((U.T.dot(X).dot(V)) / (U.T.dot(U).dot(S).dot(V.T).dot(V)))
    return U, S, V

def onm3f(X, U, S, V):
     U = U * (X.dot(V).dot(S.T)) / np.sqrt(U.dot(U.T).dot(X).dot(V).dot(S.T))
     V = V * (X.T.dot(U).dot(S)) / np.sqrt(V.dot(V.T).dot(X.T).dot(U).dot(S))
     S = S * (U.T.dot(X).dot(V)) / np.sqrt(U.T.dot(U).dot(S).dot(V.T).dot(V))
     return U, S, V

def nbvd(X, U, S, V):
     U = U * (X.dot(V).dot(S.T)) / U.dot(U.T).dot(X).dot(V).dot(S.T)
     V = V * (X.T.dot(U).dot(S)) / V.dot(V.T).dot(X.T).dot(U).dot(S)
     S = S * (U.T.dot(X).dot(V)) / U.T.dot(U).dot(S).dot(V.T).dot(V)
     return U, S, V
    
def matrix_factorization_clustering(X, k, l, factorization_func=onmtf, norm=False, num_iters=100):
    m, n = X.shape
    U = np.random.rand(m,k)
    S = np.random.rand(k,l)
    V = np.random.rand(n,l)

    if norm:
        X = normalize(X)

    for i in xrange(num_iters):
        U, S, V = factorization_func(X, U, S, V)
        error = np.sum((X - U.dot(S).dot(V.T)) ** 2)
        print error

    Du = np.diag(np.ones(m).dot(U))
    Dv = np.diag(np.ones(n).dot(V))

    U = U.dot( np.diag(S.dot(Dv).dot(np.ones(l))) )
    V = V.dot( np.diag(np.ones(k).dot(Du).dot(S)) )

    rows_ind = np.argmax(U, axis=1)
    cols_ind = np.argmax(V, axis=1)

    return U, S, V, rows_ind, cols_ind, error

In [29]:
def matrix_factorization_overlapping_bin(X, k, l, num_iters=50):
    def weights_initialization(X, n, m, k):
        shuffle_inds = np.random.permutation(n)
        cluster_end_ind = 0
        for i in xrange(k):
            cluster_init_ind = cluster_end_ind
            cluster_end_ind = round((i + 1) * n / k)
            X[shuffle_inds[cluster_init_ind : cluster_end_ind], i] = 1
        return X

    def calculate_block_matrix(X, F, G, S, k, l):
        for i in xrange(k):
            for j in xrange(l):
                S[i, j] = np.mean(X[F[:, i] == 1][:, G[i][:, j] == 1])
        where_are_NaNs = np.isnan(S)
        S[where_are_NaNs] = 0
        return S

    n, m = X.shape

    F = weights_initialization(np.zeros((n, k)), n, m, k)

    G = []
    for i in xrange(k):
        G.append( weights_initialization(np.zeros((m, l)), m, n, l) )

    S = np.random.rand(k, l)

    for iter_ind in xrange(num_iters):
        S = calculate_block_matrix(X, F, G, S, k, l)

        for i in xrange(k):
            F_t = F[F[:, i] == 1, :].dot(S)
            X_t = X[F[:, i] == 1, :]
            G[i] = np.zeros((m, l))
            for j in xrange(m):
                clust_len, _ = X_t.shape
                diff = F_t - X_t[:, j].reshape(clust_len, 1).dot(np.ones(l).reshape(1, l))
                errors = np.diag(diff.T.dot(diff))
                minV = np.min(errors)
                index = np.where(errors <= minV)[0]
                G[i][j, index[np.random.randint(len(index))]] = 1

#         S = calculate_block_matrix(X, F, G, S, k, l)

        G_t = np.zeros((k, m))
        for i in xrange(k):
            G_t[i, :] = S[i, :].dot(G[i].T)

        F = np.zeros((n, k))
        for j in xrange(n):
            diff = G_t - np.ones(k).reshape(k, 1).dot(X[j, :].reshape(1, m))
            errors = np.diag(diff.dot(diff.T))
            minV = np.min(errors)
            index = np.where(errors <= minV)[0]
            F[j, index[np.random.randint(len(index))]] = 1

    G_t = np.zeros((k, m))
    for i in xrange(k):
        G_t[i, :] = S[i, :].dot(G[i].T)

    error = np.sum((X - F.dot(G_t))**2)

    rows_ind = np.argmax(F, axis=1)

    return F, S, G, G_t, rows_ind, error

In [10]:
def fnmtf(X, k, l, num_iter=100, norm=False):
    m, n = X.shape

    U = np.random.rand(m,k)
    S = np.random.rand(k,l)
    V = np.random.rand(n,l)

    if norm:
        X = preprocessing.normalize(X)
    for i in xrange(num_iter):
        S = np.linalg.pinv(U.T.dot(U)).dot(U.T).dot(X).dot(V).dot(np.linalg.pinv(V.T.dot(V)))

        # solve subproblem to update V
        U_tilde = U.dot(S)
        V_new = np.zeros(n*l).reshape(n, l)
        for j in xrange(n):
            errors = np.zeros(l)
            for col_clust_ind in xrange(l):
                errors[col_clust_ind] = ((X[:][:, j] - U_tilde[:][:, col_clust_ind])**2).sum()
            ind = np.argmin(errors)
            V_new[j][ind] = 1

        # solve subproblem to update U
        V_tilde = S.dot(V.T)
        U_new = np.zeros(m*k).reshape(m, k)
        for i in xrange(m):
            errors = np.zeros(k)
            for row_clust_ind in xrange(k):
                errors[row_clust_ind] = ((X[i][:] - V_tilde[row_clust_ind][:])**2).sum()
            ind = np.argmin(errors)
            U_new[i][ind] = 1

        U = U_new
        V = V_new

    rows_ind = np.argmax(U, axis=1)
    cols_ind = np.argmax(V, axis=1)

    return U, S, V, rows_ind, cols_ind

In [30]:
def rand_score(labels_true, labels_pred):
    return 'Rand score: %s' % adjusted_rand_score(labels_true, labels_pred)

def sil_score(X, labels_pred):
    score = silhouette_score(X, labels_pred)
    return 'Silhouette score: %s' % score

def db_score(X, labels_pred, k_centers):
    return 'Davies-Bouldin index: %s' % davies_bouldin_score(X, labels_pred, k_centers)

In [37]:
best = 0.0
for _ in xrange(5):
    U_t, S_t, V_t, rows_ind_t, cols_ind_t, error = matrix_factorization_clustering(X_train_norm.toarray(), 2, 2, onmtf, num_iters=100)
    rand_sc = adjusted_rand_score(labels, rows_ind_t)
    if error > best:
        best = error
        U = U_t
        S = S_t
        V = V_t
        rows_ind = rows_ind_t
        cols_ind = cols_ind_t
    print 'tf norm: %s' % rand_score(labels, rows_ind_t)
    print 'tf norm: %s' % sil_score(X_train_norm, rows_ind_t)
    print 'tf norm: %s' % db_score(X_train_norm.toarray(), rows_ind_t, calculate_centroids_doc_mean(X_train_norm, rows_ind_t, 2))
    print ''


186.741902488
185.833675238
185.693099312
185.599353019
185.504889692
185.400892837
185.278171558
185.130951472
184.955041599
184.749201875
184.518247024
184.271349346
184.019132961
183.767489715
183.515800414
183.259073162
182.995380559
182.732632633
182.485481309
182.261222987
182.065534839
181.902966969
181.777389669
181.684672559
181.608874339
181.556693989
181.515404887
181.477668871
181.443751737
181.410835458
181.374380653
181.337757057
181.305300997
181.278666031
181.257805636
181.242828064
181.233019546
181.227244761
181.224478377
181.223449215
181.223238902
181.223325046
181.223490169
181.223551021
181.223428336
181.222837786
181.222294998
181.221791873
181.221250517
181.220532554
181.219650353
181.21846176
181.217018767
181.215302307
181.213477324
181.211638334
181.210011816
181.20859071
181.207438903
181.206455295
181.20567136
181.204984018
181.204429572
181.203915105
181.203485686
181.203062493
181.202706059
181.202348684
181.202049461
181.201738799
181.201473597
181.201190042
181.200952764
181.200702126
181.200489178
181.200244924
181.200030924
181.199791715
181.199578372
181.199323324
181.19907779
181.198788143
181.198517398
181.198215554
181.197939429
181.197638353
181.197371149
181.197091267
181.196854998
181.196612245
181.196412503
181.196201801
181.196026191
181.195835704
181.195682562
181.195522287
181.195403241
181.195276296
181.195185251
181.195081034
tf norm: Rand score: 0.00724243706001
tf norm: Silhouette score: 0.0189158619069
tf norm: Davies-Bouldin index: 2.35589445583

186.772867501
185.852045586
185.611020487
185.415637093
185.204819691
184.951278114
184.640439311
184.265475799
183.829442047
183.345300302
182.831730885
182.301853354
181.760528101
181.231762064
180.760125804
180.367137349
180.043265668
179.772187749
179.516506991
179.256085896
179.049808755
178.905729617
178.813191431
178.75136274
178.69637601
178.638013067
178.59229569
178.561429214
178.541403024
178.525496819
178.512447185
178.500902712
178.489512917
178.479766195
178.472729328
178.467816344
178.464143911
178.46102521
178.458241799
178.455653966
178.453328383
178.451232599
178.449469644
178.447969194
178.446744714
178.445660322
178.444689158
178.443727183
178.442969888
178.442318114
178.441763185
178.441212658
178.440726087
178.440234665
178.439788668
178.439325142
178.438918063
178.438515198
178.438168844
178.437817702
178.43751859
178.437218612
178.436975681
178.436738091
178.436559066
178.436384275
178.436261913
178.43613686
178.436055439
178.435964232
178.435909604
178.435840726
178.435804284
178.435751335
178.435727843
178.435685887
178.435670661
178.435635516
178.435625142
178.435594159
178.435586674
178.435558369
178.435552621
178.435526017
178.435521171
178.435495535
178.435490981
178.435465798
178.435461128
178.435436064
178.435431019
178.435405854
178.435400254
178.435374818
178.435368515
178.435342706
178.43533575
178.435309903
178.435303036
178.435278223
tf norm: Rand score: 0.979999505051
tf norm: Silhouette score: 0.0318769388602
tf norm: Davies-Bouldin index: 5.16034645983

187.328979466
185.90373774
185.787445029
185.744631483
185.728609205
185.717060797
185.706638233
185.695726992
185.683798234
185.670389899
185.655110832
185.637489046
185.616961238
185.592820718
185.564164626
185.529830253
185.488302784
185.437608215
185.375177728
185.297709963
185.201065062
185.080296433
184.930007203
184.745319598
184.523700842
184.267480118
183.986021919
183.69573464
183.416510831
183.165585656
182.953025636
182.781958696
182.649191007
182.545684473
182.460761719
182.386584861
182.318890996
182.254496185
182.190722968
182.125459484
182.058471337
181.991090433
181.922582765
181.850595916
181.772728619
181.68709031
181.59198139
181.485900821
181.366162083
181.229646808
181.082554819
180.939949558
180.806381969
180.673321484
180.532216151
180.385083896
180.252160765
180.138233078
180.027164516
179.893709633
179.710194968
179.488960182
179.289023564
179.157613749
179.070393072
178.992452021
178.924995146
178.885378983
178.846893884
178.796055095
178.757448658
178.722304196
178.696335578
178.677363066
178.659682038
178.646162201
178.633247746
178.621677081
178.612780668
178.602858792
178.595356554
178.58925247
178.583771782
178.578591596
178.573926255
178.570729623
178.568402741
178.56628786
178.564437261
178.563140531
178.562016932
178.561005261
178.560001714
178.558762753
178.556151958
178.546490911
178.516657105
178.487655662
178.474967021
178.469603609
tf norm: Rand score: 0.960200080404
tf norm: Silhouette score: 0.0316191461355
tf norm: Davies-Bouldin index: 5.17468909427

186.828423148
185.835340994
185.765346143
185.73745533
185.721429931
185.70677221
185.690840421
185.672430381
185.650424359
185.623683863
185.590917927
185.550597435
185.500987849
185.440109516
185.365800103
185.275757154
185.167701586
185.039573384
184.889652166
184.71642724
184.518293451
184.293251012
184.038906482
183.753139438
183.436306545
183.094422753
182.739165971
182.383864534
182.041691083
181.731462076
181.470296646
181.270821567
181.137558925
181.053163165
180.996887654
180.956224658
180.924524225
180.898348693
180.874310169
180.850181686
180.82557923
180.800820587
180.776467006
180.752692339
180.731927563
180.715962504
180.7021293
180.688457004
180.673950175
180.659247703
180.645645036
180.633589119
180.622923495
180.613352852
180.604640369
180.596593458
180.589065813
180.58195272
180.575106329
180.568268183
180.560989874
180.552752207
180.543424292
180.533733913
180.524742548
180.516916124
180.510105652
180.504003357
180.498451308
180.493348905
180.488529076
180.483805944
180.478912137
180.474067309
180.469660428
180.46546051
180.461349356
180.457340601
180.453420204
180.449404542
180.44508289
180.440459404
180.435282239
180.429817004
180.424291509
180.418523068
180.412476395
180.406140982
180.399523385
180.392724545
180.385869883
180.379010486
180.372160882
180.365299482
180.358473876
180.351757794
180.345142616
180.33855268
180.3322108
180.326207733
tf norm: Rand score: 0.119946074664
tf norm: Silhouette score: 0.0229397984962
tf norm: Davies-Bouldin index: 4.49059085823

187.101715699
185.865155518
185.755140559
185.69197375
185.643771761
185.587136945
185.511126703
185.404734833
185.249422302
185.019364932
184.679013486
184.201180376
183.611619792
182.997144032
182.422814414
181.936772964
181.594674308
181.399631819
181.300082338
181.250467386
181.224875759
181.209843084
181.20032294
181.19508422
181.192914542
181.192289714
181.192279884
181.192406949
181.192492081
181.192494959
181.192478023
181.192469848
181.192461384
181.192424072
181.192401284
181.192430601
181.192515307
181.192613965
181.192706631
181.192780125
181.192839559
181.1928816
181.192913745
181.192932953
181.192946472
181.192950785
181.192952766
181.19294831
181.192943938
181.192934927
181.192927533
181.192916479
181.192907905
181.192896081
181.192887169
181.192875091
181.192866181
181.19285413
181.192845559
181.192833957
181.192826201
181.192815494
181.192808925
181.192799399
181.192794236
181.192786042
181.192782375
181.192775538
181.192773338
181.192767792
181.192766966
181.192762596
181.19276302
181.192759695
181.192761231
181.19275881
181.192761315
181.192759652
181.192762979
181.192761929
181.192765931
181.192765349
181.192769885
181.192769635
181.192774574
181.192774534
181.192779765
181.19277984
181.19278528
181.192785403
181.192791
181.192791135
181.192796864
181.192796997
181.192802846
181.19280297
181.192808931
181.192809036
181.192815092
181.19281516
tf norm: Rand score: 0.00724243706001
tf norm: Silhouette score: 0.0189158619069
tf norm: Davies-Bouldin index: 2.35589445583


In [32]:
def print_hist(U):
    _, k = U.shape
    U_norm = U[:, 0] / np.sum(U, axis=1)

    plt.title('U norm 0')
    plt.hist(U_norm, bins=100)
    plt.show()
print_hist(U)



In [33]:
def norm(U):
    _, k = U.shape
    return U / np.tile(np.sum(U, axis=1).T, (k,1)).T
def pairplot(U):
    sns.pairplot(pd.DataFrame(norm(U)))
pairplot(U)



In [34]:
print_hist(V)



In [35]:
pairplot(V)



In [45]:
print S
np.savetxt('onmtf_2x2_S.csv', S, delimiter=",")


[[ 0.01647651  0.00017802]
 [ 0.00107611  0.01018368]]

In [41]:
def top_k(arr, k, axis=0):
    top_inds = np.argsort(arr, axis=axis)[-k:]
    top_vals = np.sort(arr, axis=axis)[-k:]
    return top_inds, top_vals

def reverse(arr):
    return arr[::-1]

def top_k_words_term_cluster(vec, X, count_vect, k):
    sum_per_word = np.sum(X, axis=0)
    top_inds, top_vals = top_k(vec, k)
    words = count_vect.get_feature_names()
    top_words = [words[i] for i in top_inds]
    correspondence_vals = [sum_per_word[i] for i in top_inds]

    top_pairs = reverse(zip(top_words, top_vals, correspondence_vals))

    return top_pairs

V_norm = norm(V)
for i in xrange(2):
    print 'Top words for term cluster %s:' % i
    top_pairs = top_k_words_term_cluster(V[:, i], X_train_norm.toarray(), count_vect, 9999999)
    for w, v_value, tf_value in top_pairs[0:30]:
        print w, v_value, tf_value
    to_csv(top_pairs, 'onmtf_2x2_V%s_words_top.csv' % i, ['word', 'V cluster %s value' % i, 'tf norm value'])
    print

# print 'Top words for middle cluster:'
# inds = np.where((V_norm[:, 0] >= 0.25) & (V_norm[:, 0] <= 0.75))[0]
# normalize = np.vectorize(lambda x: 1.0 - x if x < 0.5 else x, otypes=[np.float])
# new_V0 = normalize(V_norm[inds, 0])
# top_k_words_term_cluster(new_V0, X_train_norm.toarray(), count_vect, 30)


Top words for term cluster 0:
jogo 19.3527267394 17.409134665
jogos 15.6353496953 14.0184074837
playstation 6.35047727335 6.50286252979
feira 6.07173171202 6.63423662207
novo 5.79390446145 6.4397883303
equipe 5.55833915879 5.38402375813
minutos 5.2089078997 4.59140755144
xbox 5.08179498493 5.93517551309
dia 5.07335915865 5.32093077415
game 5.00866710716 5.40202523592
time 5.00674944051 5.21085736446
dois 4.83620090059 4.46835594418
real 4.81827687176 4.27962701698
apenas 4.51116813321 4.59242923649
gol 4.50542691225 3.91517632453
anos 4.45772025754 4.91571457583
tempo 4.44838081005 4.3141804196
brasileiro 4.38930985302 4.87632646275
console 4.36396050274 4.56307259533
jogador 4.3587845406 4.68028023127
final 4.34828341839 4.9373205725
ano 4.29161837095 4.75079310834
título 4.21163516811 4.31037511355
monetáriointerno 4.20360363283 4.81840908599
segundo 4.18209501695 4.64121947369
jogadores 4.1802252545 4.30228612023
paulo 4.1686529716 4.23639183759
games 4.03979737147 5.41506727974
partida 4.03762862007 3.73445248798
brasil 3.93566269372 4.75295749281

Top words for term cluster 1:
of 5.79936658361 6.66858864083
league 4.32280307865 3.75349938375
legends 4.28481307777 3.23393553266
riot 1.49326731098 1.27386780829
personagens 1.32248734024 1.5857127324
games 1.17938511064 5.41506727974
game 0.711428335873 5.40202523592
brasil 0.689588317028 4.75295749281
internacional 0.591851635421 0.873139796412
público 0.505442957967 1.63408328901
jogadores 0.487558805196 4.30228612023
american 0.453841085006 0.302258792403
express 0.453841085006 0.302258792403
desafio 0.45034133851 0.759196482236
servidor 0.408939934446 0.352430110856
executivo 0.404064650856 1.31299292066
cnb 0.399370180311 0.313660872869
cartões 0.37025631986 0.671351601022
maior 0.357078521244 2.46204157517
thrones 0.349344395211 0.275194704958
segundo 0.335632274021 4.64121947369
world 0.330014231984 1.16506836627
corpo 0.326499074854 0.60474390695
horas 0.325924262511 1.23511834154
acordo 0.316535505079 1.61184386755
final 0.313408294338 4.9373205725
times 0.308515737354 1.33205732189
débito 0.305689056195 0.205602867272
xfire 0.302236330882 0.204908953959
artigo 0.298703955624 0.548352600459


In [42]:
best_sil = 1e-10
best_davies = 1e10
for down_lim in [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.4, 0.5, 0.6, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]:
    for up_lim in [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.4, 0.5, 0.6, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]:
        new_labels_pred = rows_ind.copy()
        inds = np.where((norm(U)[:,0] >= down_lim) & (norm(U)[:,0] <= up_lim))[0]
        
        if len(inds) == 0:
            continue
        
        print 'Num elems in new cluster: %s' % len(inds)
        new_labels_pred[inds] = 2

        print 'cluster from ' + str(down_lim) + ' to ' + str(up_lim)
        print sil_score(X_train_norm, new_labels_pred)
        print db_score(X_train_norm.toarray(), new_labels_pred, calculate_centroids_doc_mean(X_train_norm.toarray(), new_labels_pred, 3))
        print ''
        davies = davies_bouldin_score(X_train_norm.toarray(), new_labels_pred, calculate_centroids_doc_mean(X_train_norm.toarray(), new_labels_pred, 3))
        sil = silhouette_score(X_train_norm, new_labels_pred)
        if sil > best_sil and len(inds) is not 0:
            best_sil = sil
        if davies < best_davies and len(inds) is not 0:
            best_davies = davies
#         print '|[%s, %s]| %s | %s |' % (down_lim, up_lim, davies, sil)
print 'Best sil score: %s' % best_sil
print 'Best davies score: %s' % best_davies


Num elems in new cluster: 1
cluster from 0.05 to 0.15
Silhouette score: -0.000181088765606
Davies-Bouldin index: 1.86861073407

Num elems in new cluster: 1
cluster from 0.05 to 0.2
Silhouette score: -0.000181088765606
Davies-Bouldin index: 1.86861073407

Num elems in new cluster: 1
cluster from 0.05 to 0.25
Silhouette score: -0.000181088765606
Davies-Bouldin index: 1.86861073407

Num elems in new cluster: 1
cluster from 0.05 to 0.3
Silhouette score: -0.000181088765606
Davies-Bouldin index: 1.86861073407

Num elems in new cluster: 1
cluster from 0.05 to 0.4
Silhouette score: -0.000181088765606
Davies-Bouldin index: 1.86861073407

Num elems in new cluster: 2
cluster from 0.05 to 0.5
Silhouette score: 0.00408375773937
Davies-Bouldin index: 2.080683275

Num elems in new cluster: 2
cluster from 0.05 to 0.6
Silhouette score: 0.00408375773937
Davies-Bouldin index: 2.080683275

Num elems in new cluster: 4
cluster from 0.05 to 0.7
Silhouette score: 0.00878431186025
Davies-Bouldin index: 2.84835797569

Num elems in new cluster: 7
cluster from 0.05 to 0.75
Silhouette score: 0.0105867710153
Davies-Bouldin index: 3.51123649784

Num elems in new cluster: 8
cluster from 0.05 to 0.8
Silhouette score: 0.0113055864956
Davies-Bouldin index: 3.64726759647

Num elems in new cluster: 11
cluster from 0.05 to 0.85
Silhouette score: 0.0113864136422
Davies-Bouldin index: 4.00029018315

Num elems in new cluster: 14
cluster from 0.05 to 0.9
Silhouette score: 0.0105436112202
Davies-Bouldin index: 4.12443512946

Num elems in new cluster: 22
cluster from 0.05 to 0.95
Silhouette score: 0.0122892943569
Davies-Bouldin index: 4.59497307099

Num elems in new cluster: 1
cluster from 0.1 to 0.15
Silhouette score: -0.000181088765606
Davies-Bouldin index: 1.86861073407

Num elems in new cluster: 1
cluster from 0.1 to 0.2
Silhouette score: -0.000181088765606
Davies-Bouldin index: 1.86861073407

Num elems in new cluster: 1
cluster from 0.1 to 0.25
Silhouette score: -0.000181088765606
Davies-Bouldin index: 1.86861073407

Num elems in new cluster: 1
cluster from 0.1 to 0.3
Silhouette score: -0.000181088765606
Davies-Bouldin index: 1.86861073407

Num elems in new cluster: 1
cluster from 0.1 to 0.4
Silhouette score: -0.000181088765606
Davies-Bouldin index: 1.86861073407

Num elems in new cluster: 2
cluster from 0.1 to 0.5
Silhouette score: 0.00408375773937
Davies-Bouldin index: 2.080683275

Num elems in new cluster: 2
cluster from 0.1 to 0.6
Silhouette score: 0.00408375773937
Davies-Bouldin index: 2.080683275

Num elems in new cluster: 4
cluster from 0.1 to 0.7
Silhouette score: 0.00878431186025
Davies-Bouldin index: 2.84835797569

Num elems in new cluster: 7
cluster from 0.1 to 0.75
Silhouette score: 0.0105867710153
Davies-Bouldin index: 3.51123649784

Num elems in new cluster: 8
cluster from 0.1 to 0.8
Silhouette score: 0.0113055864956
Davies-Bouldin index: 3.64726759647

Num elems in new cluster: 11
cluster from 0.1 to 0.85
Silhouette score: 0.0113864136422
Davies-Bouldin index: 4.00029018315

Num elems in new cluster: 14
cluster from 0.1 to 0.9
Silhouette score: 0.0105436112202
Davies-Bouldin index: 4.12443512946

Num elems in new cluster: 22
cluster from 0.1 to 0.95
Silhouette score: 0.0122892943569
Davies-Bouldin index: 4.59497307099

Num elems in new cluster: 1
cluster from 0.15 to 0.5
Silhouette score: 0.00587472951567
Davies-Bouldin index: 1.7871408732

Num elems in new cluster: 1
cluster from 0.15 to 0.6
Silhouette score: 0.00587472951567
Davies-Bouldin index: 1.7871408732

Num elems in new cluster: 3
cluster from 0.15 to 0.7
Silhouette score: 0.011547597204
Davies-Bouldin index: 2.72957773717

Num elems in new cluster: 6
cluster from 0.15 to 0.75
Silhouette score: 0.0126870088805
Davies-Bouldin index: 3.43002103713

Num elems in new cluster: 7
cluster from 0.15 to 0.8
Silhouette score: 0.0132416203456
Davies-Bouldin index: 3.57610608777

Num elems in new cluster: 10
cluster from 0.15 to 0.85
Silhouette score: 0.0126714508254
Davies-Bouldin index: 3.95304426267

Num elems in new cluster: 13
cluster from 0.15 to 0.9
Silhouette score: 0.0117163142616
Davies-Bouldin index: 4.07122428504

Num elems in new cluster: 21
cluster from 0.15 to 0.95
Silhouette score: 0.013900961473
Davies-Bouldin index: 4.56908653041

Num elems in new cluster: 1
cluster from 0.2 to 0.5
Silhouette score: 0.00587472951567
Davies-Bouldin index: 1.7871408732

Num elems in new cluster: 1
cluster from 0.2 to 0.6
Silhouette score: 0.00587472951567
Davies-Bouldin index: 1.7871408732

Num elems in new cluster: 3
cluster from 0.2 to 0.7
Silhouette score: 0.011547597204
Davies-Bouldin index: 2.72957773717

Num elems in new cluster: 6
cluster from 0.2 to 0.75
Silhouette score: 0.0126870088805
Davies-Bouldin index: 3.43002103713

Num elems in new cluster: 7
cluster from 0.2 to 0.8
Silhouette score: 0.0132416203456
Davies-Bouldin index: 3.57610608777

Num elems in new cluster: 10
cluster from 0.2 to 0.85
Silhouette score: 0.0126714508254
Davies-Bouldin index: 3.95304426267

Num elems in new cluster: 13
cluster from 0.2 to 0.9
Silhouette score: 0.0117163142616
Davies-Bouldin index: 4.07122428504

Num elems in new cluster: 21
cluster from 0.2 to 0.95
Silhouette score: 0.013900961473
Davies-Bouldin index: 4.56908653041

Num elems in new cluster: 1
cluster from 0.25 to 0.5
Silhouette score: 0.00587472951567
Davies-Bouldin index: 1.7871408732

Num elems in new cluster: 1
cluster from 0.25 to 0.6
Silhouette score: 0.00587472951567
Davies-Bouldin index: 1.7871408732

Num elems in new cluster: 3
cluster from 0.25 to 0.7
Silhouette score: 0.011547597204
Davies-Bouldin index: 2.72957773717

Num elems in new cluster: 6
cluster from 0.25 to 0.75
Silhouette score: 0.0126870088805
Davies-Bouldin index: 3.43002103713

Num elems in new cluster: 7
cluster from 0.25 to 0.8
Silhouette score: 0.0132416203456
Davies-Bouldin index: 3.57610608777

Num elems in new cluster: 10
cluster from 0.25 to 0.85
Silhouette score: 0.0126714508254
Davies-Bouldin index: 3.95304426267

Num elems in new cluster: 13
cluster from 0.25 to 0.9
Silhouette score: 0.0117163142616
Davies-Bouldin index: 4.07122428504

Num elems in new cluster: 21
cluster from 0.25 to 0.95
Silhouette score: 0.013900961473
Davies-Bouldin index: 4.56908653041

Num elems in new cluster: 1
cluster from 0.3 to 0.5
Silhouette score: 0.00587472951567
Davies-Bouldin index: 1.7871408732

Num elems in new cluster: 1
cluster from 0.3 to 0.6
Silhouette score: 0.00587472951567
Davies-Bouldin index: 1.7871408732

Num elems in new cluster: 3
cluster from 0.3 to 0.7
Silhouette score: 0.011547597204
Davies-Bouldin index: 2.72957773717

Num elems in new cluster: 6
cluster from 0.3 to 0.75
Silhouette score: 0.0126870088805
Davies-Bouldin index: 3.43002103713

Num elems in new cluster: 7
cluster from 0.3 to 0.8
Silhouette score: 0.0132416203456
Davies-Bouldin index: 3.57610608777

Num elems in new cluster: 10
cluster from 0.3 to 0.85
Silhouette score: 0.0126714508254
Davies-Bouldin index: 3.95304426267

Num elems in new cluster: 13
cluster from 0.3 to 0.9
Silhouette score: 0.0117163142616
Davies-Bouldin index: 4.07122428504

Num elems in new cluster: 21
cluster from 0.3 to 0.95
Silhouette score: 0.013900961473
Davies-Bouldin index: 4.56908653041

Num elems in new cluster: 1
cluster from 0.4 to 0.5
Silhouette score: 0.00587472951567
Davies-Bouldin index: 1.7871408732

Num elems in new cluster: 1
cluster from 0.4 to 0.6
Silhouette score: 0.00587472951567
Davies-Bouldin index: 1.7871408732

Num elems in new cluster: 3
cluster from 0.4 to 0.7
Silhouette score: 0.011547597204
Davies-Bouldin index: 2.72957773717

Num elems in new cluster: 6
cluster from 0.4 to 0.75
Silhouette score: 0.0126870088805
Davies-Bouldin index: 3.43002103713

Num elems in new cluster: 7
cluster from 0.4 to 0.8
Silhouette score: 0.0132416203456
Davies-Bouldin index: 3.57610608777

Num elems in new cluster: 10
cluster from 0.4 to 0.85
Silhouette score: 0.0126714508254
Davies-Bouldin index: 3.95304426267

Num elems in new cluster: 13
cluster from 0.4 to 0.9
Silhouette score: 0.0117163142616
Davies-Bouldin index: 4.07122428504

Num elems in new cluster: 21
cluster from 0.4 to 0.95
Silhouette score: 0.013900961473
Davies-Bouldin index: 4.56908653041

Num elems in new cluster: 2
cluster from 0.5 to 0.7
Silhouette score: 0.0123776875452
Davies-Bouldin index: 2.3969361808

Num elems in new cluster: 5
cluster from 0.5 to 0.75
Silhouette score: 0.0128588249467
Davies-Bouldin index: 3.27529754625

Num elems in new cluster: 6
cluster from 0.5 to 0.8
Silhouette score: 0.0136080734369
Davies-Bouldin index: 3.4435522058

Num elems in new cluster: 9
cluster from 0.5 to 0.85
Silhouette score: 0.0128785320611
Davies-Bouldin index: 3.85988129691

Num elems in new cluster: 12
cluster from 0.5 to 0.9
Silhouette score: 0.0118835459197
Davies-Bouldin index: 3.99217812436

Num elems in new cluster: 20
cluster from 0.5 to 0.95
Silhouette score: 0.0141756494454
Davies-Bouldin index: 4.55916494997

Num elems in new cluster: 2
cluster from 0.6 to 0.7
Silhouette score: 0.0123776875452
Davies-Bouldin index: 2.3969361808

Num elems in new cluster: 5
cluster from 0.6 to 0.75
Silhouette score: 0.0128588249467
Davies-Bouldin index: 3.27529754625

Num elems in new cluster: 6
cluster from 0.6 to 0.8
Silhouette score: 0.0136080734369
Davies-Bouldin index: 3.4435522058

Num elems in new cluster: 9
cluster from 0.6 to 0.85
Silhouette score: 0.0128785320611
Davies-Bouldin index: 3.85988129691

Num elems in new cluster: 12
cluster from 0.6 to 0.9
Silhouette score: 0.0118835459197
Davies-Bouldin index: 3.99217812436

Num elems in new cluster: 20
cluster from 0.6 to 0.95
Silhouette score: 0.0141756494454
Davies-Bouldin index: 4.55916494997

Num elems in new cluster: 3
cluster from 0.7 to 0.75
Silhouette score: 0.0111085660065
Davies-Bouldin index: 2.71518866653

Num elems in new cluster: 4
cluster from 0.7 to 0.8
Silhouette score: 0.0122065771896
Davies-Bouldin index: 2.92759852347

Num elems in new cluster: 7
cluster from 0.7 to 0.85
Silhouette score: 0.0115275826632
Davies-Bouldin index: 3.51851231682

Num elems in new cluster: 10
cluster from 0.7 to 0.9
Silhouette score: 0.01066081014
Davies-Bouldin index: 3.70215196813

Num elems in new cluster: 18
cluster from 0.7 to 0.95
Silhouette score: 0.0138116545588
Davies-Bouldin index: 4.33329119339

Num elems in new cluster: 1
cluster from 0.75 to 0.8
Silhouette score: 0.00929856759788
Davies-Bouldin index: 1.89607324469

Num elems in new cluster: 4
cluster from 0.75 to 0.85
Silhouette score: 0.0095150065893
Davies-Bouldin index: 3.01052494399

Num elems in new cluster: 7
cluster from 0.75 to 0.9
Silhouette score: 0.008230206729
Davies-Bouldin index: 3.4182346417

Num elems in new cluster: 15
cluster from 0.75 to 0.95
Silhouette score: 0.012961456065
Davies-Bouldin index: 4.1657688389

Num elems in new cluster: 3
cluster from 0.8 to 0.85
Silhouette score: 0.0081605724865
Davies-Bouldin index: 2.75112459222

Num elems in new cluster: 6
cluster from 0.8 to 0.9
Silhouette score: 0.00693509758834
Davies-Bouldin index: 3.29125533608

Num elems in new cluster: 14
cluster from 0.8 to 0.95
Silhouette score: 0.0118411448878
Davies-Bouldin index: 4.25382508837

Num elems in new cluster: 3
cluster from 0.85 to 0.9
Silhouette score: 0.00277291510283
Davies-Bouldin index: 2.70880700951

Num elems in new cluster: 11
cluster from 0.85 to 0.95
Silhouette score: 0.0109170044704
Davies-Bouldin index: 4.11020690428

Num elems in new cluster: 8
cluster from 0.9 to 0.95
Silhouette score: 0.0122151147055
Davies-Bouldin index: 3.77745783953

Best sil score: 0.0141756494454
Best davies score: 1.7871408732
/System/Library/Frameworks/Python.framework/Versions/2.7/Extras/lib/python/numpy/core/_methods.py:55: RuntimeWarning: Mean of empty slice.
  warnings.warn("Mean of empty slice.", RuntimeWarning)
Cut Davies Silhouette
[0.05, 0.1] 5.57285157009 0.0109045112115
[0.05, 0.15] 5.71182779465 0.0119656192241
[0.05, 0.2] 5.94127110479 0.0130751820341
[0.05, 0.25] 6.05298958519 0.0165451762916
[0.05, 0.3] 6.19170173754 0.0165380440629
[0.05, 0.4] 6.32245257561 0.0168367278657
[0.05, 0.5] 6.26966353994 0.0167316193288
[0.05, 0.6] 6.40807743382 0.016857356078
[0.05, 0.7] 6.4455544414 0.0172006814584
[0.05, 0.75] 6.55786721717 0.0172403307824
[0.05, 0.8] 6.56712481723 0.0171239228272
[0.05, 0.85] 6.64891705295 0.0172794978116
[0.05, 0.9] 6.65734610999 0.0175368723638
[0.05, 0.95] 6.43470626189 0.0185023158634
[0.1, 0.15] 3.74938336864 0.0200320140581
[0.1, 0.2] 4.95423215656 0.0122445704199
[0.1, 0.25] 5.24774207861 0.0189533279511
[0.1, 0.3] 5.4775435174 0.0184107763245
[0.1, 0.4] 5.64061493743 0.0188122752387
[0.1, 0.5] 5.70074585431 0.0180512491603
[0.1, 0.6] 5.89695453958 0.0183086502181
[0.1, 0.7] 5.99123959399 0.0187545346923
[0.1, 0.75] 6.16413919406 0.0189229292149
[0.1, 0.8] 6.21459398725 0.0187118565694
[0.1, 0.85] 6.36202785265 0.0193215518523
[0.1, 0.9] 6.38301325195 0.0198651799831
[0.1, 0.95] 6.27753852895 0.0214916194895
[0.15, 0.2] 4.86700812091 0.0105439794849
[0.15, 0.25] 5.11926762452 0.0185657354485
[0.15, 0.3] 5.37908380917 0.0180149889451
[0.15, 0.4] 5.52561494797 0.0185036943009
[0.15, 0.5] 5.5918038188 0.0176895051053
[0.15, 0.6] 5.8429610543 0.0179170884173
[0.15, 0.7] 5.94529904844 0.0184202374396
[0.15, 0.75] 6.1260691135 0.0186400603245
[0.15, 0.8] 6.18021555466 0.0184370429761
[0.15, 0.85] 6.34353934871 0.0191550753882
[0.15, 0.9] 6.38099084216 0.0197350762573
[0.15, 0.95] 6.27056371935 0.0214928456367
[0.2, 0.25] 4.59715750094 0.0258680002387
[0.2, 0.3] 4.80974711933 0.0231307008972
[0.2, 0.4] 4.95393026335 0.0226034756124
[0.2, 0.5] 5.10066417371 0.0204652333684
[0.2, 0.6] 5.40739503099 0.0200695117979
[0.2, 0.7] 5.56437964818 0.0205194066316
[0.2, 0.75] 5.80028313436 0.0207765470706
[0.2, 0.8] 5.92046824713 0.0203277378932
[0.2, 0.85] 6.28634021964 0.0211087702694
[0.2, 0.9] 6.34304665175 0.0218428354181
[0.2, 0.95] 6.18437325622 0.0237885594771
[0.25, 0.3] 4.22985981229 0.0135897730311
[0.25, 0.4] 4.61305561043 0.0164386610319
[0.25, 0.5] 4.76145280384 0.0130370501955
[0.25, 0.6] 4.89577281732 0.015318260869
[0.25, 0.7] 4.96585257909 0.0168170647366
[0.25, 0.75] 5.28245473331 0.0178483728072
[0.25, 0.8] 5.49589674625 0.0174806253808
[0.25, 0.85] 6.01445885302 0.0196945682697
[0.25, 0.9] 6.11307966306 0.0208421309285
[0.25, 0.95] 6.00289644331 0.0236550415501
[0.3, 0.4] 4.18386814443 0.0177595898038
[0.3, 0.5] 4.42658568564 0.0124694379783
[0.3, 0.6] 4.6461742013 0.0160554383447
[0.3, 0.7] 4.78111868642 0.017754651206
[0.3, 0.75] 4.94213034874 0.0188523697376
[0.3, 0.8] 5.11398707258 0.0183449197684
[0.3, 0.85] 5.77885390779 0.0205368208888
[0.3, 0.9] 5.90212574162 0.021749591286
[0.3, 0.95] 5.86506247584 0.0245266195333
[0.4, 0.5] 3.75873542708 -0.00205947886867
[0.4, 0.6] 4.56282447318 0.0122479569878
[0.4, 0.7] 4.72491632079 0.0158173158729
[0.4, 0.75] 4.91515970836 0.0177338628519
[0.4, 0.8] 5.03705345485 0.0173025687048
[0.4, 0.85] 5.6129703548 0.0204270263155
[0.4, 0.9] 5.81215490242 0.0216658358824
[0.4, 0.95] 5.76329923942 0.0248179289182
[0.5, 0.6] 4.40042799038 0.0163728994807
[0.5, 0.7] 4.60320172881 0.0191154799039
[0.5, 0.75] 4.81979472845 0.0203328470184
[0.5, 0.8] 4.94125752597 0.0194989514475
[0.5, 0.85] 5.43863505151 0.0218455272703
[0.5, 0.9] 5.63343994389 0.022917219329
[0.5, 0.95] 5.65737212181 0.0256510868347
[0.6, 0.7] 4.19429867276 0.0207153509952
[0.6, 0.75] 4.5572477842 0.0219253916023
[0.6, 0.8] 4.71365451732 0.0203517172815
[0.6, 0.85] 5.25603888145 0.0221018824178
[0.6, 0.9] 5.5575547228 0.023039679348
[0.6, 0.95] 5.63857058938 0.0253660684729
[0.7, 0.75] 4.16919175321 0.0218116628449
[0.7, 0.8] 4.41973575661 0.0196048379769
[0.7, 0.85] 4.98999126621 0.0221993642761
[0.7, 0.9] 5.33295096418 0.0232778373442
[0.7, 0.95] 5.55578778726 0.0253850390862
[0.75, 0.8] 3.75847932539 0.013086650073
[0.75, 0.85] 4.8387599148 0.0219691396746
[0.75, 0.9] 5.06389873879 0.0233348449246
[0.75, 0.95] 5.42952171046 0.0255965371499
[0.8, 0.85] 4.80277799744 0.022240997867
[0.8, 0.9] 5.01524051316 0.0236135156815
[0.8, 0.95] 5.38384596029 0.0257300306828
[0.85, 0.9] 4.453963228 0.0227579527234
[0.85, 0.95] 5.21634727289 0.0240909201119
[0.9, 0.95] 5.07232266265 0.022501379554
  • Best sil score: 0.0258680002387
  • Best davies score: 3.74938336864

In [49]:
rows_inds_v1 = rows_ind.copy()
inds = np.where((norm(U)[:,0] >= 0.25) & (norm(U)[:,0] <= 0.75))[0]
rows_inds_v1[inds] = 2
rows_inds_v1


Out[49]:
array([0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 2, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [51]:
def top_k_words(vec, V, count_vect, k):
    top_inds, top_vals = top_k(vec, k)
    words = count_vect.get_feature_names()
    top_words = [words[i] for i in top_inds]
    top_V_vals0 = [V[i, 0] for i in top_inds]
    top_V_vals1 = [V[i, 1] for i in top_inds]
    top_pairs = reverse(zip(top_words, top_V_vals0, top_V_vals1, top_vals))
    return top_pairs

for i in range(3):
    clust_inds = np.where(rows_inds_v1 == i)[0]
    sum_per_word = np.sum(X_train_norm.toarray()[clust_inds, :], axis=0)
    print 'Top words for document cluster %s' % i
    top_pairs = top_k_words(sum_per_word, V, count_vect, 99999999)
    for word, v0_val, v1_val, tf_val in top_pairs[0:30]:
        print word, v0_val, v1_val, tf_val
    print
    to_csv(top_pairs, 'onmtf_2x2_doc_cluster_%s_words_top.csv' % i,
           ['word', 'V cluster 0 value', 'V cluster 1 value', 'tf norm value'])


Top words for document cluster 0
jogo 19.3527267394 1.87040747251e-05 16.1345006713
jogos 15.6353496953 1.54556249408e-23 13.7376085609
feira 6.07173171202 8.43601036208e-54 6.58016281503
playstation 6.35047727335 9.26734952961e-84 6.50286252979
novo 5.79390446145 3.34444164084e-25 6.2847611163
xbox 5.08179498493 4.02883885549e-45 5.93517551309
equipe 5.55833915879 8.9052424836e-19 5.13623838346
dia 5.07335915865 3.48601629221e-23 5.0844037314
anos 4.45772025754 4.26177288793e-48 4.87810570511
time 5.00674944051 0.111621677805 4.72817539153
brasileiro 4.38930985302 4.64107023359e-05 4.62058505134
game 5.00866710716 0.711428335873 4.59798879455
console 4.36396050274 7.26881670789e-34 4.56307259533
minutos 5.2089078997 5.05699831963e-75 4.5284134726
jogador 4.3587845406 1.11231646213e-39 4.51859855812
monetáriointerno 4.20360363283 8.70711420166e-05 4.39775037509
apenas 4.51116813321 1.45033620722e-08 4.39400299907
ano 4.29161837095 0.0741962269436 4.34858225763
dois 4.83620090059 1.74647660812e-28 4.23853229794
games 4.03979737147 1.17938511064 4.23125607384
final 4.34828341839 0.313408294338 4.20030301106
real 4.81827687176 9.98675099274e-52 4.19956294008
tempo 4.44838081005 4.21454055913e-37 4.19650747198
sony 3.9137459447 1.97619133916e-60 4.15619803355
segundo 4.18209501695 0.335632274021 4.14510109671
título 4.21163516811 3.22179918625e-31 4.12536426416
melhor 3.81526278733 1.05101279239e-16 3.9570933128
gol 4.50542691225 2.88735243741e-115 3.91517632453
brasil 3.93566269372 0.689588317028 3.90483081613
wii 3.82603791767 1.49389828645e-20 3.89686450689

Top words for document cluster 1
of 0.0717638578618 5.79936658361 3.44276503519
league 2.28266192086e-24 4.32280307865 2.87449080261
legends 2.12316642719e-33 4.28481307777 2.87449080261
personagens 0.225381163764 1.32248734024 0.924459259724
riot 1.69347727936e-25 1.49326731098 0.860873729451
games 4.03979737147 1.17938511064 0.627269494193
jogadores 4.1802252545 0.487558805196 0.537470823541
brasil 3.93566269372 0.689588317028 0.532893869374
game 5.00866710716 0.711428335873 0.519443722384
jogo 19.3527267394 1.87040747251e-05 0.466715596016
internacional 0.222644822546 0.591851635421 0.463425978424
segundo 4.18209501695 0.335632274021 0.433124298149
final 4.34828341839 0.313408294338 0.404494962627
desafio 0.260446946096 0.45034133851 0.365578286034
time 5.00674944051 0.111621677805 0.356693815259
temporada 2.49492117547 0.278320209837 0.35473394334
público 0.989515542722 0.505442957967 0.344167696829
cnb 7.33575108626e-40 0.399370180311 0.313660872869
american 1.57463354417e-42 0.453841085006 0.302258792403
express 1.14875519701e-42 0.453841085006 0.302258792403
milhões 1.68779444263 0.256910231326 0.296086552958
servidor 1.22735309815e-23 0.408939934446 0.289807681748
maior 2.14106603093 0.357078521244 0.286394734842
cartões 0.309962906073 0.37025631986 0.272367355351
campeonato 3.19917468066 0.0919783086263 0.269953391514
horas 0.927603815988 0.325924262511 0.263262095056
corpo 0.201995509548 0.326499074854 0.244883980922
world 0.678114248516 0.330014231984 0.24385891083
outubro 1.31151386297 0.262199637816 0.232090625721
videogames 1.35421820917 0.298070628482 0.216279248498

Top words for document cluster 2
of 0.0717638578618 5.79936658361 1.07168062821
jogo 19.3527267394 1.87040747251e-05 0.807918397675
the 3.49034659586 0.112794388662 0.684944476706
games 4.03979737147 1.17938511064 0.556541711709
league 2.28266192086e-24 4.32280307865 0.495516677649
and 0.777872332796 0.119105484923 0.49421510753
executivo 0.720441127876 0.404064650856 0.45
riot 1.69347727936e-25 1.49326731098 0.412994078835
company 0.193395912757 0.189903826372 0.400320384513
heroes 0.278603732718 0.240939329999 0.400320384513
iphone 0.501278669361 0.137800933821 0.377964473009
lançamento 2.24857238605 0.0469112405855 0.372278852426
paper 0.197949926043 0.0966305146265 0.363749492919
super 1.69032765678 0.0387084066556 0.345516677649
times 0.882093122913 0.308515737354 0.345516677649
final 4.34828341839 0.313408294338 0.332522598814
pen 0.185476391574 0.0897845193082 0.331133089266
wigan 0.135160828947 0.0656829165795 0.329292779969
device 0.138485995246 0.0810758895456 0.323925308056
brasil 3.93566269372 0.689588317028 0.315232807305
paulo 4.1686529716 1.39986127489e-08 0.3
empresa 1.43680388098 0.291446854541 0.293058155737
multiplayer 1.26404515108 0.0933964553421 0.286116311475
game 5.00866710716 0.711428335873 0.284592718992
legends 2.12316642719e-33 4.28481307777 0.27598815767
knights 0.157211254226 0.0716655894479 0.264906471413
edition 0.250148708054 0.110901017397 0.264906471413
monetáriointerno 4.20360363283 8.70711420166e-05 0.264906471413
companhia 1.54846415483 0.0837559251406 0.25
menos 1.80481270611 0.0898674469755 0.235752417659


In [60]:
best = 1e10
for _ in xrange(5):
    U_t, S_t, V_t, rows_ind_t, cols_ind_t, err = matrix_factorization_clustering(X_train_norm.toarray(), 3, 2, onmtf, num_iters=200)
    try:
        dav_sc = davies_bouldin_score(X_train_norm.toarray(), rows_ind_t, calculate_centroids_doc_mean(X_train_norm, rows_ind_t, 2))
    except:
        continue
    if err < best:
        best = err
        U_v2 = U_t
        S_v2 = S_t
        V_v2 = V_t
        rows_ind_v2 = rows_ind_t
        cols_ind_v2 = cols_ind_t

    print 'tf norm (3 clusters): %s' % rand_score(labels, rows_ind_t)
    print 'tf norm (3 clusters): %s' % sil_score(X_train_norm, rows_ind_t)
    print 'tf norm (3 clusters): %s' % dav_sc
    print ''


186.788029434
185.817753824
185.752988219
185.720605916
185.705301466
185.689326538
185.671233592
185.648633445
185.619951241
185.582805482
185.534095274
185.469286864
185.381833963
185.262094355
185.095854988
184.862370802
184.532235802
184.06652597
183.421616105
182.570813614
181.553035469
180.512382797
179.648646149
179.073448824
178.744195357
178.578415919
178.503513646
178.469384649
178.45321872
178.444949346
178.440765692
178.439196849
178.439076056
178.43951161
178.440044583
178.440472007
178.44075518
178.440879882
178.440897817
178.440869929
178.440841154
178.440813146
178.440798586
178.440784551
178.440774774
178.440754836
178.440731105
178.440691243
178.440642738
178.440574407
178.440495347
178.440395587
178.440285292
178.440155389
178.440017167
178.439861617
178.439699973
178.439525049
178.439351091
178.439170737
178.438994791
178.438812932
178.438635295
178.438452042
178.438273753
178.438090875
178.437913997
178.437733615
178.437560161
178.437384143
178.437215801
178.437045685
178.436883859
178.436720966
178.436566903
178.436412441
178.436267318
178.436122452
178.43598741
178.435853244
178.435729312
178.435606756
178.43549467
178.435384247
178.435284286
178.435186023
178.435097973
178.435011438
178.434934673
178.43485907
178.43479266
178.434726951
178.434669779
178.434612797
178.434563676
178.434514234
178.434472004
178.434428984
178.434392589
178.434355008
178.434323556
178.434290606
178.434263371
178.434234379
178.434210718
178.434185051
178.434164345
178.434141401
178.434123083
178.434102334
178.434085924
178.434066935
178.434052045
178.434034469
178.434020798
178.43400437
178.433991695
178.433976225
178.43396439
178.433949755
178.433938668
178.4339248
178.43391442
178.433901295
178.433891617
178.433879243
178.433870286
178.433858687
178.433850478
178.433839681
178.433832245
178.433822267
178.433815617
178.43380646
178.43380059
178.433792238
178.433787122
178.433779538
178.43377513
178.43376826
178.4337645
178.433758277
178.433755093
178.433749438
178.433746752
178.433741584
178.433739312
178.433734548
178.433732609
178.433728166
178.433726481
178.433722281
178.433720775
178.433716746
178.433715349
178.433711424
178.433710073
178.43370619
178.433704827
178.433700933
178.433699507
178.43369555
178.433694016
178.433689953
178.433688269
178.433684062
178.433682193
178.433677808
178.433675722
178.433671128
178.433668798
178.43366397
178.433661373
178.433656289
178.433653404
178.433648046
178.433644857
178.433639208
178.433635703
178.433629752
178.433625919
178.433619657
178.433615489
178.43360891
178.433604403
178.433597504
178.433592658
178.433585438
178.433580253
178.433572716
178.433567198
178.433559349
178.433553505
178.433545353
178.433539192
178.433530748
178.433524283
178.433515561
178.433508807
178.433499821
tf norm (3 clusters): Rand score: 0.970148507425
tf norm (3 clusters): Silhouette score: 0.013394055043
tf norm (3 clusters): 0.919065990395

186.997802523
185.819626206
185.501857913
185.210290967
184.885893545
184.513943564
184.10291113
183.684724762
183.297797007
182.97151208
182.718887047
182.539070729
182.417603724
182.336707114
182.280121985
182.237701453
182.203122738
182.17344645
182.146096311
182.120226332
182.094899291
182.069903678
182.044651937
182.019101371
181.992806428
181.965616724
181.937029192
181.906852329
181.87476217
181.840625626
181.804098739
181.764790121
181.722111294
181.675288625
181.623369088
181.565112762
181.499116806
181.423511487
181.336740225
181.238584518
181.130116298
181.011504902
180.882304572
180.742958125
180.595033502
180.444243186
180.299449507
180.161858729
180.024492561
179.873397411
179.699035778
179.51349018
179.358794356
179.248031953
179.17389114
179.125824741
179.089786825
179.044355211
179.001371063
178.931190756
178.840412547
178.79556699
178.76351565
178.719810209
178.694995745
178.680711098
178.672083355
178.66683621
178.661375073
178.655513878
178.645086505
178.63575703
178.631155176
178.625957181
178.620603599
178.616473181
178.611106827
178.607608712
178.605072717
178.600729974
178.592031653
178.580776278
178.569937415
178.565400216
178.549007953
178.533310312
178.527135376
178.521780773
178.51615247
178.510646155
178.504550326
178.499922338
178.495708067
178.491694877
178.488309784
178.485079649
178.482093941
178.479293701
178.476646196
178.47432982
178.472205396
178.470271775
178.468565667
178.467100404
178.465823718
178.464728227
178.463683687
178.46274189
178.461896377
178.460959248
178.460018053
178.459275088
178.458669557
178.458215962
178.457793403
178.457389971
178.456973614
178.456596237
178.456239552
178.45591556
178.455580003
178.455246255
178.454874922
178.454482274
178.454031451
178.453547262
178.453034671
178.452547622
178.452065833
178.451601768
178.45112482
178.450665568
178.450205216
178.449753852
178.449285008
178.448860976
178.448469894
178.448118
178.447777829
178.447467375
178.447168331
178.446900464
178.446639805
178.446399211
178.446156168
178.445932022
178.445712377
178.445505931
178.445290695
178.445073729
178.44482936
178.444593983
178.444365782
178.4441505
178.443931345
178.443723512
178.443512819
178.44331046
178.443097599
178.442880491
178.442639543
178.442387515
178.442125713
178.441890956
178.441689575
178.441538802
178.441418443
178.44132721
178.441241553
178.441166007
178.441083829
178.441002712
178.440908017
178.440808751
178.440693551
178.440575794
178.440449344
178.440330056
178.440211128
178.440104834
178.439999688
178.439903261
178.439802911
178.439706861
178.439602269
178.439495555
178.439374537
178.439248417
178.439109545
178.438971505
178.438830627
178.438702157
178.438581706
178.438481005
178.438391315
178.438319851
178.438255217
178.438203265
178.438153217
178.438111375
tf norm (3 clusters): Rand score: 0.951046944683
tf norm (3 clusters): Silhouette score: 0.00816378871819
tf norm (3 clusters): 5.19042645696

186.722923963
185.773521675
185.676112776
185.617812444
185.561319106
185.493061168
185.405807644
185.28995224
185.134521685
184.924826307
184.647396063
184.297506408
183.891538674
183.467987564
183.071924489
182.734175293
182.461728697
182.243539058
182.064828988
181.914152112
181.781452912
181.658257826
181.537433836
181.413325444
181.280812838
181.134544352
180.970308155
180.785871104
180.58138124
180.359034253
180.122926422
179.880449298
179.644724065
179.434727788
179.268562139
179.147565022
179.058512678
178.985788791
178.918329557
178.860273356
178.814299819
178.765981453
178.727756463
178.70791798
178.695410476
178.681429258
178.668999628
178.663772864
178.661818073
178.660425365
178.658375166
178.653445127
178.645469258
178.640456231
178.636171253
178.627276522
178.617970878
178.614130611
178.612103407
178.609390326
178.604982445
178.599972212
178.596570323
178.594727969
178.592333177
178.583575229
178.567475093
178.558881164
178.554502398
178.549684727
178.535278344
178.515428847
178.504044335
178.495195724
178.488345793
178.484542302
178.482710759
178.481685382
178.480935904
178.480248926
178.479516976
178.478026012
178.47680532
178.475939775
178.475045559
178.474106995
178.473123996
178.47208229
178.470983726
178.469810403
178.468557213
178.467209336
178.465796429
178.464303849
178.462740498
178.461097876
178.459399625
178.457647396
178.455872033
178.454076919
178.452302074
178.450563842
178.44889905
178.447307378
178.44581953
178.444427877
178.44315329
178.441977701
178.440915244
178.439944433
178.439077969
178.438292382
178.43759765
178.436968906
178.436415979
178.435915189
178.435476924
178.435078781
178.43473192
178.434415166
178.434140205
178.43388716
178.433666251
178.433447889
178.433202188
178.432885192
178.43266078
178.432514664
178.43240506
178.432300521
178.432208541
178.432104662
178.432004689
178.431920442
178.431859749
178.431799936
178.431752063
178.431701418
178.431659806
178.431610225
178.431564361
178.43151873
178.431487791
178.431453826
178.431427753
178.431396035
178.431371143
178.431339487
178.431311077
178.431267832
178.431220576
178.431171776
178.431145837
178.431124661
178.431114518
178.431100687
178.431094717
178.431083974
178.431080487
178.431071827
178.431070039
178.431062799
178.431062156
178.431055866
178.431055975
178.431050299
178.43105087
178.431045545
178.431046336
178.431041124
178.431041897
178.43103654
178.431037009
178.43103118
178.431030972
178.431024248
178.431022903
178.431014808
178.431011886
178.431002087
178.430997474
178.430986181
178.430980489
178.430968702
178.430963177
178.430952139
178.430947757
178.430938002
178.43093483
178.430926057
178.430923583
178.430915203
178.430912851
178.430904346
178.430901661
178.43089262
178.430889208
178.43087922
178.430874637
178.430863229
tf norm (3 clusters): Rand score: 0.961398574269
tf norm (3 clusters): Silhouette score: 0.0178990324695
tf norm (3 clusters): 5.17250473517

186.68001584
185.778924475
185.606165974
185.468186394
185.285643976
185.0305308
184.665914593
184.166860588
183.531078566
182.801080179
182.049843722
181.328393803
180.670909025
180.104585885
179.634451502
179.259440693
178.981067787
178.790165464
178.66637998
178.586590022
178.533961399
178.498072669
178.472874563
178.454566033
178.440808633
178.429991721
178.421185281
178.413760538
178.407373361
178.401742437
178.396739974
178.392214744
178.388128195
178.384395383
178.381014582
178.377920073
178.375116548
178.37254198
178.37020291
178.368043154
178.366074802
178.364249558
178.362584544
178.361036938
178.359626011
178.358312174
178.357115674
178.355999335
178.354983944
178.354034289
178.353171598
178.352362394
178.351628378
178.350937605
178.350312255
178.34972167
178.349188466
178.348682986
178.348228224
178.347795275
178.347407478
178.347036475
178.346705945
178.346387915
178.346106409
178.345833698
178.345594165
178.345360198
178.345156546
178.344955613
178.344782525
178.344609622
178.344462436
178.344313184
178.344187836
178.344058441
178.343951435
178.343838656
178.343747029
178.343648138
178.343569418
178.343482151
178.343414303
178.343336803
178.34327816
178.343208903
178.3431581
178.343095826
178.343051727
178.342995379
178.342957023
178.342905692
178.342872263
178.342825205
178.342796109
178.342752903
178.342727979
178.342688698
178.342668213
178.342633132
178.342617247
178.342586285
178.342574717
178.342547419
178.342539659
178.342515486
178.342511034
178.342489496
178.342487906
178.342468565
178.342469438
178.342451896
178.342454874
178.342438773
178.342443541
178.342428572
178.342434864
178.342420765
178.342428367
178.342414928
178.342423676
178.342410736
178.342420516
178.342407959
178.342418698
178.342406449
178.342418112
178.342406127
178.342418706
178.342406963
178.34242047
178.342408961
178.34242342
178.342412147
178.342427591
178.342416567
178.342433042
178.342422288
178.342439848
178.342429394
178.342448103
178.342437987
178.342457914
178.342448178
178.342469398
178.342460088
178.34248268
178.342473844
178.342497887
178.342489572
178.342515147
178.342507398
178.342534582
178.342527438
178.342556302
178.342549797
178.342580405
178.342574559
178.342606965
178.342601792
178.342636042
178.342631546
178.34266768
178.34266386
178.342701914
178.342698764
178.342738773
178.342736287
178.342778285
178.342776456
178.342820481
178.342819304
178.342865397
178.34286487
178.342913075
178.342913198
178.342963558
178.342964325
178.34301688
178.343018282
178.343073067
178.343075088
178.343132137
178.343134758
178.343194102
178.343197303
178.343258971
178.343262725
178.343326742
178.34333102
178.343397406
178.34340217
178.343470942
178.343476148
178.343547315
178.34355291
178.343626472
178.343632394
178.343708344
178.343714518
tf norm (3 clusters): Rand score: 0.702022140535
tf norm (3 clusters): Silhouette score: 0.0170413505214
tf norm (3 clusters): 4.59279755076

186.747594206
185.861466582
185.747838861
185.692480174
185.647827544
185.60101934
185.545583657
185.476816385
185.389442876
185.277636943
185.134917945
184.955336867
184.735155544
184.475439117
184.183939167
183.87518519
183.567323961
183.270811234
182.981546244
182.68747423
182.377377256
182.046729933
181.703028216
181.365418399
181.051575229
180.768764549
180.516290965
180.293855606
180.105353244
179.927122624
179.716836843
179.503832943
179.319304843
179.170688709
179.048573825
178.9376558
178.838701173
178.762080545
178.713445326
178.682037853
178.653108114
178.614059704
178.576691922
178.555014614
178.527540167
178.506979584
178.494978693
178.485894081
178.478754965
178.473402557
178.468334818
178.464135015
178.460943341
178.457916155
178.455864353
178.45427457
178.452674912
178.451576729
178.450581872
178.449509382
178.448744884
178.448193191
178.447747676
178.447193568
178.446428163
178.445672369
178.445114108
178.444700395
178.444363468
178.444102291
178.443851365
178.44358967
178.443306176
178.443022695
178.442733185
178.442449775
178.442157729
178.441870442
178.441584067
178.441309687
178.441033148
178.440758538
178.440463047
178.440141545
178.439772337
178.439366457
178.438940403
178.438542657
178.438192357
178.437903241
178.437656187
178.437445628
178.437251736
178.437074811
178.436901819
178.436738313
178.436575122
178.436419358
178.436262359
178.436111013
178.435956938
178.435806443
178.435650548
178.435495506
178.435333684
178.435172818
178.435006484
178.434841995
178.434672488
178.43450517
178.434333751
178.434165824
178.433995715
178.433831227
178.433666952
178.433510376
178.433355506
178.433208797
178.433063418
178.432925047
178.432786645
178.432653957
178.43252032
178.432391733
178.432261746
178.432136327
178.432009024
178.431885818
178.431760447
178.431639036
178.431515425
178.431395712
178.431273695
178.431155396
178.431034579
178.430917274
178.43079728
178.430680677
178.430561317
178.430445417
178.430327005
178.430212483
178.430095862
178.429983437
178.429869022
178.429758818
178.429646534
178.429538389
178.429428059
178.429321829
178.429213348
178.429108975
178.429002331
178.428899865
178.42879517
178.428694796
178.428592311
178.428494362
178.428394468
178.428299352
178.428202448
178.428110535
178.428016941
178.427928502
178.427838442
178.427753669
178.427667308
178.427586357
178.427503846
178.42742687
178.427348361
178.42727551
178.427201145
178.427132557
178.427062456
178.426998239
178.426932497
178.426872733
178.426811412
178.426756145
178.426699264
178.426648491
178.426596018
178.426549686
178.426501545
178.426459564
178.426415652
178.426377911
178.426338108
178.426304483
178.42626865
178.426238987
178.426206949
178.426181053
178.426152587
178.426130214
178.426105058
178.426085935
178.426063808
178.426047657
tf norm (3 clusters): Rand score: 0.898548029909
tf norm (3 clusters): Silhouette score: 0.0169536573097
tf norm (3 clusters): 5.77317010321


In [62]:
pairplot(U_v2)



In [63]:
pairplot(V_v2)



In [64]:
def top_k_words_term_cluster(vec, X, count_vect, k):
    sum_per_word = np.sum(X, axis=0)
    top_inds, top_vals = top_k(vec, k)
    words = count_vect.get_feature_names()
    top_words = [words[i] for i in top_inds]
    correspondence_vals = [sum_per_word[i] for i in top_inds]

    top_pairs = reverse(zip(top_words, top_vals, correspondence_vals))

    return top_pairs

V_norm_v2 = norm(V_v2)
for i in xrange(2):
    print 'Top words for term cluster %s:' % i
    top_pairs = top_k_words_term_cluster(V_v2[:, i], X_train_norm.toarray(), count_vect, 9999999)
    for w, v_value, tf_value in top_pairs[0:30]:
        print w, v_value, tf_value
    to_csv(top_pairs, 'onmtf_3x2_V%s_words_top.csv' % i, ['word', 'V cluster %s value' % i, 'tf norm value'])
    print

# print 'Top words for middle cluster:'
# inds = np.where((V_v2[:, 0] >= 0.1) & (V_v2[:, 0] <= 0.9))[0]
# normalize = np.vectorize(lambda x: 1.0 - x if x < 0.5 else x, otypes=[np.float])
# new_V0_v2 = normalize(V_norm_v2[inds, 0])
# print_top_k_words_term_cluster(new_V0_v2, X_train_norm.toarray(), count_vect, 30)


Top words for term cluster 0:
minutos 6.0219654876 4.59140755144
time 5.56443795011 5.21085736446
gol 5.44564709792 3.91517632453
equipe 5.43591353655 5.38402375813
feira 5.0468342517 6.63423662207
real 4.8959571162 4.27962701698
final 4.45302276018 4.9373205725
partida 4.35378666593 3.73445248798
anos 4.33175831968 4.91571457583
jogador 4.29710639867 4.68028023127
técnico 4.00316589002 3.56462451803
bola 4.00275093828 2.91680857213
breno 3.72058594244 3.34439260137
madrid 3.71996290024 2.75754288117
tempo 3.71033144923 4.3141804196
segundo 3.6439592259 4.64121947369
brasileiro 3.63295121434 4.87632646275
gols 3.62339025999 2.69286933272
espanha 3.61766825663 2.9656773629
paulo 3.59209003717 4.23639183759
casa 3.5807565964 3.40081430181
barcelona 3.57183286696 2.79061228958
campeonato 3.54549201074 3.42909922129
clube 3.53009271783 3.15377741762
dois 3.49856164701 4.46835594418
euro 3.38295209229 3.4981714608
atacante 3.37084143631 2.88981319147
vitória 3.19991256454 3.13423114506
espanhol 3.09922458064 2.41562175014
título 3.07270699814 4.31037511355

Top words for term cluster 1:
jogos 15.6021053817 14.0184074837
jogo 15.1486973689 17.409134665
playstation 7.03905261552 6.50286252979
of 6.59900252646 6.66858864083
xbox 5.88443728237 5.93517551309
game 5.73830268997 5.40202523592
games 5.49220582005 5.41506727974
novo 5.13592376453 6.4397883303
console 4.90395524015 4.56307259533
wii 4.49740361187 3.89686450689
sony 4.35586512114 4.15619803355
the 3.95270818645 4.10855529243
brasil 3.89121630274 4.75295749281
nintendo 3.56303369844 3.27248181662
league 2.91691472804 3.75349938375
legends 2.89624919289 3.23393553266
one 2.76835602248 2.70936450188
lançado 2.7524348663 2.68510123945
ps 2.69766094543 2.46754638865
apenas 2.68871590737 4.59242923649
além 2.67367868207 3.65357593645
usmonetáriointerno 2.61202017289 2.62675989352
arena 2.59903788163 3.02027939557
nova 2.57055086397 3.08765693943
pessoas 2.48887070041 2.5572955834
lançamento 2.45671343124 2.57362895081
jogadores 2.31307344785 4.30228612023
versão 2.27686426579 1.95832779588
microsoft 2.227869462 2.27360974308
mil 2.10094441582 2.67438462566


In [65]:
print S_v2
np.savetxt('onmtf_3x2_S.csv', S_v2, delimiter=",")


[[  5.79564771e-13   1.10211683e-02]
 [  1.15484454e-02   5.41819523e-07]
 [  3.40425870e-03   5.84742474e-03]]

In [76]:
def top_k_words(vec, V, count_vect, k):
    top_inds, top_vals = top_k(vec, k)
    words = count_vect.get_feature_names()
    top_words = [words[i] for i in top_inds]
    top_V_vals0 = [V[i, 0] for i in top_inds]
    top_V_vals1 = [V[i, 1] for i in top_inds]
    top_pairs = reverse(zip(top_words, top_V_vals0, top_V_vals1, top_vals))
    return top_pairs

for i in range(3):
    clust_inds = np.where(rows_ind_v2 == i)[0]
    sum_per_word = np.sum(X_train_norm.toarray()[clust_inds, :], axis=0)
    print 'Top words for document cluster %s' % i
    top_pairs = top_k_words(sum_per_word, V_v2, count_vect, 99999999)
    for word, v0_val, v1_val, tf_val in top_pairs[0:30]:
        print word, v0_val, v1_val, tf_val
    print
    to_csv(top_pairs, 'onmtf_3x2_doc_cluster_%s_words_top.csv' % i,
           ['word', 'V cluster 0 value', 'V cluster 1 value', 'tf norm value'])


Top words for document cluster 0
jogos 5.2628358465e-37 15.6021053817 9.76519927085
jogo 2.46034446107 15.1486973689 9.59369235669
playstation 1.55471293537e-137 7.03905261552 5.62782221729
of 6.11592831828e-53 6.59900252646 5.52687950299
xbox 2.07979873171e-150 5.88443728237 5.2046688083
novo 0.111285297763 5.13592376453 4.13033570305
wii 9.28801863374e-161 4.49740361187 3.86319063428
console 1.02870520391e-119 4.90395524015 3.635277187
sony 1.98897820995e-128 4.35586512114 3.55059854352
games 1.26106308682e-53 5.49220582005 3.44803608228
game 2.48202446885e-51 5.73830268997 3.37042744254
nintendo 5.14765332522e-133 3.56303369844 3.15125677327
the 1.60676425646e-77 3.95270818645 2.98369183214
lançado 3.03880564275e-47 2.7524348663 2.39865415406
legends 8.48662844341e-25 2.89624919289 2.32200005374
one 1.33822556732e-154 2.76835602248 2.30525013338
league 0.141365629411 2.91691472804 2.23854348135
ps 1.15788756457e-132 2.69766094543 2.09249974015
nova 1.88454541216e-09 2.57055086397 1.95358144755
brasil 0.483438180965 3.89121630274 1.94945446305
usmonetáriointerno 4.04921863925e-46 2.61202017289 1.9440557514
lançamento 3.09611360301e-68 2.45671343124 1.93669732095
além 0.499201417365 2.67367868207 1.89820950661
arena 7.53737117076e-16 2.59903788163 1.86248741022
microsoft 5.65288697557e-128 2.227869462 1.80337020087
versão 5.85243382071e-114 2.27686426579 1.66240768624
jogadores 2.04672267472 2.31307344785 1.6351666895
site 1.1176304107e-06 2.08928994309 1.54737799716
apenas 1.53279906253 2.68871590737 1.54562096249
controle 3.97058352662e-97 1.87738858027 1.47889151455

Top words for document cluster 1
feira 5.0468342517 0.629990309184 4.42044569798
time 5.56443795011 3.20026751796e-41 4.37390358506
equipe 5.43591353655 2.34136747667e-07 4.26289447131
minutos 6.0219654876 1.09691490057e-91 4.21996069091
anos 4.33175831968 9.94326584683e-09 3.93070656528
gol 5.44564709792 6.54540727217e-138 3.85918707343
jogador 4.29710639867 5.38618819254e-10 3.74544501911
jogo 2.46034446107 15.1486973689 3.73355485596
brasileiro 3.63295121434 0.534168208006 3.61670352756
final 4.45302276018 0.085599619888 3.5722733372
real 4.8959571162 1.00967517653e-20 3.48278546656
ufc 2.18540878353 2.45111986224e-23 3.43490393195
técnico 4.00316589002 1.83667406033e-88 3.38888595131
breno 3.72058594244 1.1327669369e-255 3.34439260137
euro 3.38295209229 4.93691650123e-132 3.27025030788
partida 4.35378666593 4.08743901553e-44 3.17134787851
futebol 2.9961265318 7.17447389518e-73 3.14471826693
campeão 2.80742519866 1.79914745874e-48 3.13847171673
segundo 3.6439592259 0.704858377625 3.02222133762
clube 3.53009271783 2.63173642422e-72 2.98580966434
casa 3.5807565964 5.07952052719e-46 2.94803563109
vitória 3.19991256454 3.35895355187e-65 2.91065019625
tempo 3.71033144923 0.490034624597 2.89856169208
espanha 3.61766825663 2.41810020467e-99 2.88823467934
paulo 3.59209003717 0.385118225588 2.87693032411
campeonato 3.54549201074 1.45340764933e-36 2.87123342784
atacante 3.37084143631 2.01762405068e-93 2.85642416331
bola 4.00275093828 8.90354727146e-130 2.82743029287
título 3.07270699814 0.79763466804 2.82727360066
dois 3.49856164701 1.02035038905 2.70202442624

Top words for document cluster 2
jogo 2.46034446107 15.1486973689 4.08188745235
jogos 5.2628358465e-37 15.6021053817 2.96834334455
brasil 0.483438180965 3.89121630274 1.99578640958
game 2.48202446885e-51 5.73830268997 1.98907346783
dia 2.56541330706 2.09057274979 1.97618757408
games 1.26106308682e-53 5.49220582005 1.96703119746
monetáriointerno 2.12594621178 1.83878728696 1.41221471973
evento 0.278220411531 1.66472387367 1.3407942637
apenas 1.53279906253 2.68871590737 1.33513363299
ano 2.11452298506 1.95530622151 1.23954449477
flappy 0.0460718434821 0.885224065858 1.20950203913
feira 5.0468342517 0.629990309184 1.20601812583
mil 0.282521937841 2.10094441582 1.16985899043
of 6.11592831828e-53 6.59900252646 1.14170913783
league 0.141365629411 2.91691472804 1.140931153
paulo 3.59209003717 0.385118225588 1.12115232925
novo 0.111285297763 5.13592376453 1.11069544411
companhia 0.0896722119515 1.43299992898 1.1105265618
pessoas 0.00289913438819 2.48887070041 1.10389223686
final 4.45302276018 0.085599619888 1.07834036268
durante 1.27707262388 1.58148477782 1.07486122842
the 1.60676425646e-77 3.95270818645 1.06878624488
empresa 8.59829720815e-08 1.58975030899 0.980759596562
console 1.02870520391e-119 4.90395524015 0.927795408332
legends 8.48662844341e-25 2.89624919289 0.911935478922
playstation 1.55471293537e-137 7.03905261552 0.875040312498
bird 0.0336498729462 0.647706799824 0.841834376893
melhor 1.86939477366 1.68453968117 0.834733989655
dois 3.49856164701 1.02035038905 0.82573960649
milhões 1.73174345177 0.174279799875 0.817192811836


In [61]:
print 'Num elems in cluster 0: %s' % np.sum(rows_ind_v2 == 0)
print 'Num elems in cluster 1: %s' % np.sum(rows_ind_v2 == 1)
print 'Num elems in cluster 2: %s' % np.sum(rows_ind_v2 == 2)
print rows_ind_v2
print np.where(rows_ind_v2 == 0)

for url in sport_news_df.ix[np.where(rows_ind_v2 == 0)[0]-100].url:
    print url


Num elems in cluster 0: 70
Num elems in cluster 1: 93
Num elems in cluster 2: 37
[2 2 2 2 2 0 0 2 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 0 2 0 2 0 2 0 2 0 0
 2 0 2 0 0 0 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 2 0
 2 0 2 0 2 0 0 0 2 2 0 0 0 0 0 0 2 0 2 0 2 2 0 0 2 2 1 1 1 1 1 1 2 1 1 2 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 2 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 2 1 1 1 1 1 1 1 2 1 1 1 1]
(array([ 5,  6,  8,  9, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
       24, 27, 29, 31, 33, 35, 36, 38, 40, 41, 42, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 67, 68,
       69, 70, 71, 73, 75, 77, 79, 80, 81, 84, 85, 86, 87, 88, 89, 91, 93,
       96, 97]),)
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan

In [67]:
inds = np.where(rows_ind_v2 == 0)[0]
plt.hist(norm(U)[inds, 0], bins=50)

inds = np.where(rows_ind_v2 == 2)[0]
plt.hist(norm(U)[inds, 0], bins=50)

inds = np.where(rows_ind_v2 == 1)[0]
plt.hist(norm(U)[inds, 0], bins=50)


Out[67]:
(array([  1.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
          0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
          0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
          0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
          0.,   1.,   0.,   0.,   0.,  91.]),
 array([ 0.65752947,  0.66437888,  0.67122829,  0.6780777 ,  0.68492711,
         0.69177652,  0.69862593,  0.70547534,  0.71232476,  0.71917417,
         0.72602358,  0.73287299,  0.7397224 ,  0.74657181,  0.75342122,
         0.76027063,  0.76712004,  0.77396945,  0.78081886,  0.78766827,
         0.79451768,  0.80136709,  0.8082165 ,  0.81506591,  0.82191532,
         0.82876474,  0.83561415,  0.84246356,  0.84931297,  0.85616238,
         0.86301179,  0.8698612 ,  0.87671061,  0.88356002,  0.89040943,
         0.89725884,  0.90410825,  0.91095766,  0.91780707,  0.92465648,
         0.93150589,  0.9383553 ,  0.94520472,  0.95205413,  0.95890354,
         0.96575295,  0.97260236,  0.97945177,  0.98630118,  0.99315059,  1.        ]),
 <a list of 50 Patch objects>)

In [68]:
def overlap(a, b, k, l):
    clust_a = a == k
    clust_b = b == l
    
    inds = []
    sum_all = 0
    sum_equals = 0
    for i, elem in enumerate(clust_a):
        if (clust_a[i] == False and clust_b[i] == False):
            continue
        elif (clust_a[i] == True and clust_b[i] == False):
            sum_all += 1
        elif (clust_a[i] == False and clust_b[i] == True):
            sum_all += 1
        elif (clust_a[i] == True and clust_b[i] == True):
            sum_equals += 1
            sum_all += 1
            inds.append(i)
    return np.array(inds), float(sum_equals) / sum_all
print 'Do they overlap on cluster 2?'
inds, overlap_rate = overlap(rows_inds_v1, rows_ind_v2, 2, 0)
print '%.2f' % overlap_rate
print inds


Do they overlap on cluster 2?
0.04
[27 59 97]

In [69]:
best = 1e10
for _ in xrange(5):
    U_t, S_t, V_t, rows_ind_t, cols_ind_t, err = matrix_factorization_clustering(X_train_norm.toarray(), 2, 3, onmtf, num_iters=100)
    try:
        dav_sc = davies_bouldin_score(X_train_norm.toarray(), rows_ind_t, calculate_centroids_doc_mean(X_train_norm, rows_ind_t, 2))
    except:
        continue
    if err < best:
        best = err
        U_v7 = U_t
        S_v7 = S_t
        V_v7 = V_t
        rows_ind_v7 = rows_ind_t
        cols_ind_v7 = cols_ind_t

    print 'tf norm (3 clusters): %s' % rand_score(labels, rows_ind_t)
    print 'tf norm (3 clusters): %s' % sil_score(X_train_norm, rows_ind_t)
    print 'tf norm (3 clusters): %s' % dav_sc
    print ''


186.628842362
185.79631509
185.708244154
185.665093932
185.625836634
185.579267021
185.519140637
185.438692309
185.328877636
185.176991896
184.965410239
184.671612817
184.271500749
183.749707182
183.115343919
182.408253973
181.681094201
180.981038504
180.350987228
179.817061248
179.394245821
179.09355749
178.89161793
178.761509513
178.673304848
178.607822801
178.556147548
178.513804115
178.478156165
178.447913278
178.422312545
178.400799512
178.383029037
178.368651158
178.357114085
178.347774613
178.340071151
178.333582685
178.328016179
178.323167048
178.318895193
178.315099006
178.311708865
178.308669648
178.305942254
178.303489616
178.301283664
178.299294066
178.297498487
178.295871884
178.294397447
178.293055375
178.291833704
178.290716753
178.289695848
178.288758
178.287896593
178.287100469
178.28636429
178.285678089
178.285037457
178.284434179
178.283866489
178.283329819
178.282824672
178.282346317
178.281893221
178.281460315
178.28104874
178.280658057
178.280292285
178.279950703
178.279634472
178.279339889
178.279066232
178.278809465
178.278569329
178.278342597
178.278129729
178.277928087
178.277738575
178.277558846
178.277390044
178.277229937
178.277079809
178.276937445
178.276804225
178.276677908
178.276559963
178.276448104
178.276343893
178.276244999
178.276153084
178.276065775
178.275984849
178.2759079
178.275836826
178.2757692
178.275707041
178.275647904
tf norm (3 clusters): Rand score: 0.979999505051
tf norm (3 clusters): Silhouette score: 0.0318769388602
tf norm (3 clusters): 5.16034645983

186.722473828
185.778519809
185.665866132
185.601670206
185.534319864
185.45539106
185.360743883
185.247994569
185.114772682
184.958778649
184.780236953
184.582600576
184.37344269
184.160513241
183.949016781
183.741330992
183.5367673
183.334122999
183.136129946
182.946606438
182.764907916
182.588147268
182.416389896
182.251023592
182.09460112
181.953424174
181.828372604
181.715761928
181.614801102
181.520120937
181.425852332
181.329476966
181.230345405
181.125255349
181.010558861
180.880913125
180.734830303
180.577107698
180.410794793
180.237084103
180.055575048
179.881922237
179.720336957
179.569563936
179.416917596
179.293626696
179.197333819
179.127594725
179.066680076
178.972553476
178.917444398
178.852899448
178.791963058
178.739381869
178.688393406
178.663377759
178.644589852
178.631416123
178.618400135
178.607350743
178.597791568
178.586721585
178.576348185
178.566830841
178.55839259
178.550172189
178.543569829
178.536922641
178.531348538
178.525071871
178.518427627
178.512959096
178.508381163
178.504546037
178.50082549
178.497079198
178.493131593
178.489314469
178.485525725
178.481836196
178.478120115
178.474381065
178.470862537
178.467484042
178.464273507
178.46121504
178.458333013
178.455599392
178.452984861
178.450473876
178.448093313
178.4458287
178.443643908
178.441514049
178.439466453
178.437504522
178.435640636
178.43388213
178.432236006
178.430684294
tf norm (3 clusters): Rand score: 0.979999505051
tf norm (3 clusters): Silhouette score: 0.0318769388602
tf norm (3 clusters): 5.16034645983

186.71214445
185.761972926
185.511353925
185.286148026
185.019936355
184.681880638
184.244250574
183.686916252
183.006062389
182.227400083
181.410347661
180.657171606
180.059875177
179.614208635
179.270478047
179.039234993
178.895320915
178.80359496
178.733611926
178.666444593
178.617788018
178.58911903
178.549828623
178.502258156
178.47414025
178.455464802
178.440404043
178.429600547
178.420787032
178.413155823
178.406191511
178.399485092
178.392972731
178.38664151
178.380506254
178.37454493
178.368788386
178.363228333
178.357913813
178.35284061
178.348058443
178.343547652
178.339341008
178.335401267
178.331750594
178.328343541
178.325198637
178.322268407
178.319571903
178.317062689
178.314762264
178.312626002
178.310678255
178.308875429
178.307244347
178.305741163
178.304394808
178.303160186
178.302068362
178.301072528
178.30020624
178.299420943
178.298753115
178.298152576
178.297658937
178.297220369
178.296879497
178.296582768
178.29637566
178.296202912
178.29611263
178.296047839
178.296058977
178.296087362
178.296185483
178.296293028
178.296464332
178.296637621
178.296868969
178.297095374
178.297374597
178.297642645
178.297958921
178.298258617
178.298602695
178.298925636
178.29928986
178.299629214
178.300007514
178.300358051
178.300745983
178.301104042
178.301498568
178.301861612
178.302260549
178.302626734
178.303028551
178.303396683
178.30380048
178.304169936
tf norm (3 clusters): Rand score: 1.0
tf norm (3 clusters): Silhouette score: 0.0320253728178
tf norm (3 clusters): 5.15495327438

186.703683358
185.826050834
185.753145941
185.7242476
185.709013152
185.692662063
185.67276414
185.646421283
185.611074275
185.562904929
185.496564946
185.404049953
185.273610496
185.088302178
184.825324659
184.458219355
183.965461878
183.345477544
182.627591253
181.871698146
181.158724488
180.537657073
179.999878009
179.532152158
179.157954653
178.901327943
178.735188293
178.628195629
178.560705825
178.515122539
178.480891402
178.454315598
178.433184726
178.415544699
178.400501966
178.387608196
178.376549181
178.367087624
178.359010309
178.352140132
178.346307834
178.341375906
178.337210723
178.333709325
178.330768753
178.328314148
178.326265766
178.324569651
178.323163881
178.322011332
178.321064405
178.320299054
178.319678357
178.319187905
178.318798717
178.318503921
178.318281172
178.31813023
178.318034626
178.317999446
178.318011988
178.318079603
178.318189979
178.318349741
178.318545185
178.318781763
178.319045043
178.319340429
178.319653931
178.319991739
178.320340819
178.320708358
178.321082317
178.321470793
178.321862622
178.322266638
178.322672354
178.323089093
178.323506767
178.323934872
178.324363419
178.3248018
178.325239912
178.325686908
178.326132526
178.326585692
178.327036061
178.327492428
178.32794447
178.328400945
178.32885163
178.329305288
178.329751827
178.330200017
178.330639881
178.331080169
178.331510987
178.331941017
178.332360415
178.332777758
tf norm (3 clusters): Rand score: 1.0
tf norm (3 clusters): Silhouette score: 0.0320253728178
tf norm (3 clusters): 5.15495327438

186.54656346
185.825911325
185.739297132
185.697463697
185.662857676
185.627861483
185.589436004
185.545570093
185.494376427
185.433814431
185.361587903
185.275121787
185.171645655
185.04847124
184.903611161
184.736837784
184.550881529
184.351780719
184.147508652
183.945460366
183.750218788
183.562626025
183.380970953
183.203967331
183.032725263
182.869860335
182.717617788
182.575446177
182.43947695
182.304445325
182.166373167
182.02388374
181.878659066
181.734317304
181.595320471
181.465161021
181.344462331
181.232009917
181.12774753
181.032271186
180.943837054
180.858766635
180.770621405
180.677152884
180.585448404
180.505738535
180.429085113
180.342501975
180.263471761
180.19204835
180.123972305
180.058786212
179.996008271
179.929880739
179.861118182
179.792945702
179.725876814
179.658925883
179.591744618
179.522820376
179.452560919
179.387664412
179.331166667
179.284201464
179.242728417
179.195494153
179.132381169
179.071528615
179.028644348
178.993639791
178.965579982
178.943835395
178.923293708
178.898293196
178.87392719
178.858451598
178.841574867
178.806570668
178.752652453
178.706891568
178.666418953
178.638578733
178.623294515
178.613468627
178.604814619
178.593257068
178.581091841
178.572909794
178.566700054
178.560294262
178.55432946
178.549230783
178.544852834
178.541018803
178.537509259
178.534584173
178.532198241
178.530090019
178.527922279
178.526298189
tf norm (3 clusters): Rand score: 0.940601785143
tf norm (3 clusters): Silhouette score: 0.0314864490528
tf norm (3 clusters): 5.18119769325


In [77]:
print S_v7
np.savetxt('onmtf_2x3_S.csv', S_v7, delimiter=",")


[[  1.08225705e-02   5.15201850e-03   4.14419833e-08]
 [  3.01304199e-09   5.54741882e-03   1.12429512e-02]]

In [78]:
# for i in range(2):
#     clust_inds = np.where(rows_ind_v7 == i)[0]
#     sum_per_word = np.sum(X_train_norm.toarray()[clust_inds, :], axis=0)
#     print 'Top words for document cluster %s' % i
#     print_top_k_words(sum_per_word, count_vect, 20)
#     print
    
def top_k_words(vec, V, count_vect, k):
    top_inds, top_vals = top_k(vec, k)
    words = count_vect.get_feature_names()
    top_words = [words[i] for i in top_inds]
    top_V_vals0 = [V[i, 0] for i in top_inds]
    top_V_vals1 = [V[i, 1] for i in top_inds]
    top_V_vals2 = [V[i, 2] for i in top_inds]
    top_pairs = reverse(zip(top_words, top_V_vals0, top_V_vals1, top_V_vals2, top_vals))
    return top_pairs

for i in range(2):
    clust_inds = np.where(rows_ind_v7 == i)[0]
    sum_per_word = np.sum(X_train_norm.toarray()[clust_inds, :], axis=0)
    print 'Top words for document cluster %s' % i
    top_pairs = top_k_words(sum_per_word, V_v7, count_vect, 99999999)
    for word, v0_val, v1_val, v2_val, tf_val in top_pairs[0:30]:
        print word, v0_val, v1_val, v2_val, tf_val
    print
    to_csv(top_pairs, 'onmtf_2x3_doc_cluster_%s_words_top.csv' % i,
           ['word', 'V cluster 0 value', 'V cluster 1 value', 'V cluster 2 value', 'tf norm value'])


Top words for document cluster 0
time 4.9622283799 0.378989790529 3.04798988879e-18 4.75908187705
feira 2.84450333476 3.29197728789 1.25739102022e-09 4.53428093549
jogo 2.68012064069e-06 9.54459434012 9.13416718244 4.38957008659
equipe 3.88079422469 1.8548944296 7.08322956207e-10 4.29628349948
minutos 5.56732702575 0.0489029883654 1.87802106317e-32 4.28575586041
anos 3.0926080342 1.44630027839 2.91421864861e-11 3.98431219202
final 3.1173720191 1.76499255857 2.22444662667e-11 3.89212463999
jogador 3.1794197887 1.34777739113 2.41349868302e-18 3.86582870218
gol 5.09483149625 0.0030771716432 1.19014739596e-76 3.85918707343
ufc 1.97701600985 0.19934226238 9.66643021222e-28 3.81179311267
brasileiro 2.17189685808 2.29128914356 9.50006850351e-07 3.80965457051
técnico 3.77545717555 0.00445384446199 8.7458156854e-54 3.52244206397
euro 3.15211446617 0.00475976392425 6.71010940812e-68 3.4981714608
real 3.89946048651 1.12692603074 1.11850041269e-15 3.48278546656
partida 3.87646147421 0.405267249422 1.85554327952e-24 3.44447485976
breno 3.3662150688 9.89795820333e-05 7.84883721347e-228 3.34439260137
campeonato 3.16035325763 0.302856140361 1.03512329397e-23 3.26537799981
futebol 2.78945398828 0.0187049205324 1.92951659505e-44 3.26424112787
campeão 2.63985765534 0.0274319249179 2.18497444632e-40 3.19254552377
vitória 3.00153523226 0.0267532045051 1.54065194688e-38 3.09937333634
paulo 2.07542720605 2.17197767624 0.0060087214577 3.09565350225
segundo 1.90649976273 2.75518170319 1.78486037504e-07 3.05561036578
casa 2.9817865189 0.511973307606 1.69592306006e-21 3.03549846629
clube 3.29838519169 0.0388670385497 7.70559590301e-46 2.98580966434
tempo 2.19945012837 2.3486343232 8.00462757099e-09 2.95216731882
dois 1.54571538307 3.30517530788 7.36465656534e-08 2.90913657548
atacante 3.14763983409 0.00574225274448 5.44300692635e-52 2.88981319147
espanha 3.36135431052 0.0210533893077 4.98011340321e-44 2.88823467934
bola 3.74841879791 0.00161901775242 1.47968752021e-67 2.86081932103
torcedores 2.47960692028 0.0319823014175 1.16612595732e-40 2.83342620539

Top words for document cluster 1
jogo 2.68012064069e-06 9.54459434012 9.13416718244 13.0195645784
jogos 3.11248814024e-11 1.13835447363 14.0723621839 12.3527084592
playstation 8.1144992067e-89 0.000606272995035 6.69477820728 6.50286252979
of 5.33397708151e-24 0.154141843443 6.26184191321 6.26876796242
xbox 1.64585691337e-71 0.0034771749824 5.53203904197 5.93517551309
games 2.1628806307e-33 0.044172782179 5.34484711933 5.30691966565
game 5.80821038709e-26 0.138511322183 5.44416405436 5.29370574087
novo 1.36497172186e-12 1.94372471811 3.68621235585 4.93748565046
console 2.51639313062e-71 0.00242813212507 4.68013911133 4.56307259533
sony 3.86554293494e-99 0.000665932664377 4.18105503615 4.15619803355
the 3.1971336725e-46 0.00681655712366 3.79535345439 3.97710024088
wii 2.47849557463e-127 0.000474785947737 4.26013059841 3.89686450689
brasil 4.83185936799e-12 1.7817864104 2.82030790455 3.64522678256
nintendo 2.6019474302e-101 0.000398670067579 3.38367328845 3.27248181662
legends 4.26383827264e-17 0.284827933227 2.62965032264 2.9094926904
league 1.42820289352e-11 1.00726122554 2.22284437159 2.88202536911
apenas 0.000311845471797 3.87553318254 0.642405814523 2.82897655817
one 1.00867986741e-89 0.000825498140131 2.6111407896 2.70936450188
dia 0.0792794347216 4.87469799074 0.000223176398547 2.64115950042
lançado 4.44176174204e-18 0.0126580465512 2.61856195238 2.55713526763
arena 1.94977648292e-05 0.615352307254 2.0723052044 2.54513971209
lançamento 4.89795391293e-23 0.00368736874335 2.32488873506 2.51527656234
além 4.3520807164e-05 1.88676773706 1.5178546423 2.50919014505
nova 1.76487640065e-11 0.745154944246 1.97018867342 2.46810975723
ps 3.80301912276e-63 0.00183090132057 2.54683138775 2.46754638865
jogadores 0.0193956755296 4.57310833793 0.0531595846244 2.35449784007
usmonetáriointerno 2.81393999657e-16 0.199722514241 2.34457380631 2.28475499306
ano 0.02682799524 4.27995646724 0.000933228857422 2.27425390894
microsoft 5.26707714908e-62 0.00265684806942 2.10109560893 2.27360974308
monetáriointerno 0.408337299158 3.49747655623 0.260219087762 2.23204477036


In [79]:
# V_norm_v7 = norm(V_v7)
# print 'Top words for term cluster 0:'
# print_top_k_words_term_cluster(V_v7[:, 0], X_train_norm.toarray(), count_vect, 30)
# print

# print 'Top words for term cluster 1:'
# print_top_k_words_term_cluster(V_v7[:, 1], X_train_norm.toarray(), count_vect, 30)
# print

# print 'Top words for term cluster 3:'
# print_top_k_words_term_cluster(V_v7[:, 2], X_train_norm.toarray(), count_vect, 30)
# print
def top_k_words_term_cluster(vec, X, count_vect, k):
    sum_per_word = np.sum(X, axis=0)
    top_inds, top_vals = top_k(vec, k)
    words = count_vect.get_feature_names()
    top_words = [words[i] for i in top_inds]
    correspondence_vals = [sum_per_word[i] for i in top_inds]

    top_pairs = reverse(zip(top_words, top_vals, correspondence_vals))

    return top_pairs

V_norm_v7 = norm(V_v7)
for i in xrange(3):
    print 'Top words for term cluster %s:' % i
    top_pairs = top_k_words_term_cluster(V_v7[:, i], X_train_norm.toarray(), count_vect, 9999999)
    for w, v_value, tf_value in top_pairs[0:30]:
        print w, v_value, tf_value
    to_csv(top_pairs, 'onmtf_2x3_V%s_words_top.csv' % i, ['word', 'V cluster %s value' % i, 'tf norm value'])
    print


Top words for term cluster 0:
minutos 5.56732702575 4.59140755144
gol 5.09483149625 3.91517632453
time 4.9622283799 5.21085736446
real 3.89946048651 4.27962701698
equipe 3.88079422469 5.38402375813
partida 3.87646147421 3.73445248798
técnico 3.77545717555 3.56462451803
bola 3.74841879791 2.91680857213
madrid 3.48318607264 2.75754288117
gols 3.40497413538 2.69286933272
breno 3.3662150688 3.34439260137
espanha 3.36135431052 2.9656773629
barcelona 3.35500283792 2.79061228958
clube 3.29838519169 3.15377741762
jogador 3.1794197887 4.68028023127
campeonato 3.16035325763 3.42909922129
euro 3.15211446617 3.4981714608
atacante 3.14763983409 2.88981319147
final 3.1173720191 4.9373205725
anos 3.0926080342 4.91571457583
vitória 3.00153523226 3.13423114506
casa 2.9817865189 3.40081430181
espanhol 2.89649088263 2.41562175014
feira 2.84450333476 6.63423662207
futebol 2.78945398828 3.301483074
campeão 2.63985765534 3.27410639528
pontos 2.51667821799 2.27903941545
temporada 2.48282444604 2.8250591232
torcedores 2.47960692028 2.83342620539
bayern 2.42824659509 2.19845209453

Top words for term cluster 1:
jogo 9.54459434012 17.409134665
dia 4.87469799074 5.32093077415
jogadores 4.57310833793 4.30228612023
ano 4.27995646724 4.75079310834
apenas 3.87553318254 4.59242923649
melhor 3.76493318512 4.04988727448
monetáriointerno 3.49747655623 4.81840908599
dois 3.30517530788 4.46835594418
feira 3.29197728789 6.63423662207
mundo 3.00132543142 3.30879150899
durante 2.95626465854 3.25896696005
segundo 2.75518170319 4.64121947369
jogar 2.6559383761 2.80796312875
título 2.61367436319 4.31037511355
bem 2.46299286964 3.10801094029
maior 2.35284163632 2.46204157517
tempo 2.3486343232 4.3141804196
brasileiro 2.29128914356 4.87632646275
paulo 2.17197767624 4.23639183759
conta 2.09149075688 2.40958508998
novo 1.94372471811 6.4397883303
além 1.88676773706 3.65357593645
apesar 1.87147708349 2.15949278802
equipe 1.8548944296 5.38402375813
brasil 1.7817864104 4.75295749281
final 1.76499255857 4.9373205725
dentro 1.71492630944 1.84299175991
quatro 1.68575654173 2.41344692675
dias 1.64307618773 1.8005169387
afirmou 1.60389896375 2.39849152024

Top words for term cluster 2:
jogos 14.0723621839 14.0184074837
jogo 9.13416718244 17.409134665
playstation 6.69477820728 6.50286252979
of 6.26184191321 6.66858864083
xbox 5.53203904197 5.93517551309
game 5.44416405436 5.40202523592
games 5.34484711933 5.41506727974
console 4.68013911133 4.56307259533
wii 4.26013059841 3.89686450689
sony 4.18105503615 4.15619803355
the 3.79535345439 4.10855529243
novo 3.68621235585 6.4397883303
nintendo 3.38367328845 3.27248181662
brasil 2.82030790455 4.75295749281
legends 2.62965032264 3.23393553266
lançado 2.61856195238 2.68510123945
one 2.6111407896 2.70936450188
ps 2.54683138775 2.46754638865
usmonetáriointerno 2.34457380631 2.62675989352
lançamento 2.32488873506 2.57362895081
league 2.22284437159 3.75349938375
versão 2.14351357292 1.95832779588
microsoft 2.10109560893 2.27360974308
arena 2.0723052044 3.02027939557
pessoas 1.98129752092 2.5572955834
nova 1.97018867342 3.08765693943
controle 1.78403500519 1.94460395871
indústria 1.76063926812 1.8052180076
estúdio 1.74814320546 1.65571896577
videogames 1.66921381695 1.7592797551


In [80]:
best = 1e10
for _ in xrange(5):
    U_t, S_t, V_t, rows_ind_t, cols_ind_t, err = matrix_factorization_clustering(X_train_norm.toarray(), 3, 3, onmtf, num_iters=100)
    try:
        dav_sc = davies_bouldin_score(X_train_norm.toarray(), rows_ind_t, calculate_centroids_doc_mean(X_train_norm, rows_ind_t, 2))
    except:
        continue
    if err < best:
        best = err
        U_v6 = U_t
        S_v6 = S_t
        V_v6 = V_t
        rows_ind_v6 = rows_ind_t
        cols_ind_v6 = cols_ind_t

    print 'tf norm (3 clusters): %s' % rand_score(labels, rows_ind_t)
    print 'tf norm (3 clusters): %s' % sil_score(X_train_norm, rows_ind_t)
    print 'tf norm (3 clusters): %s' % dav_sc
    print ''


186.750051587
185.823986833
185.708864868
185.64825276
185.589197903
185.511041838
185.401041158
185.242579096
185.014534556
184.694177712
184.26351882
183.720742984
183.080446867
182.363931258
181.594134194
180.803326003
180.031871672
179.316772443
178.692237997
178.17975804
177.772635893
177.449660901
177.181691351
176.942372211
176.712693683
176.49372603
176.288804785
176.095775482
175.913224503
175.730193757
175.533342815
175.329373828
175.141724848
174.985152222
174.876244005
174.810731754
174.770346128
174.737666325
174.709031496
174.673345915
174.639468887
174.620693649
174.607084154
174.595389252
174.585691158
174.577529812
174.571056225
174.565281004
174.559912149
174.55482571
174.54994803
174.545264837
174.540980076
174.537209093
174.533923461
174.531020327
174.528464908
174.526228875
174.524275418
174.522527619
174.520935865
174.519441785
174.518018181
174.516642689
174.515337347
174.514131276
174.513076217
174.512190058
174.511476639
174.510905363
174.510450896
174.510075791
174.509763528
174.509493637
174.509265312
174.509069528
174.508910725
174.508781142
174.508683505
174.508608139
174.5085559
174.508516633
174.508491225
174.508470557
174.508456576
174.508441619
174.508428742
174.508411448
174.508393581
174.508369405
174.508343232
174.508309763
174.508273555
174.508229522
174.50818236
174.508127184
174.508069079
174.508003834
174.507937538
174.507867079
tf norm (3 clusters): Rand score: 0.915246119546
tf norm (3 clusters): Silhouette score: 0.0328628798219
tf norm (3 clusters): 1.91685250342

186.584732139
185.806865213
185.743796481
185.720172221
185.702389155
185.683568611
185.661162657
185.633109829
185.597005525
185.549520017
185.485832223
185.398750422
185.277438227
185.105693953
184.860273547
184.511226459
184.028756212
183.400732335
182.651038841
181.833059102
181.006109547
180.236134761
179.620050564
179.204059231
178.940950452
178.76357657
178.636909417
178.557023746
178.51216307
178.480107639
178.453579757
178.440404056
178.430977106
178.421277974
178.409408523
178.393662645
178.371881249
178.340919059
178.29598552
178.229992806
178.132554287
177.9889613
177.779988621
177.485050175
177.092572506
176.617033403
176.106167182
175.61909448
175.203914025
174.911713797
174.738919148
174.63473891
174.567006271
174.520018884
174.486098195
174.460836358
174.441306887
174.425644163
174.412683177
174.401785309
174.392532244
174.384639346
174.377822184
174.371869871
174.366582172
174.361824192
174.357468094
174.353438827
174.349664747
174.346112441
174.342743448
174.339541249
174.336478941
174.333542613
174.330709896
174.327967464
174.325297396
174.322686981
174.320122165
174.31759089
174.315084138
174.312592249
174.310111412
174.307633029
174.305157115
174.302676328
174.30019529
174.297708414
174.295224671
174.2927396
174.290265611
174.287798478
174.28535368
174.282928552
174.280546166
174.278212676
174.275964779
174.273814255
174.271796095
174.269909978
tf norm (3 clusters): Rand score: 0.734036996152
tf norm (3 clusters): Silhouette score: 0.0299619168063
tf norm (3 clusters): 4.54435764902

186.575395264
185.755275204
185.650063377
185.572608886
185.492725877
185.397194847
185.278454509
185.126209776
184.92810259
184.668830169
184.332307637
183.906176124
183.389188795
182.798302836
182.171038077
181.558635933
181.008097273
180.543925819
180.164795268
179.852420416
179.579413889
179.321044363
179.072803508
178.84424991
178.639802085
178.456635149
178.288916171
178.129800165
177.970452179
177.803385169
177.624174739
177.432237191
177.230747765
177.025870256
176.825766522
176.638511689
176.468431286
176.316333558
176.181883321
176.065361907
175.967016042
175.884244762
175.813316326
175.751325551
175.696365746
175.647511737
175.602599167
175.55957995
175.517723045
175.47726046
175.43842037
175.401307337
175.365841169
175.33100276
175.295422466
175.257600674
175.215773251
175.168464986
175.113740525
175.048577884
174.969136044
174.875047284
174.777899611
174.695066382
174.628814949
174.57165409
174.526534507
174.495884789
174.476203349
174.460313298
174.440615273
174.418036037
174.39549211
174.37046894
174.351708227
174.339067667
174.329444925
174.320680877
174.312374782
174.303665968
174.292414351
174.276439632
174.256216117
174.237887505
174.226157097
174.216722663
174.206398492
174.19869508
174.189069009
174.177591246
174.169146515
174.160186867
174.152983142
174.147200829
174.138916961
174.128626073
174.124354842
174.122596096
174.120803368
174.117956127
tf norm (3 clusters): Rand score: 0.898548029909
tf norm (3 clusters): Silhouette score: 0.0337816662309
tf norm (3 clusters): 2.28640586854

186.65261133
185.754776661
185.621104647
185.527809281
185.424684611
185.294416756
185.123913086
184.901965951
184.622719264
184.290102085
183.922144112
183.548829404
183.194900291
182.85991372
182.518359424
182.131626806
181.654203095
181.037346346
180.247275835
179.305711606
178.301413921
177.325494561
176.459519191
175.824829431
175.417299523
175.123508805
174.897794033
174.722216572
174.571904498
174.437687405
174.326394929
174.23869434
174.171730643
174.123510873
174.088463462
174.063751882
174.046231898
174.03452126
174.026842415
174.020232851
174.013868576
174.008914266
174.005235308
174.001837655
173.999522066
173.997983303
173.99648894
173.99469319
173.992885866
173.991568902
173.990507955
173.989546934
173.98863061
173.987842084
173.98718392
173.986658286
173.986206131
173.985857444
173.985582757
173.985379391
173.985182533
173.984947221
173.984618099
173.984313734
173.984135415
173.98414217
173.984216085
173.984279929
173.984278188
173.98427818
173.984307955
173.984372311
173.98441758
173.984441132
173.984392416
173.98428005
173.984121563
173.983941463
173.983778496
173.98366841
173.983551199
173.983432889
173.983289563
173.983146144
173.982982176
173.982821624
173.9826415
173.982464544
173.982265847
173.982066737
173.981839747
173.981605101
173.981334075
173.981047059
173.980714783
173.98035842
173.979949207
173.979510341
173.979014844
173.978489452
tf norm (3 clusters): Rand score: 0.917652301634
tf norm (3 clusters): Silhouette score: 0.0339322821421
tf norm (3 clusters): 5.04568772902

186.701630191
185.774729514
185.617684229
185.498868908
185.363340208
185.182914966
184.93022873
184.576482984
184.101169374
183.510916576
182.834956132
182.082810833
181.256377485
180.39425588
179.494557279
178.570592522
177.710412176
176.985361063
176.404225184
175.95053269
175.60192295
175.330493334
175.124953465
174.979460403
174.87461892
174.796248947
174.739021534
174.695673814
174.659468467
174.626255183
174.59581795
174.569021369
174.545676185
174.525501511
174.507933399
174.49197907
174.47671565
174.461073199
174.445675394
174.431871368
174.419148612
174.407225876
174.395916337
174.385220078
174.375072851
174.365356583
174.356018087
174.347061431
174.338528783
174.330444281
174.32285892
174.315826735
174.30939539
174.303576329
174.298362758
174.293693394
174.289488839
174.285626552
174.282023982
174.27865338
174.275578247
174.272840609
174.270465437
174.268463833
174.266831648
174.265544084
174.264642899
174.264180642
174.264195028
174.264707245
174.265669
174.266915802
174.268288419
174.269633667
174.270865176
174.271938968
174.272823093
174.273496929
174.27398587
174.274314601
174.274503089
174.27453962
174.274432238
174.274205556
174.273856784
174.273268531
174.272177942
174.270108117
174.266533012
174.261698911
174.257366795
174.255014786
174.254399076
174.25467697
174.255211617
174.255533159
174.255228554
174.253771694
174.250402526
174.245809188
tf norm (3 clusters): Rand score: 0.751769829396
tf norm (3 clusters): Silhouette score: 0.0309669776999
tf norm (3 clusters): 4.42744109328


In [81]:
print S_v6
np.savetxt('onmtf_3x3_S.csv', S_v6, delimiter=",")


[[  9.64594493e-03   1.60029736e-05   7.54742212e-04]
 [  3.50749335e-04   2.89333310e-04   9.90929164e-03]
 [  1.95980724e-04   6.81620508e-03   4.98074508e-04]]

In [82]:
print 'Num elems in cluster 0: %s' % np.sum(rows_ind_v6 == 0)
print 'Num elems in cluster 1: %s' % np.sum(rows_ind_v6 == 1)
print 'Num elems in cluster 2: %s' % np.sum(rows_ind_v6 == 2)
print rows_ind_v6
print np.where(rows_ind_v6 == 1)

for url in arena_news_df.ix[np.where(rows_ind_v6 == 1)[0]].url:
    print url


Num elems in cluster 0: 100
Num elems in cluster 1: 91
Num elems in cluster 2: 9
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1
 1 1 1 1 1 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 31, 32, 33, 34, 35,
       36, 37, 38, 39, 40, 41, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56,
       57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73,
       74, 75, 76, 77, 78, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93,
       94, 95, 96, 97, 98, 99]),)
http://arena.ig.com.br/2014-10-09/os-campeonatos-da-brasil-game-show.html
http://arena.ig.com.br/2014-09-09/conheca-o-jogo-usado-para-lancar-o-iphone-6.html
http://arena.ig.com.br/2014-09-25/avaliacao-do-fifa-15-por-um-jogador-fanatico.html
http://arena.ig.com.br/2014-09-11/paul-mccartney-escreveu-musica-para-destiny-sem-interesses-financeiros.html
http://arena.ig.com.br/especiais/infograficos/2014-06-06/infografico-historia-evolucao-dos-controles-joystick-gamepad.html
http://arena.ig.com.br/2012-11-29/a-industria-fantasma-em-busca-do-peixonauta-dos-games.html
http://arena.ig.com.br/2012-07-24/festival-big-trara-co-fundador-da-bungie-a-sp-veja-novidades.html
http://arena.ig.com.br/2014-01-17/um-papo-com-amora-towerfall-miniboss-e-uma-bagagem-cheia-de-novas-experiencias.html
http://arena.ig.com.br/2013-05-24/aprovado-no-greenlight-contrast-chega-para-pc-xbla-e-psn-ainda-este-ano.html
http://arena.ig.com.br/2013-02-21/jogos-da-psn-comprados-no-ps3-nao-poderao-ser-transferidos-para-o-ps4.html
http://arena.ig.com.br/2014-01-03/top-10-jogos-do-corraini-em-2013.html
http://arena.ig.com.br/2012-04-26/carta-aberta-da-acigames-desvia-das-acusacoes-e-mantem-duvidas.html
http://arena.ig.com.br/2012-11-27/1reasonwhy-os-inumeros-motivos-pelos-quais-ainda-existe-sexismo-nos-games.html
http://arena.ig.com.br/jogos/videogames/2013-11-13/playstation-4-as-vantagens-da-playstation-network-e-ps-plus.html
http://arena.ig.com.br/jogos/videogames/2013-11-13/xbox-one-os-acessorios-do-xbox-one.html
http://arena.ig.com.br/jogos/videogames/2013-11-13/xbox-one-entenda-a-live-do-one.html
http://arena.ig.com.br/jogos/videogames/2012-12-13/wii-u-a-evolucao-do-wii-na-era-dos-tablets-e-smartphones.html
http://arena.ig.com.br/jogos/videogames/2012-12-13/wii-u-jogos-para-todos-os-gostos-e-bolsos.html
http://arena.ig.com.br/jogos/videogames/2012-12-13/wii-u-entenda-o-miiverse-a-nintendo-network-e-os-principais-recursos-online.html
http://arena.ig.com.br/jogos/videogames/2012-12-13/wii-u-respostas-para-as-perguntas-mais-frequentes.html
http://arena.ig.com.br/2013-12-30/top-10-jogos-do-rique-de-2013.html
http://arena.ig.com.br/2013-12-27/top-10-jogos-do-teixeira-em-2013.html
http://arena.ig.com.br/2013-12-24/top-10-jogos-do-heitor-em-2013.html
http://arena.ig.com.br/2013-09-05/xbox-one-pode-vir-a-ter-retro-compatibilidade-atraves-da-nuvem.html
http://arena.ig.com.br/2013-06-11/playstation-plus-sera-necessaria-para-multiplayer-no-ps4.html
http://arena.ig.com.br/2012-12-03/papo--yo-unmechanical-e-out-there-somewhere-foram-grandes-vencedores-do-big.html
http://arena.ig.com.br/2014-02-12/se-o-voo-de-flappy-bird-nao-foi-genuino-mudancas-sao-necessarias.html
http://arena.ig.com.br/2014-01-07/brasil-marca-presenca-na-premiacao-igf-2014-com-towerfall-ascension.html
http://arena.ig.com.br/2012-08-21/bossa-studios-os-brasileiros-por-tras-do-premiado-estudio-de-londres-parte-1.html
http://arena.ig.com.br/2012-07-13/artigo-uma-mulher-uma-causa-nobre-e-um-exercito-de-haters.html
http://arena.ig.com.br/2014-01-20/estudo-mostra-que-booth-babes-sao-prejudiciais-ao-sucesso-de-empresas-em-eventos.html
http://arena.ig.com.br/2013-01-29/os-melhores-jogos-brasileiros-da-global-game-jam-2013-para-voce-jogar.html
http://arena.ig.com.br/2013-09-23/simultaneamente-xbox-one-suporta-ate-quatro-aplicativos-e-seis-usuarios.html
http://arena.ig.com.br/2013-08-28/com-corte-de-us-50-versao-deluxe-do-wii-u-passa-a-custar-us-299-em-setembro.html
http://arena.ig.com.br/2013-05-17/new-super-luigi-u-sera-lancado-em-versao-digital-e-em-caixa.html
http://arena.ig.com.br/2012-09-04/por-que-um-evento-de-games-lgbt-e-importante-para-a-cultura-gamer-como-um-todo.html
http://arena.ig.com.br/2012-09-03/artigo-voce-pode-criar-jogos-que-empresas-nao-podem.html
http://arena.ig.com.br/2014-02-10/crie-seu-proprio-flappy-bird-com-este-gerador-de-flappy-birds.html
http://arena.ig.com.br/2013-04-01/jogo-brasileiro-tribot-ja-disponivel-para-ios-chega-tambem-ao-android.html
http://arena.ig.com.br/2012-08-21/bossa-studios-os-brasileiros-por-tras-do-premiado-estudio-de-londres-parte-2.html
http://arena.ig.com.br/2012-06-28/dez-livros-de-games-que-voce-precisa-ler.html
http://arena.ig.com.br/2012-10-17/nova-leva-de-jogos-e-aprovada-no-steam-greenlight.html
http://arena.ig.com.br/2013-02-21/executivo-da-sony-confirma-presenca-do-playstation-4-na-bgs-2013.html
http://arena.ig.com.br/2013-02-21/veja-apresentacao-completa-do-playstation-4.html
http://arena.ig.com.br/2013-08-20/sony-corta-preco-do-ps-vita-em-r-120.html
http://arena.ig.com.br/2013-06-11/knack-killzone-shadowfall-e-drive-club-estarao-disponiveis-no-lancamento-do-ps4.html
http://arena.ig.com.br/2013-08-01/criadores-do-classico-para-pc-outcast-anunciam-novo-estudio.html
http://arena.ig.com.br/2013-06-10/killer-instinct-ganha-nova-versao-exclusiva-para-xbox-one.html
http://arena.ig.com.br/2013-06-10/microsoft-anuncia-versao-compacta-do-xbox-360.html
http://arena.ig.com.br/2013-06-10/xbox-live-abandonara-microsoft-points-e-passara-a-usar-dinheiro-real.html
http://arena.ig.com.br/2013-05-28/veja-o-trailer-de-lancamento-de-sonic-lost-world.html
http://arena.ig.com.br/2013-07-31/wii-u-vendeu-apenas-160-mil-unidades-nos-ultimos-tres-meses.html
http://arena.ig.com.br/2013-03-01/conheca-os-novos-minigames-de-game--wario-para-wii-u.html
http://arena.ig.com.br/2013-06-10/knights-of-pen-and-paper-+1-edition-chega-dia-18-de-junho.html
http://arena.ig.com.br/2012-10-26/wii-u-oficial-nao-estara-disponivel-no-brasil-antes-de-2013.html
http://arena.ig.com.br/2013-02-14/nintendo-prepara-new-super-luigi-u-dlc-para-new-super-mario-bros-u.html
http://arena.ig.com.br/2013-05-17/nintendo-anuncia-sonic-lost-world-e-sonic--mario-winter-games.html
http://arena.ig.com.br/2013-04-02/rockstar-revela-capa-oficial-de-grand-theft-auto-v.html
http://arena.ig.com.br/2012-08-23/jogo-brasileiro-concorre-com-dyad-botanicula-e-outros-no-indiecade.html
http://arena.ig.com.br/analises/2012-06-21/indie-game-the-movie.html
http://arena.ig.com.br/2013-01-21/atari-inicia-processo-para-declarar-falencia.html
http://arena.ig.com.br/2012-11-21/jetpack-joyride-chega-ao-psp-ps-vita-e-ps3-pela-playstation-network.html
http://arena.ig.com.br/2012-10-29/conheca-os-finalistas-da-premiacao-do-big-festival-que-acontece-em-sp.html
http://arena.ig.com.br/2013-02-21/infamous-second-son-e-novo-jogo-da-serie-para-o-playstation-4.html
http://arena.ig.com.br/2012-04-25/jogue-de-graca-jelly-escape-o-novo-jogo-brasileiro-de-plataforma.html
http://arena.ig.com.br/2012-11-07/simulador-de-tratores-agricolas-vende-mais-que-medal-of-honor-warfighter.html
http://arena.ig.com.br/2012-09-17/anuncio-de-indicacao-de-moacyr-ao-conselho-nacional-de-cultura-foi-precipitado.html
http://arena.ig.com.br/2013-06-14/e3-2013-conferencia-da-ea-traz-continuacoes-e-surpresas-inusitadas-assista.html
http://arena.ig.com.br/2012-11-27/conheca-os-quatro-jogos-indie-criados-durante-o-festival-big-em-sp.html
http://arena.ig.com.br/2013-02-21/negando-rumores-sony-avisa-que-playstation-4-aceitara-jogos-usados.html
http://arena.ig.com.br/2012-12-05/jogo-brasileiro-skyrise-runner-chega-de-graca-ao-ios.html
http://arena.ig.com.br/2012-08-03/ea-entra-em-acao-contra-a-zynga.html
http://arena.ig.com.br/2012-08-15/amc-the-walking-dead-social-game-e-muito-jogo-social-e-pouco-walking-dead.html
http://arena.ig.com.br/2013-04-01/veterana-da-industria-renuncia-ao-igda-apos-festa-com-dancarinas-seminuas-na-gdc.html
http://arena.ig.com.br/2013-10-17/arena-discute-preco-de-r-3999-do-playstation-4.html
http://arena.ig.com.br/2013-02-21/sony-afirma-estar-em-contato-com-desenvolvedores-brasileiros.html
http://arena.ig.com.br/2013-02-21/quase-150-produtoras-ja-trabalham-em-jogos-para-o-playstation-4.html
http://arena.ig.com.br/2013-02-21/drive-club-e-novo-jogo-dos-criadores-de-motorstorm-para-playstation-4.html
http://arena.ig.com.br/2013-02-21/novo-titulo-da-capcom-deep-down-remete-a-dragons-dogma.html
http://arena.ig.com.br/2013-02-21/contrariando-expectativas-blizzard-anuncia-diablo-iii-para-consoles.html
http://arena.ig.com.br/2012-06-04/xbox-smartglass-integra-console-mobile-e-internet.html
http://arena.ig.com.br/2013-02-21/engine-dos-criadores-de-heavy-rain-para-ps4-tem-recursos-de-filmes-em-cg.html
http://arena.ig.com.br/2013-06-10/below-e-jogo-dos-criadores-de-sword--sworcery-para-xbox-one.html
http://arena.ig.com.br/analises/2012-04-20/analise-resident-evil-operation-raccoon-city.html
http://arena.ig.com.br/2013-02-14/imagem-de-suposto-controle-de-playstation-4-cai-na-internet.html
http://arena.ig.com.br/2013-02-18/no-novo-playstation-retrocompatibilidade-sera-via-streaming-diz-jornal.html
http://arena.ig.com.br/2013-04-30/conferencia-da-microsoft-na-e3-acontece-em-10-de-junho.html
http://arena.ig.com.br/2013-05-21/halo-vai-virar-serie-de-tv-pelas-maos-de-steven-spielberg.html
http://arena.ig.com.br/2012-05-07/thq-anuncia-company-of-heroes-2.html
http://arena.ig.com.br/2013-03-26/square-enix-anuncia-saida-de-ceo-e-prejuizo-de-r-277-milhoes-para-ano-fiscal.html
http://arena.ig.com.br/2013-03-19/presidente-da-ea-renuncia-ao-cargo.html

In [83]:
# for i in range(3):
#     clust_inds = np.where(rows_ind_v6 == i)[0]
#     sum_per_word = np.sum(X_train_norm.toarray()[clust_inds, :], axis=0)
#     print 'Top words for document cluster %s' % i
#     print_top_k_words(sum_per_word, count_vect, 20)
#     print

def top_k_words(vec, V, count_vect, k):
    top_inds, top_vals = top_k(vec, k)
    words = count_vect.get_feature_names()
    top_words = [words[i] for i in top_inds]
    top_V_vals0 = [V[i, 0] for i in top_inds]
    top_V_vals1 = [V[i, 1] for i in top_inds]
    top_V_vals2 = [V[i, 2] for i in top_inds]
    top_pairs = reverse(zip(top_words, top_V_vals0, top_V_vals1, top_V_vals2, top_vals))
    return top_pairs

for i in range(3):
    clust_inds = np.where(rows_ind_v6 == i)[0]
    sum_per_word = np.sum(X_train_norm.toarray()[clust_inds, :], axis=0)
    print 'Top words for document cluster %s' % i
    top_pairs = top_k_words(sum_per_word, V_v6, count_vect, 99999999)
    for word, v0_val, v1_val, v2_val, tf_val in top_pairs[0:30]:
        print word, v0_val, v1_val, v2_val, tf_val
    print
    to_csv(top_pairs, 'onmtf_3x3_doc_cluster_%s_words_top.csv' % i,
           ['word', 'V cluster 0 value', 'V cluster 1 value', 'V cluster 2 value', 'tf norm value'])


Top words for document cluster 0
time 5.3490666498 0.208939169622 1.92855571961e-32 4.59686045592
feira 4.65459896169 7.60631043955e-45 1.08509025054 4.48020712845
jogo 1.69084664201 0.000114616167071 15.1011497435 4.38957008659
equipe 5.19876215793 0.000686774110747 0.00341105021503 4.29628349948
minutos 5.86925247263 2.76376134729e-71 3.969959487e-29 4.28575586041
anos 4.14056433179 5.37180947668e-37 0.015439981415 3.98431219202
jogador 4.15522433393 2.24388653384e-25 0.000371302105999 3.86582870218
gol 5.33363757159 5.57427691117e-123 6.51751170721e-48 3.85918707343
ufc 2.19096609625 2.25028536016e-54 9.35040292728e-21 3.81179311267
brasileiro 3.4972683835 0.0968838616214 0.448544084224 3.80965457051
final 4.17271655114 0.431452665599 1.84951879815e-10 3.67582941182
técnico 3.94282655569 8.1048055353e-113 7.4181951332e-42 3.52244206397
euro 3.31063623643 1.37877612795e-92 4.36377592434e-42 3.4981714608
real 4.77066245151 2.18492678698e-37 6.75588793299e-11 3.48278546656
breno 3.57179712027 4.05400290416e-148 3.22226143595e-51 3.34439260137
partida 4.25770107094 5.06745047407e-06 7.49425986133e-26 3.33632724567
futebol 2.94784151135 7.5677327156e-70 6.50398754181e-38 3.26424112787
campeão 2.77368072204 5.90566149697e-10 1.2600068064e-38 3.13847171673
campeonato 3.43378072393 0.167888666537 2.13786166399e-34 3.10315657868
vitória 3.16074187952 9.20881516454e-62 3.02550769681e-34 3.09937333634
paulo 3.34764499261 0.0592585669471 0.457619615888 3.09565350225
segundo 3.44206600867 0.46139406809 0.351536614439 3.05561036578
clube 3.48087970499 4.23667644844e-99 1.25287748144e-35 2.98580966434
casa 3.44952614184 2.46549458706e-17 4.45016655375e-16 2.98142465925
tempo 3.49876036865 1.3921262168e-25 0.69315037761 2.95216731882
dois 3.24453803467 1.1751441574e-18 1.21533960882 2.90913657548
atacante 3.31635129254 5.26537103488e-114 2.3125906927e-41 2.88981319147
espanha 3.53718166993 4.35087834869e-99 4.17751327198e-37 2.88823467934
bola 3.92353658497 4.54689065844e-122 8.47547829576e-46 2.86081932103
torcedores 2.61738174809 4.07523121335e-77 1.77444412572e-36 2.83342620539

Top words for document cluster 1
jogo 1.69084664201 0.000114616167071 15.1011497435 12.4528489824
jogos 2.77326049808e-16 5.71518577066e-30 15.7093973236 12.1697587474
playstation 4.45355175474e-59 2.12567686612e-68 7.46996417843 6.50286252979
xbox 5.61559443475e-54 1.23114681097e-52 6.10700305481 5.93517551309
novo 0.0120648465521 9.66118390854e-29 5.29396901849 4.84545251529
game 1.36708916845e-23 0.585147941016 5.03232145292 4.77426201849
games 6.47243760128e-29 0.962277638116 4.32694930842 4.58779778555
console 1.04727071821e-51 1.69178493893e-49 5.07036578824 4.56307259533
sony 1.36460064576e-63 1.35860482943e-65 4.59925713227 4.15619803355
the 1.06738264728e-35 7.1191426533e-06 3.79531235464 3.93506710571
wii 1.46695188407e-59 2.17975345724e-35 4.55241962027 3.89686450689
nintendo 1.80910726731e-55 8.12465880375e-31 3.61488986237 3.27248181662
of 9.79980875625e-26 5.74788180255 0.168745142641 3.00044576949
brasil 0.231752293768 0.664269383233 3.17693127861 2.91640672022
apenas 1.3612365808 8.8643311215e-06 2.72526720618 2.71061439765
one 2.39447552438e-57 6.99890238202e-59 2.8939990985 2.70936450188
dia 2.21795981289 5.7292142712e-17 2.29158737058 2.59115950042
lançado 4.76866195906e-23 1.49423406624e-08 2.7426755461 2.55713526763
ps 7.51789500386e-55 5.62414568204e-50 2.80186317044 2.46754638865
nova 5.49141461919e-05 4.04127533805e-07 2.55818903179 2.42592730317
lançamento 1.73167320776e-28 0.00123671367806 2.41226486106 2.41527656234
arena 4.91871011562e-11 6.91160520491e-16 2.59416954762 2.41438642228
além 0.441942524702 0.0816659148698 2.55030375088 2.30803474673
microsoft 1.18128968042e-49 3.44325784712e-55 2.30844122637 2.27360974308
usmonetáriointerno 2.17274704096e-20 0.0988233094367 2.45752811228 2.23028152199
feira 4.65459896169 7.60631043955e-45 1.08509025054 2.09995568658
pessoas 2.92081999448e-05 1.32813387008e-22 2.50570923318 2.08812061648
monetáriointerno 1.95987481846 0.0592529201784 1.74797450518 2.07629253087
ano 1.9290952519 0.161329017614 1.76833171243 2.03879734583
melhor 1.64307053542 3.38429950846e-10 1.79620841 2.01041037661

Top words for document cluster 2
of 9.79980875625e-26 5.74788180255 0.168745142641 3.59276503519
legends 2.39199463654e-33 4.30404973846 3.94083571518e-54 3.02449080261
league 1.51786563168e-15 4.35778865719 1.66795198113e-56 3.02449080261
riot 2.09251606375e-23 1.50009152445 1.58206403325e-53 1.21087372945
personagens 1.34134264021e-23 1.33048707283 0.241852395671 0.924459259724
games 6.47243760128e-29 0.962277638116 4.32694930842 0.827269494193
brasil 0.231752293768 0.664269383233 3.17693127861 0.782893869374
jogo 1.69084664201 0.000114616167071 15.1011497435 0.566715596016
executivo 4.7763661983e-09 0.396370073941 0.675898782834 0.555991220209
jogadores 1.90292547296 0.561430717293 1.72701180568 0.537470823541
game 1.36708916845e-23 0.585147941016 5.03232145292 0.519443722384
internacional 0.283340070082 0.587492844938 4.08081062238e-16 0.463425978424
final 4.17271655114 0.431452665599 1.84951879815e-10 0.454494962627
segundo 3.44206600867 0.46139406809 0.351536614439 0.433124298149
público 0.0684316538284 0.501763980403 0.754622356248 0.394167696829
desafio 0.248881479785 0.454490637626 1.5186634672e-09 0.365578286034
time 5.3490666498 0.208939169622 1.92855571961e-32 0.356693815259
temporada 2.7362703805 0.327528519457 1.93176910888e-37 0.35473394334
paulo 3.34764499261 0.0592585669471 0.457619615888 0.351917413165
cnb 1.63570250168e-18 0.396889485001 2.48640399837e-75 0.313660872869
express 2.81392654284e-52 0.458363427502 3.80623833432e-71 0.302258792403
american 4.78287333215e-51 0.458363427502 2.52804795968e-72 0.302258792403
empresa 1.87395580001e-08 0.289330294733 1.2762315676 0.300980077311
milhões 1.59337211549 0.313112246116 0.000114106352962 0.296086552958
servidor 4.14877486028e-44 0.40992093923 2.0989188571e-32 0.289807681748
ano 1.9290952519 0.161329017614 1.76833171243 0.289530370155
maior 1.24994200605 0.394452317285 0.638598366291 0.286394734842
cartões 0.390001220108 0.376608455434 6.74965309627e-31 0.272367355351
campeonato 3.43378072393 0.167888666537 2.13786166399e-34 0.269953391514
horas 0.0911625337788 0.33502388465 0.687888584098 0.263262095056


In [84]:
# V_norm_v6 = norm(V_v6)
# print 'Top words for term cluster 0:'
# print_top_k_words_term_cluster(V_v6[:, 0], X_train_norm.toarray(), count_vect, 30)
# print

# print 'Top words for term cluster 1:'
# print_top_k_words_term_cluster(V_v6[:, 1], X_train_norm.toarray(), count_vect, 30)
# print

# print 'Top words for term cluster 3:'
# print_top_k_words_term_cluster(V_v6[:, 2], X_train_norm.toarray(), count_vect, 30)
# print
def top_k_words_term_cluster(vec, X, count_vect, k):
    sum_per_word = np.sum(X, axis=0)
    top_inds, top_vals = top_k(vec, k)
    words = count_vect.get_feature_names()
    top_words = [words[i] for i in top_inds]
    correspondence_vals = [sum_per_word[i] for i in top_inds]

    top_pairs = reverse(zip(top_words, top_vals, correspondence_vals))

    return top_pairs

V_norm_v6 = norm(V_v6)
for i in xrange(3):
    print 'Top words for term cluster %s:' % i
    top_pairs = top_k_words_term_cluster(V_v6[:, i], X_train_norm.toarray(), count_vect, 9999999)
    for w, v_value, tf_value in top_pairs[0:30]:
        print w, v_value, tf_value
    to_csv(top_pairs, 'onmtf_3x3_V%s_words_top.csv' % i, ['word', 'V cluster %s value' % i, 'tf norm value'])
    print


Top words for term cluster 0:
minutos 5.86925247263 4.59140755144
time 5.3490666498 5.21085736446
gol 5.33363757159 3.91517632453
equipe 5.19876215793 5.38402375813
real 4.77066245151 4.27962701698
feira 4.65459896169 6.63423662207
partida 4.25770107094 3.73445248798
final 4.17271655114 4.9373205725
jogador 4.15522433393 4.68028023127
anos 4.14056433179 4.91571457583
técnico 3.94282655569 3.56462451803
bola 3.92353658497 2.91680857213
madrid 3.65962586907 2.75754288117
breno 3.57179712027 3.34439260137
gols 3.56922723254 2.69286933272
espanha 3.53718166993 2.9656773629
barcelona 3.51303539717 2.79061228958
tempo 3.49876036865 4.3141804196
brasileiro 3.4972683835 4.87632646275
clube 3.48087970499 3.15377741762
casa 3.44952614184 3.40081430181
segundo 3.44206600867 4.64121947369
campeonato 3.43378072393 3.42909922129
paulo 3.34764499261 4.23639183759
atacante 3.31635129254 2.88981319147
euro 3.31063623643 3.4981714608
dois 3.24453803467 4.46835594418
vitória 3.16074187952 3.13423114506
espanhol 3.04059979765 2.41562175014
futebol 2.94784151135 3.301483074

Top words for term cluster 1:
of 5.74788180255 6.66858864083
league 4.35778865719 3.75349938375
legends 4.30404973846 3.23393553266
riot 1.50009152445 1.27386780829
personagens 1.33048707283 1.5857127324
games 0.962277638116 5.41506727974
brasil 0.664269383233 4.75295749281
internacional 0.587492844938 0.873139796412
game 0.585147941016 5.40202523592
jogadores 0.561430717293 4.30228612023
público 0.501763980403 1.63408328901
segundo 0.46139406809 4.64121947369
american 0.458363427502 0.302258792403
express 0.458363427502 0.302258792403
desafio 0.454490637626 0.759196482236
final 0.431452665599 4.9373205725
servidor 0.40992093923 0.352430110856
cnb 0.396889485001 0.313660872869
executivo 0.396370073941 1.31299292066
maior 0.394452317285 2.46204157517
cartões 0.376608455434 0.671351601022
thrones 0.351107654601 0.275194704958
corpo 0.3359326112 0.60474390695
horas 0.33502388465 1.23511834154
acordo 0.332961625101 1.61184386755
times 0.328740908197 1.33205732189
temporada 0.327528519457 2.8250591232
milhões 0.313112246116 2.36864290847
world 0.311979288805 1.16506836627
débito 0.308772569176 0.205602867272

Top words for term cluster 2:
jogos 15.7093973236 14.0184074837
jogo 15.1011497435 17.409134665
playstation 7.46996417843 6.50286252979
xbox 6.10700305481 5.93517551309
novo 5.29396901849 6.4397883303
console 5.07036578824 4.56307259533
game 5.03232145292 5.40202523592
sony 4.59925713227 4.15619803355
wii 4.55241962027 3.89686450689
games 4.32694930842 5.41506727974
the 3.79531235464 4.10855529243
nintendo 3.61488986237 3.27248181662
brasil 3.17693127861 4.75295749281
one 2.8939990985 2.70936450188
ps 2.80186317044 2.46754638865
lançado 2.7426755461 2.68510123945
apenas 2.72526720618 4.59242923649
arena 2.59416954762 3.02027939557
nova 2.55818903179 3.08765693943
além 2.55030375088 3.65357593645
pessoas 2.50570923318 2.5572955834
usmonetáriointerno 2.45752811228 2.62675989352
lançamento 2.41226486106 2.57362895081
microsoft 2.30844122637 2.27360974308
dia 2.29158737058 5.32093077415
versão 2.24484286493 1.95832779588
mil 1.93051236836 2.67438462566
controle 1.92009143519 1.94460395871
site 1.90970623381 2.47725142415
estúdio 1.88165384195 1.65571896577


In [195]:
best = 1e1000
for _ in xrange(5):
    U_t, S_t, V_t, rows_ind_t, cols_ind_t, err = matrix_factorization_clustering(X_train_norm.toarray(), 2, 6, onmtf, num_iters=100)
    try:
        dav_sc = davies_bouldin_score(X_train_norm.toarray(), rows_ind_t, calculate_centroids_doc_mean(X_train_norm, rows_ind_t, 2))
    except:
        continue
    if err < best:
        best = err
        U_v8 = U_t
        S_v8 = S_t
        V_v8 = V_t
        rows_ind_v8 = rows_ind_t
        cols_ind_v8 = cols_ind_t

    print 'tf norm (3 clusters): %s' % rand_score(labels, rows_ind_t)
    print 'tf norm (3 clusters): %s' % sil_score(X_train_norm, rows_ind_t)
    print 'tf norm (3 clusters): %s' % dav_sc
    print ''


199.995888398
30584765.5334
199.980710871
31857910.8798
199.980408996
32477867.1875
199.979869101
33835307.205
199.978596575
36860845.4386
199.976220781
41053307.3486
199.97385157
44202215.2593
199.972385732
45971486.8297
199.971613461
46921880.3051
199.971208621
47421400.0233
199.971012859
47649810.1281
199.970925914
47751352.886
199.970883906
47802984.6097
199.970860038
47834429.5611
199.970843951
47856572.4179
199.970832114
47873108.7241
199.970823026
47885832.3465
199.970815905
47895748.1729
199.970810273
47903530.4171
199.970805776
47909695.8615
199.970802144
47914642.3155
199.970799174
47918662.8723
199.970796719
47921971.1214
199.970794669
47924723.6923
199.970792942
47927037.1414
199.970791477
47928999.7326
199.970790224
47930679.323
199.970789145
47932128.5925
199.970788208
47933388.5916
199.970787392
47934491.255
199.970786675
47935461.2757
199.970786043
47936317.5705
199.970785486
47937074.4814
199.970784995
47937742.8087
199.970784562
47938330.7318
199.970784183
47938844.637
199.970783854
47939289.8371
199.97078357
47939671.1464
199.970783329
47939993.2812
199.970783127
47940261.074
199.97078296
47940479.5262
199.970782826
47940653.7456
199.970782719
47940788.8273
199.970782638
47940889.728
199.970782578
47940961.1676
199.970782537
47941007.5692
199.970782511
47941033.0372
199.970782499
47941041.3582
199.970782498
47941036.0135
199.970782505
47941020.1926
tf norm (3 clusters): Rand score: 0.979999505051
tf norm (3 clusters): Silhouette score: 0.0318769388602
tf norm (3 clusters): 5.16034645983

199.995980809
34359747.167
199.98178682
35437637.4199
199.981633688
35737839.9978
199.981455956
36178316.9494
199.981157992
36965500.0621
199.980605476
38469563.4853
199.9795559
41204430.0378
199.977820844
45007078.7958
199.975842141
48353817.107
199.974435052
50440924.4664
199.973591258
51683287.8034
199.973123513
52396818.8717
199.972810718
52906053.2536
199.972656031
53107682.7949
199.972577668
53206546.6746
199.972548841
53244804.2731
199.97253137
53274610.4117
199.972515882
53303087.5788
199.972500852
53331374.2614
199.9724862
53359066.8497
199.972472076
53385677.9415
199.972458748
53410574.8065
199.972446481
53433236.4238
199.972435468
53453341.2285
199.972425802
53470784.9921
199.972417476
53485660.8273
199.972410402
53498201.7201
199.972404437
53508720.8864
199.972399414
53517561.5313
199.972395158
53525056.2395
199.972391512
53531499.7306
199.972388339
53537132.8487
199.972385535
53542133.8265
199.972383026
53546616.4709
199.972380771
53550638.0543
199.972378754
53554218.6613
199.972376971
53557366.9066
199.972375413
53560100.6978
199.972374067
53562454.7307
199.97237291
53564476.0737
199.972371914
53566215.2251
199.972371054
53567718.7456
199.972370307
53569025.4983
199.972369655
53570165.8661
199.972369084
53571162.6672
199.972368584
53572032.7587
199.972368147
53572788.7214
199.972367768
53573440.3103
199.972367442
53573995.5315
199.972367165
53574461.336
tf norm (3 clusters): Rand score: 0.960200080404
tf norm (3 clusters): Silhouette score: 0.0317375860201
tf norm (3 clusters): 5.16753857692

199.995385103
24219024.0405
199.978481842
25321769.8689
199.978308044
25534332.3754
199.97809141
25867017.3208
199.977705431
26491374.3951
199.97697703
27650622.409
199.975713282
29477125.6027
199.973927589
31711510.8084
199.971995518
33735149.7322
199.970550353
35062507.8035
199.96962373
35977490.4623
199.968897684
36783160.634
199.968267055
37455666.6464
199.967845442
37831851.3757
199.967658135
37974696.8524
199.967590601
38023362.0392
199.967562821
38044477.9923
199.967547906
38057559.1715
199.967537771
38067509.6541
199.967529491
38076297.5305
199.967522333
38084035.1205
199.967516287
38090642.3554
199.96751116
38096323.0897
199.967506686
38101331.9869
199.967502667
38105831.8864
199.96749903
38109854.5376
199.967495792
38113377.962
199.967492964
38116424.5261
199.967490507
38119062.2368
199.96748836
38121366.9444
199.967486463
38123408.5705
199.967484761
38125248.6234
199.967483206
38126939.4623
199.967481755
38128525.9183
199.967480373
38130044.3388
199.967479034
38131519.0959
199.967477724
38132962.8334
199.967476439
38134381.1188
199.967475174
38135777.9697
199.967473926
38137159.0102
199.967472692
38138531.4113
199.967471465
38139901.9841
199.967470241
38141275.284
199.967469018
38142652.7136
199.967467798
38144032.6357
199.967466582
38145411.085
199.967465375
38146782.7
199.96746418
38148141.6485
199.967463004
38149482.422
199.967461848
38150800.4113
tf norm (3 clusters): Rand score: 0.979999505051
tf norm (3 clusters): Silhouette score: 0.0318769388602
tf norm (3 clusters): 5.16034645983

199.995890541
30268830.2446
199.980532329
31514960.8254
199.980149055
32178251.5522
199.979676054
33169329.5935
199.978951727
34582899.2688
199.978018736
36148698.3079
199.977079855
37533059.6482
199.976230478
38802032.512
199.975385138
40120097.2818
199.974526867
41489596.8624
199.973660997
42889447.9536
199.972813812
44177552.29
199.972083734
45251936.604
199.971483099
46050296.1595
199.971112726
46479701.451
199.970941025
46652444.8366
199.970868447
46735850.2579
199.970832333
46783257.6508
199.970810615
46813226.0369
199.970797068
46831709.3974
199.970788347
46843775.9038
199.970782048
46852800.0586
199.970776897
46860414.3551
199.970772319
46867324.3762
199.970768057
46873845.1644
199.970763985
46880132.5078
199.970760037
46886278.124
199.970756152
46892408.7272
199.970752224
46898709.1057
199.970748138
46905316.2667
199.970743866
46912165.4654
199.970739523
46919003.2282
199.970735264
46925649.5236
199.970731135
46932112.3006
199.970727097
46938459.0314
199.970723116
46944726.8654
199.970719181
46950920.9787
199.970715294
46957032.9928
199.970711463
46963053.1304
199.97070769
46968975.6424
199.97070398
46974800.5291
199.970700329
46980533.4452
199.970696732
46986184.6901
199.970693184
46991767.6435
199.970689674
46997296.6067
199.970686195
47002783.6739
199.970682741
47008234.6544
199.970679312
47013645.5638
199.970675914
47019002.5117
199.970672558
47024286.5845
tf norm (3 clusters): Rand score: 0.979999505051
tf norm (3 clusters): Silhouette score: 0.0318769388602
tf norm (3 clusters): 5.16034645983

199.99533981
23051906.5782
199.977881051
23786304.4604
199.977771784
23902455.0646
199.977631927
24105012.9351
199.977351674
24539897.5388
199.976726645
25554451.8728
199.975279899
27837147.0517
199.97244393
31458896.7033
199.968988676
34620016.532
199.967049213
35705004.5206
199.966633486
35885550.3459
199.966564238
35900614.6072
199.966560654
35890141.5095
199.966565142
35881597.9349
199.966568589
35876937.957
199.966570495
35874608.4233
199.966571622
35873321.1907
199.966572417
35872481.9054
199.966573026
35871877.9986
199.966573466
35871469.9581
199.96657371
35871279.6234
199.966573728
35871336.583
199.966573508
35871656.5111
199.966573054
35872230.6889
199.966572382
35873041.8616
199.966571497
35874085.7558
199.9665704
35875364.9405
199.966569092
35876876.7133
199.96656758
35878612.0386
199.966565872
35880561.2217
199.966563976
35882716.3887
199.966561897
35885068.7879
199.966559645
35887606.3236
199.966557233
35890313.3216
199.966554676
35893171.4961
199.966551992
35896161.3206
199.966549198
35899263.5539
199.966546311
35902460.7959
199.966543344
35905738.9075
199.966540307
35909088.0774
199.966537207
35912503.3188
199.966534046
35915984.2456
199.96653082
35919534.1132
199.966527525
35923158.2476
199.966524155
35926862.0898
199.966520706
35930649.1321
199.966517177
35934519.0194
199.966513568
35938466.0528
199.96650989
35942478.2643
199.966506157
35946537.147
tf norm (3 clusters): Rand score: 0.979999505051
tf norm (3 clusters): Silhouette score: 0.0318769388602
tf norm (3 clusters): 5.16034645983


In [196]:
S_v8


Out[196]:
array([[  1.48050495e+00,   6.20737393e-04,   6.21261309e+00,
          3.01197767e+00,   7.98542506e+00,   5.78253851e+00],
       [  1.01302217e+01,   7.57536505e+00,   1.53444256e+00,
          1.21718146e-08,   1.27218932e-01,   1.73766868e-03]])

In [197]:
for i in range(2):
    clust_inds = np.where(rows_ind_v8 == i)[0]
    sum_per_word = np.sum(X_train_norm.toarray()[clust_inds, :], axis=0)
    print 'Top words for document cluster %s' % i
    print_top_k_words(sum_per_word, count_vect, 20)
    print


Top words for document cluster 0
time - 4.75908187705
feira - 4.53428093549
jogo - 4.38957008659
equipe - 4.29628349948
minutos - 4.28575586041
anos - 3.98431219202
final - 3.89212463999
jogador - 3.86582870218
gol - 3.85918707343
ufc - 3.81179311267
brasileiro - 3.80965457051
técnico - 3.52244206397
euro - 3.4981714608
real - 3.48278546656
partida - 3.44447485976
breno - 3.34439260137
campeonato - 3.26537799981
futebol - 3.26424112787
campeão - 3.19254552377
vitória - 3.09937333634

Top words for document cluster 1
jogo - 13.0195645784
jogos - 12.3527084592
playstation - 6.50286252979
of - 6.26876796242
xbox - 5.93517551309
games - 5.30691966565
game - 5.29370574087
novo - 4.93748565046
console - 4.56307259533
sony - 4.15619803355
the - 3.97710024088
wii - 3.89686450689
brasil - 3.64522678256
nintendo - 3.27248181662
legends - 2.9094926904
league - 2.88202536911
apenas - 2.82897655817
one - 2.70936450188
dia - 2.64115950042
lançado - 2.55713526763


In [198]:
V_norm_v8 = norm(V_v8)
for l in xrange(6):
    print 'Top words for term cluster %s:' % l
    print_top_k_words_term_cluster(V_v8[:, l], X_train_norm.toarray(), count_vect, 30)
    print


Top words for term cluster 0:
jogo - 17726.4851177 - 17.409134665
jogos - 13979.3946256 - 14.0184074837
novo - 6725.96447288 - 6.4397883303
game - 6512.16690078 - 5.40202523592
playstation - 5114.53270594 - 6.50286252979
the - 5068.97001411 - 4.10855529243
brasil - 4183.55893834 - 4.75295749281
games - 4140.75629541 - 5.41506727974
apenas - 3774.99656302 - 4.59242923649
league - 3611.0351212 - 3.75349938375
of - 3145.3711387 - 6.66858864083
pessoas - 3124.53704712 - 2.5572955834
jogadores - 3072.53398899 - 4.30228612023
legends - 2983.2374014 - 3.23393553266
mil - 2961.01777508 - 2.67438462566
dia - 2852.9273413 - 5.32093077415
lançamento - 2813.5486133 - 2.57362895081
nova - 2604.81630397 - 3.08765693943
ps - 2577.16787817 - 2.46754638865
sony - 2524.69689224 - 4.15619803355
melhor - 2437.81448081 - 4.04988727448
wii - 2352.02890568 - 3.89686450689
durante - 2328.8697535 - 3.25896696005
xbox - 2282.27938936 - 5.93517551309
usmonetáriointerno - 2085.92572837 - 2.62675989352
evento - 2038.4273557 - 2.39152043826
brasileiros - 2029.91266825 - 1.48360480136
chega - 1978.16837315 - 1.93271757459
monetáriointerno - 1953.94504477 - 4.81840908599
nintendo - 1905.38617615 - 3.27248181662

Top words for term cluster 1:
jogos - 6471.14746058 - 14.0184074837
of - 5385.96998867 - 6.66858864083
xbox - 5080.7037985 - 5.93517551309
console - 4890.21016833 - 4.56307259533
playstation - 3955.9553924 - 6.50286252979
jogo - 3506.3508334 - 17.409134665
wii - 3352.97049753 - 3.89686450689
games - 3141.02507799 - 5.41506727974
sony - 3043.89363182 - 4.15619803355
nintendo - 2642.97279485 - 3.27248181662
arena - 2611.30443003 - 3.02027939557
lançado - 2224.39855299 - 2.68510123945
videogames - 1915.26072849 - 1.7592797551
one - 1899.51259058 - 2.70936450188
personagens - 1798.49031685 - 1.5857127324
super - 1790.64657755 - 1.98627762576
além - 1767.0857506 - 3.65357593645
microsoft - 1763.47456855 - 2.27360974308
versão - 1646.71139684 - 1.95832779588
mundo - 1629.25624985 - 3.30879150899
indústria - 1500.16208712 - 1.8052180076
usmonetáriointerno - 1307.63615233 - 2.62675989352
festival - 1296.52178748 - 1.14575356691
game - 1249.53592688 - 5.40202523592
site - 1235.76523551 - 2.47725142415
new - 1235.01151578 - 1.51447469942
brasil - 1198.52334748 - 4.75295749281
multiplayer - 1174.26665345 - 1.17173750492
online - 1134.58667143 - 1.50827909898
precisa - 1129.86452608 - 1.17197822425

Top words for term cluster 2:
jogador - 4290.01661115 - 4.68028023127
segundo - 4161.31053549 - 4.64121947369
tempo - 4077.04395183 - 4.3141804196
feira - 3550.02513449 - 6.63423662207
anos - 3322.39145002 - 4.91571457583
primeira - 2870.71639509 - 2.87580869334
monetáriointerno - 2656.28066659 - 4.81840908599
final - 2500.64403452 - 4.9373205725
dia - 2498.36416788 - 5.32093077415
time - 2315.53913957 - 5.21085736446
brasileiro - 2247.25094653 - 4.87632646275
paulo - 2098.20314565 - 4.23639183759
ano - 1966.74250936 - 4.75079310834
vitória - 1965.13345807 - 3.13423114506
real - 1954.97458939 - 4.27962701698
fez - 1919.65098424 - 2.61110358594
equipe - 1880.97312388 - 5.38402375813
torneio - 1879.21980916 - 2.52315652071
jogadores - 1803.40128675 - 4.30228612023
atacante - 1682.72110205 - 2.88981319147
vai - 1664.03831604 - 3.01190358314
quarta - 1600.95818613 - 3.00284722489
breno - 1532.38575126 - 3.34439260137
técnico - 1519.94536372 - 3.56462451803
meio - 1515.15564179 - 2.82656856003
quatro - 1512.17987834 - 2.41344692675
título - 1504.10832097 - 4.31037511355
dois - 1424.40114408 - 4.46835594418
campeonato - 1415.78954225 - 3.42909922129
europa - 1374.95728937 - 1.51346298599

Top words for term cluster 3:
breno - 1486.72636836 - 3.34439260137
espanhol - 1007.94343206 - 2.41562175014
espanha - 914.891590788 - 2.9656773629
gol - 694.856717808 - 3.91517632453
campeonato - 669.163577608 - 3.42909922129
madrid - 668.524510717 - 2.75754288117
estádio - 668.07686212 - 1.15845870874
alemanha - 633.233704806 - 2.34179717313
madri - 554.702269843 - 0.543984960914
rodadas - 554.573915089 - 0.777091869336
vitória - 496.977969067 - 3.13423114506
técnico - 486.215624166 - 3.56462451803
pontos - 462.274698693 - 2.27903941545
marcou - 458.736105362 - 1.32220399532
minutos - 438.313073014 - 4.59140755144
polônia - 435.580298616 - 1.35457112127
clube - 434.779760786 - 3.15377741762
gols - 426.687132707 - 2.69286933272
barcelona - 402.740190972 - 2.79061228958
cara - 401.224729804 - 0.925179597292
partida - 390.771002124 - 3.73445248798
argentino - 370.054563357 - 1.21561454948
messi - 366.919044054 - 1.74789023524
bola - 353.906521073 - 2.91680857213
united - 347.972080508 - 0.865460146362
inglês - 319.002693576 - 1.48853476236
diego - 314.70722185 - 0.469211309918
artilheiro - 307.838748786 - 0.703227444938
iraizoz - 301.652610725 - 0.221575969299
ataque - 301.039301723 - 1.05764234031

Top words for term cluster 4:
minutos - 4860.1455449 - 4.59140755144
time - 4305.94487754 - 5.21085736446
equipe - 3763.23966657 - 5.38402375813
real - 3133.93379321 - 4.27962701698
barcelona - 3067.34635337 - 2.79061228958
bola - 3061.56275814 - 2.91680857213
casa - 2991.47124893 - 3.40081430181
espanhol - 2662.31854693 - 2.41562175014
feira - 2590.07888079 - 6.63423662207
final - 2462.05648836 - 4.9373205725
campeão - 2421.48792131 - 3.27410639528
torcedores - 2387.65363421 - 2.83342620539
seleção - 2345.81066406 - 2.6180042938
madrid - 2336.18970909 - 2.75754288117
técnico - 2326.23482943 - 3.56462451803
anos - 2227.73891107 - 4.91571457583
alemanha - 2119.70808191 - 2.34179717313
brasileiro - 2119.453268 - 4.87632646275
sábado - 2089.13110192 - 2.06620034769
grupo - 2078.91983815 - 2.7649274719
ronaldo - 2055.43175086 - 1.5086905886
gols - 2005.42751582 - 2.69286933272
liga - 1990.47281055 - 2.53157804296
clube - 1975.29555807 - 3.15377741762
comente - 1898.80138609 - 2.33907203032
guardiola - 1873.52576454 - 1.81756412301
munique - 1873.49147768 - 2.10275852657
gol - 1821.40463969 - 3.91517632453
dois - 1812.27867881 - 4.46835594418
temporada - 1806.81300121 - 2.8250591232

Top words for term cluster 5:
gol - 3729.77239217 - 3.91517632453
partida - 3070.7757185 - 3.73445248798
espanha - 2877.07846406 - 2.9656773629
euro - 2041.22858647 - 3.4981714608
minutos - 1980.99519108 - 4.59140755144
futebol - 1945.82734854 - 3.301483074
título - 1779.23548506 - 4.31037511355
clube - 1689.12778602 - 3.15377741762
pontos - 1685.64505528 - 2.27903941545
messi - 1648.454283 - 1.74789023524
campeonato - 1467.80180043 - 3.42909922129
atacante - 1413.6891308 - 2.88981319147
gols - 1360.6557873 - 2.69286933272
bayern - 1331.34047854 - 2.19845209453
jogador - 1311.40422073 - 4.68028023127
cinco - 1255.61993196 - 1.22143794555
goleiro - 1216.5305456 - 1.26493364393
torcida - 1213.38984282 - 1.41509273528
confira - 1131.69459806 - 2.34917808533
real - 1083.9830574 - 4.27962701698
classificação - 1027.25514222 - 0.915601660082
munique - 999.254192227 - 2.10275852657
liga - 984.942080442 - 2.53157804296
grupo - 943.216726106 - 2.7649274719
chegou - 924.81766054 - 1.79053389634
dois - 900.424812557 - 4.46835594418
equipe - 890.518526647 - 5.38402375813
campo - 863.708582417 - 1.83184225111
estádio - 855.132266751 - 1.15845870874
rio - 822.227652576 - 2.14572051893


In [138]:
best = 1e10
for _ in xrange(5):
    U_t, S_t, V_t, V_t_t, rows_ind_t, error = matrix_factorization_overlapping_bin(X_train_norm.toarray(), 2, 2, num_iters=100)
    try:
        dav_sc = davies_bouldin_score(X_train_norm.toarray(), rows_ind_t, calculate_centroids_doc_mean(X_train_norm, rows_ind_t, 2))
    except:
        continue
    if error < best:
        best = error
        U_v3 = U_t
        S_v3 = S_t
        V_v3 = V_t
        V_t_v3 = V_t_t
        rows_ind_v3 = rows_ind_t

    print 'tf norm (2 clusters): %s' % rand_score(labels, rows_ind_t)
    print 'tf norm (2 clusters): %s' % sil_score(X_train_norm, rows_ind_t)
    print 'tf norm (2 clusters): %s' % dav_sc
    print ''


tf norm (3 clusters): Rand score: 0.704151382448
tf norm (3 clusters): Silhouette score: 0.0293431281723
tf norm (3 clusters): 5.2907157964

tf norm (3 clusters): Rand score: 0.000410251253819
tf norm (3 clusters): Silhouette score: 0.00870302972305
tf norm (3 clusters): 3.68778616632

tf norm (3 clusters): Rand score: 0.827238885682
tf norm (3 clusters): Silhouette score: 0.0286723769638
tf norm (3 clusters): 5.37117320592

tf norm (3 clusters): Rand score: -0.00019704344349
tf norm (3 clusters): Silhouette score: 0.0151481673327
tf norm (3 clusters): 3.08164548957

tf norm (3 clusters): Rand score: 0.979999505051
tf norm (3 clusters): Silhouette score: 0.031862446603
tf norm (3 clusters): 5.16923685599


In [139]:
print_hist(U_v3)



In [140]:
print rows_ind_v3


[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 1 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]

In [161]:
for k in range(2):
    print 'Top words for term cluster 0 and doc cluster %s:' % k
    print_top_k_words_term_cluster(V_v3[k][:, 0], X_train_norm.toarray(), count_vect, 30)
    print

    print 'Top words for term cluster 1 and doc cluster %s:' % k
    print_top_k_words_term_cluster(V_v3[k][:, 1], X_train_norm.toarray(), count_vect, 30)
    print


Top words for term cluster 0 and doc cluster 0:
quinta - 1.0 - 1.43675534573
técnico - 1.0 - 3.56462451803
melhor - 1.0 - 4.04988727448
meio - 1.0 - 2.82656856003
meia - 1.0 - 1.29533757942
casa - 1.0 - 3.40081430181
três - 1.0 - 2.96444738924
atleta - 1.0 - 1.84469966099
caso - 1.0 - 1.8529715666
atlético - 1.0 - 1.33512941856
conta - 1.0 - 2.40958508998
ex - 1.0 - 1.64155721116
desde - 1.0 - 1.67547953777
frente - 1.0 - 1.38076539741
quarta - 1.0 - 3.00284722489
gols - 1.0 - 2.69286933272
goleiro - 1.0 - 1.26493364393
neste - 1.0 - 1.95089030972
título - 1.0 - 4.31037511355
quatro - 1.0 - 2.41344692675
santos - 1.0 - 1.08655608931
gol - 1.0 - 3.91517632453
luta - 1.0 - 1.29955925782
apenas - 1.0 - 4.59242923649
partida - 1.0 - 3.73445248798
bayern - 1.0 - 2.19845209453
deixe - 1.0 - 2.16156875952
estádio - 1.0 - 1.15845870874
bem - 1.0 - 3.10801094029
cinturão - 1.0 - 1.33854908097

Top words for term cluster 1 and doc cluster 0:
útil - 1.0 - 0.0462847202446
empregos - 1.0 - 0.0556148656014
empates - 1.0 - 0.0532025133399
empatou - 1.0 - 0.177640714617
empecilho - 1.0 - 0.0408680825388
empolgado - 1.0 - 0.0713187822608
empolgação - 1.0 - 0.107752060152
empregados - 1.0 - 0.089727635149
empresa - 1.0 - 1.89752592837
encerrar - 1.0 - 0.102108056479
empresas - 1.0 - 0.709464330345
encaixa - 1.0 - 0.0640038649736
encaixam - 1.0 - 0.0718259123974
encaixar - 1.0 - 0.0560241625212
encara - 1.0 - 0.389380669422
encarar - 1.0 - 0.192402295547
empate - 1.0 - 0.973060775729
empatar - 1.0 - 0.0484143012609
empata - 1.0 - 0.111773099733
emoções - 1.0 - 0.10133432604
emociona - 1.0 - 0.0862420642795
emissora - 1.0 - 0.113273615639
emerson - 1.0 - 0.23822612835
emergente - 1.0 - 0.0604943724526
embora - 1.0 - 0.839051525207
embate - 1.0 - 0.141364280703
embalo - 1.0 - 0.0554224170341
elogiou - 1.0 - 0.0972433276653
elogios - 1.0 - 0.309298226098
elogiar - 1.0 - 0.0908041178936

Top words for term cluster 0 and doc cluster 1:
útil - 1.0 - 0.0462847202446
elogiado - 1.0 - 0.109549597526
elevado - 1.0 - 0.0399220193959
elimina - 1.0 - 0.0969023819587
eliminada - 1.0 - 0.0710717559958
eliminar - 1.0 - 0.304578578942
eliminatórias - 1.0 - 1.15430289699
eliminação - 1.0 - 0.668606385737
elogiar - 1.0 - 0.0908041178936
eletrônico - 1.0 - 0.235120580411
elogios - 1.0 - 0.309298226098
elogiou - 1.0 - 0.0972433276653
embalo - 1.0 - 0.0554224170341
embate - 1.0 - 0.141364280703
embora - 1.0 - 0.839051525207
emergente - 1.0 - 0.0604943724526
eletrônicos - 1.0 - 0.139512966878
elenco - 1.0 - 0.649160395082
edmund - 1.0 - 0.112014971302
eficiente - 1.0 - 0.0901837448111
educação - 1.0 - 0.0549934351556
efe - 1.0 - 0.302387314116
efeito - 1.0 - 0.40585045197
efeitos - 1.0 - 0.155631616006
efetiva - 1.0 - 0.0559866268251
effect - 1.0 - 0.0996596197502
eficientes - 1.0 - 0.0499447788355
elementos - 1.0 - 0.169376211224
eficiência - 1.0 - 0.076544548035
eis - 1.0 - 0.0941765471576

Top words for term cluster 1 and doc cluster 1:
monetáriointerno - 1.0 - 4.81840908599
dia - 1.0 - 5.32093077415
xbox - 1.0 - 5.93517551309
lançamento - 1.0 - 2.57362895081
of - 1.0 - 6.66858864083
games - 1.0 - 5.41506727974
novo - 1.0 - 6.4397883303
mil - 1.0 - 2.67438462566
usmonetáriointerno - 1.0 - 2.62675989352
apenas - 1.0 - 4.59242923649
lançado - 1.0 - 2.68510123945
site - 1.0 - 2.47725142415
além - 1.0 - 3.65357593645
ano - 1.0 - 4.75079310834
console - 1.0 - 4.56307259533
microsoft - 1.0 - 2.27360974308
melhor - 1.0 - 4.04988727448
legends - 1.0 - 3.23393553266
nova - 1.0 - 3.08765693943
nintendo - 1.0 - 3.27248181662
game - 1.0 - 5.40202523592
playstation - 1.0 - 6.50286252979
jogos - 1.0 - 14.0184074837
ps - 1.0 - 2.46754638865
wii - 1.0 - 3.89686450689
durante - 1.0 - 3.25896696005
jogadores - 1.0 - 4.30228612023
versão - 1.0 - 1.95832779588
pessoas - 1.0 - 2.5572955834
sony - 1.0 - 4.15619803355


In [144]:
def print_top_k_words(vec, count_vect, k):
    top_inds, top_vals = top_k(vec, k)
    words = count_vect.get_feature_names()
    top_words = [words[i] for i in top_inds]

    top_pairs = reverse(zip(top_words, top_vals))
    for word, value in top_pairs:
        print '%s - %s' % (word, value)

for i in range(2):
    clust_inds = np.where(rows_ind_v3 == i)[0]
    sum_per_word = np.sum(X_train_norm.toarray()[clust_inds, :], axis=0)
    print 'Top words for document cluster %s' % i
    print_top_k_words(sum_per_word, count_vect, 20)
    print


Top words for document cluster 0
time - 4.59686045592
feira - 4.48020712845
jogo - 4.38957008659
equipe - 4.29628349948
minutos - 4.28575586041
anos - 3.98431219202
gol - 3.85918707343
ufc - 3.81179311267
brasileiro - 3.80965457051
jogador - 3.75606444219
técnico - 3.52244206397
euro - 3.4981714608
real - 3.48278546656
final - 3.45630089184
breno - 3.34439260137
futebol - 3.26424112787
partida - 3.22656298568
campeão - 3.13847171673
campeonato - 3.10315657868
paulo - 3.09565350225

Top words for document cluster 1
jogo - 13.0195645784
jogos - 12.3527084592
of - 6.59321080468
playstation - 6.50286252979
xbox - 5.93517551309
games - 5.41506727974
game - 5.29370574087
novo - 4.93748565046
console - 4.56307259533
sony - 4.15619803355
the - 3.97710024088
wii - 3.89686450689
brasil - 3.6993005896
league - 3.42599673135
nintendo - 3.27248181662
legends - 3.23393553266
apenas - 2.82897655817
one - 2.70936450188
dia - 2.69523330746
lançado - 2.55713526763


In [145]:
best = 1e10
for _ in xrange(5):
    U_t, S_t, V_t, V_t_t, rows_ind_t, error = matrix_factorization_overlapping_bin(X_train_norm.toarray(), 3, 2, num_iters=100)
    try:
        dav_sc = davies_bouldin_score(X_train_norm.toarray(), rows_ind_t, calculate_centroids_doc_mean(X_train_norm, rows_ind_t, 3))
    except:
        continue
    if error < best:
        best = error
        U_v4 = U_t
        S_v4 = S_t
        V_v4 = V_t
        V_t_v4 = V_t_t
        rows_ind_v4 = rows_ind_t

    print 'tf norm (2 clusters): %s' % rand_score(labels, rows_ind_t)
    print 'tf norm (2 clusters): %s' % sil_score(X_train_norm, rows_ind_t)
    print 'tf norm (2 clusters): %s' % dav_sc
    print ''


tf norm (2 clusters): Rand score: 0.911626468863
tf norm (2 clusters): Silhouette score: 0.0263247804924
tf norm (2 clusters): 4.67652655528

tf norm (2 clusters): Rand score: 0.97014853758
tf norm (2 clusters): Silhouette score: 0.0275417678727
tf norm (2 clusters): 3.75016662057

tf norm (2 clusters): Rand score: 0.767871887193
tf norm (2 clusters): Silhouette score: 0.02801993876
tf norm (2 clusters): 5.32959729825

tf norm (2 clusters): Rand score: 0.734036996152
tf norm (2 clusters): Silhouette score: 0.0271920028126
tf norm (2 clusters): 5.29653960277

tf norm (2 clusters): Rand score: 0.721365663243
tf norm (2 clusters): Silhouette score: 0.0220293220506
tf norm (2 clusters): 4.68667761473


In [199]:
S_v4


Out[199]:
array([[ 0.0234421 ,  0.00131716],
       [ 0.03638007,  0.00135596],
       [ 0.00124627,  0.02218215]])

In [147]:
pairplot(U_v4)



In [200]:
for k in range(3):
    print 'Top words for term cluster 0 and doc cluster %s:' % k
    print_top_k_words_term_cluster(V_v4[k][:, 0], X_train_norm.toarray(), count_vect, 50)
    print

    print 'Top words for term cluster 1 and doc cluster %s:' % k
    print_top_k_words_term_cluster(V_v4[k][:, 1], X_train_norm.toarray(), count_vect, 50)
    print


Top words for term cluster 0 and doc cluster 0:
esquerdo - 1.0 - 0.651365168984
final - 1.0 - 4.9373205725
frente - 1.0 - 1.38076539741
técnico - 1.0 - 3.56462451803
frança - 1.0 - 0.95542966429
três - 1.0 - 2.96444738924
partida - 1.0 - 3.73445248798
apenas - 1.0 - 4.59242923649
apesar - 1.0 - 2.15949278802
competição - 1.0 - 2.05481368611
lesão - 1.0 - 1.42526313009
renan - 1.0 - 0.889024737333
flamengo - 1.0 - 0.991778089951
argentino - 1.0 - 1.21561454948
comente - 1.0 - 2.33907203032
liga - 1.0 - 2.53157804296
dois - 1.0 - 4.46835594418
ucrânia - 1.0 - 0.913929173483
fim - 1.0 - 1.91502163193
domingo - 1.0 - 0.928393334169
artilharia - 1.0 - 0.669287812052
artilheiro - 1.0 - 0.703227444938
ficou - 1.0 - 1.14806484749
torneio - 1.0 - 2.52315652071
fez - 1.0 - 2.61110358594
torcida - 1.0 - 1.41509273528
resultado - 1.0 - 1.22373105365
torcedores - 1.0 - 2.83342620539
logo - 1.0 - 1.39549395
duas - 1.0 - 2.24847688731
duelo - 1.0 - 1.50055422439
clássico - 1.0 - 1.01844508683
título - 1.0 - 4.31037511355
ufc - 1.0 - 3.87983449441
clube - 1.0 - 3.15377741762
jogar - 1.0 - 2.80796312875
irlanda - 1.0 - 0.725632569366
quarta - 1.0 - 3.00284722489
aldo - 1.0 - 1.6600459664
gol - 1.0 - 3.91517632453
alemanha - 1.0 - 2.34179717313
quatro - 1.0 - 2.41344692675
vezes - 1.0 - 1.66104357838
itália - 1.0 - 1.51631925379
conseguiu - 1.0 - 1.05727974402
venceu - 1.0 - 1.13849745601
joelho - 1.0 - 0.759925921354
perdeu - 1.0 - 1.40080111144
jogador - 1.0 - 4.68028023127
jogadores - 1.0 - 4.30228612023

Top words for term cluster 1 and doc cluster 0:
útil - 1.0 - 0.0462847202446
encaixam - 1.0 - 0.0718259123974
empolgação - 1.0 - 0.107752060152
empregados - 1.0 - 0.089727635149
empregos - 1.0 - 0.0556148656014
empresa - 1.0 - 1.89752592837
empresas - 1.0 - 0.709464330345
encaixa - 1.0 - 0.0640038649736
encaixar - 1.0 - 0.0560241625212
encontrada - 1.0 - 0.0723917101107
encara - 1.0 - 0.389380669422
encarar - 1.0 - 0.192402295547
encerrado - 1.0 - 0.118382193495
encerrar - 1.0 - 0.102108056479
encerrou - 1.0 - 0.111111111111
encheu - 1.0 - 0.0484143012609
empolgado - 1.0 - 0.0713187822608
empecilho - 1.0 - 0.0408680825388
empatou - 1.0 - 0.177640714617
empates - 1.0 - 0.0532025133399
empatar - 1.0 - 0.0484143012609
empata - 1.0 - 0.111773099733
emoções - 1.0 - 0.10133432604
emociona - 1.0 - 0.0862420642795
emissora - 1.0 - 0.113273615639
emerson - 1.0 - 0.23822612835
emergente - 1.0 - 0.0604943724526
embora - 1.0 - 0.839051525207
embate - 1.0 - 0.141364280703
embalo - 1.0 - 0.0554224170341
elogiou - 1.0 - 0.0972433276653
elogios - 1.0 - 0.309298226098
elogiar - 1.0 - 0.0908041178936
encontra - 1.0 - 0.0683846302886
encontram - 1.0 - 0.0918329516921
entra - 1.0 - 0.346768958475
ensaiar - 1.0 - 0.0487432682874
english - 1.0 - 0.128840490088
engraçado - 1.0 - 0.0464960612324
enix - 1.0 - 0.353828050187
ennis - 1.0 - 0.457438880846
enorme - 1.0 - 0.130361130923
enquete - 1.0 - 0.412593506
ensina - 1.0 - 0.0954523025321
encontrar - 1.0 - 0.302330610605
entanto - 1.0 - 0.980735378703
entenda - 1.0 - 0.158587172625
entender - 1.0 - 0.410012696086
entendimento - 1.0 - 0.0514990706269
entertainment - 1.0 - 0.644950286811

Top words for term cluster 0 and doc cluster 1:
monetáriointerno - 1.0 - 4.81840908599
dia - 1.0 - 5.32093077415
xbox - 1.0 - 5.93517551309
lançamento - 1.0 - 2.57362895081
of - 1.0 - 6.66858864083
games - 1.0 - 5.41506727974
novo - 1.0 - 6.4397883303
mil - 1.0 - 2.67438462566
usmonetáriointerno - 1.0 - 2.62675989352
apenas - 1.0 - 4.59242923649
lançado - 1.0 - 2.68510123945
site - 1.0 - 2.47725142415
além - 1.0 - 3.65357593645
ano - 1.0 - 4.75079310834
console - 1.0 - 4.56307259533
microsoft - 1.0 - 2.27360974308
melhor - 1.0 - 4.04988727448
legends - 1.0 - 3.23393553266
nova - 1.0 - 3.08765693943
nintendo - 1.0 - 3.27248181662
game - 1.0 - 5.40202523592
playstation - 1.0 - 6.50286252979
jogos - 1.0 - 14.0184074837
ps - 1.0 - 2.46754638865
wii - 1.0 - 3.89686450689
durante - 1.0 - 3.25896696005
jogadores - 1.0 - 4.30228612023
versão - 1.0 - 1.95832779588
pessoas - 1.0 - 2.5572955834
sony - 1.0 - 4.15619803355
the - 1.0 - 4.10855529243
feira - 1.0 - 6.63423662207
one - 1.0 - 2.70936450188
super - 1.0 - 1.98627762576
brasil - 1.0 - 4.75295749281
arena - 1.0 - 3.02027939557
league - 1.0 - 3.75349938375
controle - 1.0 - 1.94460395871
jogo - 1.0 - 17.409134665
elementos - 0.0 - 0.169376211224
útil - 0.0 - 0.0462847202446
electronic - 0.0 - 0.604217202679
elemento - 0.0 - 0.0809966417187
eleito - 0.0 - 0.152320895711
eletrônico - 0.0 - 0.235120580411
el - 0.0 - 0.0517578160265
eis - 0.0 - 0.0941765471576
eficiência - 0.0 - 0.076544548035
eficientes - 0.0 - 0.0499447788355
eficiente - 0.0 - 0.0901837448111

Top words for term cluster 1 and doc cluster 1:
útil - 1.0 - 0.0462847202446
elogiado - 1.0 - 0.109549597526
elevado - 1.0 - 0.0399220193959
elimina - 1.0 - 0.0969023819587
eliminada - 1.0 - 0.0710717559958
eliminar - 1.0 - 0.304578578942
eliminatórias - 1.0 - 1.15430289699
eliminação - 1.0 - 0.668606385737
elogiar - 1.0 - 0.0908041178936
eletrônico - 1.0 - 0.235120580411
elogios - 1.0 - 0.309298226098
elogiou - 1.0 - 0.0972433276653
embalo - 1.0 - 0.0554224170341
embate - 1.0 - 0.141364280703
embora - 1.0 - 0.839051525207
emergente - 1.0 - 0.0604943724526
eletrônicos - 1.0 - 0.139512966878
elenco - 1.0 - 0.649160395082
edmund - 1.0 - 0.112014971302
eficiente - 1.0 - 0.0901837448111
educação - 1.0 - 0.0549934351556
efe - 1.0 - 0.302387314116
efeito - 1.0 - 0.40585045197
efeitos - 1.0 - 0.155631616006
efetiva - 1.0 - 0.0559866268251
effect - 1.0 - 0.0996596197502
eficientes - 1.0 - 0.0499447788355
elementos - 1.0 - 0.169376211224
eficiência - 1.0 - 0.076544548035
eis - 1.0 - 0.0941765471576
el - 1.0 - 0.0517578160265
electronic - 1.0 - 0.604217202679
eleito - 1.0 - 0.152320895711
elemento - 1.0 - 0.0809966417187
emerson - 1.0 - 0.23822612835
emissora - 1.0 - 0.113273615639
emociona - 1.0 - 0.0862420642795
encontram - 1.0 - 0.0918329516921
encerrado - 1.0 - 0.118382193495
encerrar - 1.0 - 0.102108056479
encerrou - 1.0 - 0.111111111111
encheu - 1.0 - 0.0484143012609
encontra - 1.0 - 0.0683846302886
encontrada - 1.0 - 0.0723917101107
encontrar - 1.0 - 0.302330610605
emoções - 1.0 - 0.10133432604
encontraram - 1.0 - 0.0566217052284
encontrava - 1.0 - 0.0647752874563
encontro - 1.0 - 0.381804992499
encontrou - 1.0 - 0.178992762892

Top words for term cluster 0 and doc cluster 2:
útil - 1.0 - 0.0462847202446
empolgação - 1.0 - 0.107752060152
empatar - 1.0 - 0.0484143012609
empate - 1.0 - 0.973060775729
empates - 1.0 - 0.0532025133399
empatou - 1.0 - 0.177640714617
empecilho - 1.0 - 0.0408680825388
empolgado - 1.0 - 0.0713187822608
empregados - 1.0 - 0.089727635149
encarar - 1.0 - 0.192402295547
empregos - 1.0 - 0.0556148656014
empresa - 1.0 - 1.89752592837
empresas - 1.0 - 0.709464330345
encaixa - 1.0 - 0.0640038649736
encaixam - 1.0 - 0.0718259123974
encaixar - 1.0 - 0.0560241625212
empata - 1.0 - 0.111773099733
emoções - 1.0 - 0.10133432604
emociona - 1.0 - 0.0862420642795
emissora - 1.0 - 0.113273615639
emerson - 1.0 - 0.23822612835
emergente - 1.0 - 0.0604943724526
embora - 1.0 - 0.839051525207
embate - 1.0 - 0.141364280703
embalo - 1.0 - 0.0554224170341
elogiou - 1.0 - 0.0972433276653
elogios - 1.0 - 0.309298226098
elogiar - 1.0 - 0.0908041178936
elogiado - 1.0 - 0.109549597526
eliminação - 1.0 - 0.668606385737
eliminatórias - 1.0 - 1.15430289699
eliminar - 1.0 - 0.304578578942
eliminada - 1.0 - 0.0710717559958
encara - 1.0 - 0.389380669422
encerrado - 1.0 - 0.118382193495
elevado - 1.0 - 0.0399220193959
english - 1.0 - 0.128840490088
enfrentaram - 1.0 - 0.0837962446039
enfrentaria - 1.0 - 0.087784686332
enfrentará - 1.0 - 0.133399113438
enfrentarão - 1.0 - 0.102794578758
enfrentou - 1.0 - 0.148993205134
engine - 1.0 - 0.399722831088
engraçado - 1.0 - 0.0464960612324
encerrar - 1.0 - 0.102108056479
enix - 1.0 - 0.353828050187
ennis - 1.0 - 0.457438880846
enorme - 1.0 - 0.130361130923
enquete - 1.0 - 0.412593506
ensaiar - 1.0 - 0.0487432682874

Top words for term cluster 1 and doc cluster 2:
bayern - 1.0 - 2.19845209453
city - 1.0 - 1.4274434691
vitor - 1.0 - 1.09375280711
passado - 1.0 - 1.49438620437
afirmou - 1.0 - 2.39849152024
técnico - 1.0 - 3.56462451803
cinturão - 1.0 - 1.33854908097
título - 1.0 - 4.31037511355
ex - 1.0 - 1.64155721116
melhor - 1.0 - 4.04988727448
basquete - 1.0 - 0.833181299854
pênaltis - 1.0 - 0.89743702228
meio - 1.0 - 2.82656856003
casa - 1.0 - 3.40081430181
ufc - 1.0 - 3.87983449441
time - 1.0 - 5.21085736446
quarta - 1.0 - 3.00284722489
futebol - 1.0 - 3.301483074
conta - 1.0 - 2.40958508998
vitória - 1.0 - 3.13423114506
luis - 1.0 - 0.855823986831
cleveland - 1.0 - 0.712210849285
guardiola - 1.0 - 1.81756412301
heat - 1.0 - 0.984955457179
edgar - 1.0 - 0.790972601321
partida - 1.0 - 3.73445248798
disputa - 1.0 - 0.91332766438
euro - 1.0 - 3.4981714608
deixe - 1.0 - 2.16156875952
feira - 1.0 - 6.63423662207
belfort - 1.0 - 1.47534264153
entrevista - 1.0 - 1.67739351422
nba - 1.0 - 1.70179563753
segundo - 1.0 - 4.64121947369
volta - 1.0 - 1.31788524872
três - 1.0 - 2.96444738924
apenas - 1.0 - 4.59242923649
atacante - 1.0 - 2.88981319147
rio - 1.0 - 2.14572051893
contrato - 1.0 - 0.929718496685
barcelona - 1.0 - 2.79061228958
atleta - 1.0 - 1.84469966099
sábado - 1.0 - 2.06620034769
atletas - 1.0 - 1.00743717288
manchester - 1.0 - 0.89499617971
julgamento - 1.0 - 0.839836699457
jogador - 1.0 - 4.68028023127
jogadores - 1.0 - 4.30228612023
vai - 1.0 - 3.01190358314
real - 1.0 - 4.27962701698


In [157]:
print 'Num elems in cluster 0: %s' % np.sum(rows_ind_v4 == 0)
print 'Num elems in cluster 1: %s' % np.sum(rows_ind_v4 == 1)
print 'Num elems in cluster 2: %s' % np.sum(rows_ind_v4 == 2)
print rows_ind_v4
print np.where(rows_ind_v4 == 2)

for url in sport_news_df.ix[np.where(rows_ind_v4 == 2)[0]-100].url:
    print url


Num elems in cluster 0: 47
Num elems in cluster 1: 101
Num elems in cluster 2: 52
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 2 1 2 0
 0 0 0 0 0 0 2 2 2 2 2 2 0 0 0 0 0 2 0 0 0 0 0 0 2 2 0 2 2 2 2 2 2 2 0 0 0
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 2 0 2 2 2 2 2 0 0 2 2 2 0 0 2 2 2 0 0 2 2 0
 0 0 0 2 0 0 2 0 0 2 0 2 0 2 2]
(array([107, 109, 117, 118, 119, 120, 121, 122, 128, 135, 136, 138, 139,
       140, 141, 142, 143, 144, 148, 149, 150, 151, 152, 153, 154, 155,
       156, 157, 158, 159, 160, 161, 163, 165, 166, 167, 168, 169, 172,
       173, 174, 177, 178, 179, 182, 183, 188, 191, 194, 196, 198, 199]),)
http://esporte.ig.com.br/maisesportes/vela/2014-10-11/com-festa-sete-barcos-partem-em-direcao-a-cidade-do-cabo-na-largada-da-volvo.html
http://esporte.ig.com.br/basquete/2014-10-11/rio-volta-a-receber-a-nba-e-fas-formam-filas-para-comprar-produtos-da-liga.html
http://esporte.ig.com.br/futebol/2012-06-20/drogba-assina-com-time-chines-e-reencontrara-anelka.html
http://esporte.ig.com.br/futebol/2012-04-20/corinthians-vai-alem-das-provocacoes-e-usa-bambi-no-papel-de-lea.html
http://esporte.ig.com.br/futebol/2012-05-24/para-rodrigo-caetano-libertadores-e-um-sonho-adiado-no-flu.html
http://esporte.ig.com.br/futebol/2012-05-24/flu-tenta-superar-trauma-para-nao-repetir-brasileirao-de-2008.html
http://esporte.ig.com.br/futebol/2012-04-27/novo-tecnico-do-barca-superou-tumor-e-levou-dedada-no-olho-de-mo.html
http://esporte.ig.com.br/futebol/2012-05-31/lesao-muscular-tira-o-ingles-frank-lampard-da-euro-2012.html
http://esporte.ig.com.br/lutas/2012-07-19/mutante-comemora-elogios-de-chefe-do-ufc-e-sonha-em-lutar-ao-lado-de-belfort.html
http://esporte.ig.com.br/basquete/2014-10-10/fa-do-cavaliers-viajou-de-joao-pessoa-e-gastou-r-6-mil-para-ver-o-time-no-rio.html
http://esporte.ig.com.br/basquete/2014-10-10/genio-do-basquete-tecnico-conquista-elenco-do-cavaliers-com-ideias-proprias.html
http://esporte.ig.com.br/futebol/2012-07-04/zagueiro-breno-e-condenado-a-3-anos-e-9-meses-de-prisao-na-alemanha.html
http://esporte.ig.com.br/futebol/2012-12-20/breno-e-registrado-no-bid-e-aguarda-justica-para-defender-sao-paulo.html
http://esporte.ig.com.br/basquete/2014-10-10/apos-ano-ausente-ala-do-heat-diz-nunca-ter-duvidado-de-nova-chance-na-nba.html
http://esporte.ig.com.br/basquete/2014-10-08/jovens-de-favela-no-rio-tem-aula-inesquecivel-de-basquete-com-campeao-da-nba.html
http://esporte.ig.com.br/basquete/2014-10-09/elogiado-por-lebron-novato-do-miami-nega-frustracao-por-nao-jogar-com-ele.html
http://esporte.ig.com.br/futebol/2014-09-30/apos-reuniao-com-cbf-arbitros-descartam-greve-no-brasileirao.html
http://esporte.ig.com.br/futebol/2012-04-23/leao-reclama-de-arbitros-superstars-e-exalta-jogadores-marrentos.html
http://esporte.ig.com.br/futebol/2012-06-28/para-esposa-breno-estava-possuido-pelo-satanas-no-dia-do-incendio.html
http://esporte.ig.com.br/futebol/2012-07-03/acusado-de-incendiar-sua-casa-breno-tomava-uma-garrafa-de-whisky-por-dia.html
http://esporte.ig.com.br/futebol/2013-08-19/breno-deixa-a-prisao-volta-ao-bayern-e-se-considera-uma-pessoa-diferente.html
http://esporte.ig.com.br/basquete/2014-10-09/garoto-de-11-anos-ganha-autografo-de-lebron-e-promete-nunca-mais-lavar-a-camisa.html
http://esporte.ig.com.br/futebol/2014-09-26/arbitros-se-rebelam-contra-criticas-e-ameacam-parar-brasileirao.html
http://esporte.ig.com.br/futebol/2012-06-13/em-julgamento-breno-mantem-silencio-sobre-incendio-na-propria-casa.html
http://esporte.ig.com.br/futebol/2014-09-26/chefe-de-arbitragem-da-cbf-rechaca-profissionalizacao-e-critica-tecnologia.html
http://esporte.ig.com.br/futebol/2012-04-24/ribery-afirma-que-duelo-contra-o-real-madrid-sera-o-jogo-da-sua.html
http://esporte.ig.com.br/futebol/2012-06-13/comeca-na-alemanha-julgamento-de-breno-por-incendio-em-sua-casa.html
http://esporte.ig.com.br/automobilismo/2014-03-19/aos-83-anos-ecclestone-admite-que-pensa-em-aposentadoria.html
http://esporte.ig.com.br/lutas/2012-09-05/anderson-silva-diz-que-ha-boiolas-no-ufc-e-que-usava-vestido-na-infancia.html
http://esporte.ig.com.br/lutas/2012-09-05/ufc-lanca-trailer-da-luta-entre-vitor-belfort-e-jon-jones-assista.html
http://esporte.ig.com.br/lutas/2012-09-03/jose-aldo-cai-da-moto-mas-nao-sofre-ferimentos-graves-e-luta-no-ufc-rio.html
http://esporte.ig.com.br/lutas/2012-08-31/casas-de-aposta-reduzem-favoritismo-de-jose-aldo-no-ufc-rio.html
http://esporte.ig.com.br/lutas/2012-08-29/clone-de-anderson-silva-da-show-e-vence-3-seguida-no-mma.html
http://esporte.ig.com.br/lutas/2012-09-04/arianny-celeste-ring-girl-do-ufc-insiste-em-querer-alavancar-carreira-musical.html
http://esporte.ig.com.br/lutas/2012-08-31/rival-de-aldo-se-machuca-e-edgar-sera-o-adversario-do-brasileiro-no-ufc-rio-3.html
http://esporte.ig.com.br/futebol/2012-05-04/torcida-do-real-madrid-quer-que-diretoria-venda-kaka-aponta-enqu.html
http://esporte.ig.com.br/lutas/2012-08-25/jon-jones-encara-belfort-com-favoritismo-historico-nas-casas-de-apostas.html
http://esporte.ig.com.br/lutas/2012-08-27/chael-sonnen-esquece-rusga-e-declara-torcida-por-belfort-contra-jon-jones.html
http://esporte.ig.com.br/futebol/2012-04-19/na-tv-luis-fabiano-tira-duvida-sobre-sexo-antes-dos-jogos-e-se-a.html
http://esporte.ig.com.br/futebol/2012-05-31/campeonato-ingles-e-o-que-gera-mais-receitas-na-europa-aponta-es.html
http://esporte.ig.com.br/futebol/2012-05-20/apos-titulo-capitao-terry-pede-permanencia-de-di-matteo-no-chels.html
http://esporte.ig.com.br/futebol/2012-05-15/city-e-tevez-se-desculpam-por-cartaz-que-ridicularizava-ferguson.html
http://esporte.ig.com.br/futebol/2012-05-26/beckham-aceita-reducao-no-salario-e-renova-contrato-com-o-galaxy.html
http://esporte.ig.com.br/futebol/2012-05-15/milan-desmente-problemas-entre-ibrahimovic-e-allegri.html
http://esporte.ig.com.br/futebol/2012-04-23/lahm-diz-que-briga-entre-robben-e-ribery-ja-foi-superada.html
http://esporte.ig.com.br/futebol/2012-06-27/cr7-se-diz-feliz-com-rendimento-e-diz-que-espanha-teve-sorte.html
http://esporte.ig.com.br/futebol/2012-06-27/com-erro-na-memoria-sergio-ramos-diz-que-nao-teve-medo-de-bater-penalti.html
http://esporte.ig.com.br/futebol/2012-06-20/anuncio-feito-por-rooney-no-twitter-e-banido-por-reguladores-britanicos.html
http://esporte.ig.com.br/futebol/2012-04-20/manchester-united-segue-no-topo-do-ranking-de-clubes-mais-valios.html
http://esporte.ig.com.br/futebol/2012-05-25/guardiola-avisa-que-espera-ser-seduzido-para-acertar-com-novo-cl.html
http://esporte.ig.com.br/futebol/2012-04-28/era-guardiola-vira-fenomeno-pop-com-idolatria-frases-de-efeito-e.html
http://esporte.ig.com.br/futebol/2012-06-13/qual-o-jogador-mais-feio-da-euro-2012-confira-os-candidatos-e-vote-na-enquete.html

In [158]:
inds = np.where(rows_ind_v4 == 0)[0]
plt.hist(norm(U_v4)[inds, 0], bins=50)

inds = np.where(rows_ind_v4 == 2)[0]
plt.hist(norm(U_v4)[inds, 0], bins=50)

inds = np.where(rows_ind_v4 == 1)[0]
plt.hist(norm(U_v4)[inds, 0], bins=50)


Out[158]:
(array([   0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
           0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
           0.,    0.,    0.,    0.,    0.,    0.,    0.,  101.,    0.,
           0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
           0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
           0.,    0.,    0.,    0.,    0.]),
 array([-0.5 , -0.48, -0.46, -0.44, -0.42, -0.4 , -0.38, -0.36, -0.34,
        -0.32, -0.3 , -0.28, -0.26, -0.24, -0.22, -0.2 , -0.18, -0.16,
        -0.14, -0.12, -0.1 , -0.08, -0.06, -0.04, -0.02,  0.  ,  0.02,
         0.04,  0.06,  0.08,  0.1 ,  0.12,  0.14,  0.16,  0.18,  0.2 ,
         0.22,  0.24,  0.26,  0.28,  0.3 ,  0.32,  0.34,  0.36,  0.38,
         0.4 ,  0.42,  0.44,  0.46,  0.48,  0.5 ]),
 <a list of 50 Patch objects>)

In [159]:
def print_top_k_words(vec, count_vect, k):
    top_inds, top_vals = top_k(vec, k)
    words = count_vect.get_feature_names()
    top_words = [words[i] for i in top_inds]

    top_pairs = reverse(zip(top_words, top_vals))
    for word, value in top_pairs:
        print '%s - %s' % (word, value)

for i in range(3):
    clust_inds = np.where(rows_ind_v4 == i)[0]
    sum_per_word = np.sum(X_train_norm.toarray()[clust_inds, :], axis=0)
    print 'Top words for document cluster %s' % i
    print_top_k_words(sum_per_word, count_vect, 20)
    print


Top words for document cluster 0
minutos - 4.05430795115
gol - 3.47540763664
jogo - 3.00859499708
gols - 2.58109919032
bola - 2.56517677888
euro - 2.50224519934
time - 2.49417849529
partida - 2.42022462049
tempo - 2.3805084019
real - 2.22332618913
feira - 2.15254516838
equipe - 2.08617661635
dois - 2.07200269657
pontos - 2.0671860148
segundo - 2.04539924898
rodada - 2.00205504069
grupo - 1.89237620602
espanha - 1.88008972379
vitória - 1.85527390016
espanhol - 1.79768171206

Top words for document cluster 1
jogo - 13.0195645784
jogos - 12.3527084592
of - 6.59321080468
playstation - 6.50286252979
xbox - 5.93517551309
games - 5.41506727974
game - 5.29370574087
novo - 4.93748565046
console - 4.56307259533
sony - 4.15619803355
the - 3.97710024088
wii - 3.89686450689
brasil - 3.6993005896
league - 3.42599673135
nintendo - 3.27248181662
legends - 3.23393553266
apenas - 2.82897655817
one - 2.70936450188
dia - 2.69523330746
lançado - 2.55713526763

Top words for document cluster 2
breno - 3.34439260137
anos - 3.21123386288
ufc - 3.09502559271
jogador - 2.87592086829
brasileiro - 2.56340162511
monetáriointerno - 2.34715958238
feira - 2.32766196006
clube - 2.2752233151
futebol - 2.24095143242
equipe - 2.21010688312
time - 2.10268196063
técnico - 2.00246846241
casa - 1.94309360546
torcedores - 1.91960094743
ano - 1.88186712042
campeão - 1.86394929638
rio - 1.85681058996
paulo - 1.85100326146
bayern - 1.84795808388
munique - 1.82609542912


In [ ]: