In [40]:
import pandas as pd
import numpy as np
import seaborn as sns
import sys
import codecs
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.metrics import silhouette_score
In [21]:
sys.setrecursionlimit(1000000000)
In [22]:
%matplotlib inline
sns.set_palette("deep", desat=.6)
sns.set_context(rc={"figure.figsize": (16, 7)})
In [23]:
arena_news_stem_df = pd.read_pickle('arena_news_stem_df.pkl')
sport_news_stem_df = pd.read_pickle('sport_news_stem_df.pkl')
jovem_news_stem_df = pd.read_pickle('jovem_news_stem_df.pkl')
arena_news_df = pd.read_pickle('arena_news_df.pkl')
sport_news_df = pd.read_pickle('sport_news_df.pkl')
jovem_news_df = pd.read_pickle('jovem_news_df.pkl')
In [24]:
labels = np.array(len(arena_news_df)*[1] + len(sport_news_df)*[0])
In [25]:
count_vect = CountVectorizer(encoding='UTF-8',lowercase=False, min_df=2)
X = count_vect.fit_transform(arena_news_df['all'].tolist() + sport_news_df['all'].tolist())
X_train_norm_tfidf = TfidfTransformer(norm=u'l2', use_idf=True).fit_transform(X)
X_train_norm = TfidfTransformer(norm=u'l2', use_idf=False).fit_transform(X)
In [38]:
def to_csv(some_list, file_name, header):
def to_str(st):
if isinstance(st, str) or isinstance(st, unicode):
return st
else:
return str(st)
with codecs.open(file_name, 'w', 'utf-8') as file_stream:
if len(header) != 0:
file_stream.write(u','.join(header) + '\n')
for item in some_list:
if isinstance(item, tuple) or isinstance(item, list):
line = u','.join(map(to_str, item)) + '\n'
else:
line = item + '\n'
file_stream.write(line)
file_stream.close()
In [28]:
def _big_s(x, center):
len_x = len(x)
total = 0
for i in range(len_x):
total += np.linalg.norm(x[i]-center)
return total / len_x
def davies_bouldin_score(X, labels_pred, k_centers):
num_clusters, _ = k_centers.shape
big_ss = np.zeros([num_clusters], dtype=np.float64)
d_eucs = np.zeros([num_clusters, num_clusters], dtype=np.float64)
db = 0
for k in range(num_clusters):
samples_in_k_inds = np.where(labels_pred == k)[0]
samples_in_k = X[samples_in_k_inds, :]
big_ss[k] = _big_s(samples_in_k, k_centers[k])
for k in range(num_clusters):
for l in range(0, num_clusters):
d_eucs[k, l] = np.linalg.norm(k_centers[k]-k_centers[l])
for k in range(num_clusters):
values = np.zeros([num_clusters-1], dtype=np.float64)
for l in range(0, k):
values[l] = (big_ss[k] + big_ss[l])/d_eucs[k, l]
for l in range(k+1, num_clusters):
values[l-1] = (big_ss[k] + big_ss[l])/d_eucs[k, l]
db += np.max(values)
res = db / num_clusters
return res
def calculate_centroids_doc_mean(X, labels_pred, k):
_, m = X.shape
centroids = np.zeros((k, m))
for k in range(k):
samples_in_k_inds = np.where(labels_pred == k)[0]
centroids[k, :] = X[samples_in_k_inds, :].mean(axis=0)
return centroids
In [18]:
def onmtf(X, U, S, V):
U = U * ((X.dot(V).dot(S.T)) / (U.dot(S).dot(V.T).dot(X.T).dot(U)))
V = V * ((X.T.dot(U).dot(S)) / (V.dot(S.T).dot(U.T).dot(X).dot(V)))
S = S * ((U.T.dot(X).dot(V)) / (U.T.dot(U).dot(S).dot(V.T).dot(V)))
return U, S, V
def onm3f(X, U, S, V):
U = U * (X.dot(V).dot(S.T)) / np.sqrt(U.dot(U.T).dot(X).dot(V).dot(S.T))
V = V * (X.T.dot(U).dot(S)) / np.sqrt(V.dot(V.T).dot(X.T).dot(U).dot(S))
S = S * (U.T.dot(X).dot(V)) / np.sqrt(U.T.dot(U).dot(S).dot(V.T).dot(V))
return U, S, V
def nbvd(X, U, S, V):
U = U * (X.dot(V).dot(S.T)) / U.dot(U.T).dot(X).dot(V).dot(S.T)
V = V * (X.T.dot(U).dot(S)) / V.dot(V.T).dot(X.T).dot(U).dot(S)
S = S * (U.T.dot(X).dot(V)) / U.T.dot(U).dot(S).dot(V.T).dot(V)
return U, S, V
def matrix_factorization_clustering(X, k, l, factorization_func=onmtf, norm=False, num_iters=100):
m, n = X.shape
U = np.random.rand(m,k)
S = np.random.rand(k,l)
V = np.random.rand(n,l)
if norm:
X = normalize(X)
for i in xrange(num_iters):
U, S, V = factorization_func(X, U, S, V)
error = np.sum((X - U.dot(S).dot(V.T)) ** 2)
print error
Du = np.diag(np.ones(m).dot(U))
Dv = np.diag(np.ones(n).dot(V))
U = U.dot( np.diag(S.dot(Dv).dot(np.ones(l))) )
V = V.dot( np.diag(np.ones(k).dot(Du).dot(S)) )
rows_ind = np.argmax(U, axis=1)
cols_ind = np.argmax(V, axis=1)
return U, S, V, rows_ind, cols_ind, error
In [29]:
def matrix_factorization_overlapping_bin(X, k, l, num_iters=50):
def weights_initialization(X, n, m, k):
shuffle_inds = np.random.permutation(n)
cluster_end_ind = 0
for i in xrange(k):
cluster_init_ind = cluster_end_ind
cluster_end_ind = round((i + 1) * n / k)
X[shuffle_inds[cluster_init_ind : cluster_end_ind], i] = 1
return X
def calculate_block_matrix(X, F, G, S, k, l):
for i in xrange(k):
for j in xrange(l):
S[i, j] = np.mean(X[F[:, i] == 1][:, G[i][:, j] == 1])
where_are_NaNs = np.isnan(S)
S[where_are_NaNs] = 0
return S
n, m = X.shape
F = weights_initialization(np.zeros((n, k)), n, m, k)
G = []
for i in xrange(k):
G.append( weights_initialization(np.zeros((m, l)), m, n, l) )
S = np.random.rand(k, l)
for iter_ind in xrange(num_iters):
S = calculate_block_matrix(X, F, G, S, k, l)
for i in xrange(k):
F_t = F[F[:, i] == 1, :].dot(S)
X_t = X[F[:, i] == 1, :]
G[i] = np.zeros((m, l))
for j in xrange(m):
clust_len, _ = X_t.shape
diff = F_t - X_t[:, j].reshape(clust_len, 1).dot(np.ones(l).reshape(1, l))
errors = np.diag(diff.T.dot(diff))
minV = np.min(errors)
index = np.where(errors <= minV)[0]
G[i][j, index[np.random.randint(len(index))]] = 1
# S = calculate_block_matrix(X, F, G, S, k, l)
G_t = np.zeros((k, m))
for i in xrange(k):
G_t[i, :] = S[i, :].dot(G[i].T)
F = np.zeros((n, k))
for j in xrange(n):
diff = G_t - np.ones(k).reshape(k, 1).dot(X[j, :].reshape(1, m))
errors = np.diag(diff.dot(diff.T))
minV = np.min(errors)
index = np.where(errors <= minV)[0]
F[j, index[np.random.randint(len(index))]] = 1
G_t = np.zeros((k, m))
for i in xrange(k):
G_t[i, :] = S[i, :].dot(G[i].T)
error = np.sum((X - F.dot(G_t))**2)
rows_ind = np.argmax(F, axis=1)
return F, S, G, G_t, rows_ind, error
In [10]:
def fnmtf(X, k, l, num_iter=100, norm=False):
m, n = X.shape
U = np.random.rand(m,k)
S = np.random.rand(k,l)
V = np.random.rand(n,l)
if norm:
X = preprocessing.normalize(X)
for i in xrange(num_iter):
S = np.linalg.pinv(U.T.dot(U)).dot(U.T).dot(X).dot(V).dot(np.linalg.pinv(V.T.dot(V)))
# solve subproblem to update V
U_tilde = U.dot(S)
V_new = np.zeros(n*l).reshape(n, l)
for j in xrange(n):
errors = np.zeros(l)
for col_clust_ind in xrange(l):
errors[col_clust_ind] = ((X[:][:, j] - U_tilde[:][:, col_clust_ind])**2).sum()
ind = np.argmin(errors)
V_new[j][ind] = 1
# solve subproblem to update U
V_tilde = S.dot(V.T)
U_new = np.zeros(m*k).reshape(m, k)
for i in xrange(m):
errors = np.zeros(k)
for row_clust_ind in xrange(k):
errors[row_clust_ind] = ((X[i][:] - V_tilde[row_clust_ind][:])**2).sum()
ind = np.argmin(errors)
U_new[i][ind] = 1
U = U_new
V = V_new
rows_ind = np.argmax(U, axis=1)
cols_ind = np.argmax(V, axis=1)
return U, S, V, rows_ind, cols_ind
In [30]:
def rand_score(labels_true, labels_pred):
return 'Rand score: %s' % adjusted_rand_score(labels_true, labels_pred)
def sil_score(X, labels_pred):
score = silhouette_score(X, labels_pred)
return 'Silhouette score: %s' % score
def db_score(X, labels_pred, k_centers):
return 'Davies-Bouldin index: %s' % davies_bouldin_score(X, labels_pred, k_centers)
In [37]:
best = 0.0
for _ in xrange(5):
U_t, S_t, V_t, rows_ind_t, cols_ind_t, error = matrix_factorization_clustering(X_train_norm.toarray(), 2, 2, onmtf, num_iters=100)
rand_sc = adjusted_rand_score(labels, rows_ind_t)
if error > best:
best = error
U = U_t
S = S_t
V = V_t
rows_ind = rows_ind_t
cols_ind = cols_ind_t
print 'tf norm: %s' % rand_score(labels, rows_ind_t)
print 'tf norm: %s' % sil_score(X_train_norm, rows_ind_t)
print 'tf norm: %s' % db_score(X_train_norm.toarray(), rows_ind_t, calculate_centroids_doc_mean(X_train_norm, rows_ind_t, 2))
print ''
In [32]:
def print_hist(U):
_, k = U.shape
U_norm = U[:, 0] / np.sum(U, axis=1)
plt.title('U norm 0')
plt.hist(U_norm, bins=100)
plt.show()
print_hist(U)
In [33]:
def norm(U):
_, k = U.shape
return U / np.tile(np.sum(U, axis=1).T, (k,1)).T
def pairplot(U):
sns.pairplot(pd.DataFrame(norm(U)))
pairplot(U)
In [34]:
print_hist(V)
In [35]:
pairplot(V)
In [45]:
print S
np.savetxt('onmtf_2x2_S.csv', S, delimiter=",")
In [41]:
def top_k(arr, k, axis=0):
top_inds = np.argsort(arr, axis=axis)[-k:]
top_vals = np.sort(arr, axis=axis)[-k:]
return top_inds, top_vals
def reverse(arr):
return arr[::-1]
def top_k_words_term_cluster(vec, X, count_vect, k):
sum_per_word = np.sum(X, axis=0)
top_inds, top_vals = top_k(vec, k)
words = count_vect.get_feature_names()
top_words = [words[i] for i in top_inds]
correspondence_vals = [sum_per_word[i] for i in top_inds]
top_pairs = reverse(zip(top_words, top_vals, correspondence_vals))
return top_pairs
V_norm = norm(V)
for i in xrange(2):
print 'Top words for term cluster %s:' % i
top_pairs = top_k_words_term_cluster(V[:, i], X_train_norm.toarray(), count_vect, 9999999)
for w, v_value, tf_value in top_pairs[0:30]:
print w, v_value, tf_value
to_csv(top_pairs, 'onmtf_2x2_V%s_words_top.csv' % i, ['word', 'V cluster %s value' % i, 'tf norm value'])
print
# print 'Top words for middle cluster:'
# inds = np.where((V_norm[:, 0] >= 0.25) & (V_norm[:, 0] <= 0.75))[0]
# normalize = np.vectorize(lambda x: 1.0 - x if x < 0.5 else x, otypes=[np.float])
# new_V0 = normalize(V_norm[inds, 0])
# top_k_words_term_cluster(new_V0, X_train_norm.toarray(), count_vect, 30)
In [42]:
best_sil = 1e-10
best_davies = 1e10
for down_lim in [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.4, 0.5, 0.6, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]:
for up_lim in [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.4, 0.5, 0.6, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]:
new_labels_pred = rows_ind.copy()
inds = np.where((norm(U)[:,0] >= down_lim) & (norm(U)[:,0] <= up_lim))[0]
if len(inds) == 0:
continue
print 'Num elems in new cluster: %s' % len(inds)
new_labels_pred[inds] = 2
print 'cluster from ' + str(down_lim) + ' to ' + str(up_lim)
print sil_score(X_train_norm, new_labels_pred)
print db_score(X_train_norm.toarray(), new_labels_pred, calculate_centroids_doc_mean(X_train_norm.toarray(), new_labels_pred, 3))
print ''
davies = davies_bouldin_score(X_train_norm.toarray(), new_labels_pred, calculate_centroids_doc_mean(X_train_norm.toarray(), new_labels_pred, 3))
sil = silhouette_score(X_train_norm, new_labels_pred)
if sil > best_sil and len(inds) is not 0:
best_sil = sil
if davies < best_davies and len(inds) is not 0:
best_davies = davies
# print '|[%s, %s]| %s | %s |' % (down_lim, up_lim, davies, sil)
print 'Best sil score: %s' % best_sil
print 'Best davies score: %s' % best_davies
Cut | Davies | Silhouette |
---|---|---|
[0.05, 0.1] | 5.57285157009 | 0.0109045112115 |
[0.05, 0.15] | 5.71182779465 | 0.0119656192241 |
[0.05, 0.2] | 5.94127110479 | 0.0130751820341 |
[0.05, 0.25] | 6.05298958519 | 0.0165451762916 |
[0.05, 0.3] | 6.19170173754 | 0.0165380440629 |
[0.05, 0.4] | 6.32245257561 | 0.0168367278657 |
[0.05, 0.5] | 6.26966353994 | 0.0167316193288 |
[0.05, 0.6] | 6.40807743382 | 0.016857356078 |
[0.05, 0.7] | 6.4455544414 | 0.0172006814584 |
[0.05, 0.75] | 6.55786721717 | 0.0172403307824 |
[0.05, 0.8] | 6.56712481723 | 0.0171239228272 |
[0.05, 0.85] | 6.64891705295 | 0.0172794978116 |
[0.05, 0.9] | 6.65734610999 | 0.0175368723638 |
[0.05, 0.95] | 6.43470626189 | 0.0185023158634 |
[0.1, 0.15] | 3.74938336864 | 0.0200320140581 |
[0.1, 0.2] | 4.95423215656 | 0.0122445704199 |
[0.1, 0.25] | 5.24774207861 | 0.0189533279511 |
[0.1, 0.3] | 5.4775435174 | 0.0184107763245 |
[0.1, 0.4] | 5.64061493743 | 0.0188122752387 |
[0.1, 0.5] | 5.70074585431 | 0.0180512491603 |
[0.1, 0.6] | 5.89695453958 | 0.0183086502181 |
[0.1, 0.7] | 5.99123959399 | 0.0187545346923 |
[0.1, 0.75] | 6.16413919406 | 0.0189229292149 |
[0.1, 0.8] | 6.21459398725 | 0.0187118565694 |
[0.1, 0.85] | 6.36202785265 | 0.0193215518523 |
[0.1, 0.9] | 6.38301325195 | 0.0198651799831 |
[0.1, 0.95] | 6.27753852895 | 0.0214916194895 |
[0.15, 0.2] | 4.86700812091 | 0.0105439794849 |
[0.15, 0.25] | 5.11926762452 | 0.0185657354485 |
[0.15, 0.3] | 5.37908380917 | 0.0180149889451 |
[0.15, 0.4] | 5.52561494797 | 0.0185036943009 |
[0.15, 0.5] | 5.5918038188 | 0.0176895051053 |
[0.15, 0.6] | 5.8429610543 | 0.0179170884173 |
[0.15, 0.7] | 5.94529904844 | 0.0184202374396 |
[0.15, 0.75] | 6.1260691135 | 0.0186400603245 |
[0.15, 0.8] | 6.18021555466 | 0.0184370429761 |
[0.15, 0.85] | 6.34353934871 | 0.0191550753882 |
[0.15, 0.9] | 6.38099084216 | 0.0197350762573 |
[0.15, 0.95] | 6.27056371935 | 0.0214928456367 |
[0.2, 0.25] | 4.59715750094 | 0.0258680002387 |
[0.2, 0.3] | 4.80974711933 | 0.0231307008972 |
[0.2, 0.4] | 4.95393026335 | 0.0226034756124 |
[0.2, 0.5] | 5.10066417371 | 0.0204652333684 |
[0.2, 0.6] | 5.40739503099 | 0.0200695117979 |
[0.2, 0.7] | 5.56437964818 | 0.0205194066316 |
[0.2, 0.75] | 5.80028313436 | 0.0207765470706 |
[0.2, 0.8] | 5.92046824713 | 0.0203277378932 |
[0.2, 0.85] | 6.28634021964 | 0.0211087702694 |
[0.2, 0.9] | 6.34304665175 | 0.0218428354181 |
[0.2, 0.95] | 6.18437325622 | 0.0237885594771 |
[0.25, 0.3] | 4.22985981229 | 0.0135897730311 |
[0.25, 0.4] | 4.61305561043 | 0.0164386610319 |
[0.25, 0.5] | 4.76145280384 | 0.0130370501955 |
[0.25, 0.6] | 4.89577281732 | 0.015318260869 |
[0.25, 0.7] | 4.96585257909 | 0.0168170647366 |
[0.25, 0.75] | 5.28245473331 | 0.0178483728072 |
[0.25, 0.8] | 5.49589674625 | 0.0174806253808 |
[0.25, 0.85] | 6.01445885302 | 0.0196945682697 |
[0.25, 0.9] | 6.11307966306 | 0.0208421309285 |
[0.25, 0.95] | 6.00289644331 | 0.0236550415501 |
[0.3, 0.4] | 4.18386814443 | 0.0177595898038 |
[0.3, 0.5] | 4.42658568564 | 0.0124694379783 |
[0.3, 0.6] | 4.6461742013 | 0.0160554383447 |
[0.3, 0.7] | 4.78111868642 | 0.017754651206 |
[0.3, 0.75] | 4.94213034874 | 0.0188523697376 |
[0.3, 0.8] | 5.11398707258 | 0.0183449197684 |
[0.3, 0.85] | 5.77885390779 | 0.0205368208888 |
[0.3, 0.9] | 5.90212574162 | 0.021749591286 |
[0.3, 0.95] | 5.86506247584 | 0.0245266195333 |
[0.4, 0.5] | 3.75873542708 | -0.00205947886867 |
[0.4, 0.6] | 4.56282447318 | 0.0122479569878 |
[0.4, 0.7] | 4.72491632079 | 0.0158173158729 |
[0.4, 0.75] | 4.91515970836 | 0.0177338628519 |
[0.4, 0.8] | 5.03705345485 | 0.0173025687048 |
[0.4, 0.85] | 5.6129703548 | 0.0204270263155 |
[0.4, 0.9] | 5.81215490242 | 0.0216658358824 |
[0.4, 0.95] | 5.76329923942 | 0.0248179289182 |
[0.5, 0.6] | 4.40042799038 | 0.0163728994807 |
[0.5, 0.7] | 4.60320172881 | 0.0191154799039 |
[0.5, 0.75] | 4.81979472845 | 0.0203328470184 |
[0.5, 0.8] | 4.94125752597 | 0.0194989514475 |
[0.5, 0.85] | 5.43863505151 | 0.0218455272703 |
[0.5, 0.9] | 5.63343994389 | 0.022917219329 |
[0.5, 0.95] | 5.65737212181 | 0.0256510868347 |
[0.6, 0.7] | 4.19429867276 | 0.0207153509952 |
[0.6, 0.75] | 4.5572477842 | 0.0219253916023 |
[0.6, 0.8] | 4.71365451732 | 0.0203517172815 |
[0.6, 0.85] | 5.25603888145 | 0.0221018824178 |
[0.6, 0.9] | 5.5575547228 | 0.023039679348 |
[0.6, 0.95] | 5.63857058938 | 0.0253660684729 |
[0.7, 0.75] | 4.16919175321 | 0.0218116628449 |
[0.7, 0.8] | 4.41973575661 | 0.0196048379769 |
[0.7, 0.85] | 4.98999126621 | 0.0221993642761 |
[0.7, 0.9] | 5.33295096418 | 0.0232778373442 |
[0.7, 0.95] | 5.55578778726 | 0.0253850390862 |
[0.75, 0.8] | 3.75847932539 | 0.013086650073 |
[0.75, 0.85] | 4.8387599148 | 0.0219691396746 |
[0.75, 0.9] | 5.06389873879 | 0.0233348449246 |
[0.75, 0.95] | 5.42952171046 | 0.0255965371499 |
[0.8, 0.85] | 4.80277799744 | 0.022240997867 |
[0.8, 0.9] | 5.01524051316 | 0.0236135156815 |
[0.8, 0.95] | 5.38384596029 | 0.0257300306828 |
[0.85, 0.9] | 4.453963228 | 0.0227579527234 |
[0.85, 0.95] | 5.21634727289 | 0.0240909201119 |
[0.9, 0.95] | 5.07232266265 | 0.022501379554 |
In [49]:
rows_inds_v1 = rows_ind.copy()
inds = np.where((norm(U)[:,0] >= 0.25) & (norm(U)[:,0] <= 0.75))[0]
rows_inds_v1[inds] = 2
rows_inds_v1
Out[49]:
In [51]:
def top_k_words(vec, V, count_vect, k):
top_inds, top_vals = top_k(vec, k)
words = count_vect.get_feature_names()
top_words = [words[i] for i in top_inds]
top_V_vals0 = [V[i, 0] for i in top_inds]
top_V_vals1 = [V[i, 1] for i in top_inds]
top_pairs = reverse(zip(top_words, top_V_vals0, top_V_vals1, top_vals))
return top_pairs
for i in range(3):
clust_inds = np.where(rows_inds_v1 == i)[0]
sum_per_word = np.sum(X_train_norm.toarray()[clust_inds, :], axis=0)
print 'Top words for document cluster %s' % i
top_pairs = top_k_words(sum_per_word, V, count_vect, 99999999)
for word, v0_val, v1_val, tf_val in top_pairs[0:30]:
print word, v0_val, v1_val, tf_val
print
to_csv(top_pairs, 'onmtf_2x2_doc_cluster_%s_words_top.csv' % i,
['word', 'V cluster 0 value', 'V cluster 1 value', 'tf norm value'])
In [60]:
best = 1e10
for _ in xrange(5):
U_t, S_t, V_t, rows_ind_t, cols_ind_t, err = matrix_factorization_clustering(X_train_norm.toarray(), 3, 2, onmtf, num_iters=200)
try:
dav_sc = davies_bouldin_score(X_train_norm.toarray(), rows_ind_t, calculate_centroids_doc_mean(X_train_norm, rows_ind_t, 2))
except:
continue
if err < best:
best = err
U_v2 = U_t
S_v2 = S_t
V_v2 = V_t
rows_ind_v2 = rows_ind_t
cols_ind_v2 = cols_ind_t
print 'tf norm (3 clusters): %s' % rand_score(labels, rows_ind_t)
print 'tf norm (3 clusters): %s' % sil_score(X_train_norm, rows_ind_t)
print 'tf norm (3 clusters): %s' % dav_sc
print ''
In [62]:
pairplot(U_v2)
In [63]:
pairplot(V_v2)
In [64]:
def top_k_words_term_cluster(vec, X, count_vect, k):
sum_per_word = np.sum(X, axis=0)
top_inds, top_vals = top_k(vec, k)
words = count_vect.get_feature_names()
top_words = [words[i] for i in top_inds]
correspondence_vals = [sum_per_word[i] for i in top_inds]
top_pairs = reverse(zip(top_words, top_vals, correspondence_vals))
return top_pairs
V_norm_v2 = norm(V_v2)
for i in xrange(2):
print 'Top words for term cluster %s:' % i
top_pairs = top_k_words_term_cluster(V_v2[:, i], X_train_norm.toarray(), count_vect, 9999999)
for w, v_value, tf_value in top_pairs[0:30]:
print w, v_value, tf_value
to_csv(top_pairs, 'onmtf_3x2_V%s_words_top.csv' % i, ['word', 'V cluster %s value' % i, 'tf norm value'])
print
# print 'Top words for middle cluster:'
# inds = np.where((V_v2[:, 0] >= 0.1) & (V_v2[:, 0] <= 0.9))[0]
# normalize = np.vectorize(lambda x: 1.0 - x if x < 0.5 else x, otypes=[np.float])
# new_V0_v2 = normalize(V_norm_v2[inds, 0])
# print_top_k_words_term_cluster(new_V0_v2, X_train_norm.toarray(), count_vect, 30)
In [65]:
print S_v2
np.savetxt('onmtf_3x2_S.csv', S_v2, delimiter=",")
In [76]:
def top_k_words(vec, V, count_vect, k):
top_inds, top_vals = top_k(vec, k)
words = count_vect.get_feature_names()
top_words = [words[i] for i in top_inds]
top_V_vals0 = [V[i, 0] for i in top_inds]
top_V_vals1 = [V[i, 1] for i in top_inds]
top_pairs = reverse(zip(top_words, top_V_vals0, top_V_vals1, top_vals))
return top_pairs
for i in range(3):
clust_inds = np.where(rows_ind_v2 == i)[0]
sum_per_word = np.sum(X_train_norm.toarray()[clust_inds, :], axis=0)
print 'Top words for document cluster %s' % i
top_pairs = top_k_words(sum_per_word, V_v2, count_vect, 99999999)
for word, v0_val, v1_val, tf_val in top_pairs[0:30]:
print word, v0_val, v1_val, tf_val
print
to_csv(top_pairs, 'onmtf_3x2_doc_cluster_%s_words_top.csv' % i,
['word', 'V cluster 0 value', 'V cluster 1 value', 'tf norm value'])
In [61]:
print 'Num elems in cluster 0: %s' % np.sum(rows_ind_v2 == 0)
print 'Num elems in cluster 1: %s' % np.sum(rows_ind_v2 == 1)
print 'Num elems in cluster 2: %s' % np.sum(rows_ind_v2 == 2)
print rows_ind_v2
print np.where(rows_ind_v2 == 0)
for url in sport_news_df.ix[np.where(rows_ind_v2 == 0)[0]-100].url:
print url
In [67]:
inds = np.where(rows_ind_v2 == 0)[0]
plt.hist(norm(U)[inds, 0], bins=50)
inds = np.where(rows_ind_v2 == 2)[0]
plt.hist(norm(U)[inds, 0], bins=50)
inds = np.where(rows_ind_v2 == 1)[0]
plt.hist(norm(U)[inds, 0], bins=50)
Out[67]:
In [68]:
def overlap(a, b, k, l):
clust_a = a == k
clust_b = b == l
inds = []
sum_all = 0
sum_equals = 0
for i, elem in enumerate(clust_a):
if (clust_a[i] == False and clust_b[i] == False):
continue
elif (clust_a[i] == True and clust_b[i] == False):
sum_all += 1
elif (clust_a[i] == False and clust_b[i] == True):
sum_all += 1
elif (clust_a[i] == True and clust_b[i] == True):
sum_equals += 1
sum_all += 1
inds.append(i)
return np.array(inds), float(sum_equals) / sum_all
print 'Do they overlap on cluster 2?'
inds, overlap_rate = overlap(rows_inds_v1, rows_ind_v2, 2, 0)
print '%.2f' % overlap_rate
print inds
In [69]:
best = 1e10
for _ in xrange(5):
U_t, S_t, V_t, rows_ind_t, cols_ind_t, err = matrix_factorization_clustering(X_train_norm.toarray(), 2, 3, onmtf, num_iters=100)
try:
dav_sc = davies_bouldin_score(X_train_norm.toarray(), rows_ind_t, calculate_centroids_doc_mean(X_train_norm, rows_ind_t, 2))
except:
continue
if err < best:
best = err
U_v7 = U_t
S_v7 = S_t
V_v7 = V_t
rows_ind_v7 = rows_ind_t
cols_ind_v7 = cols_ind_t
print 'tf norm (3 clusters): %s' % rand_score(labels, rows_ind_t)
print 'tf norm (3 clusters): %s' % sil_score(X_train_norm, rows_ind_t)
print 'tf norm (3 clusters): %s' % dav_sc
print ''
In [77]:
print S_v7
np.savetxt('onmtf_2x3_S.csv', S_v7, delimiter=",")
In [78]:
# for i in range(2):
# clust_inds = np.where(rows_ind_v7 == i)[0]
# sum_per_word = np.sum(X_train_norm.toarray()[clust_inds, :], axis=0)
# print 'Top words for document cluster %s' % i
# print_top_k_words(sum_per_word, count_vect, 20)
# print
def top_k_words(vec, V, count_vect, k):
top_inds, top_vals = top_k(vec, k)
words = count_vect.get_feature_names()
top_words = [words[i] for i in top_inds]
top_V_vals0 = [V[i, 0] for i in top_inds]
top_V_vals1 = [V[i, 1] for i in top_inds]
top_V_vals2 = [V[i, 2] for i in top_inds]
top_pairs = reverse(zip(top_words, top_V_vals0, top_V_vals1, top_V_vals2, top_vals))
return top_pairs
for i in range(2):
clust_inds = np.where(rows_ind_v7 == i)[0]
sum_per_word = np.sum(X_train_norm.toarray()[clust_inds, :], axis=0)
print 'Top words for document cluster %s' % i
top_pairs = top_k_words(sum_per_word, V_v7, count_vect, 99999999)
for word, v0_val, v1_val, v2_val, tf_val in top_pairs[0:30]:
print word, v0_val, v1_val, v2_val, tf_val
print
to_csv(top_pairs, 'onmtf_2x3_doc_cluster_%s_words_top.csv' % i,
['word', 'V cluster 0 value', 'V cluster 1 value', 'V cluster 2 value', 'tf norm value'])
In [79]:
# V_norm_v7 = norm(V_v7)
# print 'Top words for term cluster 0:'
# print_top_k_words_term_cluster(V_v7[:, 0], X_train_norm.toarray(), count_vect, 30)
# print
# print 'Top words for term cluster 1:'
# print_top_k_words_term_cluster(V_v7[:, 1], X_train_norm.toarray(), count_vect, 30)
# print
# print 'Top words for term cluster 3:'
# print_top_k_words_term_cluster(V_v7[:, 2], X_train_norm.toarray(), count_vect, 30)
# print
def top_k_words_term_cluster(vec, X, count_vect, k):
sum_per_word = np.sum(X, axis=0)
top_inds, top_vals = top_k(vec, k)
words = count_vect.get_feature_names()
top_words = [words[i] for i in top_inds]
correspondence_vals = [sum_per_word[i] for i in top_inds]
top_pairs = reverse(zip(top_words, top_vals, correspondence_vals))
return top_pairs
V_norm_v7 = norm(V_v7)
for i in xrange(3):
print 'Top words for term cluster %s:' % i
top_pairs = top_k_words_term_cluster(V_v7[:, i], X_train_norm.toarray(), count_vect, 9999999)
for w, v_value, tf_value in top_pairs[0:30]:
print w, v_value, tf_value
to_csv(top_pairs, 'onmtf_2x3_V%s_words_top.csv' % i, ['word', 'V cluster %s value' % i, 'tf norm value'])
print
In [80]:
best = 1e10
for _ in xrange(5):
U_t, S_t, V_t, rows_ind_t, cols_ind_t, err = matrix_factorization_clustering(X_train_norm.toarray(), 3, 3, onmtf, num_iters=100)
try:
dav_sc = davies_bouldin_score(X_train_norm.toarray(), rows_ind_t, calculate_centroids_doc_mean(X_train_norm, rows_ind_t, 2))
except:
continue
if err < best:
best = err
U_v6 = U_t
S_v6 = S_t
V_v6 = V_t
rows_ind_v6 = rows_ind_t
cols_ind_v6 = cols_ind_t
print 'tf norm (3 clusters): %s' % rand_score(labels, rows_ind_t)
print 'tf norm (3 clusters): %s' % sil_score(X_train_norm, rows_ind_t)
print 'tf norm (3 clusters): %s' % dav_sc
print ''
In [81]:
print S_v6
np.savetxt('onmtf_3x3_S.csv', S_v6, delimiter=",")
In [82]:
print 'Num elems in cluster 0: %s' % np.sum(rows_ind_v6 == 0)
print 'Num elems in cluster 1: %s' % np.sum(rows_ind_v6 == 1)
print 'Num elems in cluster 2: %s' % np.sum(rows_ind_v6 == 2)
print rows_ind_v6
print np.where(rows_ind_v6 == 1)
for url in arena_news_df.ix[np.where(rows_ind_v6 == 1)[0]].url:
print url
In [83]:
# for i in range(3):
# clust_inds = np.where(rows_ind_v6 == i)[0]
# sum_per_word = np.sum(X_train_norm.toarray()[clust_inds, :], axis=0)
# print 'Top words for document cluster %s' % i
# print_top_k_words(sum_per_word, count_vect, 20)
# print
def top_k_words(vec, V, count_vect, k):
top_inds, top_vals = top_k(vec, k)
words = count_vect.get_feature_names()
top_words = [words[i] for i in top_inds]
top_V_vals0 = [V[i, 0] for i in top_inds]
top_V_vals1 = [V[i, 1] for i in top_inds]
top_V_vals2 = [V[i, 2] for i in top_inds]
top_pairs = reverse(zip(top_words, top_V_vals0, top_V_vals1, top_V_vals2, top_vals))
return top_pairs
for i in range(3):
clust_inds = np.where(rows_ind_v6 == i)[0]
sum_per_word = np.sum(X_train_norm.toarray()[clust_inds, :], axis=0)
print 'Top words for document cluster %s' % i
top_pairs = top_k_words(sum_per_word, V_v6, count_vect, 99999999)
for word, v0_val, v1_val, v2_val, tf_val in top_pairs[0:30]:
print word, v0_val, v1_val, v2_val, tf_val
print
to_csv(top_pairs, 'onmtf_3x3_doc_cluster_%s_words_top.csv' % i,
['word', 'V cluster 0 value', 'V cluster 1 value', 'V cluster 2 value', 'tf norm value'])
In [84]:
# V_norm_v6 = norm(V_v6)
# print 'Top words for term cluster 0:'
# print_top_k_words_term_cluster(V_v6[:, 0], X_train_norm.toarray(), count_vect, 30)
# print
# print 'Top words for term cluster 1:'
# print_top_k_words_term_cluster(V_v6[:, 1], X_train_norm.toarray(), count_vect, 30)
# print
# print 'Top words for term cluster 3:'
# print_top_k_words_term_cluster(V_v6[:, 2], X_train_norm.toarray(), count_vect, 30)
# print
def top_k_words_term_cluster(vec, X, count_vect, k):
sum_per_word = np.sum(X, axis=0)
top_inds, top_vals = top_k(vec, k)
words = count_vect.get_feature_names()
top_words = [words[i] for i in top_inds]
correspondence_vals = [sum_per_word[i] for i in top_inds]
top_pairs = reverse(zip(top_words, top_vals, correspondence_vals))
return top_pairs
V_norm_v6 = norm(V_v6)
for i in xrange(3):
print 'Top words for term cluster %s:' % i
top_pairs = top_k_words_term_cluster(V_v6[:, i], X_train_norm.toarray(), count_vect, 9999999)
for w, v_value, tf_value in top_pairs[0:30]:
print w, v_value, tf_value
to_csv(top_pairs, 'onmtf_3x3_V%s_words_top.csv' % i, ['word', 'V cluster %s value' % i, 'tf norm value'])
print
In [195]:
best = 1e1000
for _ in xrange(5):
U_t, S_t, V_t, rows_ind_t, cols_ind_t, err = matrix_factorization_clustering(X_train_norm.toarray(), 2, 6, onmtf, num_iters=100)
try:
dav_sc = davies_bouldin_score(X_train_norm.toarray(), rows_ind_t, calculate_centroids_doc_mean(X_train_norm, rows_ind_t, 2))
except:
continue
if err < best:
best = err
U_v8 = U_t
S_v8 = S_t
V_v8 = V_t
rows_ind_v8 = rows_ind_t
cols_ind_v8 = cols_ind_t
print 'tf norm (3 clusters): %s' % rand_score(labels, rows_ind_t)
print 'tf norm (3 clusters): %s' % sil_score(X_train_norm, rows_ind_t)
print 'tf norm (3 clusters): %s' % dav_sc
print ''
In [196]:
S_v8
Out[196]:
In [197]:
for i in range(2):
clust_inds = np.where(rows_ind_v8 == i)[0]
sum_per_word = np.sum(X_train_norm.toarray()[clust_inds, :], axis=0)
print 'Top words for document cluster %s' % i
print_top_k_words(sum_per_word, count_vect, 20)
print
In [198]:
V_norm_v8 = norm(V_v8)
for l in xrange(6):
print 'Top words for term cluster %s:' % l
print_top_k_words_term_cluster(V_v8[:, l], X_train_norm.toarray(), count_vect, 30)
print
In [138]:
best = 1e10
for _ in xrange(5):
U_t, S_t, V_t, V_t_t, rows_ind_t, error = matrix_factorization_overlapping_bin(X_train_norm.toarray(), 2, 2, num_iters=100)
try:
dav_sc = davies_bouldin_score(X_train_norm.toarray(), rows_ind_t, calculate_centroids_doc_mean(X_train_norm, rows_ind_t, 2))
except:
continue
if error < best:
best = error
U_v3 = U_t
S_v3 = S_t
V_v3 = V_t
V_t_v3 = V_t_t
rows_ind_v3 = rows_ind_t
print 'tf norm (2 clusters): %s' % rand_score(labels, rows_ind_t)
print 'tf norm (2 clusters): %s' % sil_score(X_train_norm, rows_ind_t)
print 'tf norm (2 clusters): %s' % dav_sc
print ''
In [139]:
print_hist(U_v3)
In [140]:
print rows_ind_v3
In [161]:
for k in range(2):
print 'Top words for term cluster 0 and doc cluster %s:' % k
print_top_k_words_term_cluster(V_v3[k][:, 0], X_train_norm.toarray(), count_vect, 30)
print
print 'Top words for term cluster 1 and doc cluster %s:' % k
print_top_k_words_term_cluster(V_v3[k][:, 1], X_train_norm.toarray(), count_vect, 30)
print
In [144]:
def print_top_k_words(vec, count_vect, k):
top_inds, top_vals = top_k(vec, k)
words = count_vect.get_feature_names()
top_words = [words[i] for i in top_inds]
top_pairs = reverse(zip(top_words, top_vals))
for word, value in top_pairs:
print '%s - %s' % (word, value)
for i in range(2):
clust_inds = np.where(rows_ind_v3 == i)[0]
sum_per_word = np.sum(X_train_norm.toarray()[clust_inds, :], axis=0)
print 'Top words for document cluster %s' % i
print_top_k_words(sum_per_word, count_vect, 20)
print
In [145]:
best = 1e10
for _ in xrange(5):
U_t, S_t, V_t, V_t_t, rows_ind_t, error = matrix_factorization_overlapping_bin(X_train_norm.toarray(), 3, 2, num_iters=100)
try:
dav_sc = davies_bouldin_score(X_train_norm.toarray(), rows_ind_t, calculate_centroids_doc_mean(X_train_norm, rows_ind_t, 3))
except:
continue
if error < best:
best = error
U_v4 = U_t
S_v4 = S_t
V_v4 = V_t
V_t_v4 = V_t_t
rows_ind_v4 = rows_ind_t
print 'tf norm (2 clusters): %s' % rand_score(labels, rows_ind_t)
print 'tf norm (2 clusters): %s' % sil_score(X_train_norm, rows_ind_t)
print 'tf norm (2 clusters): %s' % dav_sc
print ''
In [199]:
S_v4
Out[199]:
In [147]:
pairplot(U_v4)
In [200]:
for k in range(3):
print 'Top words for term cluster 0 and doc cluster %s:' % k
print_top_k_words_term_cluster(V_v4[k][:, 0], X_train_norm.toarray(), count_vect, 50)
print
print 'Top words for term cluster 1 and doc cluster %s:' % k
print_top_k_words_term_cluster(V_v4[k][:, 1], X_train_norm.toarray(), count_vect, 50)
print
In [157]:
print 'Num elems in cluster 0: %s' % np.sum(rows_ind_v4 == 0)
print 'Num elems in cluster 1: %s' % np.sum(rows_ind_v4 == 1)
print 'Num elems in cluster 2: %s' % np.sum(rows_ind_v4 == 2)
print rows_ind_v4
print np.where(rows_ind_v4 == 2)
for url in sport_news_df.ix[np.where(rows_ind_v4 == 2)[0]-100].url:
print url
In [158]:
inds = np.where(rows_ind_v4 == 0)[0]
plt.hist(norm(U_v4)[inds, 0], bins=50)
inds = np.where(rows_ind_v4 == 2)[0]
plt.hist(norm(U_v4)[inds, 0], bins=50)
inds = np.where(rows_ind_v4 == 1)[0]
plt.hist(norm(U_v4)[inds, 0], bins=50)
Out[158]:
In [159]:
def print_top_k_words(vec, count_vect, k):
top_inds, top_vals = top_k(vec, k)
words = count_vect.get_feature_names()
top_words = [words[i] for i in top_inds]
top_pairs = reverse(zip(top_words, top_vals))
for word, value in top_pairs:
print '%s - %s' % (word, value)
for i in range(3):
clust_inds = np.where(rows_ind_v4 == i)[0]
sum_per_word = np.sum(X_train_norm.toarray()[clust_inds, :], axis=0)
print 'Top words for document cluster %s' % i
print_top_k_words(sum_per_word, count_vect, 20)
print
In [ ]: