In [1]:
using JSON, ProgressMeter, JLD, LightGraphs
This implements the DualNMF algorithm in this paper: Community Detection in Political Twitter Networks using Nonnegative Matrix Factorization Methods
Prerequisites:
In [18]:
user_word = JLD.load("/media/henripal/hd1/data/new_mat.jld", "new_mat");
graph = JLD.load("/media/henripal/hd1/data/graph.jld", "graph");
In [19]:
size(user_word,2)
Out[19]:
In [20]:
function sparse_similarity(m::SparseMatrixCSC)::SparseMatrixCSC
normalized_user_word = spzeros(size(m)...)
norms = [norm(m[:,i]) for i in 1:size(m,2)]
@showprogress for (col,s) in enumerate(norms)
s == 0 && continue # What does a "normalized" column with a sum of zero look like?
normalized_user_word[:,col] = m[:,col]/s
end
return normalized_user_word' * normalized_user_word
end
# this builds a LightGraphs graph from a similartiy matrix
function build_graph_from_similarity(similarity::Matrix, cutoff::Float64)::Graph
length = size(similarity, 1)
graph = Graph(length)
for i in 1:length
for j in 1:i-1
similarity[i, j] > cutoff && add_edge!(graph, i, j)
end
end
graph
end
Out[20]:
In [21]:
# this succesively builds the similarity matrix then
# builds the graph from the similarity matrix
cutoff = .4
similarity = sparse_similarity(user_word)
word_graph = build_graph_from_similarity(full(similarity), cutoff)
Out[21]:
Some remarks: the cost function is quite expensive, so we do not calculate it everytime. It would be maybe useful to adaptively calculate it?
Some of the helper functions are a little strange looking; this is because these matrices are huge, some are not sparse, and the memory usage can get a little out of control. Hence the column iterations, some precalculations, etc..
In [22]:
# calculating the laplacian matrices and plusminus laplacian:
L_c = laplacian_matrix(graph)
word_laplacian = laplacian_matrix(word_graph)
wl_plus = (abs(word_laplacian)+word_laplacian)/2
wl_minus = (abs(word_laplacian)-word_laplacian)/2
gl_plus = (abs(L_c)+L_c)/2
gl_minus = (abs(L_c)-L_c)/2
Out[22]:
In [28]:
# parameters and initializing W and U
clusters = 60
α = 10
β = 10
users = size(L_c,1)
words = size(word_laplacian ,1)
W = .5 * spones(sprand(words, clusters, .5))
U = .5 * spones(sprand(users, clusters, .5))
Out[28]:
In [30]:
# memory friendly update functions
function update_U(U::SparseMatrixCSC, W::SparseMatrixCSC)::SparseMatrixCSC
WpW = W' * W
return U .* sqrt((user_word * W + α * gl_minus * U) ./ (U * WpW + α * gl_plus * U))
end
function update_W(U::SparseMatrixCSC, W::SparseMatrixCSC)::SparseMatrixCSC
UpU = U' * U
return W .* sqrt((user_word' * U + β * wl_minus * W) ./ (W * UpU + β * wl_plus * W))
end
Out[30]:
In [32]:
# memory friendly frobenius norms and objective functions
function my_frobenius(uw::SparseMatrixCSC, U::SparseMatrixCSC, W::SparseMatrixCSC)::Float64
(users, words) = size(uw)
wp = W'
clusters = size(U,2)
result = 0
@showprogress for j in 1:words
uwp_j = U*wp[:, j]
result += norm(uw[:, j] - uwp_j)^2
end
result
end
function obj(U::SparseMatrixCSC, W::SparseMatrixCSC)::Float64
my_frobenius(user_word, U, W) + α * trace(U' * L_c * U) + β * trace(W' * word_laplacian * W)
end
Out[32]:
This is where we run the algorithm. Somewhat time intensive but not crazily so
In [47]:
tolerance = .05
delta = 1000
stride = 10
err = obj(U, W)
while delta > tolerance
for i in 1:stride
U = update_U(U, W);
W = update_W(U, W);
end
newerr = obj(U,W)
delta = abs(newerr - err)
err = newerr
end
In [49]:
JLD.save("/media/henripal/hd1/data/U_60.jld", "U_60", U)
In [51]:
# another helper functions, assigns the communities based on the highest probability of being in that community
function assign_communities(u::SparseMatrixCSC)
(n_user, n_cluster) = size(u)
communities = Array{Int64,1}(n_user)
@showprogress for user in 1:n_user
communities[user] = indmax(u[user, :])
end
communities
end
Out[51]:
In [52]:
comm = assign_communities(U)
Out[52]:
In [53]:
using Plots
In [55]:
histogram(comm, nbins = 60)
Out[55]:
In [10]:
U_60 = JLD.load("/media/henripal/hd1/data/U_60.jld", "U_60")
Out[10]:
In [12]:
using DataFrames
In [25]:
name_followers = readtable("/media/henripal/hd1/data/name_to_follower.csv", header = false);
In [26]:
rename!(name_followers,:x1,:name)
rename!(name_followers,:x2, :followers)
Out[26]:
In [32]:
sort!(name_followers, cols= :followers, rev = true);
In [35]:
name_followers = name_followers[1:10000,:]
Out[35]:
In [36]:
name_followers[:ind] = [name_to_index[n] for n in name_followers[:name]]
Out[36]:
In [38]:
user_vectors = Array{Float64,2}(10000, 60)
Out[38]:
In [41]:
for i in 1:10000, j in 1:60
user_vectors[i, j] = U_60[name_followers[:ind][i], j]
end
In [43]:
user_cluster = DataFrame(user_vectors)
Out[43]:
In [62]:
class = [indmax(user_vectors[i,:]) for i in 1:10000];
class_freq = zeros(60)
for class_no in class
class_freq[class_no] = class_freq[class_no]+1
end
In [74]:
best_classes = [i for i in 1:60 if class_freq[i] > 250]
Out[74]:
In [78]:
new_class = [indmax(user_vectors[i, best_classes]) for i in 1:10000]
Out[78]:
In [79]:
name_followers[:class] = string.(new_class)
Out[79]:
In [80]:
writetable("/media/henripal/hd1/data/attributes.tsv",name_followers[:,[:name, :class, :followers]])
In [53]:
writetable("/media/henripal/hd1/data/vectors.tsv",user_cluster, header = false)
In [20]:
df
Out[20]:
In [ ]: