In [2]:
using JSON, ProgressMeter, JLD, LightGraphs, MatrixMarket
In [2]:
# a stand in for python's os.walk.
# will apply the function fn to whatever file is at path, then close the file
function dirwalk(path::AbstractString, fn::Function)
content = readdir(path)
for c in content
p = joinpath(path, c)
if isdir(p)
dirwalk(p, fn)
elseif isfile(p)
println(p)
open(fn, p)
end
end
end
Out[2]:
In [3]:
# dictionary where key is the name of the tweeter and values is an array
# of strings containing the people the tweeter retweeted
const di = Dict{String,Array{String,1}}()
function add_data_to_dict(f::IOStream)
lines = readlines(f)
try
tweets = JSON.parse(lines[1])
for tweet in tweets
if tweet["retweet"] != "N"
if haskey(di, tweet["name"])
push!(di[tweet["name"]], tweet["original_name"])
else
di[tweet["name"]] = [tweet["original_name"]]
end
end
end
end
end
Out[3]:
In [4]:
# this converts the data from di
# into a list of names and a name_to_index dictionary that we
# can use to build the graph
function fill_data(di::Dict{String,Array{String,1}})
name_to_index = Dict{String, Int64}()
names = Array{String}(0)
for (k, vs) in di
push!(names, k)
for v in vs
push!(names, v)
end
end
names = unique(names)
for (i, n) in enumerate(names)
name_to_index[n] = i
end
return names, name_to_index
end
Out[4]:
In [ ]:
# takes our data format (a dictionary of string -> [string] and a dictionary of string -> int)
# and builds the LightGraph by adding all nodes and edges
function build_graph(graph_dict::Dict{String, Array{String, 1}},
name_to_index::Dict{String, Int64})
graph = LightGraphs.Graph(length(names))
for (key, val) in graph_dict
@showprogress for item in val
if item != "CC" # @CCs in tweets need to be removed
source = name_to_index[key]
target = name_to_index[item]
add_edge!(graph, source, target)
end
end
end
return graph
end
In [ ]:
function fill_nontweeters(m::SparseMatrixCSC{Int64, Int64},
non_tweeter_transfo, n_lines::Int64, n_cols::Int64)::SparseMatrixCSC{Int64, Int64}
is, js, vals = findnz(m)
new_is = Array{Int64,1}()
new_js = Array{Int64,1}()
new_vals = Array{Int64,1}()
for i in 1:length(is)
if haskey(non_tweeter_transfo, is[i] )
push!(new_is, non_tweeter_transfo[is[i]])
push!(new_js, js[i])
push!(new_vals, vals[i])
end
end
sparse(new_is, new_js, new_vals, n_lines, n_cols)
end
In [ ]:
function fill_reduced(m::SparseMatrixCSC{Int64, Int64},
corpus_indices, n_lines::Int64, n_cols::Int64)::SparseMatrixCSC{Int64, Int64}
is,js,vals = findnz(m)
sparse([corpus_indices[i] for i in is], js, vals, n_lines, n_cols)
end
In [5]:
# filling out the dictionary
dirwalk("/media/henripal/hd1/data/", add_data_to_dict)
In [2]:
# serialization options for later
# save("/media/henripal/hd1/data/temp.jld", "di", di)
# di = JLD.load("/media/henripal/hd1/data/temp.jld", "di")
Out[2]:
In [7]:
length(di)
Out[7]:
Now, we need to be able to build a graph with only integers (we need the graph structure to be lightweight), but still be able to get the main info. We create an array of unique names and a dictionary linking the unique name to the index in the graph.
Note that the name field in the data is not the best as it does not reflect the twitter user's handle, can contain unicode and be hard to work with, etc...
In [8]:
names, name_to_index = fill_data(di)
In [9]:
# more serialization
#JLD.save("/media/henripal/hd1/data/names.jld", "names", names)
#JLD.save("/media/henripal/hd1/data/name_to_index.jld", "name_to_index", name_to_index)
In [11]:
#names = JLD.load("/media/henripal/hd1/data/names.jld","names")
#name_to_index = JLD.load("/media/henripal/hd1/data/name_to_index.jld", "name_to_index");
Now we fill the edges according to the retweet structure:
In [7]:
graph = build_graph(di, name_to_index)
In [ ]:
JLD.save("/media/henripal/hd1/data/graph.jld", "graph", graph)
In [ ]:
# the sparse bow matrix from the python notebook (thanks @wwymak)
corpus = MatrixMarket.mmread("/media/henripal/hd1/data/corp.mm")
corpus = convert(SparseMatrixCSC{Int64, Int64}, corpus);
In [ ]:
# this is the names from the tweets extracted from the python notebook
corpus_names= readdlm("/media/henripal/hd1/data/corpus_names.csv",',',String)
Some cleaning required to make the python dictionary talk to the julia dictionary. We need to:
In [ ]:
n_lines = length(names)
n_cols = size(corpus)[2]
In [ ]:
corpus_indices = [name_to_index[corpus_name] for corpus_name in corpus_names]
corpus_indices_set = Set(corpus_indices)
In [ ]:
# we build the reverse dictionary: retweeted -> tweeter.
# this will be used later to build the words associated with the retweeter
rev_di = Dict{String, String}()
for (tweeter,retweeteds) in di
for retweeted in retweeteds
rev_di[retweeted] = tweeter
end
end
In [ ]:
# here non_tweeters are the subset of indices in the graph, not in the corpus.
# some list comprehension fun to build the mapping that will map them to their words
non_tweeter_indices = [i for i in 1:length(names) if ~ (i in corpus_indices_set)];
non_tweeter_names = [names[i] for i in non_tweeter_indices];
non_tweeter_transfo = Dict(zip([name_to_index[rev_di[n]] for n in non_tweeter_names], non_tweeter_indices))
In [ ]:
# the user_word matrix, with only the common names (other lines show up as zeros)
user_word = fill_reduced(corpus, corpus_indices, n_lines, n_cols);
In [ ]:
# finally, we add the nontweeters, add the two matrices, and we are done!
nontweeters_word = fill_nontweeters(user_word, non_tweeter_transfo, n_lines, n_cols)
user_word = user_word + nontweeters_word
In [ ]:
JLD.save("/media/henripal/hd1/data/user_word.jld", "user_word", user_word)
In a limited size dataset where connectivity is indicated by retweets, many nodes will be isolated. Happily, graph theory still tells us that we should get good coverage using only the largest connected or giant component, so we will now create that subgraph and check that it has a good size.
In [13]:
# getting the connected components and sorting them by their size
conn = connected_components(graph)
sort!(conn, by = length, rev=true);
In [14]:
# gettin the giant component
giant, giant_nodes = induced_subgraph(graph, conn[1])
giant
Out[14]:
In [15]:
# and now the size of the second largets connected component:
induced_subgraph(graph, conn[2])[1]
Out[15]:
Looks good, the second largest connected component is mini. To check that we find good stuff, we'll name the 10 most connected nodes and see if they make sense:
In [16]:
centrality = degree_centrality(giant);
centrality_tuples = collect(zip(centrality, giant_nodes));
sort!(centrality_tuples, by = x -> x[1], rev = true);
In [17]:
for i in 1:20
println(names[centrality_tuples[i][2]])
end