In [2]:
using JSON, ProgressMeter, JLD, LightGraphs, TextAnalysis
In [2]:
# a stand in for python's os.walk.
# will apply the function fn to whatever file is at path, then close the file
function dirwalk(path::AbstractString, fn::Function)
content = readdir(path)
for c in content
p = joinpath(path, c)
if isdir(p)
dirwalk(p, fn)
elseif isfile(p)
println(p)
open(fn, p)
end
end
end
Out[2]:
In [3]:
# dictionary where key is the name of the tweeter and values is an array
# of strings containing the people the tweeter retweeted
const di = Dict{String,Array{String,1}}()
function add_data_to_dict(f::IOStream)
lines = readlines(f)
try
tweets = JSON.parse(lines[1])
for tweet in tweets
if tweet["retweet"] != "N"
if haskey(di, tweet["name"])
push!(di[tweet["name"]], tweet["original_name"])
else
di[tweet["name"]] = [tweet["original_name"]]
end
end
end
end
end
Out[3]:
In [5]:
# this converts the data from di
# into a list of names and a name_to_index dictionary that we
# can use to build the graph
function fill_data(di::Dict{String,Array{String,1}})
name_to_index = Dict{String, Int64}()
names = Array{String}(0)
for (k, vs) in di
push!(names, k)
for v in vs
push!(names, v)
end
end
names = unique(names)
for (i, n) in enumerate(names)
name_to_index[n] = i
end
return names, name_to_index
end
Out[5]:
In [7]:
dirwalk("/media/henripal/hd1/data/", add_data_to_dict)
In [9]:
# serialization options for later
# save("/media/henripal/hd1/data/temp.jld", "di", di)
# di = JLD.load("/media/henripal/hd1/data/temp.jld", "di")
In [7]:
length(di)
Out[7]:
Now, we need to be able to build a graph with only integers (we need the graph structure to be lightweight), but still be able to get the main info. We create an array of unique names and a dictionary linking the unique name to the index in the graph.
Note that the name field in the data is not the best as it does not reflect the twitter user's handle, can contain unicode and be hard to work with, etc...
In [9]:
names, name_to_index = fill_data(di)
Out[9]:
In [11]:
# more serialization
JLD.save("/media/henripal/hd1/data/names.jld", "names", names)
JLD.save("/media/henripal/hd1/data/name_to_index.jld", "name_to_index", name_to_index)
In [90]:
graph = Graph(length(names))
Out[90]:
Now we fill the edges according to the retweet structure:
In [91]:
for (key, val) in di
@showprogress for item in val
if item != "CC" # @CCs in tweets need to be removed
source = name_to_index[key]
target = name_to_index[item]
add_edge!(graph, source, target)
end
end
end
In a limited size dataset where connectivity is indicated by retweets, many nodes will be isolated. Happily, graph theory still tells us that we should get good coverage using only the largest connected or giant component, so we will now create that subgraph and check that it has a good size.
In [92]:
# getting the connected components and sorting them by their size
conn = connected_components(graph)
sort!(conn, by = length, rev=true);
In [93]:
# gettin the giant component
giant, giant_nodes = induced_subgraph(graph, conn[1])
giant
Out[93]:
In [94]:
# and now the size of the second largets connected component:
induced_subgraph(graph, conn[2])[1]
Out[94]:
Looks good, the second largest connected component is mini. To check that we find good stuff, we'll name the 10 most connected nodes and see if they make sense:
In [95]:
centrality = degree_centrality(giant);
centrality_tuples = collect(zip(centrality, giant_nodes));
sort!(centrality_tuples, by = x -> x[1], rev = true);
In [97]:
for i in 1:20
println(names[centrality_tuples[i][2]])
end
Note that the first step here would be to sign the edges according to the fact that some edges are endorsements or not. However for prototyping we will skip this step as we do not have the edited tweet info. I think this is available in the full twitter API.
This is the main objective in the DualNMF presented in the paper. We will ignore connected words and words that show up less than 20 times.
In [1]:
## see SparseMatrix.ipynb