The goal of this byte is to explore some algorithms and visualizations relating to networks
This Byte has three parts:
In [1]:
import copy
# open the file you have downloaded
# these files are organized
file = open("amazon.txt")
# this returns an array with one entry for each line ni the file
lines = file.readlines()
print len(lines)
# Note: the format of the snap files is to list a node (identified by a unique number)
# and all of the nodes it links to (also identified by numbers), on the same line, separated by tabs.
In [2]:
# construct the graph
# a set is an unordered collection of unique elements
edges = set()
# this will store our nodes
nodes = {}
# divide the line into the node and all of its edges
# for each line in the file that was loaded in
for line in lines:
# divide the line into the node and all of its edges
data = line.split()
a = int(data[0])
b = int(data[1])
# add the edge
edges.add((a, b))
# update the count for the number of times we've seen each node
nodes[a] = nodes.get(a, -1) + 1
nodes[b] = nodes.get(b, -1) + 1
print "number of unique edges"
print len(edges)
print "number of unique nodes"
print len(nodes)
In [3]:
# get the degrees of each node in a set of edges
def get_degrees(edges):
degree_counts={}
# for each pair of nodes (edge)
for i,j in edges:
# increment the count for the number of edges connected
# to each node by one
degree_counts[i] = degree_counts.get(i, 0) + 1
degree_counts[j] = degree_counts.get(j, 0) + 1
return degree_counts
# Delete all nodes in delete_nodes from edges
def delete_node(edges, delete_nodes):
# construct a new set of edges
new_edges = []
print "# of nodes to be deleted", len(delete_nodes)
# loop through all the current edges
for i, j in edges:
# if an edges two nodes are not in the
# set of nodes to be deleted
if i not in delete_nodes and j not in delete_nodes:
# append that edge to our new edges
new_edges.append((i,j))
return new_edges
# kcore algorithm
# We run the kcore algorithm to delete all
# the nodes whose cores are less than k
# returns a new set of edges and nodes
# including only those in the k core.
def kcore(edges, k):
# make a complete copy of the edges so we can delete or change
# things without messing up our original
edges = copy.deepcopy(edges)
# now for each pair of nodes, count the number of
degree_counts = get_degrees(edges)
# sort the nodes by degree and return
# only the node numbers (not their degree)
sorted_nodes = sorted(degree_counts, key = degree_counts.get)
print "largest degree: ", degree_counts[sorted_nodes[0]]
# repeatedly delete all nodes with degrees < k to find the k core
# if we run out of nodes, or the largest count is < k we should stop
while ((len(sorted_nodes) > 0) and (degree_counts[sorted_nodes[0]]<k)):
# collect nodes with degrees < k in to_delete
to_delete = set()
for node in sorted_nodes:
if degree_counts[node]<k:
to_delete.add(node)
else:
break
# delete all edges that include those nodes
edges = delete_node(edges, to_delete)
print "# of edges left:",len(edges)
# recount the degrees for this (smaller) graph
degree_counts = get_degrees(edges)
# resort the nodes
sorted_nodes = sorted(degree_counts, key = degree_counts.get)
return edges, sorted_nodes
In [7]:
core_edges, core_nodes=kcore(edges, 3)
In [4]:
# We can use this method to create
# an adjacency matrix to represent the graph
def build_neighborhood(edges, nodes):
neighborhood = {}
for node in nodes:
# create a place to store the neighbors
neighborhood[node]=set()
for edge in edges:
# if either side of the edge contains node
# add the other side as a neighbor
if node == edge[0]:
neighborhood[node].add(edge[1])
if node == edge[1]:
neighborhood[node].add(edge[0])
return neighborhood
# This method is used to discover the connected components
# The basic idea is Breadth First Search
# We start from a node and find all the nodes it can reach
# In this way we can get a cluster of nodes which is called
# a connected component
# to start, we pass in the edges,
def get_connected_components(edges, neighborhood, nodes):
result = []
nodes = set(nodes)
# keep track of what we've seen
visited = set()
# loop until there are no more nodes
while nodes:
# grab the first one
node = nodes.pop()
# create a new set for it
component = set()
# start searching from node
queue = [node]
while queue:
# pick a node and mark as visited
node = queue.pop(0)
visited.add(node)
# add it to our connected component
component.add(node)
# find all its neighbors
neighbors = neighborhood[node]
# add them to the queue (if we haven't seen them before)
for neighbor in neighbors:
if neighbor not in visited:
nodes.discard(neighbor)
queue.append(neighbor)
result.append(component)
return result
In [8]:
neighborhood = build_neighborhood(core_edges, core_nodes)
ret = get_connected_components(core_edges, neighborhood, core_nodes)
print "# of connected components",len(ret)
You may need to install the library "networkx". It is a very great tool to help you analyze graph data. You can combine it with Dephi to visualize and analyze social network. Here we use D3 library to visualize the graph data after running k-core algorithm. You can use other different fancy tools or graphs to visualize it.
In [ ]:
import networkx as nx
from networkx.readwrite import json_graph
import json
# create a graph and add al the edges
G=nx.Graph()
for edge in edges:
G.add_edge(edge[0],edge[1])
nld = json_graph.node_link_data(G)
# We store the data in a json file
# So the javascript code can read it
json.dump(nld, open('force.json','w'))
In [ ]:
from IPython.display import IFrame
# IPython Notebook can serve files and display them into
# inline frames. Prepend the path with the 'files' prefix.
viz_file = 'force.html'
IFrame(viz_file, width=700, height=550)
In [48]:
# code to analyze undirected graphs
%matplotlib inline
from pylab import *
import matplotlib.pyplot as plt
# get the degrees for each node (again)
nodes = get_degrees(edges)
v = nodes.values()
# this ensures that we don't have any values more than once
noRep = list(set(v))
noRep.sort()
x = []
y = []
for count in noRep:
# f is the number of times this value occurs
f = v.count(count)
x.append(count)
y.append(f)
figure()
loglog(x, y, '*')
xlabel('x')
ylabel('y')
title('power law plot')
show()
In [49]:
# code to analyze directed graphs
file = open("twitter.txt")
lines = file.readlines()
edges = set()
nodes_indegree = {}
nodes_outdegree = {}
# construct the indegree info and edges
# very similar to what we did for directed graphs
for line in lines:
data = line.split()
source = int(data[0])
endpoint = int(data[1])
# add the edge
edges.add((source, endpoint))
# update the count for the number of times we've seen each node
nodes_indegree[source] = nodes_indegree.get(source, -1) + 1
nodes_outdegree[endpoint] = nodes_outdegree.get(endpoint, -1) + 1
%matplotlib inline
from pylab import *
import matplotlib.pyplot as plt
# now show this to the viewer
v_indegree = nodes_indegree.values()
v_outdegree = nodes_outdegree.values()
noRep_indegree = list(set(v_indegree))
noRep_outdegree = list(set(v_outdegree))
noRep_indegree.sort()
noRep_outdegree.sort()
x_indegree = []
y_indegree = []
x_outdegree = []
y_outdegree = []
for count in noRep_indegree:
f = v_indegree.count(count)
x_indegree.append(count)
y_indegree.append(f)
for count in noRep_outdegree:
f = v_outdegree.count(count)
x_outdegree.append(count)
y_outdegree.append(f)
figure()
loglog(x_indegree, y_indegree, '*')
xlabel('x')
ylabel('y')
title('indegree distribution')
show()
figure()
loglog(x_outdegree, y_outdegree, '*')
xlabel('x')
ylabel('y')
title('outdegree distribution')
show()
==============
1) Now that you've seen how k-core works, time to explore the way in which k impacts the results. Experiment with different values of k. How does the number of connected components change as you increase k? How does the number of nodes in a the connected components change as you increase k? 2) Can you create a chart that shows the relationship between k and the number of connected components? Chart at least two different data sets and try to write something about how they are different 3) Can you make a version of k-core that works for directed graphs? For directed graphs the node degree is defined to be the in-degree + out-degree.