In [1]:
import networkx as nx
import numpy as np
import pickle as p
from os import path
from scipy.sparse import csr_matrix, lil_matrix
from matplotlib import pyplot as plt
%matplotlib inline
data_loc = './../data/raw/BlogCatalog-dataset/data/'
BlogCatalog is the social blog directory which manages the bloggers and their blogs. There are 10,312 bloggers with unique ids starting from 1 to 10,312 and 333,983 friendship pairs in this dataset. Each blogger belongs to multiple groups. There are 39 groups with indices ranging from 1 to 39.
In [2]:
def maybe_load_data(data='./../data/blogcatalog.data'):
if path.exists(data):
print("Dataset is found. Skip reading...")
with open(data, 'rb') as f:
return p.load(f)
else:
iid = {}
idx = 0
edgelist = []
# Read edges pairs
with open(data_loc+'edges.csv', 'r') as f:
for line in f.readlines():
i, j = line.strip().split(',') # csv
if i not in iid:
iid[i] = idx; idx += 1
if j not in iid:
iid[j] = idx; idx += 1
edgelist.append((iid[i], iid[j]))
# Create an nx undirected network
bc = nx.Graph(edgelist)
print("Number of nodes: ", len(bc))
print("Number of edges: ", bc.size())
# Read labels
lil_labels = lil_matrix((len(bc), 39), dtype=int)
# Read (node_id, label) file
with open(data_loc+'group-edges.csv', 'r') as f:
for line in f.readlines():
node, group = line.strip().split(',')
lil_labels[iid[node], int(group) - 1] = 1 # range(0,39)
# Pack data
bc_dataset = {'NXGraph': bc, 'LILLabels': lil_labels}
with open('./../data/blogcatalog.data', 'wb') as f:
p.dump(bc_dataset, f)
return bc_dataset
bc_dataset = maybe_load_data()
In [3]:
labels = bc_dataset['LILLabels']
In [4]:
if not path.exists('./../data/blogcatalog.edges'):
nx.write_edgelist(bc, path='./../data/blogcatalog.edges', data=False)
else:
print("Edge list file is found. Skip writing...")
In [5]:
b = bc_dataset['NXGraph']
In [6]:
degree_list = sorted([b.degree(i) for i in b], reverse=True)
Degree statistics:
In [7]:
print("Maximum degree: {}".format(degree_list[0]))
print("Minimum degree: {}".format(degree_list[-1]))
print("Mean of degree distribution: {}".format(np.mean(degree_list)))
print("Std variation of degree distribution: {}".format(np.std(degree_list)))
Degree distribution plot:
In [8]:
plt.subplot(111)
plt.semilogy(degree_list)
plt.title("BlogCatalog network degree distribution")
plt.ylabel("Log degree")
plt.xlabel("Nodes")
plt.legend(["Log-scale degree"])
Out[8]:
In [11]:
from sys import path
path.append('./../src/')
from walks import WalkGenerator
from constrains import R, UTriangle, UWedge
In [10]:
random_walker = WalkGenerator(graph=b, constrain=R())
bc_context = [i for i in random_walker(walk_length=80, num_walk=10)]
In [32]:
from collections import defaultdict as dd
random_walk_node_freqs = dd(int)
for node in bc_context[0]: # random context
random_walk_node_freqs[node] += 1
In [33]:
random_walk_node_hist = sorted(random_walk_node_freqs.values(), reverse=True)
plt.subplot(111)
plt.semilogy(random_walk_node_hist)
plt.title("Blogcatalog random walk node frequency distribution")
plt.ylabel("Log node count")
plt.xlabel("Nodes")
Out[33]:
In [34]:
print("Maximum frequency: {}".format(random_walk_node_hist[0]))
print("Minimum frequency: {}".format(random_walk_node_hist[-1]))
print("Mean of frequency distribution: {}".format(np.mean(random_walk_node_hist)))
print("Std variation of frequency distribution: {}".format(np.std(random_walk_node_hist)))
In [39]:
with open('../data/blogcatalog.random_context', 'w') as f:
for i in bc_context[0].reshape(-1, 80):
f.write(' '.join(map(str, i)) + '\n')
In [12]:
import time
triangle_walker = WalkGenerator(graph=b, constrain=UTriangle())
In [16]:
bc_triangle_context = [i for i in triangle_walker(walk_length=80, num_walk=10)]
In [17]:
from collections import defaultdict as dd
triangle_walk_node_freqs = dd(int)
for node in bc_triangle_context[0]: # random context
triangle_walk_node_freqs[node] += 1
In [18]:
triangle_walk_node_hist = sorted(triangle_walk_node_freqs.values(), reverse=True)
plt.subplot(111)
plt.semilogy(triangle_walk_node_hist)
plt.title("Blogcatalog triangle walk node frequency distribution")
plt.ylabel("Log node count")
plt.xlabel("Nodes")
Out[18]:
In [20]:
with open('../data/blogcatalog.triangle_context', 'w') as f:
for i in bc_triangle_context[0].reshape(-1, 80):
f.write(' '.join(map(str, i)) + '\n')
In [ ]: