In [1]:
import json
data = []
for line in open('internet-archive1.wane', 'r'):
data.append(json.loads(line))
In [2]:
len(data)
Out[2]:
In [3]:
import networkx as nx
import matplotlib.pyplot as plt
G=nx.Graph()
In [4]:
for item in data:
G.add_node(item['url'], bipartite=0)
for item in data:
G.add_nodes_from(item['named_entities']['persons'], bipartite=1)
In [5]:
edges = []
for item in data:
for index in range(len(item['named_entities']['persons'])):
l = [[item['url'], item['named_entities']['persons'][index]]]
edges.extend(tuple(l))
G.add_edges_from(edges)
In [6]:
G.number_of_nodes()
Out[6]:
In [7]:
G.number_of_edges()
Out[7]:
In [8]:
from networkx.algorithms import bipartite
In [9]:
bottom_nodes, top_nodes = bipartite.sets(G)
top_nodes = set(n for n,d in G.nodes(data=True) if d['bipartite']==0)
bottom_nodes = set(G) - top_nodes
In [10]:
nx.is_bipartite(G)
Out[10]:
In [11]:
G2 = G.to_undirected()
In [12]:
len(top_nodes)
Out[12]:
In [13]:
type(G2.nodes())
Out[13]:
In [14]:
degree_topnodes = G.degree(top_nodes)
degree_bottomnodes = G.degree(bottom_nodes)
In [15]:
type(degree_topnodes)
Out[15]:
In [18]:
degree_topnodes1 = {k:v for (k,v) in dict.items(degree_topnodes) if v > 100}
degree_bottomnodes1 = {k:v for (k,v) in dict.items(degree_bottomnodes) if v > 100}
In [25]:
drop_list = []
drop_list = list(degree_topnodes1.keys()) + list(degree_bottomnodes1.keys())
type(drop_list)
Out[25]:
In [26]:
H = G.subgraph(drop_list)
In [27]:
H.number_of_nodes()
Out[27]:
In [28]:
H.number_of_edges()
Out[28]:
In [29]:
H.to_undirected()
Out[29]:
In [30]:
nx.write_gml(H,"./graph3.gml")