In [1]:
import json
data = []
for line in open('internet-archive1.wane', 'r'):
    data.append(json.loads(line))

In [2]:
len(data)


Out[2]:
22761

In [3]:
import networkx as nx
import matplotlib.pyplot as plt
G=nx.Graph()

In [4]:
for item in data:
    G.add_node(item['url'], bipartite=0)
for item in data:
    G.add_nodes_from(item['named_entities']['persons'], bipartite=1)

In [5]:
edges = []
for item in data:
    for index in range(len(item['named_entities']['persons'])):
        l = [[item['url'], item['named_entities']['persons'][index]]]
        edges.extend(tuple(l))
G.add_edges_from(edges)

In [6]:
G.number_of_nodes()


Out[6]:
65017

In [7]:
G.number_of_edges()


Out[7]:
535723

In [8]:
from networkx.algorithms import bipartite

In [9]:
bottom_nodes, top_nodes = bipartite.sets(G)
top_nodes = set(n for n,d in G.nodes(data=True) if d['bipartite']==0)
bottom_nodes = set(G) - top_nodes

In [10]:
nx.is_bipartite(G)


Out[10]:
True

In [11]:
G2 = G.to_undirected()

In [12]:
len(top_nodes)


Out[12]:
22761

In [13]:
type(G2.nodes())


Out[13]:
list

In [14]:
degree_topnodes = G.degree(top_nodes)
degree_bottomnodes = G.degree(bottom_nodes)

In [15]:
type(degree_topnodes)


Out[15]:
dict

In [18]:
degree_topnodes1 = {k:v for (k,v) in dict.items(degree_topnodes) if v > 100}
degree_bottomnodes1 = {k:v for (k,v) in dict.items(degree_bottomnodes) if v > 100}

In [25]:
drop_list = []
drop_list = list(degree_topnodes1.keys()) + list(degree_bottomnodes1.keys()) 
type(drop_list)


Out[25]:
list

In [26]:
H = G.subgraph(drop_list)

In [27]:
H.number_of_nodes()


Out[27]:
2478

In [28]:
H.number_of_edges()


Out[28]:
239055

In [29]:
H.to_undirected()


Out[29]:
<networkx.classes.graph.Graph at 0x10c63b518>

In [30]:
nx.write_gml(H,"./graph3.gml")