In [1]:
import json
import numpy as np

In [2]:
full = json.load(open("./hashtag_network.json", "r"))

In [3]:
new_edges, new_nodes = (full['edges'], full['nodes'])

In [4]:
len(new_edges),len(new_nodes)


Out[4]:
(133220, 52590)

In [5]:
label2nodeid = {}
node_thresh = 500
nodes2remove = []
for i, node in enumerate(new_nodes):
    new_nodes[i]['id'] = i
    if new_nodes[i]['size']<node_thresh:
        nodes2remove.append(i)
        continue
    label2nodeid.update({node['label']:i})
    new_nodes[i]['size'] = max(int(np.log(int(node['size']))),1)
    new_nodes[i].pop("x",None)
    new_nodes[i].pop("y",None)
    
        
for i in sorted(nodes2remove)[::-1]:
    new_nodes.pop(i)
print(len(new_nodes))


74

In [6]:
edges2remove = []
edge_thresh = 30
for i, edge in enumerate(new_edges):
    new_edges[i]['id'] = i
    try:
        new_edges[i]['source'] = label2nodeid[edge['source']]
        new_edges[i]['target'] = label2nodeid[edge['target']]
        if new_edges[i]['size']<edge_thresh:
            edges2remove.append(i)
        new_edges[i]['size'] = max(int(edge['size']),1)
#         new_edges[i]['size'] = max(int(np.log(int(edge['size']))),1)
        new_edges[i]['attributes'] = "{Weight:"+str(new_edges[i]['size'])+'}'
    except KeyError:
        edges2remove.append(i)
        
for i in sorted(edges2remove)[::-1]:
    new_edges.pop(i)
print(len(new_edges))


407

In [7]:
len(new_edges)


Out[7]:
407

In [8]:
len(full['edges'])


Out[8]:
407

In [9]:
new_full = {
    'nodes':new_nodes,
    'edges':new_edges
}

In [10]:
json.dump(new_full, open('./hashtag_network_cleaned.json',"w"))

In [ ]: