In [1]:
datadir = "/var/datasets/twitter/"

In [2]:
file = datadir + "twitter_combined.txt"

In [3]:
!ls -lh {file}


-rw-r--r--  1 migish  staff    42M May 30 15:23 /var/datasets/twitter/twitter_combined.txt

In [3]:
!tail {file}


99841247 364350420
99841247 175848601
99841247 117674417
99841247 152011250
99841247 53811382
99841247 154263215
99841247 194403468
99841247 180165101
99841247 253509115
99841247 463410501

In [4]:
!wc -l {file}


 2420766 /var/datasets/twitter/twitter_combined.txt

In [14]:
#!sort {file} > {datadir}sorted_twitter_combined.txt
!uniq {datadir}/sorted_twitter_combined.txt | wc -l


 1768149

In [4]:
import networkx as nx
import json
from networkx.readwrite import json_graph

In [5]:
G = nx.read_edgelist(file, create_using=nx.DiGraph(), nodetype=int,)

In [6]:
G.size()


Out[6]:
1768149

In [7]:
G.order()


Out[7]:
81306

In [53]:
Gs = G.subgraph(G.nodes()[0:4000])

In [54]:
Gs.size()


Out[54]:
5446

In [55]:
def cleanupgraph(G):
    comp = nx.weakly_connected_components(G.copy())
    for c in comp:
        if len(c)<200:
            G.remove_nodes_from(c)

In [56]:
cleanupgraph(Gs)

In [57]:
Gs.size()


Out[57]:
4574

In [58]:
# Save and plot graph
d = json_graph.node_link_data(Gs)
for node in d['nodes']:
    node['name']=node['id']
    node['value']=Gs.degree(node['id'])

#d['adjacency'] = json_graph.adjacency_data(Gs)['adjacency']
json.dump(d, open('graph.json','w'))


get_ipython().run_cell_magic(u'html', u'', u'<div id="d3-example"></div>\n<style>\n.node {stroke: #fff; stroke-width: 1.5px;}\nmarker {stroke: #999;}\n.link {stroke: #999; stroke-opacity: .6;}\n</style>\n<script src="force.js"></script>')
#nx.draw_graphviz(G)



In [ ]: