In [24]:
import sys

sys.path.append('../../code/')
import os
import json
from datetime import datetime
import time
from math import *

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats

import igraph as ig
import networkx as nx

from load_data import load_citation_network, case_info

%load_ext autoreload
%autoreload 2
%matplotlib inline

data_dir = '../../data/'
court_name = 'scotus'


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

In [10]:
case_metadata = pd.read_csv(data_dir + 'clean/case_metadata_master.csv')
edgelist = pd.read_csv(data_dir + 'clean/edgelist_master.csv')

In [2]:
# net_dir = data_dir + 'clean/' + court_name + '/'
# case_metadata = pd.read_csv(net_dir + 'case_metadata.csv')

# edgelist = pd.read_csv(net_dir + 'edgelist.csv')
# edgelist.drop('Unnamed: 0', inplace=True, axis=1)

Compare iterrows vs itertuples


In [3]:
start = time.time()
# create graph and add metadata
G = nx.DiGraph()
G.add_nodes_from(case_metadata.index.tolist())
nx.set_node_attributes(G, 'date', case_metadata['date'].to_dict())
for index, edge in edgelist.iterrows():
    ing = edge['citing']
    ed = edge['cited']
    G.add_edge(ing, ed)
end = time.time()

print 'pandas took %d seconds to go though %d edges using iterrows'  % (end - start, edgelist.shape[0])


pandas took 29 seconds to go though 250465 edges using iterrows

In [4]:
# go through edglist using itertuples

start = time.time()
# create graph and add metadata
G = nx.DiGraph()
G.add_nodes_from(case_metadata.index.tolist())
nx.set_node_attributes(G, 'date', case_metadata['date'].to_dict())
for row in edgelist.itertuples():
    ing = row[1]
    ed = row[2]
    G.add_edge(ing, ed)
end = time.time()

print 'pandas took %d seconds to go though %d edges using itertuples'  % (end - start, edgelist.shape[0])


pandas took 1 seconds to go though 250465 edges using itertuples

load into igraph


In [37]:
# create a dictonary that maps court listener ids to igraph ids
cl_to_ig_id = {}
cl_ids = case_metadata['id'].tolist()
for i in range(case_metadata['id'].size):
    cl_to_ig_id[cl_ids[i]] = i

In [38]:
start = time.time()
V = case_metadata.shape[0]

g = ig.Graph(n=V, directed=True)
g.vs['date'] = case_metadata['date'].tolist()
g.vs['name'] = case_metadata['id'].tolist()

ig_edgelist = []
missing_cases = 0
start = time.time()
# i = 1
for row in edgelist.itertuples():
#     if log(i, 2) == int(log(i, 2)):
#         print 'edge %d' % i
#     i += 1

    cl_ing = row[1]
    cl_ed = row[2]

    if (cl_ing in cl_to_ig_id.keys()) and (cl_ed in cl_to_ig_id.keys()):
        ing = cl_to_ig_id[cl_ing]
        ed = cl_to_ig_id[cl_ed]
    else:
        missing_cases += 0
    
    ig_edgelist.append((ing, ed))
intermediate = time.time()

g.add_edges(ig_edgelist)
end = time.time()

print 'itertuples took %d seconds to go through %d edges'  % (intermediate - start, edgelist.shape[0])
print 'igraph took %d seconds to add %d edges'  % (end - start, edgelist.shape[0])


---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-38-27ebfd643a17> in <module>()
     18     cl_ed = row[2]
     19 
---> 20     if (cl_ing in cl_to_ig_id.keys()) and (cl_ed in cl_to_ig_id.keys()):
     21         ing = cl_to_ig_id[cl_ing]
     22         ed = cl_to_ig_id[cl_ed]

KeyboardInterrupt: 

igraph find vs. select


In [ ]:
start = time.time()
R = 1000
for i in range(R):
    g.vs.find(name='92891')
end = time.time()
print 'g.vs.find took %E seconds per lookup' % ((end - start)/R)

In [ ]:
start = time.time()
R = 1000
for i in range(R):
    g.vs.select(name='92891')
end = time.time()
print 'g.vs.select took %E seconds per lookup' % ((end - start)/R)

In [ ]:
start = time.time()
R = 1000
for i in range(R):
    cl_to_ig_id[92891]
end = time.time()
print 'pandas df lookup took %E seconds per lookup' % ((end - start)/R)