In [1]:
import sys

sys.path.append('../../code/')
import os
import json
from datetime import datetime
import time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats

import igraph as ig

from load_data import load_citation_network_igraph, case_info

%load_ext autoreload
%autoreload 2
%matplotlib inline

data_dir = '../../data/'
court_name = 'scotus'

In [2]:
# this will be a little slow the first time you run it
G = load_citation_network_igraph(data_dir, court_name)

print 'loaded %s network with %d cases and %d edges' % (court_name, len(G.vs), len(G.es))


1 seconds for 250465 edges
loaded scotus network with 33248 cases and 250465 edges

randomly sample edges that are not there


In [183]:
desired_num_samples = 1000

all_indices = range(len(G.vs))

nonexistant_edge_list = []

start_time = time.time()
while len(nonexistant_edge_list) < desired_num_samples:
    # randomly select a pair of vertices
    rand_pair = np.random.choice(all_indices, size=2, replace=False)
    
    # check if there is currently an edge between the two vertices
    edge_check = G.es.select(_between=([rand_pair[0]], [rand_pair[1]]))
    
    # if edge does not exist add it to the list
    if len(edge_check) == 0: 
       
        # order the vertices by time
        if G.vs[rand_pair[0]]['year'] <= G.vs[rand_pair[1]]['year']:
            ing_id = rand_pair[1]
            ed_id = rand_pair[0]
        else:
            ing_id = rand_pair[0]
            ed_id = rand_pair[1]
            
        nonexistant_edge_list.append((ing_id, ed_id))
total_runtime = time.time() - start_time

print 'total_runtime %1.5f' % (total_runtime/desired_num_samples)

print 'len nonexistant_edge_list %d' % len(nonexistant_edge_list)


total_runtime 0.00452
len nonexistant_edge_list 1000

In [187]:
print 'estimated time to get to 500000 samples: %1.5f min' % (((total_runtime/desired_num_samples) * 500000)/60)


estimated time to get to 500000 samples: 37.65709 min

In [ ]: