In [1]:
import sys

sys.path.append('../../code/')
import os
import json
from datetime import datetime
import time
from math import *

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats

import igraph as ig

from load_data import load_citation_network, case_info

%load_ext autoreload
%autoreload 2
%matplotlib inline

data_dir = '../../data/'
court_name = 'scotus'

load into igraph


In [2]:
start = time.time()
if court_name == 'all':
    case_metadata = pd.read_csv(data_dir + 'clean/case_metadata_master.csv')

    edgelist = pd.read_csv(data_dir + 'clean/edgelist_master.csv')
else:
    net_dir = data_dir + 'clean/' + court_name + '/'
    if not os.path.exists(net_dir):
        os.makedirs(net_dir)
        make_court_subnetwork(court_name, data_dir)

    case_metadata = pd.read_csv(net_dir + 'case_metadata.csv')

    edgelist = pd.read_csv(net_dir + 'edgelist.csv')
    edgelist.drop('Unnamed: 0', inplace=True, axis=1)

# create a dictonary that maps court listener ids to igraph ids
cl_to_ig_id = {}
cl_ids = case_metadata['id'].tolist()
for i in range(case_metadata['id'].size):
    cl_to_ig_id[cl_ids[i]] = i

# add nodes
V = case_metadata.shape[0]
g = ig.Graph(n=V, directed=True)
# g.vs['date'] = case_metadata['date'].tolist()
g.vs['name'] = case_metadata['id'].tolist()

# create igraph edgelist
cases_w_metadata = set(cl_to_ig_id.keys())
ig_edgelist = []
missing_cases = 0
start = time.time()
for row in edgelist.itertuples():

    cl_ing = row[1]
    cl_ed = row[2]

    if (cl_ing in cases_w_metadata) and (cl_ed in cases_w_metadata):
        ing = cl_to_ig_id[cl_ing]
        ed = cl_to_ig_id[cl_ed]
    else:
        missing_cases += 0
    
    ig_edgelist.append((ing, ed))

# add edges to graph
g.add_edges(ig_edgelist)

# add vertex attributes
g.vs['court'] =  case_metadata['court'].tolist()
g.vs['year'] = [int(d.split('-')[0]) for d in case_metadata['date'].tolist()]

end = time.time()

print '%d seconds for %d edges' % (end - start, len(g.es))


0 seconds for 250465 edges

In [3]:
# g.write_graphml(data_dir + 'clean/entire_law_net.graphml')
# G = ig.read_graphml(data_dir + 'clean/entire_law_net.graphml')

analyze


In [6]:
g.summary()


Out[6]:
'IGRAPH DN-- 33248 250465 -- \n+ attr: court (v), name (v), year (v)'

In [7]:
visual_style={}
visual_style["layout"] = 'fr'

# vertices
visual_style['vertex_size'] = 20
visual_style['vertex_color'] = 'black'
visual_style['vertex_frame_color'] = 'black'


visual_style["edge_width"] = .1

visual_style["bbox"] = (500, 500)
visual_style["margin"] = 20

p = ig.plot(g, **visual_style)

In [ ]:
p

in degree distribution


In [26]:
indegrees = g.indegree()

plt.figure(figsize = [20, 10])

plt.subplot(1,2,1)
dmax = 100
binwidth = 1
plt.hist(indegrees, bins=range(0, dmax + binwidth, binwidth));
plt.xlim([0, dmax])
plt.ylim([0, 2e5])


plt.subplot(1,2,2)
plt.loglog(sorted(indegrees, reverse=True), '-', marker='.', color='black',
           alpha=.7);


Out[26]:
[<matplotlib.lines.Line2D at 0x2073c8810>]

Out degree distribution


In [28]:
outdegrees = g.outdegree()

# out degree distribution
plt.figure(figsize = [20, 10])

plt.subplot(1,2,1)
dmax = 50
binwidth = 1
plt.hist(outdegrees, bins=range(0, dmax + binwidth, binwidth));
plt.xlim([0, dmax])
plt.ylim([0, 2e5])


plt.subplot(1,2,2)
plt.loglog(sorted(outdegrees, reverse=True), '-', marker='.', color='black',
           alpha=.7);


degree statistics by year


In [93]:
year_range = range(1631, 2016 + 1)
year_quotient = pd.DataFrame(index=year_range, columns=['count', 'avg_indegree', 'avg_outdegree'])

count number of cases


In [104]:
year_counts = {y: 0 for y in year_quotient.index}
for v in g.vs:
    year_counts[v['year']] += 1

year_quotient['count'] = year_counts.values()

get average in/out degrees


In [99]:
indegrees = g.indegree()
outdegrees = g.outdegree()

indegs_counts = {y: [] for y in year_quotient.index}
outdegs_counts = {y: [] for y in year_quotient.index}

# get degrees for cases in each year
for i in range(len(g.vs)):
    year = g.vs[i]['year']
    
    indeg = indegrees[i]
    outdeg = outdegrees[i]

    indegs_counts[year].append(indeg)
    outdegs_counts[year].append(outdeg)
    
# average the degrees by yaer  
for y in indegs_counts.keys():
    indegs = indegs_counts[y]
    outdegs = outdegs_counts[y]
    
    
    if len(indegs) == 0:
        year_quotient.loc[y, 'avg_indegree'] = 0
    else:
        year_quotient.loc[y,'avg_indegree'] = np.mean(indegs)
        
        
        
    if len(outdegs) == 0:
        year_quotient.loc[y, 'avg_outdegree'] = 0
    else:
        year_quotient.loc[y,'avg_outdegree'] = np.mean(outdegs)

In [105]:
year_quotient


Out[105]:
count avg_indegree avg_outdegree
1631 1 0 0
1632 0 0 0
1633 0 0 0
1634 0 0 0
1635 0 0 0
1636 0 0 0
1637 0 0 0
1638 0 0 0
1639 0 0 0
1640 0 0 0
1641 1 0 0
1642 0 0 0
1643 0 0 0
1644 0 0 0
1645 0 0 0
1646 0 0 0
1647 0 0 0
1648 0 0 0
1649 0 0 0
1650 0 0 0
1651 0 0 0
1652 0 0 0
1653 0 0 0
1654 0 0 0
1655 0 0 0
1656 0 0 0
1657 0 0 0
1658 0 0 0
1659 0 0 0
1660 0 0 0
... ... ... ...
1987 49450 11.8642 8.54738
1988 48021 11.817 8.91839
1989 48383 12.0883 8.99225
1990 50880 11.9102 8.75336
1991 55713 11.8268 8.37444
1992 61236 10.5748 8.62138
1993 62913 10.28 8.69973
1994 62611 9.76102 8.90877
1995 64501 9.19328 8.83792
1996 70206 8.58008 8.80704
1997 65278 8.77588 9.42803
1998 64381 8.45804 9.77661
1999 62997 8.57444 9.99032
2000 62945 9.43416 10.2198
2001 65358 7.67707 10.3108
2002 69380 7.11789 10.1386
2003 70081 6.67971 10.4298
2004 72349 6.12384 10.0678
2005 83267 5.41775 9.55274
2006 88567 4.47566 9.18237
2007 97355 4.05657 8.617
2008 117499 2.79631 7.52348
2009 120519 2.36827 7.67797
2010 116141 1.80863 7.93733
2011 99383 1.36347 7.69446
2012 70879 0.567912 6.93746
2013 78739 0.15451 6.81313
2014 107262 0.0715631 6.07937
2015 104643 0.0302075 5.85419
2016 59877 0.0135945 4.95021

386 rows × 3 columns

Plot the time series


In [124]:
plt.figure(figsize=[10, 10])
plt.scatter(year_quotient.index,
            year_quotient['count'],
            marker='.',
            color='black')
plt.ylim(0, max(year_quotient['count']))
plt.xlim([1850, 2016])
plt.xlabel('year')
plt.ylabel('number of cases')


Out[124]:
<matplotlib.text.Text at 0x1ef3dd350>

In [130]:
plt.figure(figsize=[8, 8])
plt.scatter(year_quotient.index,
            year_quotient['avg_indegree'],
            marker='.',
            color='black')
plt.ylim(0, max(year_quotient['avg_indegree']))
plt.xlim([1900, 2016])
plt.xlabel('year')
plt.ylabel('average in-degree')


Out[130]:
<matplotlib.text.Text at 0x20a009710>

In [132]:
plt.figure(figsize=[8, 8])
plt.scatter(year_quotient.index,
            year_quotient['avg_outdegree'],
            marker='.',
            color='black')
plt.ylim(0, max(year_quotient['avg_outdegree']))
plt.xlim([1850, 2016])
plt.xlabel('year')
plt.ylabel('average out-degree')


Out[132]:
<matplotlib.text.Text at 0x14bd76610>

In [ ]: