In [3]:
import sys

sys.path.append('../../code/')
import os
import json
from datetime import datetime
import time
from math import *

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats

import igraph as ig

from load_data import load_citation_network, case_info

%load_ext autoreload
%autoreload 2
%matplotlib inline

data_dir = '../../data/'
court_name = 'all'


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

load into igraph


In [4]:
start = time.time()
if court_name == 'all':
    case_metadata = pd.read_csv(data_dir + 'clean/case_metadata_master.csv')

    edgelist = pd.read_csv(data_dir + 'clean/edgelist_master.csv')
else:
    net_dir = data_dir + 'clean/' + court_name + '/'
    if not os.path.exists(net_dir):
        os.makedirs(net_dir)
        make_court_subnetwork(court_name, data_dir)

    case_metadata = pd.read_csv(net_dir + 'case_metadata.csv')

    edgelist = pd.read_csv(net_dir + 'edgelist.csv')
    edgelist.drop('Unnamed: 0', inplace=True, axis=1)

# create a dictonary that maps court listener ids to igraph ids
cl_to_ig_id = {}
cl_ids = case_metadata['id'].tolist()
for i in range(case_metadata['id'].size):
    cl_to_ig_id[cl_ids[i]] = i

# add nodes
V = case_metadata.shape[0]
g = ig.Graph(n=V, directed=True)
# g.vs['date'] = case_metadata['date'].tolist()
g.vs['name'] = case_metadata['id'].tolist()

# create igraph edgelist
cases_w_metadata = set(cl_to_ig_id.keys())
ig_edgelist = []
missing_cases = 0
start = time.time()
for row in edgelist.itertuples():

    cl_ing = row[1]
    cl_ed = row[2]

    if (cl_ing in cases_w_metadata) and (cl_ed in cases_w_metadata):
        ing = cl_to_ig_id[cl_ing]
        ed = cl_to_ig_id[cl_ed]
    else:
        missing_cases += 0
    
    ig_edgelist.append((ing, ed))

# add edges to graph
g.add_edges(ig_edgelist)

end = time.time()

print '%d seconds for %d edges' % (end - start, len(g.es))


84 seconds for 24997230 edges

In [ ]:


In [5]:
# add vertex attributes
g.vs['court'] =  case_metadata['court'].tolist()
g.vs['year'] = [int(d.split('-')[0]) for d in case_metadata['date'].tolist()]

In [10]:
# g.write_graphml(data_dir + 'clean/entire_law_net.graphml')
# G = ig.read_graphml(data_dir + 'clean/entire_law_net.graphml')

analyze


In [12]:
g.summary()


Out[12]:
'IGRAPH DN-- 3760097 24997230 -- \n+ attr: court (v), name (v), year (v)'

in degree distribution


In [13]:
indegrees = g.indegree()

plt.figure(figsize = [20, 10])

plt.subplot(1,2,1)
dmax = 100
binwidth = 1
plt.hist(indegrees, bins=range(0, dmax + binwidth, binwidth));
plt.xlim([0, dmax])
plt.ylim([0, 2e5])


plt.subplot(1,2,2)
plt.loglog(sorted(indegrees, reverse=True), '-', marker='.', color='black',
           alpha=.7);


Out degree distribution


In [14]:
outdegrees = g.outdegree()

# out degree distribution
plt.figure(figsize = [20, 10])

plt.subplot(1,2,1)
dmax = 50
binwidth = 1
plt.hist(outdegrees, bins=range(0, dmax + binwidth, binwidth));
plt.xlim([0, dmax])
plt.ylim([0, 2e5])


plt.subplot(1,2,2)
plt.loglog(sorted(outdegrees, reverse=True), '-', marker='.', color='black',
           alpha=.7);


degree statistics by year


In [15]:
year_range = range(1631, 2016 + 1)
year_quotient = pd.DataFrame(index=year_range, columns=['count', 'avg_indegree', 'avg_outdegree'])

count number of cases


In [16]:
year_counts = {y: 0 for y in year_quotient.index}
for v in g.vs:
    year_counts[v['year']] += 1

year_quotient['count'] = year_counts.values()

get average in/out degrees


In [17]:
indegrees = g.indegree()
outdegrees = g.outdegree()

indegs_counts = {y: [] for y in year_quotient.index}
outdegs_counts = {y: [] for y in year_quotient.index}

# get degrees for cases in each year
for i in range(len(g.vs)):
    year = g.vs[i]['year']
    
    indeg = indegrees[i]
    outdeg = outdegrees[i]

    indegs_counts[year].append(indeg)
    outdegs_counts[year].append(outdeg)
    
# average the degrees by yaer  
for y in indegs_counts.keys():
    indegs = indegs_counts[y]
    outdegs = outdegs_counts[y]
    
    
    if len(indegs) == 0:
        year_quotient.loc[y, 'avg_indegree'] = 0
    else:
        year_quotient.loc[y,'avg_indegree'] = np.mean(indegs)
        
        
        
    if len(outdegs) == 0:
        year_quotient.loc[y, 'avg_outdegree'] = 0
    else:
        year_quotient.loc[y,'avg_outdegree'] = np.mean(outdegs)

In [18]:
year_quotient


Out[18]:
count avg_indegree avg_outdegree
1631 1 0 0
1632 0 0 0
1633 0 0 0
1634 0 0 0
1635 0 0 0
1636 0 0 0
1637 0 0 0
1638 0 0 0
1639 0 0 0
1640 0 0 0
1641 1 0 0
1642 0 0 0
1643 0 0 0
1644 0 0 0
1645 0 0 0
1646 0 0 0
1647 0 0 0
1648 0 0 0
1649 0 0 0
1650 0 0 0
1651 0 0 0
1652 0 0 0
1653 0 0 0
1654 0 0 0
1655 0 0 0
1656 0 0 0
1657 0 0 0
1658 0 0 0
1659 0 0 0
1660 0 0 0
... ... ... ...
1987 49450 11.8642 8.54738
1988 48021 11.817 8.91839
1989 48383 12.0883 8.99225
1990 50880 11.9102 8.75336
1991 55713 11.8268 8.37444
1992 61236 10.5748 8.62138
1993 62913 10.28 8.69973
1994 62611 9.76102 8.90877
1995 64501 9.19328 8.83792
1996 70206 8.58008 8.80704
1997 65278 8.77588 9.42803
1998 64381 8.45804 9.77661
1999 62997 8.57444 9.99032
2000 62945 9.43416 10.2198
2001 65358 7.67707 10.3108
2002 69380 7.11789 10.1386
2003 70081 6.67971 10.4298
2004 72349 6.12384 10.0678
2005 83267 5.41775 9.55274
2006 88567 4.47566 9.18237
2007 97355 4.05657 8.617
2008 117499 2.79631 7.52348
2009 120519 2.36827 7.67797
2010 116141 1.80863 7.93733
2011 99383 1.36347 7.69446
2012 70879 0.567912 6.93746
2013 78739 0.15451 6.81313
2014 107262 0.0715631 6.07937
2015 104643 0.0302075 5.85419
2016 59877 0.0135945 4.95021

386 rows × 3 columns

Plot the time series


In [19]:
plt.figure(figsize=[10, 10])
plt.scatter(year_quotient.index,
            year_quotient['count'],
            marker='.',
            color='black')
plt.ylim(0, max(year_quotient['count']))
plt.xlim([1850, 2016])
plt.xlabel('year')
plt.ylabel('number of cases')


Out[19]:
<matplotlib.text.Text at 0x18a4f2250>

In [20]:
plt.figure(figsize=[8, 8])
plt.scatter(year_quotient.index,
            year_quotient['avg_indegree'],
            marker='.',
            color='black')
plt.ylim(0, max(year_quotient['avg_indegree']))
plt.xlim([1900, 2016])
plt.xlabel('year')
plt.ylabel('average in-degree')


Out[20]:
<matplotlib.text.Text at 0x18a459a50>

In [21]:
plt.figure(figsize=[8, 8])
plt.scatter(year_quotient.index,
            year_quotient['avg_outdegree'],
            marker='.',
            color='black')
plt.ylim(0, max(year_quotient['avg_outdegree']))
plt.xlim([1850, 2016])
plt.xlabel('year')
plt.ylabel('average out-degree')


Out[21]:
<matplotlib.text.Text at 0x17a2f4810>

In [22]:
def CreateSubGraph(g, court, includeIfMatched=True):
    sub_g = g.as_directed()
    if includeIfMatched:
        to_delete_ids = [v.index for v in g.vs if court not in v['court']]
    else:
        to_delete_ids = [v.index for v in g.vs if court in v['court']]
    sub_g.delete_vertices(to_delete_ids)
    return sub_g

In [23]:
def YearQuotient(g):
    year_range = range(1631, 2016 + 1)
    year_quotient = pd.DataFrame(index=year_range, columns=['count', 'avg_indegree', 'avg_outdegree'])

    #count number of cases
    year_counts = {y: 0 for y in year_quotient.index}
    for v in g.vs:
        year_counts[v['year']] += 1

    year_quotient['count'] = year_counts.values()

    #get average in/out degrees
    indegrees = g.indegree()
    outdegrees = g.outdegree()

    indegs_counts = {y: [] for y in year_quotient.index}
    outdegs_counts = {y: [] for y in year_quotient.index}

    # get degrees for cases in each year
    for i in range(len(g.vs)):
        year = g.vs[i]['year']
        
        indeg = indegrees[i]
        outdeg = outdegrees[i]

        indegs_counts[year].append(indeg)
        outdegs_counts[year].append(outdeg)
        
    # average the degrees by yaer  
    for y in indegs_counts.keys():
        indegs = indegs_counts[y]
        outdegs = outdegs_counts[y]
        
        
        if len(indegs) == 0:
            year_quotient.loc[y, 'avg_indegree'] = 0
        else:
            year_quotient.loc[y,'avg_indegree'] = np.mean(indegs)
            
        if len(outdegs) == 0:
            year_quotient.loc[y, 'avg_outdegree'] = 0
        else:
            year_quotient.loc[y,'avg_outdegree'] = np.mean(outdegs)
            
    return year_quotient

In [24]:
def DegreePlots(g):
    year_quotient = YearQuotient(g)
    #Plot the time series
    plt.figure(1)
    plt.figure(figsize=[10, 10])
    plt.scatter(year_quotient.index,
                year_quotient['count'],
                marker='.',
                color='black')
    plt.ylim(0, max(year_quotient['count']))
    plt.xlim([1850, 2016])
    plt.xlabel('year')
    plt.ylabel('number of cases')

    plt.figure(2)
    plt.figure(figsize=[8, 8])
    plt.scatter(year_quotient.index,
                year_quotient['avg_indegree'],
                marker='.',
                color='black')
    plt.ylim(0, max(year_quotient['avg_indegree']))
    plt.xlim([1900, 2016])
    plt.xlabel('year')
    plt.ylabel('average in-degree')

    plt.figure(3)
    plt.figure(figsize=[8, 8])
    plt.scatter(year_quotient.index,
                year_quotient['avg_outdegree'],
                marker='.',
                color='black')
    plt.ylim(0, max(year_quotient['avg_outdegree']))
    plt.xlim([1850, 2016])
    plt.xlabel('year')
    plt.ylabel('average out-degree')
    
    plt.show()

In [25]:
def CompareDegreePlots(g,sub_g,overall_net='',sub_net=''):
    year_quotient = YearQuotient(g)
    year_quotient_sub_g = YearQuotient(sub_g)
    
    #Plot the time series
    plt.figure(1)
    plt.figure(figsize=[10, 10])
    plt.scatter(year_quotient.index,
                year_quotient['count'],
                marker='.',
                color='black',
                label = overall_net)
    plt.scatter(year_quotient_sub_g.index,
                year_quotient_sub_g['count'],
                marker='.',
                color='red',
                label = sub_net)
    plt.ylim(0, max( max(year_quotient['count']), max(year_quotient_sub_g['count']) ))
    plt.xlim([1850, 2016])
    plt.xlabel('year')
    plt.ylabel('number of cases')
    plt.title('counts')
    plt.legend(loc='upper right')
    
    plt.figure(2)
    plt.figure(figsize=[8, 8])
    plt.scatter(year_quotient.index,
                year_quotient['avg_indegree'],
                marker='.',
                color='black',
                label=overall_net)
    plt.scatter(year_quotient_sub_g.index,
                year_quotient_sub_g['avg_indegree'],
                marker='.',
                color='red',
                label = sub_net)
    plt.ylim(0, max( max(year_quotient['avg_indegree']), max(year_quotient_sub_g['avg_indegree']) ))
    plt.xlim([1800, 2016])
    plt.xlabel('year')
    plt.ylabel('average in-degree')
    plt.title('in-degree')
    plt.legend(loc='upper right')

    plt.figure(3)
    plt.figure(figsize=[8, 8])
    plt.scatter(year_quotient.index,
                year_quotient['avg_outdegree'],
                marker='.',
                color='black',
                label = overall_net)
    plt.scatter(year_quotient_sub_g.index,
                year_quotient_sub_g['avg_outdegree'],
                marker='.',
                color='red',
                label = sub_net)
    plt.ylim(0, max( max(year_quotient['avg_outdegree']), max(year_quotient_sub_g['avg_outdegree']) ))
    plt.xlim([1800, 2016])
    plt.xlabel('year')
    plt.ylabel('average out-degree')
    plt.title('out-degree')
    plt.legend(loc='upper right')
    
    plt.show()

In [26]:
g_scotus = CreateSubGraph(g,'scotus')

In [27]:
CompareDegreePlots(g,g_scotus,overall_net='overall',sub_net='scotus')


<matplotlib.figure.Figure at 0x17a31d450>

In [28]:
g_minus_scotus = CreateSubGraph(g, 'scotus', includeIfMatched=False)

In [29]:
CompareDegreePlots(g_minus_scotus,g_scotus,overall_net='overall - scotus',sub_net='scotus')


<matplotlib.figure.Figure at 0x1108e25d0>

remove zero out-degree cases


In [56]:
def get_degree_trend_nonzero(g):
    year_range = range(1631, 2016 + 1)
    nz_year_quotient = pd.DataFrame(index=year_range, columns=['zero_count' ,'nonzero_count', 'nz_avg_indegree', 'nz_avg_outdegree'])
    
    outdegrees = g.outdegree()

    # count number of zero degree cases
    zd_year_counts = {y: 0 for y in year_quotient.index}
    for i in range(len(g.vs)):
        if outdegrees[i] == 0:
            v = g.vs[i]
            zd_year_counts[v['year']] += 1

    nz_year_quotient['zero_count'] = zd_year_counts.values()
    
    # remove zero degree cases
    zero_deg_cases = [i for i in range(len(g.vs)) if outdegrees[i] ==0]
    g_nozero = g.copy()
    g_nozero.delete_vertices(zero_deg_cases)

    # compute the average in and out degree
    nz_year_quotient_temp = YearQuotient(g_nozero)

    nz_year_quotient['nonzero_count'] = nz_year_quotient_temp['count']
    nz_year_quotient['nz_avg_indegree'] = nz_year_quotient_temp['avg_indegree']
    nz_year_quotient['nz_avg_outdegree'] = nz_year_quotient_temp['avg_outdegree']
    
    return nz_year_quotient

In [61]:
nz_deg_trend_all_noscotus = get_degree_trend_nonzero(g_minus_scotus)
nz_deg_trend_scotus = get_degree_trend_nonzero(g_scotus)

look at zero out-degree counts


In [63]:
plt.figure(figsize=[20, 10])

plt.subplot(1,2,1)
plt.title('all minus scotus')
plt.scatter(year_range,
            nz_deg_trend_all_noscotus['zero_count'],
            marker = '.',
            color='grey',
            label='zero')

plt.scatter(year_range,
            nz_deg_trend_all_noscotus['nonzero_count'],
            marker='.',
            color='black',
            label='nonzero')

plt.ylim([0, max(nz_deg_trend_all_noscotus['nonzero_count'])])
plt.legend(loc='upper left')
plt.xlabel('year')
plt.ylabel('count')



plt.subplot(1,2,2)
plt.title('scotus')
plt.scatter(year_range,
            nz_deg_trend_scotus['zero_count'],
            marker = '.',
            color='grey',
            label='zero')

plt.scatter(year_range,
            nz_deg_trend_scotus['nonzero_count'],
            marker='.',
            color='black',
            label='nonzero')

plt.ylim([0, max(nz_deg_trend_scotus['nonzero_count'])])
plt.legend(loc='upper left')
plt.xlabel('year')
plt.ylabel('count')


Out[63]:
<matplotlib.text.Text at 0x17a5c7490>

In [86]:
plt.figure(figsize=[8, 8])
plt.scatter(year_range,
            nz_deg_trend_all_noscotus['nz_avg_outdegree'],
            marker='.',
            color='black',
            label = 'all minus scotus')

plt.scatter(year_range,
            nz_deg_trend_scotus['nz_avg_outdegree'],
            marker='.',
            color='red',
            label = 'scotus')
plt.ylim(0, max( max(nz_deg_trend_scotus['nz_avg_outdegree'],), max(nz_deg_trend_all_noscotus['nz_avg_outdegree']) ))
plt.xlim([1800, 2016])
plt.xlabel('year')
plt.ylabel('average out-degree (zeo-trunc)')
plt.title('out-degree for zero truncated network')
plt.legend(loc='upper right')


Out[86]:
<matplotlib.legend.Legend at 0x16b5c2390>

In [87]:
degree_rato = []
for i in range(nz_deg_trend_all_noscotus.shape[0]):
    d_noscotus = nz_deg_trend_all_noscotus['nz_avg_outdegree'].tolist()[i]
    d_scotus = nz_deg_trend_scotus['nz_avg_outdegree'].tolist()[i]
    
    if d_noscotus == 0:
        degree_rato.append(0)
    else:
        degree_rato.append(d_scotus/ (d_noscotus + 0.0))

In [88]:
plt.figure(figsize=[8, 8])
plt.scatter(year_range,
            degree_rato,
            marker='.',
            color='black')
plt.axhline(1, color='red')

plt.ylim([0, 5])
plt.xlim([1800, 2016])
plt.xlabel('year')
plt.ylabel('avg out-deg ratio')
plt.title('out-degree ratio scotus/all-scotus for zero truncated network')


Out[88]:
<matplotlib.text.Text at 0x16ea5de10>

In [ ]: