In [3]:
import sys
sys.path.append('../../code/')
import os
import json
from datetime import datetime
import time
from math import *
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
import igraph as ig
from load_data import load_citation_network, case_info
%load_ext autoreload
%autoreload 2
%matplotlib inline
data_dir = '../../data/'
court_name = 'all'
In [4]:
start = time.time()
if court_name == 'all':
case_metadata = pd.read_csv(data_dir + 'clean/case_metadata_master.csv')
edgelist = pd.read_csv(data_dir + 'clean/edgelist_master.csv')
else:
net_dir = data_dir + 'clean/' + court_name + '/'
if not os.path.exists(net_dir):
os.makedirs(net_dir)
make_court_subnetwork(court_name, data_dir)
case_metadata = pd.read_csv(net_dir + 'case_metadata.csv')
edgelist = pd.read_csv(net_dir + 'edgelist.csv')
edgelist.drop('Unnamed: 0', inplace=True, axis=1)
# create a dictonary that maps court listener ids to igraph ids
cl_to_ig_id = {}
cl_ids = case_metadata['id'].tolist()
for i in range(case_metadata['id'].size):
cl_to_ig_id[cl_ids[i]] = i
# add nodes
V = case_metadata.shape[0]
g = ig.Graph(n=V, directed=True)
# g.vs['date'] = case_metadata['date'].tolist()
g.vs['name'] = case_metadata['id'].tolist()
# create igraph edgelist
cases_w_metadata = set(cl_to_ig_id.keys())
ig_edgelist = []
missing_cases = 0
start = time.time()
for row in edgelist.itertuples():
cl_ing = row[1]
cl_ed = row[2]
if (cl_ing in cases_w_metadata) and (cl_ed in cases_w_metadata):
ing = cl_to_ig_id[cl_ing]
ed = cl_to_ig_id[cl_ed]
else:
missing_cases += 0
ig_edgelist.append((ing, ed))
# add edges to graph
g.add_edges(ig_edgelist)
end = time.time()
print '%d seconds for %d edges' % (end - start, len(g.es))
In [ ]:
In [5]:
# add vertex attributes
g.vs['court'] = case_metadata['court'].tolist()
g.vs['year'] = [int(d.split('-')[0]) for d in case_metadata['date'].tolist()]
In [10]:
# g.write_graphml(data_dir + 'clean/entire_law_net.graphml')
# G = ig.read_graphml(data_dir + 'clean/entire_law_net.graphml')
In [12]:
g.summary()
Out[12]:
In [13]:
indegrees = g.indegree()
plt.figure(figsize = [20, 10])
plt.subplot(1,2,1)
dmax = 100
binwidth = 1
plt.hist(indegrees, bins=range(0, dmax + binwidth, binwidth));
plt.xlim([0, dmax])
plt.ylim([0, 2e5])
plt.subplot(1,2,2)
plt.loglog(sorted(indegrees, reverse=True), '-', marker='.', color='black',
alpha=.7);
In [14]:
outdegrees = g.outdegree()
# out degree distribution
plt.figure(figsize = [20, 10])
plt.subplot(1,2,1)
dmax = 50
binwidth = 1
plt.hist(outdegrees, bins=range(0, dmax + binwidth, binwidth));
plt.xlim([0, dmax])
plt.ylim([0, 2e5])
plt.subplot(1,2,2)
plt.loglog(sorted(outdegrees, reverse=True), '-', marker='.', color='black',
alpha=.7);
In [15]:
year_range = range(1631, 2016 + 1)
year_quotient = pd.DataFrame(index=year_range, columns=['count', 'avg_indegree', 'avg_outdegree'])
count number of cases
In [16]:
year_counts = {y: 0 for y in year_quotient.index}
for v in g.vs:
year_counts[v['year']] += 1
year_quotient['count'] = year_counts.values()
get average in/out degrees
In [17]:
indegrees = g.indegree()
outdegrees = g.outdegree()
indegs_counts = {y: [] for y in year_quotient.index}
outdegs_counts = {y: [] for y in year_quotient.index}
# get degrees for cases in each year
for i in range(len(g.vs)):
year = g.vs[i]['year']
indeg = indegrees[i]
outdeg = outdegrees[i]
indegs_counts[year].append(indeg)
outdegs_counts[year].append(outdeg)
# average the degrees by yaer
for y in indegs_counts.keys():
indegs = indegs_counts[y]
outdegs = outdegs_counts[y]
if len(indegs) == 0:
year_quotient.loc[y, 'avg_indegree'] = 0
else:
year_quotient.loc[y,'avg_indegree'] = np.mean(indegs)
if len(outdegs) == 0:
year_quotient.loc[y, 'avg_outdegree'] = 0
else:
year_quotient.loc[y,'avg_outdegree'] = np.mean(outdegs)
In [18]:
year_quotient
Out[18]:
In [19]:
plt.figure(figsize=[10, 10])
plt.scatter(year_quotient.index,
year_quotient['count'],
marker='.',
color='black')
plt.ylim(0, max(year_quotient['count']))
plt.xlim([1850, 2016])
plt.xlabel('year')
plt.ylabel('number of cases')
Out[19]:
In [20]:
plt.figure(figsize=[8, 8])
plt.scatter(year_quotient.index,
year_quotient['avg_indegree'],
marker='.',
color='black')
plt.ylim(0, max(year_quotient['avg_indegree']))
plt.xlim([1900, 2016])
plt.xlabel('year')
plt.ylabel('average in-degree')
Out[20]:
In [21]:
plt.figure(figsize=[8, 8])
plt.scatter(year_quotient.index,
year_quotient['avg_outdegree'],
marker='.',
color='black')
plt.ylim(0, max(year_quotient['avg_outdegree']))
plt.xlim([1850, 2016])
plt.xlabel('year')
plt.ylabel('average out-degree')
Out[21]:
In [22]:
def CreateSubGraph(g, court, includeIfMatched=True):
sub_g = g.as_directed()
if includeIfMatched:
to_delete_ids = [v.index for v in g.vs if court not in v['court']]
else:
to_delete_ids = [v.index for v in g.vs if court in v['court']]
sub_g.delete_vertices(to_delete_ids)
return sub_g
In [23]:
def YearQuotient(g):
year_range = range(1631, 2016 + 1)
year_quotient = pd.DataFrame(index=year_range, columns=['count', 'avg_indegree', 'avg_outdegree'])
#count number of cases
year_counts = {y: 0 for y in year_quotient.index}
for v in g.vs:
year_counts[v['year']] += 1
year_quotient['count'] = year_counts.values()
#get average in/out degrees
indegrees = g.indegree()
outdegrees = g.outdegree()
indegs_counts = {y: [] for y in year_quotient.index}
outdegs_counts = {y: [] for y in year_quotient.index}
# get degrees for cases in each year
for i in range(len(g.vs)):
year = g.vs[i]['year']
indeg = indegrees[i]
outdeg = outdegrees[i]
indegs_counts[year].append(indeg)
outdegs_counts[year].append(outdeg)
# average the degrees by yaer
for y in indegs_counts.keys():
indegs = indegs_counts[y]
outdegs = outdegs_counts[y]
if len(indegs) == 0:
year_quotient.loc[y, 'avg_indegree'] = 0
else:
year_quotient.loc[y,'avg_indegree'] = np.mean(indegs)
if len(outdegs) == 0:
year_quotient.loc[y, 'avg_outdegree'] = 0
else:
year_quotient.loc[y,'avg_outdegree'] = np.mean(outdegs)
return year_quotient
In [24]:
def DegreePlots(g):
year_quotient = YearQuotient(g)
#Plot the time series
plt.figure(1)
plt.figure(figsize=[10, 10])
plt.scatter(year_quotient.index,
year_quotient['count'],
marker='.',
color='black')
plt.ylim(0, max(year_quotient['count']))
plt.xlim([1850, 2016])
plt.xlabel('year')
plt.ylabel('number of cases')
plt.figure(2)
plt.figure(figsize=[8, 8])
plt.scatter(year_quotient.index,
year_quotient['avg_indegree'],
marker='.',
color='black')
plt.ylim(0, max(year_quotient['avg_indegree']))
plt.xlim([1900, 2016])
plt.xlabel('year')
plt.ylabel('average in-degree')
plt.figure(3)
plt.figure(figsize=[8, 8])
plt.scatter(year_quotient.index,
year_quotient['avg_outdegree'],
marker='.',
color='black')
plt.ylim(0, max(year_quotient['avg_outdegree']))
plt.xlim([1850, 2016])
plt.xlabel('year')
plt.ylabel('average out-degree')
plt.show()
In [25]:
def CompareDegreePlots(g,sub_g,overall_net='',sub_net=''):
year_quotient = YearQuotient(g)
year_quotient_sub_g = YearQuotient(sub_g)
#Plot the time series
plt.figure(1)
plt.figure(figsize=[10, 10])
plt.scatter(year_quotient.index,
year_quotient['count'],
marker='.',
color='black',
label = overall_net)
plt.scatter(year_quotient_sub_g.index,
year_quotient_sub_g['count'],
marker='.',
color='red',
label = sub_net)
plt.ylim(0, max( max(year_quotient['count']), max(year_quotient_sub_g['count']) ))
plt.xlim([1850, 2016])
plt.xlabel('year')
plt.ylabel('number of cases')
plt.title('counts')
plt.legend(loc='upper right')
plt.figure(2)
plt.figure(figsize=[8, 8])
plt.scatter(year_quotient.index,
year_quotient['avg_indegree'],
marker='.',
color='black',
label=overall_net)
plt.scatter(year_quotient_sub_g.index,
year_quotient_sub_g['avg_indegree'],
marker='.',
color='red',
label = sub_net)
plt.ylim(0, max( max(year_quotient['avg_indegree']), max(year_quotient_sub_g['avg_indegree']) ))
plt.xlim([1800, 2016])
plt.xlabel('year')
plt.ylabel('average in-degree')
plt.title('in-degree')
plt.legend(loc='upper right')
plt.figure(3)
plt.figure(figsize=[8, 8])
plt.scatter(year_quotient.index,
year_quotient['avg_outdegree'],
marker='.',
color='black',
label = overall_net)
plt.scatter(year_quotient_sub_g.index,
year_quotient_sub_g['avg_outdegree'],
marker='.',
color='red',
label = sub_net)
plt.ylim(0, max( max(year_quotient['avg_outdegree']), max(year_quotient_sub_g['avg_outdegree']) ))
plt.xlim([1800, 2016])
plt.xlabel('year')
plt.ylabel('average out-degree')
plt.title('out-degree')
plt.legend(loc='upper right')
plt.show()
In [26]:
g_scotus = CreateSubGraph(g,'scotus')
In [27]:
CompareDegreePlots(g,g_scotus,overall_net='overall',sub_net='scotus')
In [28]:
g_minus_scotus = CreateSubGraph(g, 'scotus', includeIfMatched=False)
In [29]:
CompareDegreePlots(g_minus_scotus,g_scotus,overall_net='overall - scotus',sub_net='scotus')
In [56]:
def get_degree_trend_nonzero(g):
year_range = range(1631, 2016 + 1)
nz_year_quotient = pd.DataFrame(index=year_range, columns=['zero_count' ,'nonzero_count', 'nz_avg_indegree', 'nz_avg_outdegree'])
outdegrees = g.outdegree()
# count number of zero degree cases
zd_year_counts = {y: 0 for y in year_quotient.index}
for i in range(len(g.vs)):
if outdegrees[i] == 0:
v = g.vs[i]
zd_year_counts[v['year']] += 1
nz_year_quotient['zero_count'] = zd_year_counts.values()
# remove zero degree cases
zero_deg_cases = [i for i in range(len(g.vs)) if outdegrees[i] ==0]
g_nozero = g.copy()
g_nozero.delete_vertices(zero_deg_cases)
# compute the average in and out degree
nz_year_quotient_temp = YearQuotient(g_nozero)
nz_year_quotient['nonzero_count'] = nz_year_quotient_temp['count']
nz_year_quotient['nz_avg_indegree'] = nz_year_quotient_temp['avg_indegree']
nz_year_quotient['nz_avg_outdegree'] = nz_year_quotient_temp['avg_outdegree']
return nz_year_quotient
In [61]:
nz_deg_trend_all_noscotus = get_degree_trend_nonzero(g_minus_scotus)
nz_deg_trend_scotus = get_degree_trend_nonzero(g_scotus)
look at zero out-degree counts
In [63]:
plt.figure(figsize=[20, 10])
plt.subplot(1,2,1)
plt.title('all minus scotus')
plt.scatter(year_range,
nz_deg_trend_all_noscotus['zero_count'],
marker = '.',
color='grey',
label='zero')
plt.scatter(year_range,
nz_deg_trend_all_noscotus['nonzero_count'],
marker='.',
color='black',
label='nonzero')
plt.ylim([0, max(nz_deg_trend_all_noscotus['nonzero_count'])])
plt.legend(loc='upper left')
plt.xlabel('year')
plt.ylabel('count')
plt.subplot(1,2,2)
plt.title('scotus')
plt.scatter(year_range,
nz_deg_trend_scotus['zero_count'],
marker = '.',
color='grey',
label='zero')
plt.scatter(year_range,
nz_deg_trend_scotus['nonzero_count'],
marker='.',
color='black',
label='nonzero')
plt.ylim([0, max(nz_deg_trend_scotus['nonzero_count'])])
plt.legend(loc='upper left')
plt.xlabel('year')
plt.ylabel('count')
Out[63]:
In [86]:
plt.figure(figsize=[8, 8])
plt.scatter(year_range,
nz_deg_trend_all_noscotus['nz_avg_outdegree'],
marker='.',
color='black',
label = 'all minus scotus')
plt.scatter(year_range,
nz_deg_trend_scotus['nz_avg_outdegree'],
marker='.',
color='red',
label = 'scotus')
plt.ylim(0, max( max(nz_deg_trend_scotus['nz_avg_outdegree'],), max(nz_deg_trend_all_noscotus['nz_avg_outdegree']) ))
plt.xlim([1800, 2016])
plt.xlabel('year')
plt.ylabel('average out-degree (zeo-trunc)')
plt.title('out-degree for zero truncated network')
plt.legend(loc='upper right')
Out[86]:
In [87]:
degree_rato = []
for i in range(nz_deg_trend_all_noscotus.shape[0]):
d_noscotus = nz_deg_trend_all_noscotus['nz_avg_outdegree'].tolist()[i]
d_scotus = nz_deg_trend_scotus['nz_avg_outdegree'].tolist()[i]
if d_noscotus == 0:
degree_rato.append(0)
else:
degree_rato.append(d_scotus/ (d_noscotus + 0.0))
In [88]:
plt.figure(figsize=[8, 8])
plt.scatter(year_range,
degree_rato,
marker='.',
color='black')
plt.axhline(1, color='red')
plt.ylim([0, 5])
plt.xlim([1800, 2016])
plt.xlabel('year')
plt.ylabel('avg out-deg ratio')
plt.title('out-degree ratio scotus/all-scotus for zero truncated network')
Out[88]:
In [ ]: