In [118]:
import sys
sys.path.append('../../code/')
import os
import json
from datetime import datetime
import time
from math import *
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
import igraph as ig
from load_data import load_citation_network, case_info
from helper_functions import *
%load_ext autoreload
%autoreload 2
%matplotlib inline
data_dir = '../../data/'
court_name = 'all'
In [2]:
start = time.time()
if court_name == 'all':
case_metadata = pd.read_csv(data_dir + 'clean/case_metadata_master.csv')
edgelist = pd.read_csv(data_dir + 'clean/edgelist_master.csv')
else:
net_dir = data_dir + 'clean/' + court_name + '/'
if not os.path.exists(net_dir):
os.makedirs(net_dir)
make_court_subnetwork(court_name, data_dir)
case_metadata = pd.read_csv(net_dir + 'case_metadata.csv')
edgelist = pd.read_csv(net_dir + 'edgelist.csv')
edgelist.drop('Unnamed: 0', inplace=True, axis=1)
# create a dictonary that maps court listener ids to igraph ids
cl_to_ig_id = {}
cl_ids = case_metadata['id'].tolist()
for i in range(case_metadata['id'].size):
cl_to_ig_id[cl_ids[i]] = i
# add nodes
V = case_metadata.shape[0]
g = ig.Graph(n=V, directed=True)
# g.vs['date'] = case_metadata['date'].tolist()
g.vs['name'] = case_metadata['id'].tolist()
# create igraph edgelist
cases_w_metadata = set(cl_to_ig_id.keys())
ig_edgelist = []
missing_cases = 0
start = time.time()
for row in edgelist.itertuples():
cl_ing = row[1]
cl_ed = row[2]
if (cl_ing in cases_w_metadata) and (cl_ed in cases_w_metadata):
ing = cl_to_ig_id[cl_ing]
ed = cl_to_ig_id[cl_ed]
else:
missing_cases += 0
ig_edgelist.append((ing, ed))
# add edges to graph
g.add_edges(ig_edgelist)
end = time.time()
print '%d seconds for %d edges' % (end - start, len(g.es))
In [ ]:
In [3]:
# add vertex attributes
g.vs['court'] = case_metadata['court'].tolist()
g.vs['year'] = [int(d.split('-')[0]) for d in case_metadata['date'].tolist()]
In [4]:
# g.write_graphml(data_dir + 'clean/entire_law_net.graphml')
# G = ig.read_graphml(data_dir + 'clean/entire_law_net.graphml')
In [5]:
g.summary()
Out[5]:
In [164]:
def compute_measure(x, measure, alpha=.1):
if measure == 'mean':
return np.mean(x)
elif measure == 'median':
return np.median(x)
elif measure == 'upper_trimed_mean':
return upper_trimed_mean(x, alpha)
elif measure == 'max':
return max(x)
elif measure == 'n1':
return sum([v == 1 for v in x])
elif measure == 'prop1':
return (sum([v == 1 for v in x]) + 0.0)/len(x)
else:
raise ValueError('improper meassure')
In [165]:
def get_degree_trend(g, court='all', include_zero_outdegree=True,
measure='mean', alpha=.1):
if measure not in ['mean', 'median', 'upper_trimed_mean', 'max', 'n1', 'prop1']:
raise ValueError('improper measure')
if court != 'all' and type(court) != list:
court = set([court])
year_range = range(1631, 2016 + 1)
trend = pd.DataFrame(index=year_range, columns=['count', 'indegree', 'outdegree'])
indegrees = g.indegree()
outdegrees = g.outdegree()
year_counts = {y: 0 for y in trend.index}
indegs_counts = {y: [] for y in trend.index}
outdegs_counts = {y: [] for y in trend.index}
# compute statistics for each year
if court == 'all':
for i in range(len(g.vs)):
outdeg = outdegrees[i]
if include_zero_outdegree or outdeg > 0:
year = g.vs[i]['year']
year_counts[year] += 1
indegs_counts[year].append(indegrees[i])
outdegs_counts[year].append(outdeg)
else:
for i in range(len(g.vs)):
v = g.vs[i]
if v['court'] in court:
outdeg = outdegrees[i]
if include_zero_outdegree or outdeg > 0:
year = v['year']
year_counts[year] += 1
indegs_counts[year].append(indegrees[i])
outdegs_counts[year].append(outdeg)
trend['count'] = year_counts.values()
# average the degrees by yaer
for y in year_counts.keys():
indegs = indegs_counts[y]
outdegs = outdegs_counts[y]
if len(indegs) == 0:
trend.loc[y, 'indegree'] = 0
else:
trend.loc[y, 'indegree'] = compute_measure(indegs, measure, alpha=.1)
if len(outdegs) == 0:
trend.loc[y, 'outdegree'] = 0
else:
trend.loc[y, 'outdegree'] = compute_measure(outdegs, measure, alpha=.1)
return trend
In [24]:
def plot_single_trend(trend, court, measure, start_year=1850):
plt.figure(figsize = [24, 8])
# case count
plt.subplot(1,3,1)
plt.scatter(trend.index,
trend['count'],
marker='.',
color='black')
plt.ylim(0, max(trend['count']))
plt.xlim([start_year, 2016])
plt.xlabel('year')
plt.ylabel('number of cases')
plt.title('case count %s' % court)
# out-degree
plt.subplot(1,3,2)
plt.scatter(trend.index,
trend['outdegree'],
marker='.',
color='black')
plt.ylim(0, max(trend['outdegree']))
plt.xlim([start_year, 2016])
plt.xlabel('year')
plt.ylabel('%s out-degree' % measure)
plt.title('%s out degree of %s' % (measure, court))
# in-degree
plt.subplot(1,3,3)
plt.scatter(trend.index,
trend['indegree'],
marker='.',
color='black')
plt.ylim(0, max(trend['indegree']))
plt.xlim([start_year, 2016])
plt.xlabel('year')
plt.ylabel('%s in-degree' % measure)
plt.title('%s in degree of %s' % (measure, court))
In [25]:
plot_single_trend(trend_all, court='all', measure='median', start_year=1850)
In [126]:
measure = 'upper_trimed_mean'
In [132]:
start = time.time()
trend_scotus = get_degree_trend(g,
court='scotus',
include_zero_outdegree=False,
measure=measure)
end = time.time()
print 'get_degree_trend took %d seconds' % (end-start)
In [133]:
start = time.time()
trend_ca1 = get_degree_trend(g,
court='ca1',
include_zero_outdegree=False,
measure=measure)
end = time.time()
print 'get_degree_trend took %d seconds' % (end-start)
In [130]:
def plot_compare_trends(A, netA, B, netB, start_year, measure):
plt.figure(figsize= [20, 10])
# case count
plt.subplot(1,2,1)
plt.scatter(A.index,
A['count'],
marker='.',
color='red',
label=netA)
plt.scatter(B.index,
B['count'],
marker='.',
color='blue',
label=netB)
plt.ylim(0, max(A['count'].max(), B['count'].max()))
plt.xlim([start_year, 2016])
plt.xlabel('year')
plt.ylabel('number of cases')
plt.title('case count of %s vs. %s' % (netA, netB))
plt.legend(loc='top right')
# out-degree
plt.subplot(1,2,2)
plt.scatter(A.index,
A['outdegree'],
marker='.',
color='red',
label=netA)
plt.scatter(B.index,
B['outdegree'],
marker='.',
color='blue',
label=netB)
plt.ylim(0, max(A['outdegree'].max(), B['outdegree'].max()))
plt.xlim([start_year, 2016])
plt.xlabel('year')
plt.ylabel('%s out-degree' % measure)
plt.title('%s out-degree of %s vs. %s' % (measure, netA, netB))
plt.legend(loc='upper right')
In [134]:
plot_compare_trends(trend_scotus, 'scotus',
trend_ca1, 'ca1',
start_year=1850,
measure=measure)
In [180]:
measure = 'upper_trimed_mean'
# measure = 'prop1'
alpha = .1
In [181]:
fed_appellate = ['ca' + str(i+1) for i in range(11)]
fed_appellate.append('cafc')
fed_appellate.append('cadc')
In [182]:
start = time.time()
trend_scotus = get_degree_trend(g,
court='scotus',
include_zero_outdegree=False,
measure=measure,
alpha=alpha)
end = time.time()
print 'get_degree_trend took %d seconds' % (end-start)
In [183]:
start = time.time()
trend_fed_appellate = get_degree_trend(g,
court=fed_appellate,
include_zero_outdegree=False,
measure=measure,
alpha=alpha)
end = time.time()
print 'get_degree_trend took %d seconds' % (end-start)
In [184]:
plot_compare_trends(trend_scotus, 'scotus',
trend_fed_appellate, 'federal appellate',
start_year=1850,
measure=measure)
In [ ]:
# def plot_trend_ratio(A, netA, B, netB, measure):
# ratio = A['outdegree']
# years = A
# plt.figure(figsize= [20, 10])
# # case count
# plt.subplot(1,2,1)
# plt.scatter(years,
# ratio,
# marker='.',
# color='red',
# label=netA)
# plt.ylim(0, max(A['count'].max(), B['count'].max()))
# plt.xlim([start_year, 2016])
# plt.xlabel('year')
# plt.ylabel('number of cases')
# plt.title('case count of %s vs. %s' % (netA, netB))
# plt.legend(loc='top right')
In [ ]:
jurisdictions = pd.read_csv(data_dir + 'clean/jurisdictions.csv')
In [99]:
jurisdictions.sort_values(by='count', ascending=False, inplace=True)
print_full(jurisdictions, ['name', 'count'])
In [ ]:
In [ ]: