In [1]:
import sys
sys.path.append('../../../code/')
import os
import json
from datetime import datetime
import time
from math import *
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
import igraph as ig
from load_data import load_citation_network, case_info
%load_ext autoreload
%autoreload 2
%matplotlib inline
data_dir = '../../data/'
court_name = 'scotus'
In [2]:
start = time.time()
if court_name == 'all':
case_metadata = pd.read_csv(data_dir + 'clean/case_metadata_master.csv')
edgelist = pd.read_csv(data_dir + 'clean/edgelist_master.csv')
else:
net_dir = data_dir + 'clean/' + court_name + '/'
if not os.path.exists(net_dir):
os.makedirs(net_dir)
make_court_subnetwork(court_name, data_dir)
case_metadata = pd.read_csv(net_dir + 'case_metadata.csv')
edgelist = pd.read_csv(net_dir + 'edgelist.csv')
edgelist.drop('Unnamed: 0', inplace=True, axis=1)
# create a dictonary that maps court listener ids to igraph ids
cl_to_ig_id = {}
cl_ids = case_metadata['id'].tolist()
for i in range(case_metadata['id'].size):
cl_to_ig_id[cl_ids[i]] = i
# add nodes
V = case_metadata.shape[0]
g = ig.Graph(n=V, directed=True)
# g.vs['date'] = case_metadata['date'].tolist()
g.vs['name'] = case_metadata['id'].tolist()
# create igraph edgelist
cases_w_metadata = set(cl_to_ig_id.keys())
ig_edgelist = []
missing_cases = 0
start = time.time()
for row in edgelist.itertuples():
cl_ing = row[1]
cl_ed = row[2]
if (cl_ing in cases_w_metadata) and (cl_ed in cases_w_metadata):
ing = cl_to_ig_id[cl_ing]
ed = cl_to_ig_id[cl_ed]
else:
missing_cases += 0
ig_edgelist.append((ing, ed))
# add edges to graph
g.add_edges(ig_edgelist)
# add vertex attributes
g.vs['court'] = case_metadata['court'].tolist()
g.vs['year'] = [int(d.split('-')[0]) for d in case_metadata['date'].tolist()]
end = time.time()
print '%d seconds for %d edges' % (end - start, len(g.es))
In [3]:
citation_ages = []
source_ages = []
for e in g.es:
source = g.vs[e.source]
target = g.vs[e.target]
source_year = source['year']
target_year = target['year']
edge_age = source_year - target_year
if edge_age > 0:
citation_ages.append(edge_age)
source_ages.append(source_year)
In [4]:
plt.figure(figsize=[20, 10])
plt.subplot(1,2,1)
plt.hist(citation_ages);
plt.xlim([0, max(citation_ages)])
plt.xlabel('citation age')
plt.ylabel('count')
plt.subplot(1,2,2)
plt.loglog(sorted(citation_ages, reverse=True), '-', marker='.', color='black',
alpha=.7);
plt.ylabel('log age')
plt.xlabel('log count')
Out[4]:
In [5]:
plt.scatter(source_ages, citation_ages)
plt.xlabel('source age')
plt.ylabel('citation age')
plt.xlim([min(source_ages), max(source_ages)])
plt.ylim([0, max(citation_ages)])
Out[5]:
In [19]:
bins = range(1810, 2020 + 1, 10)
bin_means = stats.binned_statistic(source_ages, citation_ages, statistic='mean', bins=bins).statistic
plt.scatter(bins[:-1], bin_means)
plt.xlim([min(bins), max(bins)])
plt.ylim([0, max(bin_means)])
plt.xlabel('decade')
plt.ylabel('decade mean')
Out[19]:
In [6]:
indegrees = g.indegree()
plt.scatter(g.vs['year'], indegrees,
marker='.',
color='black')
plt.xlabel('node age')
plt.ylabel('in degree')
plt.xlim([min(g.vs['year']), max(g.vs['year'])])
# plt.ylim([0, 200])
Out[6]:
In [ ]: