In [1]:
repo_directory = '/Users/iaincarmichael/Dropbox/Research/law/law-net/'
data_dir = '/Users/iaincarmichael/data/courtlistener/'
import numpy as np
import sys
import matplotlib.pyplot as plt
from scipy.stats import rankdata
import cPickle as pickle
from collections import Counter
import pandas as pd
# graph package
import igraph as ig
# our code
sys.path.append(repo_directory + 'code/')
from setup_data_dir import setup_data_dir, make_subnetwork_directory
from pipeline.download_data import download_bulk_resource, download_master_edgelist, download_scdb
from helpful_functions import case_info
from viz import print_describe
sys.path.append(repo_directory + 'vertex_metrics_experiment/code/')
from custom_vertex_metrics import *
from results import *
# which network to download data for
network_name = 'federal' # 'federal', 'ca1', etc
# some sub directories that get used
raw_dir = data_dir + 'raw/'
subnet_dir = data_dir + network_name + '/'
text_dir = subnet_dir + 'textfiles/'
# jupyter notebook settings
%load_ext autoreload
%autoreload 2
%matplotlib inline
In [2]:
G = ig.Graph.Read_GraphML(subnet_dir + network_name +'_network.graphml')
In [23]:
jurisdictions = set(G.vs['court'])
plt.figure(figsize=[20, 20])
k = 0
for court in jurisdictions:
k += 1
plt.subplot(4, 4, k)
cases = G.vs.select(court_eq=court)
year_counts = Counter(cases['year'])
plt.plot(year_counts.keys(),
year_counts.values())
plt.xlabel('year')
plt.xlim([1750, 2016])
plt.ylabel('counts')
plt.title(court)
In [25]:
H_seq = [1, 2, 5, 10, 20]
n_H = len(H_seq)
k = 0
for h in H_seq:
k += 1
H = 5
cr = get_CiteRank(G, h)
years = G.vs['year']
cr_year_mean = get_year_aggregate(years, cr, np.mean)
cr_year_median = get_year_aggregate(years, cr, np.median)
plt.figure(figsize=[8, 4])
# plt.subplot(n_H,3,1)
# plt.scatter(years, cr)
# plt.xlim([1750, 2016])
# plt.ylim([0, 1.2 * max(cr)])
# plt.ylabel('CiteRank_%d' % H)
# plt.xlabel('year')
plt.subplot(n_H,2,1)
plt.plot(cr_year_mean.index, cr_year_mean)
plt.ylabel('CiteRank_%d mean' % H)
plt.xlabel('year')
plt.subplot(n_H,2, 2)
plt.plot(cr_year_median.index, cr_year_median)
plt.ylabel('CiteRank_%d median' % H)
plt.xlabel('year')
corr = np.corrcoef(cr, years)[0, 1]
print "h: %d, corr: %f" %(h, corr)
In [72]:
outdegree = G.outdegree()
In [77]:
print np.mean(outdegree)
print np.median(outdegree)
In [78]:
diffs = [G.vs[e[0]]['year'] - G.vs[e[1]]['year'] for e in G.get_edgelist()]
In [86]:
print_describe(diffs)
In [87]:
bins = np.linspace(-40, 300, 100)
plt.hist(diffs, bins=bins)
plt.xlim(-40, 300)
plt.xlabel('citation age')
Out[87]:
In [21]:
scotus_cases = G.vs.select(court_eq='scotus')
scotus_indegree = scotus_cases.indegree()
scotus_outdegree = scotus_cases.outdegree()
scotus_years = scotus_cases['year']
In [22]:
out_by_year = get_year_aggregate(scotus_years, scotus_outdegree, np.median)
in_by_year = get_year_aggregate(scotus_years, scotus_indegree, np.median)
In [25]:
plt.scatter(in_by_year.index, in_by_year,
label='indegree', color='red')
plt.scatter(out_by_year.index, out_by_year,
label='outdegree', color='blue')
plt.ylabel('median')
plt.xlabel('year')
plt.legend(loc='upper right')
plt.xlim([1800, 2017])
plt.ylim([0, 200])
plt.title('SCOTUS cases in federal network')
Out[25]:
In [4]:
snap_dir = subnet_dir + 'snapshots/'
year = 2015
S = pd.read_csv(snap_dir + 'vertex_metrics_%d.csv' % year, index_col=0)
In [9]:
h = 5
cr = get_CiteRank(G, 5)
In [10]:
top_cr = pd.Series(cr, index=G.vs['name'])
In [16]:
for op_id in top_cr.sort_values(ascending=False)[:10].index:
case_info(op_id)
In [62]:
In [ ]:
diffs = [G.vs[e[0]]['year'] - G.vs[e[1]]['year'] for e in G.get_edgelist()]
backward = np.where(np.array(diffs) < 0)
In [71]:
i = 10
bad_edge = G.es[backward[0][i]]
source = G.vs[bad_edge.source]
target = G.vs[bad_edge.target]
print source
case_info(source['name'])
print
print target
case_info(target['name'])