In [2]:
repo_directory = '/Users/iaincarmichael/Dropbox/Research/law/law-net/'
data_dir = '/Users/iaincarmichael/data/courtlistener/'
import numpy as np
import sys
import matplotlib.pyplot as plt
from collections import Counter
# graph package
import igraph as ig
# our code
sys.path.append(repo_directory + 'code/')
from setup_data_dir import setup_data_dir, make_subnetwork_directory
from pipeline.download_data import download_bulk_resource, download_master_edgelist, download_scdb
from helpful_functions import case_info
from stats.viz import *
from stats.dim_reduction import *
from stats.linear_model import *
sys.path.append(repo_directory + 'vertex_metrics_experiment/code/')
from rankscore_experiment_sort import *
from rankscore_experiment_LR import *
from make_tr_edge_df import *
# which network to download data for
network_name = 'federal' # 'federal', 'ca1', etc
# some sub directories that get used
raw_dir = data_dir + 'raw/'
subnet_dir = data_dir + network_name + '/'
text_dir = subnet_dir + 'textfiles/'
# jupyter notebook settings
%load_ext autoreload
%autoreload 2
%matplotlib inline
In [3]:
G = ig.Graph.Read_GraphML(subnet_dir + network_name +'_network.graphml')
G.summary()
Out[3]:
In [4]:
%time d_pagerank = G.pagerank()
%time u_pagerank = G.as_undirected().pagerank()
In [29]:
%time d_betweenness = G.betweenness(directed=True)
# %time u_betweenness = G.as_undirected().betweenness(directed=False)
In [ ]:
#COMPUTING D BETWEEN right now started at 6:35 pm
In [ ]:
In [ ]:
In [212]:
%time d_closeness = G.closeness(mode="IN", normalized=True)
%time u_closeness = G.as_undirected().closeness(normalized=True)
In [211]:
%time d_eigen = G.eigenvector_centrality()
%time u_eigen = G.as_undirected().eigenvector_centrality()
In [5]:
%time hubs = G.hub_score()
%time authorities = G.authority_score()
In [6]:
indegree = G.indegree()
outdegree = G.outdegree()
degree = G.degree()
In [7]:
# df = pd.DataFrame(index=G.vs['name'])
# df['year'] = G.vs['year']
# df['indegree'] = indegree
# df['outdegree'] = outdegree
# df['degree'] = degree
# df['d_pagerank'] = d_pagerank
# df['u_pagerank'] = u_pagerank
# df['d_betweenness'] = d_betweenness
# df['u_betweenness'] = u_betweenness
# df['d_closeness'] = d_closeness
# df['u_closeness'] = u_closeness
# df['d_eigen'] = d_eigen
# df['u_eigen'] = u_eigen
# df['hubs'] = hubs
# df['authorities'] = authorities
# all_metrics = ['indegree', 'outdegree', 'degree',
# 'd_pagerank', 'u_pagerank',
# 'd_betweenness', 'u_betweenness',
# 'd_closeness', 'u_closeness',
# 'd_eigen', 'u_eigen',
# 'hubs', 'authorities']
df = pd.DataFrame(index=G.vs['name'])
df['year'] = G.vs['year']
df['indegree'] = indegree
df['outdegree'] = outdegree
df['degree'] = degree
df['d_pagerank'] = d_pagerank
df['u_pagerank'] = u_pagerank
df['hubs'] = hubs
df['authorities'] = authorities
Procedural
Substantive
Other
In [3]:
# map types to issues
type_to_issue = {'procedural': [1, 4, 6, 9],
'substantive': [2, 3, 5, 7, 8, 12, 14],
'other': [10, 11, 13, 0]}
# map issues to type
issue_to_type = {i: '' for i in range(13 + 1)}
for t in type_to_issue.keys():
for i in type_to_issue[t]:
issue_to_type[i] = t
In [4]:
# create type
G.vs['issueArea'] = [0 if np.isnan(i) else int(i) for i in G.vs['issueArea']]
G.vs['type'] = [issue_to_type[i] for i in G.vs['issueArea']]
In [17]:
# add to data frame
df['issueArea'] = G.vs['issueArea']
df['type'] = G.vs['type']
# get type subsets
df_sub = df[df['type'] == 'substantive']
df_pro = df[df['type'] == 'procedural']
df_oth = df[df['type'] == 'other']
print 'num substantive: %d' % df_sub.shape[0]
print 'num procedural: %d' % df_pro.shape[0]
print 'num other: %d' % df_oth.shape[0]
In [221]:
df.to_csv(subnet_dir + 'issue_area/metrics.csv', index=True)
In [184]:
df.columns
Out[184]:
In [218]:
metric = 'authorities'
bins = np.linspace(min(df[metric]), max(df[metric]), 100)
# substantive
plt.hist(df_sub[metric],
bins=bins,
color='red',
label='substantive (mean: %1.5f)' % np.mean(df_sub[metric]))
# procedural
plt.hist(df_pro[metric],
bins=bins,
color='blue',
label='procedural (mean: %1.5f)' % np.mean(df_pro[metric]))
# other
plt.hist(df_oth[metric],
bins=bins,
color='green',
label='other (mean: %1.5f)' % np.mean(df_oth[metric]))
plt.xlim([0, .2])
plt.ylim([0, 2000])
plt.xlabel(metric)
plt.legend(loc='upper right')
Out[218]:
In [220]:
# look at propotion of top cases of each type
T = 100
top_cases = df.sort_values(by=metric, ascending=False).iloc[0:T]['type']
top_breakdown = top_cases.value_counts(normalize=True)
# compare to proportion of all cases
all_breakdown = df['type'].value_counts(normalize=True)
diff = top_breakdown - all_breakdown
diff
Out[220]:
In [25]:
metric= 'indegree'
df_pro_sub = df[df['type'] != 'other']
T = 100
# observed proportion of top cases that are substantive
obs_top_breakdown = df_pro_sub.\
sort_values(by=metric, ascending=False).\
iloc[0:T]['type'].\
value_counts(normalize=True)
obs_prop_sub = obs_top_breakdown['substantive']
In [26]:
R = 1000
perm_prop_sub = [0] * R
for r in range(R):
# randomly select T cases
perm_indices = np.random.choice(range(df_pro_sub.shape[0]), replace=False, size=T)
# compute the type breakdown of the T cases
perm_breakdown = df_pro_sub.\
iloc[perm_indices]['type'].\
value_counts(normalize=True)
# proportion of T cases that are substantive
perm_prop_sub[r] = perm_breakdown['substantive']
perm_prop_sub = np.array(perm_prop_sub)
pval = 1 - np.mean(perm_prop_sub < obs_prop_sub)
In [27]:
plt.title('permutation test substantive vs. procedural (pval: %1.3f)' % pval)
plt.hist(perm_prop_sub,
color='blue',
label='permutation')
plt.axvline(obs_prop_sub,
color='red',
label='obs')
plt.xlabel(metric)
Out[27]:
In [ ]:
In [232]:
df_pro_sub = df[df['type'] != 'other']
U, D, V = get_PCA(df_pro_sub[all_metrics], scale=True)
In [238]:
plot_2class_scores(U,
classes = df_pro_sub['type'],
start=6,
n_comp=5)
In [7]:
# get federal only and scotus casese
fed_cases = G.vs.select(court_ne='scotus')
scotus_cases = G.vs.select(court_eq='scotus')
# fed_jurisdictions = set(fed_cases['court'])
# edges between scotus/fed
fed_scotus = G.es.select(_between=(fed_cases, scotus_cases))
scotus_to_fed = [e for e in fed_scotus if G.vs[e.source]['court']=='scotus']
fed_to_scotus = [e for e in fed_scotus if G.vs[e.target]['court']=='scotus']
fed_to_fed = G.es.select(_between=(fed_cases, fed_cases))
scotus_to_scotus = G.es.select(_between=(scotus_cases, scotus_cases))
In [ ]:
n_fed_citations = len(fed_to_scotus) + len(fed_to_fed)
n_fed_to_scotus = len(fed_to_scotus)
prop_fed_to_scotus = (n_fed_to_scotus + 0.0) / n_fed_citations
In [ ]:
print "scotus makes up %1.2f percent of cases in federal + scotus" % (100 *(len(scotus_cases) + 0.0) / len(G.vs))
print "%1.2f percent of federal citations go to scotus cases" % (100 * prop_fed_to_scotus)
In [22]:
scotus_types = pd.Series(Counter(scotus_cases['type'])) / len(scotus_cases)
scotus_types
Out[22]:
In [35]:
f2s_oth = len([e for e in fed_to_scotus if G.vs[e.target]['type'] == 'other'])
f2s_pro = len([e for e in fed_to_scotus if G.vs[e.target]['type'] == 'procedural'])
f2s_sub = len([e for e in fed_to_scotus if G.vs[e.target]['type'] == 'substantive'])
In [39]:
fed_cite_types = pd.Series([f2s_oth, f2s_pro, f2s_sub], index=['other', 'procedural', 'substantive']) / len(fed_to_scotus)
In [41]:
fed_cite_types
Out[41]:
In [ ]:
In [4]:
# get federal only and scotus casese
fed_cases = G.vs.select(court_ne='scotus')
dc_cases = G.vs.select(court_eq='cadc')
scotus_cases = G.vs.select(court_eq='scotus')
# fed_jurisdictions = set(fed_cases['court'])
# edges between scotus/fed
scotus_citations = [e for e in G.es if G.vs[e.source]['court']=='scotus']
# scotus dc
dc_scotus = G.es.select(_between=(dc_cases, scotus_cases))
scotus_to_dc = [e for e in dc_scotus if G.vs[e.source]['court']=='scotus']
In [7]:
n_scotus_citations = len(scotus_citations)
n_scotus_to_dc = len(scotus_to_dc)
prop_scotus_to_dc = (n_scotus_to_dc + 0.0) / n_scotus_citations
In [9]:
Out[9]:
In [14]:
print 'percentage of all cases that are DC: %1.2f' % (100 * (len(dc_cases) + 0.0) / len(G.vs))
print 'percentation of scotus citaions that go to DC %1.2f' % (100*prop_scotus_to_dc)
In [15]:
%%time
print 'this will be really fast'
In [ ]: