In [2]:
repo_directory = '/Users/iaincarmichael/Dropbox/Research/law/law-net/'

data_dir = '/Users/iaincarmichael/data/courtlistener/'

import numpy as np
import sys
import matplotlib.pyplot as plt
from collections import Counter

# graph package
import igraph as ig

# our code
sys.path.append(repo_directory + 'code/')
from setup_data_dir import setup_data_dir, make_subnetwork_directory
from pipeline.download_data import download_bulk_resource, download_master_edgelist, download_scdb
from helpful_functions import case_info

from stats.viz import *
from stats.dim_reduction import *
from stats.linear_model import *

sys.path.append(repo_directory + 'vertex_metrics_experiment/code/')
from rankscore_experiment_sort import *
from rankscore_experiment_LR import *
from make_tr_edge_df import *


# which network to download data for
network_name = 'federal' # 'federal', 'ca1', etc


# some sub directories that get used
raw_dir = data_dir + 'raw/'
subnet_dir = data_dir + network_name + '/'
text_dir = subnet_dir + 'textfiles/'


# jupyter notebook settings
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [3]:
G = ig.Graph.Read_GraphML(subnet_dir + network_name +'_network.graphml')
G.summary()


/Users/iaincarmichael/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:1: RuntimeWarning: Could not add vertex ids, there is already an 'id' vertex attribute at foreign-graphml.c:443
  if __name__ == '__main__':
Out[3]:
'IGRAPH DN-- 959985 6649916 -- \n+ attr: court (v), id (v), issueArea (v), name (v), num_words (v), year (v)'

compute metrics


In [4]:
%time d_pagerank = G.pagerank()

%time u_pagerank = G.as_undirected().pagerank()


CPU times: user 2.81 s, sys: 262 ms, total: 3.07 s
Wall time: 3.2 s
CPU times: user 27.3 s, sys: 2.04 s, total: 29.3 s
Wall time: 34.9 s

In [29]:
%time d_betweenness = G.betweenness(directed=True)

# %time u_betweenness = G.as_undirected().betweenness(directed=False)

In [ ]:
#COMPUTING D BETWEEN right now started at 6:35 pm

In [ ]:


In [ ]:


In [212]:
%time d_closeness =  G.closeness(mode="IN", normalized=True)

%time u_closeness = G.as_undirected().closeness(normalized=True)

In [211]:
%time d_eigen = G.eigenvector_centrality()

%time u_eigen = G.as_undirected().eigenvector_centrality()

In [5]:
%time hubs = G.hub_score()

%time authorities = G.authority_score()


CPU times: user 14.1 s, sys: 691 ms, total: 14.8 s
Wall time: 15.8 s
CPU times: user 14.6 s, sys: 218 ms, total: 14.8 s
Wall time: 15.7 s

In [6]:
indegree = G.indegree()

outdegree = G.outdegree()

degree = G.degree()

In [7]:
# df = pd.DataFrame(index=G.vs['name'])

# df['year'] = G.vs['year']

# df['indegree'] = indegree
# df['outdegree'] = outdegree
# df['degree'] = degree
# df['d_pagerank'] = d_pagerank
# df['u_pagerank'] = u_pagerank
# df['d_betweenness'] = d_betweenness
# df['u_betweenness'] = u_betweenness
# df['d_closeness'] = d_closeness
# df['u_closeness'] = u_closeness
# df['d_eigen'] = d_eigen
# df['u_eigen'] = u_eigen
# df['hubs'] = hubs
# df['authorities'] = authorities

# all_metrics = ['indegree', 'outdegree', 'degree',
#                 'd_pagerank', 'u_pagerank',
#                 'd_betweenness', 'u_betweenness',
#                 'd_closeness', 'u_closeness',
#                 'd_eigen', 'u_eigen',
#                 'hubs', 'authorities']


df = pd.DataFrame(index=G.vs['name'])

df['year'] = G.vs['year']

df['indegree'] = indegree
df['outdegree'] = outdegree
df['degree'] = degree
df['d_pagerank'] = d_pagerank
df['u_pagerank'] = u_pagerank
df['hubs'] = hubs
df['authorities'] = authorities

issue area

Procedural

  • 1 Criminal Procedure
  • 4 Due Process
  • 6 Attorneys
  • 9 Judicial Power

Substantive

  • 2 Civil Rights
  • 3 First Amendment
  • 5 Privacy
  • 7 Unions
  • 8 Economic Activity
  • 12 Federal Taxation
  • 14 Private Action

Other

  • 0 Missing
  • 10 Federalism
  • 11 Interstate Relations
  • 13 Miscellaneous

hypothesis

  • betweeness/closeness favor procedural cases
  • eivenvector metrics (eigenvector centrality, hubs, authorities) favor substantive cases

In [3]:
# map types to issues
type_to_issue = {'procedural': [1, 4, 6, 9],
                 'substantive': [2, 3, 5, 7, 8, 12, 14],
                 'other': [10, 11, 13, 0]}

# map issues to type
issue_to_type = {i: '' for i in range(13 + 1)}
for t in type_to_issue.keys():
    for i in type_to_issue[t]:
        issue_to_type[i] = t

In [4]:
# create type
G.vs['issueArea'] = [0 if np.isnan(i) else int(i) for i in G.vs['issueArea']]
G.vs['type'] = [issue_to_type[i] for i in G.vs['issueArea']]

In [17]:
# add to data frame
df['issueArea'] = G.vs['issueArea']
df['type'] = G.vs['type']

# get type subsets
df_sub = df[df['type'] == 'substantive']
df_pro = df[df['type'] == 'procedural']
df_oth = df[df['type'] == 'other']

print 'num substantive: %d' % df_sub.shape[0]
print 'num procedural: %d' % df_pro.shape[0]
print 'num other: %d' % df_oth.shape[0]


num substantive: 16891
num procedural: 9733
num other: 933361

In [221]:
df.to_csv(subnet_dir + 'issue_area/metrics.csv', index=True)

compare metric vs. issue type


In [184]:
df.columns


Out[184]:
Index([u'year', u'indegree', u'outdegree', u'degree', u'd_pagerank',
       u'u_pagerank', u'd_betweenness', u'u_betweenness', u'd_closeness',
       u'u_closeness', u'd_eigen', u'u_eigen', u'hubs', u'authorities',
       u'issueArea', u'type'],
      dtype='object')

In [218]:
metric = 'authorities'

bins = np.linspace(min(df[metric]), max(df[metric]), 100)

# substantive
plt.hist(df_sub[metric],
         bins=bins,
         color='red',
         label='substantive (mean: %1.5f)' % np.mean(df_sub[metric]))

# procedural
plt.hist(df_pro[metric],
         bins=bins,
         color='blue',
         label='procedural (mean: %1.5f)' % np.mean(df_pro[metric]))

# other
plt.hist(df_oth[metric],
         bins=bins,
         color='green',
         label='other (mean: %1.5f)' % np.mean(df_oth[metric]))

plt.xlim([0, .2])
plt.ylim([0, 2000])

plt.xlabel(metric)
plt.legend(loc='upper right')


Out[218]:
<matplotlib.legend.Legend at 0x134dd9f50>

In [220]:
# look at propotion of top cases of each type
T = 100

top_cases = df.sort_values(by=metric, ascending=False).iloc[0:T]['type']
top_breakdown = top_cases.value_counts(normalize=True)

# compare to proportion of all cases
all_breakdown = df['type'].value_counts(normalize=True)

diff = top_breakdown - all_breakdown

diff


Out[220]:
other               NaN
procedural    -0.169041
substantive    0.214262
Name: type, dtype: float64

permutation test

Rank cases by metric then look at the proportion of the top T (=100) cases that are substantive.


In [25]:
metric= 'indegree'

df_pro_sub = df[df['type'] != 'other']

T = 100

# observed proportion of top cases that are substantive
obs_top_breakdown = df_pro_sub.\
                    sort_values(by=metric, ascending=False).\
                    iloc[0:T]['type'].\
                    value_counts(normalize=True)
            
obs_prop_sub = obs_top_breakdown['substantive']

In [26]:
R = 1000


perm_prop_sub = [0] * R
for r in range(R):
    
    # randomly select T cases
    perm_indices = np.random.choice(range(df_pro_sub.shape[0]), replace=False, size=T)
    
    # compute the type breakdown of the T cases
    perm_breakdown = df_pro_sub.\
                     iloc[perm_indices]['type'].\
                     value_counts(normalize=True)
    
    # proportion of T cases that are substantive
    perm_prop_sub[r] = perm_breakdown['substantive']
    
perm_prop_sub = np.array(perm_prop_sub)
pval = 1 - np.mean(perm_prop_sub < obs_prop_sub)

In [27]:
plt.title('permutation test substantive vs. procedural (pval: %1.3f)' % pval)
plt.hist(perm_prop_sub,
         color='blue',
         label='permutation')

plt.axvline(obs_prop_sub,
            color='red',
            label='obs')

plt.xlabel(metric)


Out[27]:
<matplotlib.text.Text at 0x114564e90>

Results

hubs, authorities, u_eigen, d_eign, d_betweeness, u_betweeness are significant (confirming hypothesis)

TODO: recompute u_closeness


In [ ]:

PC plot


In [232]:
df_pro_sub = df[df['type'] != 'other']

U, D, V = get_PCA(df_pro_sub[all_metrics], scale=True)

In [238]:
plot_2class_scores(U,
                   classes = df_pro_sub['type'],
                   start=6,
                   n_comp=5)


Does federal favor SCOTUS


In [7]:
# get federal only and scotus casese
fed_cases = G.vs.select(court_ne='scotus')
scotus_cases = G.vs.select(court_eq='scotus')

# fed_jurisdictions = set(fed_cases['court'])

# edges between scotus/fed
fed_scotus = G.es.select(_between=(fed_cases, scotus_cases))
scotus_to_fed = [e for e in fed_scotus if G.vs[e.source]['court']=='scotus']
fed_to_scotus = [e for e in fed_scotus if G.vs[e.target]['court']=='scotus']

fed_to_fed = G.es.select(_between=(fed_cases, fed_cases))
scotus_to_scotus = G.es.select(_between=(scotus_cases, scotus_cases))


932100
27885

In [ ]:
n_fed_citations = len(fed_to_scotus) + len(fed_to_fed)
n_fed_to_scotus = len(fed_to_scotus)

prop_fed_to_scotus = (n_fed_to_scotus + 0.0) / n_fed_citations

In [ ]:
print "scotus makes up %1.2f percent of cases in federal + scotus" % (100 *(len(scotus_cases) + 0.0) /  len(G.vs))

print "%1.2f percent of federal citations go to scotus cases" % (100 * prop_fed_to_scotus)

Does federal favor scotus procedural cases


In [22]:
scotus_types = pd.Series(Counter(scotus_cases['type'])) / len(scotus_cases)
scotus_types


Out[22]:
other          0.045221
procedural     0.349041
substantive    0.605738
dtype: float64

In [35]:
f2s_oth = len([e for e in fed_to_scotus if G.vs[e.target]['type'] == 'other'])
f2s_pro = len([e for e in fed_to_scotus if G.vs[e.target]['type'] == 'procedural'])
f2s_sub = len([e for e in fed_to_scotus if G.vs[e.target]['type'] == 'substantive'])

In [39]:
fed_cite_types = pd.Series([f2s_oth, f2s_pro, f2s_sub], index=['other', 'procedural', 'substantive']) / len(fed_to_scotus)

In [41]:
fed_cite_types


Out[41]:
other          0.03044
procedural     0.52480
substantive    0.44476
dtype: float64

Does SCOTUS favord DC

TODO: look at scotus citaions leaving scotus and see if DC is over represented


In [ ]:


In [4]:
# get federal only and scotus casese
fed_cases = G.vs.select(court_ne='scotus')
dc_cases = G.vs.select(court_eq='cadc')
scotus_cases = G.vs.select(court_eq='scotus')

# fed_jurisdictions = set(fed_cases['court'])

# edges between scotus/fed
scotus_citations = [e for e in G.es if G.vs[e.source]['court']=='scotus']

# scotus dc
dc_scotus = G.es.select(_between=(dc_cases, scotus_cases))
scotus_to_dc = [e for e in dc_scotus if G.vs[e.source]['court']=='scotus']

In [7]:
n_scotus_citations = len(scotus_citations)
n_scotus_to_dc = len(scotus_to_dc)

prop_scotus_to_dc = (n_scotus_to_dc + 0.0) / n_scotus_citations

In [9]:



Out[9]:
0.02651708099605723

In [14]:
print 'percentage of all cases that are DC: %1.2f' % (100 * (len(dc_cases) + 0.0) / len(G.vs))
print 'percentation of scotus citaions that go to DC %1.2f' % (100*prop_scotus_to_dc)


percentage of all cases that are DC: 2.65
percentation of scotus citaions that go to DC 1.03

In [15]:
%%time
print 'this will be really fast'


this will be really fast
CPU times: user 153 µs, sys: 268 µs, total: 421 µs
Wall time: 362 µs

In [ ]: