In [45]:
repo_directory = '/Users/iaincarmichael/Dropbox/Research/law/law-net/'
data_dir = '/Users/iaincarmichael/Documents/courtlistener/data/'
import numpy as np
import sys
import matplotlib.pyplot as plt
# graph package
import igraph as ig
# our code
sys.path.append(repo_directory + 'code/')
from setup_data_dir import setup_data_dir, make_subnetwork_directory
from pipeline.download_data import download_bulk_resource, download_master_edgelist, download_scdb
from helpful_functions import case_info
from stats.viz import *
from stats.dim_reduction import *
from stats.linear_model import *
sys.path.append(repo_directory + 'vertex_metrics_experiment/code/')
from rankscore_experiment_sort import *
from rankscore_experiment_LR import *
from make_tr_edge_df import *
# which network to download data for
network_name = 'scotus' # 'federal', 'ca1', etc
# some sub directories that get used
raw_dir = data_dir + 'raw/'
subnet_dir = data_dir + network_name + '/'
text_dir = subnet_dir + 'textfiles/'
# jupyter notebook settings
%load_ext autoreload
%autoreload 2
%matplotlib inline
In [2]:
G = ig.Graph.Read_GraphML(subnet_dir + network_name +'_network.graphml')
In [52]:
outdegs = np.array(G.outdegree())
indegs = np.array(G.indegree())
# to_keep = (indegs >= 1) & (outdegs >= 1)
# indegs = indegs[to_keep]
# outdegs = outdegs[to_keep]
In [53]:
plt.figure(figsize=[8,4])
plt.subplot(1,2,1)
plt.scatter(outdegs, indegs)
plt.xlabel('out degree')
plt.ylabel('in degree')
plt.xlim([0, 1.2*max(outdegs)])
plt.ylim([0, 1.2*max(indegs)])
plt.subplot(1,2,2)
plt.scatter(np.log(outdegs), np.log(indegs))
plt.xlabel('log out degree')
plt.ylabel('log in degree')
Out[53]:
In [54]:
get_SLR(outdegs, indegs)
Out[54]:
In [56]:
get_SLR(np.log(outdegs + 1), np.log(indegs + 1))
Out[56]:
In [33]:
%time d_pagerank = G.pagerank()
In [29]:
%time u_pagerank = G.as_undirected().pagerank()
In [21]:
%time d_betweenness = G.betweenness(directed=True)
In [26]:
%time u_betweenness = G.as_undirected().betweenness(directed=False)
In [22]:
%time d_closeness = G.closeness(mode="IN", normalized=True)
In [25]:
%time u_closeness = G.as_undirected().closeness(normalized=True)
In [30]:
%time d_eigen = G.eigenvector_centrality()
In [31]:
%time u_eigen = G.as_undirected().eigenvector_centrality()
In [120]:
%time hubs = G.hub_score()
In [121]:
%time authorities = G.authority_score()
In [176]:
indegree = G.indegree()
outdegree = G.outdegree()
degree = G.degree()
In [177]:
df = pd.DataFrame(index=G.vs['name'])
df['year'] = G.vs['year']
df['indegree'] = indegree
df['outdegree'] = outdegree
df['degree'] = degree
df['d_pagerank'] = d_pagerank
df['u_pagerank'] = u_pagerank
df['d_betweenness'] = d_betweenness
df['u_betweenness'] = u_betweenness
df['d_closeness'] = d_closeness
df['u_closeness'] = u_closeness
df['d_eigen'] = d_eigen
df['u_eigen'] = u_eigen
df['hubs'] = hubs
df['authorities'] = authorities
In [178]:
plot_scatter_matrix(df.apply(lambda c: c/np.std(c)))
In [179]:
U, D, V = get_PCA(df, scale=True)
In [180]:
plot_scores(U,
start=1,
n_comp=5,
title='')
In [146]:
# map types to issues
type_to_issue = {'procedural': [1, 4, 6, 9],
'substantive': [2, 3, 5, 7, 8, 12, 14],
'other': [10, 11, 13, 0]}
# map issues to type
issue_to_type = {i: '' for i in range(13 + 1)}
for t in type_to_issue.keys():
for i in type_to_issue[t]:
issue_to_type[i] = t
In [182]:
# create type
G.vs['issueArea'] = [int(i) for i in G.vs['issueArea']]
G.vs['type'] = [issue_to_type[i] for i in G.vs['issueArea']]
# add to data frame
df['issueArea'] = G.vs['issueArea']
df['type'] = G.vs['type']
In [183]:
# get type subsets
df_sub = df[df['type'] == 'substantive']
df_pro = df[df['type'] == 'procedural']
df_oth = df[df['type'] == 'other']
print 'num substantive: %d' % df_sub.shape[0]
print 'num procedural: %d' % df_pro.shape[0]
print 'num other: %d' % df_oth.shape[0]
In [184]:
df.columns
Out[184]:
In [185]:
metric = 'indegree'
bins = np.linspace(min(df[metric]), max(df[metric]), 100)
# substantive
plt.hist(df_sub[metric],
bins=bins,
color='red',
label='substantive (mean: %1.5f)' % np.mean(df_sub[metric]))
# procedural
plt.hist(df_pro[metric],
bins=bins,
color='blue',
label='procedural (mean: %1.5f)' % np.mean(df_pro[metric]))
# other
plt.hist(df_oth[metric],
bins=bins,
color='green',
label='other (mean: %1.5f)' % np.mean(df_oth[metric]))
# plt.xlim([0, .2])
# plt.ylim([0, 2000])
plt.xlabel(metric)
plt.legend(loc='upper right')
Out[185]:
In [207]:
R = 100
top_cases = df.sort_values(by=metric, ascending=False).iloc[0:R]['type']
top_breakdown = top_cases.value_counts(normalize=True)
all_breakdown = df['type'].value_counts(normalize=True)
top_breakdown - all_breakdown
Out[207]:
In [198]:
In [ ]: