In [1]:
repo_directory = '/Users/iaincarmichael/Dropbox/Research/law/law-net/'

data_dir = '/Users/iaincarmichael/data/courtlistener/'

import numpy as np
import sys

# graph package
import igraph as ig

# our code
sys.path.append(repo_directory + 'code/')
from setup_data_dir import setup_data_dir, make_subnetwork_directory
from pipeline.download_data import download_bulk_resource, download_master_edgelist, download_scdb
from helpful_functions import case_info

sys.path.append(repo_directory + 'vertex_metrics_experiment/code/')
from make_network_data import *
from make_graph import make_graph
from bag_of_words import make_tf_idf
from make_snapshots import make_snapshot_vertex_metrics, update_snapshot_vertex_metrics

# which network to download data for
network_name = 'scotus' # 'federal', 'ca1', etc


# some sub directories that get used
raw_dir = data_dir + 'raw/'
subnet_dir = data_dir + network_name + '/'
text_dir = subnet_dir + 'textfiles/'


# jupyter notebook settings
%load_ext autoreload
%autoreload 2
%matplotlib inline

set up the data directory


In [ ]:
setup_data_dir(data_dir)

In [ ]:
make_subnetwork_directory(data_dir, network_name)

data download

get opinion and cluster files from CourtListener

opinions/cluster files are saved in data_dir/raw/court/


In [ ]:
download_op_and_cl_files(data_dir, network_name)

get the master edgelist from CL

master edgelist is saved in data_dir/raw/


In [ ]:
download_master_edgelist(data_dir)

download scdb data from SCDB

scdb data is saved in data_dir/scdb


In [ ]:
download_scdb(data_dir)

network data

make the case metadata and edgelist

  • add the raw case metadata data frame to the raw/ folder
  • remove cases missing scdb ids
  • remove detroit lumber case
  • get edgelist of cases within desired subnetwork
  • save case metadata and edgelist to the experiment_dir/

In [ ]:
# create the raw case metadata data frame in the raw/ folder
make_subnetwork_raw_case_metadata(data_dir, network_name)

In [ ]:
# create clean case metadata and edgelist from raw data
clean_metadata_and_edgelist(data_dir, network_name)

make graph

creates the network with the desired case metadata and saves it as a .graphml file in experiment_dir/


In [ ]:
make_graph(subnet_dir, network_name)

NLP data

make case text files

grabs the opinion text for each case in the network and saves them as a text file in experiment_dir/textfiles/


In [ ]:
%%time
make_network_textfiles(data_dir, network_name)

make tf-idf matrix

creates the tf-idf matrix for the corpus of cases in the network and saves them to subnet_dir + 'nlp/'


In [2]:
%%time
make_tf_idf(text_dir, subnet_dir + 'nlp/')


CPU times: user 38min 25s, sys: 14.4 s, total: 38min 39s
Wall time: 38min 57s

Load network


In [3]:
# load the graph
G = ig.Graph.Read_GraphML(subnet_dir + network_name +'_network.graphml')

In [4]:
G.summary()


Out[4]:
'IGRAPH DN-- 27885 234312 -- \n+ attr: court (v), date (v), id (v), issueArea (v), name (v), year (v)'

compute snapshots


In [4]:
vertex_metrics = ['indegree', 'outdegree', 'degree',
                  'd_pagerank','u_pagerank','rev_pagerank',
                  'authorities', 'hubs',
                  'd_eigen', 'u_eigen',
                  'd_betweenness', 'u_betweenness',
                  'd_in_closeness', 'd_out_closeness',
                  'd_all_closeness', 'u_closeness']

# add recent citations
vertex_metrics += ['recentcite_' + str(t) for t in np.arange(1, 10 + 1)]
vertex_metrics += ['recentcite_' + str(t) for t in [15, 20, 25, 30, 35, 40]]
vertex_metrics += ['citerank_' + str(t) for t in [1, 2, 5, 10, 20, 50]]
vertex_metrics += ['polyrank_' + str(t) for t in [1, 2, 5, 10, 20, 50]]
vertex_metrics += ['pagerank_' + str(t * 10) for t in range(1, 9 + 1)]
vertex_metrics += ['num_words']

active_years = range(1900, 2015 + 1)

In [5]:
%%time
make_snapshot_vertex_metrics(G, active_years, vertex_metrics, subnet_dir)


year 1900, (2/117) at 00:05:50
year 1902, (4/117) at 00:06:51
year 1906, (8/117) at 00:09:14
/Users/iaincarmichael/Dropbox/Research/law/law-net/vertex_metrics_experiment/code/make_snapshots.py:242: RuntimeWarning: ARPACK solver failed to converge (3001 iterations, 0/1 eigenvectors converged) at arpack.c:776
  metric_column = G.eigenvector_centrality(arpack_options=arpack_options)
year 1914, (16/117) at 00:20:15
year 1930, (32/117) at 00:51:15
year 1962, (64/117) at 02:20:59
CPU times: user 8h 15min 22s, sys: 35.5 s, total: 8h 15min 57s
Wall time: 8h 16min 7s

update snapshots


In [39]:
# to_add = ['rev_pagerank', 'num_words']
# to_add += ['citerank_' + str(t) for t in [1, 2, 5, 10, 20, 50]]

In [40]:
# to_add = ['polyrank_' + str(t) for t in [1, 2, 5, 10, 20, 50]]

In [41]:
# to_add = ['d_in_closeness', 'd_out_closeness', 'd_all_closeness', 'd_eigen']

In [44]:
# to_add = ['pagerank_' + str(t * 10) for t in range(1, 9 + 1)]

In [51]:
%%time
update_snapshot_vertex_metrics(G, active_years, to_add, subnet_dir)


CPU times: user 1e+03 ns, sys: 0 ns, total: 1e+03 ns
Wall time: 6.91 µs

add text length

adds word count as a vertex attribute


In [5]:
G.vs['num_words'] = [0] * len(G.vs)
for op_id in G.vs['name']:
    
    text = open(text_dir + op_id +'.txt', 'r').read()
    num_words = len(text.split())
    
    G.vs.find(name=op_id)['num_words'] = num_words

In [6]:
G.write_graphml(subnet_dir + network_name +'_network.graphml')

hospital


In [ ]: