In [67]:
# modify these for your own computer
repo_directory = '/Users/iaincarmichael/Dropbox/Research/law/law-net/'
data_dir = '/Users/iaincarmichael/Documents/courtlistener/data/'

In [35]:
import os
import sys
import time
from math import *
import copy
import cPickle as pickle

# data
import numpy as np
import pandas as pd

# viz
import matplotlib.pyplot as plt


# graph
import igraph as ig


# our code
sys.path.append(repo_directory + 'code/')
from pipeline.download_data import download_bulk_resource, download_master_edgelist


sys.path.append(repo_directory + 'explore/vertex_metrics_experiment/code/')
from make_case_text_files import *
from bag_of_words import *
from similarity_matrix import *
from make_snapshots import *
from make_graph import *
from data_dir_setup import *


# court
court = 'scotus'
network_name = 'scotus'

# directory set up
raw_dir = data_dir + 'raw/'
experiment_data_dir = data_dir + network_name + '/'
text_dir = experiment_data_dir + 'textfiles/'



# jupyter notebook settings
%load_ext autoreload
%autoreload 2
%matplotlib inline


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

set up the data directory


In [ ]:
setup_data_dir(data_dir)

data download

get opinion and cluster files from CourtListener

opinions/cluster files are saved in data_dir/raw/court/


In [51]:
%time download_op_and_cl_files(data_dir, network_name)

get the master edgelist from CL

master edgelist is saved in data_dir/raw/


In [ ]:
%time download_master_edgelist(data_dir)

download scdb data from SCDB

scdb data is saved in data_dir/scdb


In [ ]:
%time download_scdb(data_dir)

clean data

make the case metadata and edgelist

  • add the raw case metadata data frame to the raw/ folder
  • remove cases missing scdb ids
  • remove detroit lumber case
  • get edgelist of cases within desired subnetwork
  • save case metadata and edgelist to the experiment_dir/

In [ ]:
# create the raw case metadata data frame in the raw/ folder
%time make_subnetwork_raw_case_metadata(data_dir, network_name)

In [ ]:
# create clean case metadata and edgelist from raw data
%time clean_metadata_and_edgelist(data_dir, network_name)

make graph

creates the network with the desired case metadata and saves it as a .graphml file in experiment_dir/


In [20]:
%time make_graph(experiment_data_dir, network_name)


CPU times: user 6.05 s, sys: 281 ms, total: 6.33 s
Wall time: 8.8 s

make case text files

grabs the opinion text for each case in the network and saves them as a text file in experiment_dir/textfiles/


In [32]:
# make the textfiles for give court
%time make_network_textfiles(data_dir, network_name)


CPU times: user 9min 31s, sys: 28.5 s, total: 10min
Wall time: 11min 22s

make tf-idf matrix

creates the tf-idf matrix for the corpus of cases in the network and saves them to experiment_data_dir + 'nlp/'


In [33]:
%time make_tf_idf(text_dir, experiment_data_dir + 'nlp/', min_df=0, max_df=1)


CPU times: user 4min 47s, sys: 17.2 s, total: 5min 4s
Wall time: 6min 4s

data for vertex metrics experiment

make snapshots


In [68]:
# load the graph
G = ig.Graph.Read_GraphML(experiment_data_dir + 'scotus_network.graphml')

In [72]:
G.summary()


Out[72]:
'IGRAPH DN-- 27885 234312 -- \n+ attr: court (v), id (v), issueArea (v), name (v), year (v)'

In [ ]:


In [63]:
vertex_metrics = ['indegree', 'outdegree', 'degree',
                   'd_pagerank', 'authorities', 'hubs']

# add recent citations
vertex_metrics += ['recentcite_' + str(t) for t in 5 * np.arange(1, 6+1)]

active_years = range(1900, 2015 + 1)

In [65]:
%time make_snapshot_vertex_metrics(G, active_years, vertex_metrics, experiment_data_dir)


year 1900, (2/117) at 16:52:52
year 1902, (4/117) at 16:52:53
year 1906, (8/117) at 16:52:56
year 1914, (16/117) at 16:53:03
year 1930, (32/117) at 16:53:19
year 1962, (64/117) at 16:54:13
CPU times: user 3min 9s, sys: 8.46 s, total: 3min 17s
Wall time: 3min 34s

In [ ]: