In [1]:
repo_directory = '/Users/iaincarmichael/Dropbox/Research/law/law-net/'

data_dir = '/Users/iaincarmichael/data/courtlistener/'

import numpy as np
import sys

# graph package
import igraph as ig

# our code
sys.path.append(repo_directory + 'code/')
from setup_data_dir import setup_data_dir, make_subnetwork_directory
from pipeline.download_data import download_bulk_resource, download_master_edgelist, download_scdb
from helpful_functions import case_info

sys.path.append(repo_directory + 'vertex_metrics_experiment/code/')
from make_network_data import *
from make_graph import make_graph
from bag_of_words import make_tf_idf
from make_snapshots import make_snapshot_vertex_metrics, update_snapshot_vertex_metrics

# which network to download data for
network_name = 'federal' # 'federal', 'ca1', etc


# some sub directories that get used
raw_dir = data_dir + 'raw/'
subnet_dir = data_dir + network_name + '/'
text_dir = subnet_dir + 'textfiles/'


# jupyter notebook settings
%load_ext autoreload
%autoreload 2
%matplotlib inline

set up the data directory


In [ ]:
setup_data_dir(data_dir)

In [ ]:
make_subnetwork_directory(data_dir, network_name)

data download

get opinion and cluster files from CourtListener

opinions/cluster files are saved in data_dir/raw/court/


In [ ]:
download_op_and_cl_files(data_dir, network_name)

get the master edgelist from CL

master edgelist is saved in data_dir/raw/


In [ ]:
download_master_edgelist(data_dir)

download scdb data from SCDB

scdb data is saved in data_dir/scdb


In [ ]:
download_scdb(data_dir)

network data

make the case metadata and edgelist

  • add the raw case metadata data frame to the raw/ folder
  • remove cases missing scdb ids
  • remove detroit lumber case
  • get edgelist of cases within desired subnetwork
  • save case metadata and edgelist to the experiment_dir/

In [ ]:
# create the raw case metadata data frame in the raw/ folder
make_subnetwork_raw_case_metadata(data_dir, network_name)

In [ ]:
# create clean case metadata and edgelist from raw data
clean_metadata_and_edgelist(data_dir, network_name)

make graph

creates the network with the desired case metadata and saves it as a .graphml file in experiment_dir/


In [ ]:
make_graph(subnet_dir, network_name)

NLP data

make case text files

grabs the opinion text for each case in the network and saves them as a text file in experiment_dir/textfiles/


In [ ]:
%%time
make_network_textfiles(data_dir, network_name)

make tf-idf matrix

creates the tf-idf matrix for the corpus of cases in the network and saves them to subnet_dir + 'nlp/'


In [8]:
%%time
make_tf_idf(text_dir, subnet_dir + 'nlp/')


CPU times: user 10h 15min 16s, sys: 9min 10s, total: 10h 24min 27s
Wall time: 10h 34min 7s

Load network


In [2]:
# load the graph
G = ig.Graph.Read_GraphML(subnet_dir + network_name +'_network.graphml')


/Users/iaincarmichael/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:2: RuntimeWarning: Could not add vertex ids, there is already an 'id' vertex attribute at foreign-graphml.c:443
  from ipykernel import kernelapp as app

In [3]:
G.summary()


Out[3]:
'IGRAPH DN-- 959985 6649916 -- \n+ attr: court (v), id (v), issueArea (v), name (v), num_words (v), year (v)'

compute snapshots


In [4]:
vertex_metrics = ['indegree', 'outdegree', 'degree',
                  'd_pagerank','u_pagerank',
                  'authorities', 'hubs',
                  'd_eigen', 'u_eigen']
                  #'d_betweenness', 'u_betweenness',
                  #'d_closeness', 'u_closeness']

# add recent citations
vertex_metrics += ['recentcite_' + str(t) for t in np.arange(1, 10 + 1)]
vertex_metrics += ['recentcite_' + str(t) for t in [15, 20, 25, 30, 35, 40]]

active_years = range(1900, 2015 + 1)

In [10]:
%%time
make_snapshot_vertex_metrics(G, active_years, vertex_metrics, subnet_dir)


year 1900, (2/117) at 04:59:49
year 1902, (4/117) at 04:59:58
year 1906, (8/117) at 05:00:35
year 1914, (16/117) at 05:04:33
problem with d_eigen
problem with d_eigen
year 1930, (32/117) at 05:18:29
problem with d_eigen
year 1962, (64/117) at 05:38:07
problem with d_eigen
problem with d_eigen
CPU times: user 4h 34min 9s, sys: 1min 47s, total: 4h 35min 57s
Wall time: 4h 36min 8s

add textlength

count number of words in each opinion text.


In [6]:
G.vs['num_words'] = [0] * len(G.vs)
for op_id in G.vs['name']:
    
    text = open(text_dir + op_id +'.txt', 'r').read()
    num_words = len(text.split())
    
    G.vs.find(name=op_id)['num_words'] = num_words

In [7]:
G.write_graphml(subnet_dir + network_name +'_network.graphml')

update snapshots


In [4]:
to_add = ['rev_pagerank', 'num_words']
to_add += ['citerank_' + str(t) for t in [1, 2, 5, 10, 20, 50]]

In [5]:
to_add = ['polyrank_' + str(t) for t in [1, 2, 5, 10, 20, 50]]

In [6]:
%%time
update_snapshot_vertex_metrics(G, active_years, to_add, subnet_dir)


CPU times: user 26min 55s, sys: 1min 3s, total: 27min 58s
Wall time: 28min 7s

In [ ]: