In [4]:
repo_directory = '/Users/iaincarmichael/Dropbox/Research/law/law-net/'
data_dir = '/Users/iaincarmichael/Documents/courtlistener/data/'

import os
import sys
import time
from math import *
import copy
import cPickle as pickle

# data
import numpy as np
import pandas as pd

# viz
import matplotlib.pyplot as plt


# graph
import igraph as ig


# our code
sys.path.append(repo_directory + 'code/')
from pipeline.download_data import *
from pipeline.make_raw_case_metadata import *
from helpful_functions import case_info
from setup_data_dir import *


sys.path.append(repo_directory + 'explore/vertex_metrics_experiment/code/')
from make_case_text_files import *
from bag_of_words import *
from similarity_matrix import *
from make_snapshots import *
from make_graph import *

# court
network_name = 'federal'
courts = ['scotus', 'cafc', 'cadc']
courts += ['ca' + str(i+1) for i in range(11)]

# directory set up
data_dir = '/Users/iaincarmichael/Documents/courtlistener/data/'
experiment_data_dir = data_dir + '%s/' % network_name
text_dir = experiment_data_dir + 'textfiles/'


# jupyter notebook settings
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [5]:



Out[5]:
True

In [ ]:

set up data directory


In [ ]:
setup_data_dir(data_dir)

download opinion and cluster files


In [15]:
for court in courts:
    start = time.time()
    download_bulk_resource(court, 'clusters', data_dir)
    download_bulk_resource(court, 'opinions', data_dir)


requesting metadata for scotus
Downloading clusters data for court SCOTUS...
requesting metadata for scotus
Downloading opinions data for court SCOTUS...
scotus took 1140 seconds
requesting metadata for cafc
Downloading clusters data for court CAFC...
requesting metadata for cafc
Downloading opinions data for court CAFC...
cafc took 362 seconds
requesting metadata for cadc
Downloading clusters data for court CADC...
requesting metadata for cadc
Downloading opinions data for court CADC...
cadc took 668 seconds
requesting metadata for ca1
Downloading clusters data for court CA1...
requesting metadata for ca1
Downloading opinions data for court CA1...
ca1 took 783 seconds
requesting metadata for ca2
Downloading clusters data for court CA2...
requesting metadata for ca2
Downloading opinions data for court CA2...
ca2 took 737 seconds
requesting metadata for ca3
Downloading clusters data for court CA3...
requesting metadata for ca3
Downloading opinions data for court CA3...
ca3 took 1014 seconds
requesting metadata for ca4
Downloading clusters data for court CA4...
requesting metadata for ca4
Downloading opinions data for court CA4...
ca4 took 1324 seconds
requesting metadata for ca5
Downloading clusters data for court CA5...
requesting metadata for ca5
Downloading opinions data for court CA5...
ca5 took 1647 seconds
requesting metadata for ca6
Downloading clusters data for court CA6...
requesting metadata for ca6
Downloading opinions data for court CA6...
ca6 took 1136 seconds
requesting metadata for ca7
Downloading clusters data for court CA7...
requesting metadata for ca7
Downloading opinions data for court CA7...
ca7 took 1147 seconds
requesting metadata for ca8
Downloading clusters data for court CA8...
requesting metadata for ca8
Downloading opinions data for court CA8...
ca8 took 1166 seconds
requesting metadata for ca9
Downloading clusters data for court CA9...
requesting metadata for ca9
Downloading opinions data for court CA9...
ca9 took 2211 seconds
requesting metadata for ca10
Downloading clusters data for court CA10...
requesting metadata for ca10
Downloading opinions data for court CA10...
ca10 took 997 seconds
requesting metadata for ca11
Downloading clusters data for court CA11...
requesting metadata for ca11
Downloading opinions data for court CA11...
ca11 took 1069 seconds

download the master edgelist


In [ ]:
# download_master_edgelist(data_dir)

download scdb data


In [ ]:
download_scdb(data_dir)

make case metadata


In [30]:
start = time.time()
# append all other courts
for court in courts:
    court_data = get_raw_case_metadata_from_court(court, data_dir)
    
    # either initialize of append data frame
    if court == courts[0]:
        case_metadata = court_data
    else:
        case_metadata = case_metadata.append(court_data)
        
print time.time() - start


2496.03245115

In [33]:
case_metadata.to_csv(data_dir + 'raw/%s_case_metadata_r.csv', index=True) % network_name

clean scotus

kill SCOTUS cases with no SCDB ids


In [96]:
case_metadata = pd.read_csv(data_dir + 'raw/%s_case_metadata_r.csv' % network_names, index_col=0)
case_metadata.index= case_metadata.index.astype('str')

In [97]:
# scotus scdb ids
scdb_ids = case_metadata[case_metadata['court'] == 'scotus']['scdb_id']

# scotus cases with no scdb id
no_scdb_link = scdb_ids.index[scdb_ids.isnull()].tolist()

# remove SCOTUS cases with no SCDB id
case_metadata.drop(no_scdb_link, inplace=True)

# kill detroit lumber
case_metadata.drop('96405', inplace=True)

In [99]:
case_metadata.to_csv(experiment_data_dir + 'case_metadata.csv', index=True)

get the federal subedgelist


In [163]:
# load master edgelist
master_edgelist = pd.read_csv(data_dir + 'raw/edgelist_master_r.csv')

# only keep edges within federal circuit
case_ids = set(case_metadata.index)
edgelist = master_edgelist[master_edgelist.citing.isin(case_ids) & master_edgelist.cited.isin(case_ids)]

# save federal edgelist
edgelist.to_csv(experiment_data_dir + 'edgelist.csv', index=False)

make igraph object


In [ ]:
make_graph(experiment_data_dir, network_name)

pretty sure we can kill the next cells, double check make_graph works


In [164]:
# case_metadata = pd.read_csv(experiment_data_dir + 'case_metadata.csv', index_col=0)
# case_metadata.index = case_metadata.index.astype('str')

# edgelist = pd.read_csv(experiment_data_dir + 'edgelist.csv', index_col=False)
# edgelist['citing'] = edgelist['citing'].astype(str)
# edgelist['cited'] = edgelist['cited'].astype(str)

In [165]:
# # initialize graph
# G = ig.Graph(n=case_metadata.shape[0], directed=True)

# # add opinion names
# G.vs['name'] = case_metadata.index

# # opinion to ig index mapping
# op_to_ig = {op_id: G.vs.find(name=op_id).index  for op_id in G.vs['name']}

# # convert edgelist to ig ids
# edgelist_ig = edgelist.apply(lambda c: [op_to_ig[str(op_id)] for op_id in c])

# # add edes to graph
# G.add_edges(edgelist_ig.as_matrix().tolist())

In [121]:
# # add igraph index to case metadata
# case_metadata['ig_index'] = 0
# case_metadata.loc[G.vs['name'],'ig_index'] = range(len(G.vs))

# # set missing issueArea to 0
# no_issueArea = case_metadata.index[case_metadata['issueArea'].isnull()]
# case_metadata.loc[no_issueArea,'issueArea'] = 0
# case_metadata['issueArea'] = case_metadata['issueArea'].astype(int)

# # add year
# case_metadata['date'] = pd.to_datetime(case_metadata['date'])
# case_metadata['year'] = case_metadata['date'].apply(lambda d: d.year)

In [176]:
# # add node metadata to graph
# # pretty sure this is the right order
# G.vs['year'] = case_metadata['year']
# G.vs['issueArea'] = case_metadata['issueArea']
# G.vs['issueArea'] = case_metadata['issueArea']
# G.vs['court'] = case_metadata['court']

In [179]:
# G.write_graphml(experiment_data_dir + 'federal_network.graphml')

TODO: add case metadata

make case text files


In [143]:
for court in courts:
    start = time.time()
    
    # ignore bad scotus cases
    if court == 'scotus':
        CLid_bad = no_scdb_link
    else:
        CLid_bad = None
    
    # make the textfiles for give court
    make_text_files(data_dir,
                    court,
                    CLid_good=None,
                    CLid_bad=CLid_bad,
                    output_path = text_dir)
    
    print '%s took %d seconds' % (court, time.time() - start)


scotus took 687 seconds
cafc took 188 seconds
cadc took 449 seconds
ca1 took 587 seconds
ca2 took 614 seconds
ca3 took 610 seconds
ca4 took 539 seconds
ca5 took 966 seconds
ca6 took 671 seconds
ca7 took 601 seconds
ca8 took 578 seconds
ca9 took 1051 seconds
ca10 took 429 seconds
ca11 took 393 seconds

make td-idf matrix


In [1]:
# normalize textfiles
%time %time make_tf_idf(text_dir, experiment_data_dir, min_df=0, max_df=1)

make snapshots


In [5]:
G = ig.Graph.Read_GraphML(experiment_data_dir + network_name +'_network.graphml')

In [7]:
vertex_metrics = ['indegree', 'outdegree', 'degree',
                   'd_pagerank', 'authorities', 'hubs']

active_years = range(1900, 2015 + 1)

In [10]:
%time make_snapshot_vertex_metrics(G, active_years, vertex_metrics, experiment_data_dir)


year 1900, (2/117) at 23:46:12
year 1902, (4/117) at 23:46:13
year 1906, (8/117) at 23:46:16
year 1914, (16/117) at 23:46:23
year 1930, (32/117) at 23:46:35
year 1962, (64/117) at 23:47:39
CPU times: user 17min 54s, sys: 34.3 s, total: 18min 28s
Wall time: 19min 6s

make edge dataframe


In [ ]: