In [4]:
repo_directory = '/Users/iaincarmichael/Dropbox/Research/law/law-net/'
data_dir = '/Users/iaincarmichael/Documents/courtlistener/data/'
import os
import sys
import time
from math import *
import copy
import cPickle as pickle
# data
import numpy as np
import pandas as pd
# viz
import matplotlib.pyplot as plt
# graph
import igraph as ig
# our code
sys.path.append(repo_directory + 'code/')
from pipeline.download_data import *
from pipeline.make_raw_case_metadata import *
from helpful_functions import case_info
from setup_data_dir import *
sys.path.append(repo_directory + 'explore/vertex_metrics_experiment/code/')
from make_case_text_files import *
from bag_of_words import *
from similarity_matrix import *
from make_snapshots import *
from make_graph import *
# court
network_name = 'federal'
courts = ['scotus', 'cafc', 'cadc']
courts += ['ca' + str(i+1) for i in range(11)]
# directory set up
data_dir = '/Users/iaincarmichael/Documents/courtlistener/data/'
experiment_data_dir = data_dir + '%s/' % network_name
text_dir = experiment_data_dir + 'textfiles/'
# jupyter notebook settings
%load_ext autoreload
%autoreload 2
%matplotlib inline
In [5]:
Out[5]:
In [ ]:
In [ ]:
setup_data_dir(data_dir)
In [15]:
for court in courts:
start = time.time()
download_bulk_resource(court, 'clusters', data_dir)
download_bulk_resource(court, 'opinions', data_dir)
In [ ]:
# download_master_edgelist(data_dir)
In [ ]:
download_scdb(data_dir)
In [30]:
start = time.time()
# append all other courts
for court in courts:
court_data = get_raw_case_metadata_from_court(court, data_dir)
# either initialize of append data frame
if court == courts[0]:
case_metadata = court_data
else:
case_metadata = case_metadata.append(court_data)
print time.time() - start
In [33]:
case_metadata.to_csv(data_dir + 'raw/%s_case_metadata_r.csv', index=True) % network_name
In [96]:
case_metadata = pd.read_csv(data_dir + 'raw/%s_case_metadata_r.csv' % network_names, index_col=0)
case_metadata.index= case_metadata.index.astype('str')
In [97]:
# scotus scdb ids
scdb_ids = case_metadata[case_metadata['court'] == 'scotus']['scdb_id']
# scotus cases with no scdb id
no_scdb_link = scdb_ids.index[scdb_ids.isnull()].tolist()
# remove SCOTUS cases with no SCDB id
case_metadata.drop(no_scdb_link, inplace=True)
# kill detroit lumber
case_metadata.drop('96405', inplace=True)
In [99]:
case_metadata.to_csv(experiment_data_dir + 'case_metadata.csv', index=True)
In [163]:
# load master edgelist
master_edgelist = pd.read_csv(data_dir + 'raw/edgelist_master_r.csv')
# only keep edges within federal circuit
case_ids = set(case_metadata.index)
edgelist = master_edgelist[master_edgelist.citing.isin(case_ids) & master_edgelist.cited.isin(case_ids)]
# save federal edgelist
edgelist.to_csv(experiment_data_dir + 'edgelist.csv', index=False)
In [ ]:
make_graph(experiment_data_dir, network_name)
pretty sure we can kill the next cells, double check make_graph works
In [164]:
# case_metadata = pd.read_csv(experiment_data_dir + 'case_metadata.csv', index_col=0)
# case_metadata.index = case_metadata.index.astype('str')
# edgelist = pd.read_csv(experiment_data_dir + 'edgelist.csv', index_col=False)
# edgelist['citing'] = edgelist['citing'].astype(str)
# edgelist['cited'] = edgelist['cited'].astype(str)
In [165]:
# # initialize graph
# G = ig.Graph(n=case_metadata.shape[0], directed=True)
# # add opinion names
# G.vs['name'] = case_metadata.index
# # opinion to ig index mapping
# op_to_ig = {op_id: G.vs.find(name=op_id).index for op_id in G.vs['name']}
# # convert edgelist to ig ids
# edgelist_ig = edgelist.apply(lambda c: [op_to_ig[str(op_id)] for op_id in c])
# # add edes to graph
# G.add_edges(edgelist_ig.as_matrix().tolist())
In [121]:
# # add igraph index to case metadata
# case_metadata['ig_index'] = 0
# case_metadata.loc[G.vs['name'],'ig_index'] = range(len(G.vs))
# # set missing issueArea to 0
# no_issueArea = case_metadata.index[case_metadata['issueArea'].isnull()]
# case_metadata.loc[no_issueArea,'issueArea'] = 0
# case_metadata['issueArea'] = case_metadata['issueArea'].astype(int)
# # add year
# case_metadata['date'] = pd.to_datetime(case_metadata['date'])
# case_metadata['year'] = case_metadata['date'].apply(lambda d: d.year)
In [176]:
# # add node metadata to graph
# # pretty sure this is the right order
# G.vs['year'] = case_metadata['year']
# G.vs['issueArea'] = case_metadata['issueArea']
# G.vs['issueArea'] = case_metadata['issueArea']
# G.vs['court'] = case_metadata['court']
In [179]:
# G.write_graphml(experiment_data_dir + 'federal_network.graphml')
TODO: add case metadata
In [143]:
for court in courts:
start = time.time()
# ignore bad scotus cases
if court == 'scotus':
CLid_bad = no_scdb_link
else:
CLid_bad = None
# make the textfiles for give court
make_text_files(data_dir,
court,
CLid_good=None,
CLid_bad=CLid_bad,
output_path = text_dir)
print '%s took %d seconds' % (court, time.time() - start)
In [1]:
# normalize textfiles
%time %time make_tf_idf(text_dir, experiment_data_dir, min_df=0, max_df=1)
In [5]:
G = ig.Graph.Read_GraphML(experiment_data_dir + network_name +'_network.graphml')
In [7]:
vertex_metrics = ['indegree', 'outdegree', 'degree',
'd_pagerank', 'authorities', 'hubs']
active_years = range(1900, 2015 + 1)
In [10]:
%time make_snapshot_vertex_metrics(G, active_years, vertex_metrics, experiment_data_dir)
In [ ]: