This script cleans the scotus network.
SCOTUS heard order 100 cases a year, however there are a few years that have several thousand .json files. A bunch of these .json files correspond to 'applications' to SCOTUS. We would like to remove these from the network.
We remove a case if:
Outline:
Problems:
TODO:
In [2]:
#!/usr/bin/env python3
from pandas import DataFrame, read_csv
import time
import pandas as pd
import sys
from igraph import *
import glob
import re
import json
from bs4 import *
import os
import numpy as np
In [3]:
#Read in the edge list
scotus_network = Graph.Read_Lgl('../../data/created/scotus/original/scotus_net_all_lgl.txt', names = 'name', directed = True)
print(summary(scotus_network))
In [5]:
#Add in meta data
case_list_cl = [f.split('.')[0] for f in os.listdir("../../data/downloaded/clusters/scotus")]
loop_start = time.time()
for i in case_list_cl:
filename = "../../data/downloaded/clusters/scotus/" + str(i) + ".json"
with open(filename, encoding='utf-8') as data_file:
cluster_data = json.load(data_file, encoding='utf-8')
date = cluster_data['date_filed']
scotus_network.vs.select(name = 'id'+ str(i))['date'] = date
loop_end = time.time()
print('the loop took ' + str(loop_end- loop_start) + "s")
In [ ]:
#Find cases that have zero degree AND contain one of
#'denied'
#'certiorari'
#'certiorari denied'
# 'certiorari granted'
denied_all = [] #deg = 0, containes 'denied'
certiorari_all = []
certiorari_denied_all = []
certiorari_granted_all = []
years_all = []
k = 0
loop_start = time.time()
for i in case_list_op:
current_vertex = scotus_network.vs.select(name = 'id'+ str(i))
degree = current_vertex.degree()[0]
if degree == 0:
filename_op = "../../data/downloaded/opinions/scotus/" + str(i) + ".json"
with open(filename_op, encoding='utf-8') as data_file:
op_data = json.load(data_file, encoding='utf-8')
text = op_data['html']
if len(text) == 0:
text = op_data['html_with_citations']
elif len(text) == 0:
text = op_data['plain_text']
elif len(text) == 0:
text = op_data['html_lawbox']
elif len(text) == 0:
text = ''
print('case ' + str(i) + ' has no text')
year = current_vertex['date'][0].split('-')[0]
if re.search(r'denied', text, re.IGNORECASE):
denied_all.append(i)
years_all.append('d-'+ year + '-' + str(i))
if re.search(r'certiorari', text,re.IGNORECASE):
certiorari_all.append(i)
years_all.append('c-'+ year + '-' + str(i))
if re.search(r'certiorari denied', text,re.IGNORECASE):
certiorari_denied_all.append(i)
years_all.append('cd-'+ year + '-' + str(i))
if re.search(r'certiorari granted', text,re.IGNORECASE):
certiorari_granted_all.append(i)
years_all.append('cg-'+ year + '-' + str(i))
k = k+1
loop_end = time.time()
print('the loop took ' + str(loop_end - loop_start) + "s")
In [ ]:
#kick out cases we don't want
#kills cases that have zero degree and contain either 'denied' or 'certiorari'
cases_to_kick_out = list(set(denied_all) | set(certiorari_all))
for i in cases_to_kick_out:
caseid = 'id' + str(i)
bad_vertex = scotus_network.vs.select(name = caseid)
scotus_network.delete_vertices(bad_vertex)
In [ ]:
scotus_network.write_gml('../../data/created/scotus/clean/scotus_net_clean.txt')