In [2]:
top_directory = '/Users/iaincarmichael/Dropbox/Research/law/law-net/'
import os
import sys
import time
from math import *
import copy
import cPickle as pickle
import re
import datetime
from collections import Counter
# data
import numpy as np
import pandas as pd
# viz
import matplotlib.pyplot as plt
# graph
import igraph as ig
# our code
sys.path.append(top_directory + 'code/')
from load_data import load_and_clean_graph, case_info
from pipeline.download_data import download_bulk_resource
from pipeline.make_clean_data import *
from viz import print_describe
from pipeline.make_raw_case_metadata import *
sys.path.append(top_directory + 'explore/vertex_metrics_experiment/code/')
from make_case_text_files import *
# directory set up
data_dir = top_directory + 'data/'
court_name = 'scotus'
# jupyter notebook settings
%load_ext autoreload
%autoreload 2
%matplotlib inline
In [4]:
# load scdb
scdb_modern = pd.read_csv(data_dir + 'scdb/SCDB_2016_01_caseCentered_Citation.csv', index_col=0)
scdb_legacy = pd.read_csv(data_dir + 'scdb/SCDB_Legacy_03_caseCentered_Citation.csv', index_col=0)
scdb = scdb_legacy.append(scdb_modern)
In [3]:
%time case_metadata = get_raw_case_metadata_from_court(court_name, data_dir)
In [5]:
cert_cases = pd.DataFrame(False,
index=case_metadata.index,
columns=['denied', 'certiorari', 'zero_degree', 'scdb_link', 'in_scdb', 'text_length'])
cert_cases['text_length'] = 0
# add year
cert_cases['year'] = case_metadata['date'].apply(lambda d: d.year)
In [ ]:
In [24]:
# string search text for the words certiorari or denied
op_dir = data_dir + 'raw/' + court_name + '/opinions/'
# words we want to identify
bad_words = ['denied', 'certiorari']
i = 0
# check each opinion
for op_id in case_metadata.index:
i += 1
if int(log(i, 2)) == log(i, 2):
current_time = datetime.datetime.now().strftime('%H:%M:%S')
print '(%d/%d) at %s' % (i, len(case_metadata.index), current_time)
# grab the opinion file
op_path = op_dir + str(op_id) + '.json'
opinion = json_to_dict(op_path)
# get the lower case text
text = get_text_from_json(opinion)
text = text.lower()
# check each word in the text file
for word in bad_words:
if word in text:
cert_cases.loc[op_id, word] = True
# check if the text is really short
cert_cases.loc[op_id,'text_length'] = len(text)
In [26]:
master_edgelist = pd.read_csv(data_dir + 'raw/edgelist_master_r.csv')
# dict keyed by case indicated if case is mentioned in the edgelist
max_id = max(master_edgelist['citing'].max(), master_edgelist['cited'].max())
mentions = {str(op_id): False for op_id in range(1, max_id + 1)}
i = 0
for index, edge in master_edgelist.iterrows():
i += 1
if int(log(i, 2)) == log(i, 2):
current_time = datetime.datetime.now().strftime('%H:%M:%S')
print '(%d/%d) at %s' % (i, len(master_edgelist), current_time)
# citing opinion mentioned
ing_op_id = str(edge[0])
ed_op_id = str(edge[1])
# cited opinion mentioned
mentions[ed_op_id] = True
# citing mentioned only if cited is not detroit lumber
if ed_op_id != 'g':
mentions[ing_op_id] = True
In [27]:
case_ids = set(case_metadata.index)
zero_deg_cases = [op_id for op_id in mentions.keys() if (not mentions[op_id]) and (op_id in case_ids)]
cert_cases.loc[zero_deg_cases, 'zero_degree'] = True
In [10]:
# scdb ids
scdb_ids = set(scdb.index)
for index, row in case_metadata.iterrows():
# check if case has link to SCDB id
if len(row['scdb_id']) > 0:
cert_cases.loc[index, 'scdb_link'] = True
# check if SCDB id is in SCDB ids
if row['scdb_id'] in scdb_ids:
cert_cases.loc[index, 'in_scdb'] = True
In [15]:
cert_cases[~cert_cases['scdb_link']].index
Out[15]:
In [ ]:
In [80]:
# cert_cases.to_csv('cert_cases_data.csv', index=True)
# cert_cases = pd.read_csv('cert_cases_data.csv', index_col=0)
In [196]:
# initialize pandas series
CL_year_counts = pd.Series(0, index=range(1754, 2017))
scdb_year_counts = pd.Series(0, index=range(1754, 2017))
# count cases per year
CL_year_counter = Counter(cert_cases['year'])
scdb_year_counter = Counter(scdb['dateDecision'].apply(lambda d: d.split('/')[2]))
# make fill series
for y in CL_year_counts.index:
CL_year_counts[y] = CL_year_counter[y]
scdb_year_counts[y] = scdb_year_counter[y]
difference = CL_year_counts - scdb_year_counts
#years that have a lot of extra cases
bad_years = difference[difference > 400].index.tolist()
# plot difference
difference.plot()
plt.ylabel('difference')
plt.title('yearly case counts')
Out[196]:
In [83]:
cert_cases.mean(axis=0)
Out[83]:
In [17]:
cases_denied = cert_cases['denied']
cases_certiorari = cert_cases['certiorari']
cases_zero_degree = cert_cases['zero_degree']
cases_no_scdb_link = ~cert_cases['scdb_link']
cases_notin_scdb = ~cert_cases['in_scdb']
cases_bad_years = cert_cases['year'].apply(lambda y: y in bad_years)
# 2 cases with scdb links but that don't registar in scdb
# cert_cases[~cases_no_scdb_link & cases_notin_scdb]
In [188]:
# cert_cases[cases_zero_degree & cases_denied & cases_certiorari & cases_no_scdb_link]
# cert_cases[~cases_bad_years & cases_notin_scdb]
cert_cases[cases_certiorari & cases_denied & cases_zero_degree & ~cases_no_scdb_link]
Out[188]:
In [202]:
op_id = 106306
print case_metadata.loc[str(op_id)]
print
print case_info(op_id)
In [30]:
cases_no_scdb = pd.Series(cert_cases[~cert_cases['scdb_link']].index)
cases_no_scdb.to_csv('no_scdb_link.csv', index=False)
In [ ]: