In [2]:
top_directory = '/Users/iaincarmichael/Dropbox/Research/law/law-net/'

import os
import sys
import time
from math import *
import copy
import cPickle as pickle
import re
import datetime
from collections import Counter


# data
import numpy as np
import pandas as pd

# viz
import matplotlib.pyplot as plt


# graph
import igraph as ig


# our code
sys.path.append(top_directory + 'code/')
from load_data import load_and_clean_graph, case_info
from pipeline.download_data import download_bulk_resource
from pipeline.make_clean_data import *
from viz import print_describe

from pipeline.make_raw_case_metadata import *

sys.path.append(top_directory + 'explore/vertex_metrics_experiment/code/')
from make_case_text_files import *


# directory set up
data_dir = top_directory + 'data/'

court_name = 'scotus'

# jupyter notebook settings
%load_ext autoreload
%autoreload 2
%matplotlib inline


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

In [4]:
# load scdb
scdb_modern = pd.read_csv(data_dir + 'scdb/SCDB_2016_01_caseCentered_Citation.csv', index_col=0)
scdb_legacy = pd.read_csv(data_dir + 'scdb/SCDB_Legacy_03_caseCentered_Citation.csv', index_col=0)

scdb = scdb_legacy.append(scdb_modern)

In [3]:
%time case_metadata = get_raw_case_metadata_from_court(court_name, data_dir)


CPU times: user 1min 30s, sys: 8.18 s, total: 1min 39s
Wall time: 1min 52s

In [5]:
cert_cases = pd.DataFrame(False,
                          index=case_metadata.index,
                          columns=['denied', 'certiorari', 'zero_degree', 'scdb_link', 'in_scdb', 'text_length'])

cert_cases['text_length'] = 0

# add year
cert_cases['year'] = case_metadata['date'].apply(lambda d: d.year)

In [ ]:

find cases with bad words or short text


In [24]:
# string search text for the words certiorari or denied

op_dir = data_dir + 'raw/' + court_name + '/opinions/'

# words we want to identify
bad_words = ['denied', 'certiorari']

i = 0

# check each opinion
for op_id in case_metadata.index:
    i += 1
    if int(log(i, 2)) == log(i, 2):
        current_time = datetime.datetime.now().strftime('%H:%M:%S')
        print '(%d/%d) at %s' % (i, len(case_metadata.index), current_time)
    
    
    # grab the opinion file
    op_path = op_dir + str(op_id) + '.json'
    opinion = json_to_dict(op_path)
    
    # get the lower case text
    text = get_text_from_json(opinion)
    text = text.lower()
    
    # check each word in the text file
    for word in bad_words:
        if word in text:
            cert_cases.loc[op_id, word] = True
    
    # check if the text is really short
    cert_cases.loc[op_id,'text_length'] = len(text)


(1/63859) at 10:40:40
(2/63859) at 10:40:40
(4/63859) at 10:40:40
(8/63859) at 10:40:40
(16/63859) at 10:40:40
(32/63859) at 10:40:40
(64/63859) at 10:40:40
(128/63859) at 10:40:41
(256/63859) at 10:40:42
(512/63859) at 10:40:44
(1024/63859) at 10:40:50
(2048/63859) at 10:41:01
(4096/63859) at 10:41:19
(8192/63859) at 10:41:59
(16384/63859) at 10:43:18
(32768/63859) at 10:46:00

find cases with zero degree


In [26]:
master_edgelist = pd.read_csv(data_dir + 'raw/edgelist_master_r.csv')

# dict keyed by case indicated if case is mentioned in the edgelist
max_id = max(master_edgelist['citing'].max(), master_edgelist['cited'].max())
mentions = {str(op_id): False for op_id in range(1, max_id + 1)}


i = 0
for index, edge in master_edgelist.iterrows():
    i += 1
    if int(log(i, 2)) == log(i, 2):
        current_time = datetime.datetime.now().strftime('%H:%M:%S')
        print '(%d/%d) at %s' % (i, len(master_edgelist), current_time)
 
    # citing opinion mentioned
    ing_op_id = str(edge[0])
    ed_op_id = str(edge[1])

    # cited opinion mentioned
    mentions[ed_op_id] = True

    # citing mentioned only if cited is not detroit lumber
    if ed_op_id != 'g':
        mentions[ing_op_id] = True


(1/25292533) at 10:53:50
(2/25292533) at 10:53:50
(4/25292533) at 10:53:50
(8/25292533) at 10:53:50
(16/25292533) at 10:53:50
(32/25292533) at 10:53:50
(64/25292533) at 10:53:50
(128/25292533) at 10:53:50
(256/25292533) at 10:53:50
(512/25292533) at 10:53:50
(1024/25292533) at 10:53:50
(2048/25292533) at 10:53:50
(4096/25292533) at 10:53:50
(8192/25292533) at 10:53:51
(16384/25292533) at 10:53:52
(32768/25292533) at 10:53:54
(65536/25292533) at 10:53:58
(131072/25292533) at 10:54:06
(262144/25292533) at 10:54:23
(524288/25292533) at 10:54:55
(1048576/25292533) at 10:56:03
(2097152/25292533) at 10:58:56
(4194304/25292533) at 11:03:26
(8388608/25292533) at 11:11:58
(16777216/25292533) at 11:29:40

In [27]:
case_ids = set(case_metadata.index)
zero_deg_cases = [op_id for op_id in mentions.keys() if (not mentions[op_id]) and (op_id in case_ids)]

cert_cases.loc[zero_deg_cases, 'zero_degree'] = True

find cases without scdb links


In [10]:
# scdb ids
scdb_ids = set(scdb.index)


for index, row in case_metadata.iterrows():
    
    # check if case has link to SCDB id
    if len(row['scdb_id']) > 0:
        cert_cases.loc[index, 'scdb_link'] = True
        
        # check if SCDB id is in SCDB ids
        if row['scdb_id'] in scdb_ids:
            cert_cases.loc[index, 'in_scdb'] = True

In [15]:
cert_cases[~cert_cases['scdb_link']].index


Out[15]:
Index([u'142945', u'130423', u'106301', u'137047', u'1859650', u'137046',
       u'137045', u'127479', u'127478', u'106306',
       ...
       u'131844', u'131847', u'131846', u'143887', u'143880', u'143881',
       u'142944', u'143882', u'143883', u'130422'],
      dtype='object', name=u'id', length=35973)

In [ ]:

save cert_cases file


In [80]:
# cert_cases.to_csv('cert_cases_data.csv', index=True)
# cert_cases = pd.read_csv('cert_cases_data.csv', index_col=0)

compare year counts between CL and scdb


In [196]:
# initialize pandas series
CL_year_counts = pd.Series(0, index=range(1754, 2017))
scdb_year_counts = pd.Series(0, index=range(1754, 2017))

# count cases per year
CL_year_counter = Counter(cert_cases['year'])
scdb_year_counter = Counter(scdb['dateDecision'].apply(lambda d: d.split('/')[2]))

# make fill series
for y in CL_year_counts.index:
    CL_year_counts[y] = CL_year_counter[y]
    scdb_year_counts[y] = scdb_year_counter[y]
    
difference = CL_year_counts - scdb_year_counts

#years that have a lot of extra cases
bad_years = difference[difference > 400].index.tolist()


# plot difference
difference.plot()
plt.ylabel('difference')
plt.title('yearly case counts')


Out[196]:
Int64Index([1993, 1994, 2002, 2003, 2004, 2005], dtype='int64')

analyze cert_cases


In [83]:
cert_cases.mean(axis=0)


Out[83]:
denied             0.713306
certiorari         0.631720
zero_degree        0.405910
scdb_link          0.436681
text_length    12666.539814
year            1962.545859
in_scdb            0.436649
dtype: float64

In [17]:
cases_denied = cert_cases['denied']
cases_certiorari = cert_cases['certiorari']
cases_zero_degree = cert_cases['zero_degree']
cases_no_scdb_link = ~cert_cases['scdb_link']
cases_notin_scdb = ~cert_cases['in_scdb']


cases_bad_years = cert_cases['year'].apply(lambda y: y in bad_years)

# 2 cases with scdb links but that don't registar in scdb
# cert_cases[~cases_no_scdb_link & cases_notin_scdb]


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-17-384f47290624> in <module>()
      6 
      7 
----> 8 cases_bad_years = cert_cases['year'].apply(lambda y: y in bad_years)
      9 
     10 # 2 cases with scdb links but that don't registar in scdb

//anaconda/envs/default2/lib/python2.7/site-packages/pandas/core/series.pyc in apply(self, func, convert_dtype, args, **kwds)
   2218         else:
   2219             values = self.asobject
-> 2220             mapped = lib.map_infer(values, f, convert=convert_dtype)
   2221 
   2222         if len(mapped) and isinstance(mapped[0], Series):

pandas/src/inference.pyx in pandas.lib.map_infer (pandas/lib.c:62658)()

<ipython-input-17-384f47290624> in <lambda>(y)
      6 
      7 
----> 8 cases_bad_years = cert_cases['year'].apply(lambda y: y in bad_years)
      9 
     10 # 2 cases with scdb links but that don't registar in scdb

NameError: global name 'bad_years' is not defined

In [188]:
# cert_cases[cases_zero_degree & cases_denied & cases_certiorari & cases_no_scdb_link]
# cert_cases[~cases_bad_years & cases_notin_scdb] 

cert_cases[cases_certiorari & cases_denied & cases_zero_degree & ~cases_no_scdb_link]


Out[188]:
denied certiorari zero_degree scdb_link text_length year in_scdb
id
112378 True True True True 87117 1990 True
85071 True True True True 35629 1814 True
103080 True True True True 1699 1938 True
94474 True True True True 9377 1896 True
1158161 True True True True 346 1964 True
88873 True True True True 21808 1874 True
99298 True True True True 1370 1919 True
106159 True True True True 1028 1961 True
95370 True True True True 60153 1900 True
95375 True True True True 13012 1900 True
87614 True True True True 24655 1864 True
95725 True True True True 3255 1902 True
88040 True True True True 2692 1869 True

In [202]:
op_id = 106306

print case_metadata.loc[str(op_id)]
print
print case_info(op_id)


date                                    1961-12-04
court                                       scotus
name       sylvester-johnson-v-state-of-new-jersey
judges                                            
scdb_id                                           
term                                          1961
Name: 106306, dtype: object

https://www.courtlistener.com/opinion/106306/sylvester-johnson-v-state-of-new-jersey/
None

cases missing SCDB links


In [30]:
cases_no_scdb = pd.Series(cert_cases[~cert_cases['scdb_link']].index)
cases_no_scdb.to_csv('no_scdb_link.csv', index=False)

In [ ]: