notebook.community

Edit and run



In [2]:

    
top_directory = '/Users/iaincarmichael/Dropbox/Research/law/law-net/'

import os
import sys
import time
from math import *
import copy
import cPickle as pickle
import re
import datetime
from collections import Counter


# data
import numpy as np
import pandas as pd

# viz
import matplotlib.pyplot as plt


# graph
import igraph as ig


# our code
sys.path.append(top_directory + 'code/')
from load_data import load_and_clean_graph, case_info
from pipeline.download_data import download_bulk_resource
from pipeline.make_clean_data import *
from viz import print_describe

from pipeline.make_raw_case_metadata import *

sys.path.append(top_directory + 'explore/vertex_metrics_experiment/code/')
from make_case_text_files import *


# directory set up
data_dir = top_directory + 'data/'

court_name = 'scotus'

# jupyter notebook settings
%load_ext autoreload
%autoreload 2
%matplotlib inline









    



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload



In [4]:

    
# load scdb
scdb_modern = pd.read_csv(data_dir + 'scdb/SCDB_2016_01_caseCentered_Citation.csv', index_col=0)
scdb_legacy = pd.read_csv(data_dir + 'scdb/SCDB_Legacy_03_caseCentered_Citation.csv', index_col=0)

scdb = scdb_legacy.append(scdb_modern)



In [3]:

    
%time case_metadata = get_raw_case_metadata_from_court(court_name, data_dir)









    



CPU times: user 1min 30s, sys: 8.18 s, total: 1min 39s
Wall time: 1min 52s



In [5]:

    
cert_cases = pd.DataFrame(False,
                          index=case_metadata.index,
                          columns=['denied', 'certiorari', 'zero_degree', 'scdb_link', 'in_scdb', 'text_length'])

cert_cases['text_length'] = 0

# add year
cert_cases['year'] = case_metadata['date'].apply(lambda d: d.year)



In [ ]:

find cases with bad words or short text



In [24]:

    
# string search text for the words certiorari or denied

op_dir = data_dir + 'raw/' + court_name + '/opinions/'

# words we want to identify
bad_words = ['denied', 'certiorari']

i = 0

# check each opinion
for op_id in case_metadata.index:
    i += 1
    if int(log(i, 2)) == log(i, 2):
        current_time = datetime.datetime.now().strftime('%H:%M:%S')
        print '(%d/%d) at %s' % (i, len(case_metadata.index), current_time)
    
    
    # grab the opinion file
    op_path = op_dir + str(op_id) + '.json'
    opinion = json_to_dict(op_path)
    
    # get the lower case text
    text = get_text_from_json(opinion)
    text = text.lower()
    
    # check each word in the text file
    for word in bad_words:
        if word in text:
            cert_cases.loc[op_id, word] = True
    
    # check if the text is really short
    cert_cases.loc[op_id,'text_length'] = len(text)









    



(1/63859) at 10:40:40
(2/63859) at 10:40:40
(4/63859) at 10:40:40
(8/63859) at 10:40:40
(16/63859) at 10:40:40
(32/63859) at 10:40:40
(64/63859) at 10:40:40
(128/63859) at 10:40:41
(256/63859) at 10:40:42
(512/63859) at 10:40:44
(1024/63859) at 10:40:50
(2048/63859) at 10:41:01
(4096/63859) at 10:41:19
(8192/63859) at 10:41:59
(16384/63859) at 10:43:18
(32768/63859) at 10:46:00

find cases with zero degree



In [26]:

    
master_edgelist = pd.read_csv(data_dir + 'raw/edgelist_master_r.csv')

# dict keyed by case indicated if case is mentioned in the edgelist
max_id = max(master_edgelist['citing'].max(), master_edgelist['cited'].max())
mentions = {str(op_id): False for op_id in range(1, max_id + 1)}


i = 0
for index, edge in master_edgelist.iterrows():
    i += 1
    if int(log(i, 2)) == log(i, 2):
        current_time = datetime.datetime.now().strftime('%H:%M:%S')
        print '(%d/%d) at %s' % (i, len(master_edgelist), current_time)
 
    # citing opinion mentioned
    ing_op_id = str(edge[0])
    ed_op_id = str(edge[1])

    # cited opinion mentioned
    mentions[ed_op_id] = True

    # citing mentioned only if cited is not detroit lumber
    if ed_op_id != 'g':
        mentions[ing_op_id] = True









    



(1/25292533) at 10:53:50
(2/25292533) at 10:53:50
(4/25292533) at 10:53:50
(8/25292533) at 10:53:50
(16/25292533) at 10:53:50
(32/25292533) at 10:53:50
(64/25292533) at 10:53:50
(128/25292533) at 10:53:50
(256/25292533) at 10:53:50
(512/25292533) at 10:53:50
(1024/25292533) at 10:53:50
(2048/25292533) at 10:53:50
(4096/25292533) at 10:53:50
(8192/25292533) at 10:53:51
(16384/25292533) at 10:53:52
(32768/25292533) at 10:53:54
(65536/25292533) at 10:53:58
(131072/25292533) at 10:54:06
(262144/25292533) at 10:54:23
(524288/25292533) at 10:54:55
(1048576/25292533) at 10:56:03
(2097152/25292533) at 10:58:56
(4194304/25292533) at 11:03:26
(8388608/25292533) at 11:11:58
(16777216/25292533) at 11:29:40



In [27]:

    
case_ids = set(case_metadata.index)
zero_deg_cases = [op_id for op_id in mentions.keys() if (not mentions[op_id]) and (op_id in case_ids)]

cert_cases.loc[zero_deg_cases, 'zero_degree'] = True

find cases without scdb links



In [10]:

    
# scdb ids
scdb_ids = set(scdb.index)


for index, row in case_metadata.iterrows():
    
    # check if case has link to SCDB id
    if len(row['scdb_id']) > 0:
        cert_cases.loc[index, 'scdb_link'] = True
        
        # check if SCDB id is in SCDB ids
        if row['scdb_id'] in scdb_ids:
            cert_cases.loc[index, 'in_scdb'] = True



In [15]:

    
cert_cases[~cert_cases['scdb_link']].index









    Out[15]:





Index([u'142945', u'130423', u'106301', u'137047', u'1859650', u'137046',
       u'137045', u'127479', u'127478', u'106306',
       ...
       u'131844', u'131847', u'131846', u'143887', u'143880', u'143881',
       u'142944', u'143882', u'143883', u'130422'],
      dtype='object', name=u'id', length=35973)



In [ ]:

save cert_cases file



In [80]:

    
# cert_cases.to_csv('cert_cases_data.csv', index=True)
# cert_cases = pd.read_csv('cert_cases_data.csv', index_col=0)

compare year counts between CL and scdb



In [196]:

    
# initialize pandas series
CL_year_counts = pd.Series(0, index=range(1754, 2017))
scdb_year_counts = pd.Series(0, index=range(1754, 2017))

# count cases per year
CL_year_counter = Counter(cert_cases['year'])
scdb_year_counter = Counter(scdb['dateDecision'].apply(lambda d: d.split('/')[2]))

# make fill series
for y in CL_year_counts.index:
    CL_year_counts[y] = CL_year_counter[y]
    scdb_year_counts[y] = scdb_year_counter[y]
    
difference = CL_year_counts - scdb_year_counts

#years that have a lot of extra cases
bad_years = difference[difference > 400].index.tolist()


# plot difference
difference.plot()
plt.ylabel('difference')
plt.title('yearly case counts')









    Out[196]:





Int64Index([1993, 1994, 2002, 2003, 2004, 2005], dtype='int64')

analyze cert_cases



In [83]:

    
cert_cases.mean(axis=0)









    Out[83]:





denied             0.713306
certiorari         0.631720
zero_degree        0.405910
scdb_link          0.436681
text_length    12666.539814
year            1962.545859
in_scdb            0.436649
dtype: float64



In [17]:

    
cases_denied = cert_cases['denied']
cases_certiorari = cert_cases['certiorari']
cases_zero_degree = cert_cases['zero_degree']
cases_no_scdb_link = ~cert_cases['scdb_link']
cases_notin_scdb = ~cert_cases['in_scdb']


cases_bad_years = cert_cases['year'].apply(lambda y: y in bad_years)

# 2 cases with scdb links but that don't registar in scdb
# cert_cases[~cases_no_scdb_link & cases_notin_scdb]









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-17-384f47290624> in <module>()
      6 
      7 
----> 8 cases_bad_years = cert_cases['year'].apply(lambda y: y in bad_years)
      9 
     10 # 2 cases with scdb links but that don't registar in scdb

//anaconda/envs/default2/lib/python2.7/site-packages/pandas/core/series.pyc in apply(self, func, convert_dtype, args, **kwds)
   2218         else:
   2219             values = self.asobject
-> 2220             mapped = lib.map_infer(values, f, convert=convert_dtype)
   2221 
   2222         if len(mapped) and isinstance(mapped[0], Series):

pandas/src/inference.pyx in pandas.lib.map_infer (pandas/lib.c:62658)()

<ipython-input-17-384f47290624> in <lambda>(y)
      6 
      7 
----> 8 cases_bad_years = cert_cases['year'].apply(lambda y: y in bad_years)
      9 
     10 # 2 cases with scdb links but that don't registar in scdb

NameError: global name 'bad_years' is not defined



In [188]:

    
# cert_cases[cases_zero_degree & cases_denied & cases_certiorari & cases_no_scdb_link]
# cert_cases[~cases_bad_years & cases_notin_scdb] 

cert_cases[cases_certiorari & cases_denied & cases_zero_degree & ~cases_no_scdb_link]









    Out[188]:






  
    
      
      denied
      certiorari
      zero_degree
      scdb_link
      text_length
      year
      in_scdb
    
    
      id
      
      
      
      
      
      
      
    
  
  
    
      112378
      True
      True
      True
      True
      87117
      1990
      True
    
    
      85071
      True
      True
      True
      True
      35629
      1814
      True
    
    
      103080
      True
      True
      True
      True
      1699
      1938
      True
    
    
      94474
      True
      True
      True
      True
      9377
      1896
      True
    
    
      1158161
      True
      True
      True
      True
      346
      1964
      True
    
    
      88873
      True
      True
      True
      True
      21808
      1874
      True
    
    
      99298
      True
      True
      True
      True
      1370
      1919
      True
    
    
      106159
      True
      True
      True
      True
      1028
      1961
      True
    
    
      95370
      True
      True
      True
      True
      60153
      1900
      True
    
    
      95375
      True
      True
      True
      True
      13012
      1900
      True
    
    
      87614
      True
      True
      True
      True
      24655
      1864
      True
    
    
      95725
      True
      True
      True
      True
      3255
      1902
      True
    
    
      88040
      True
      True
      True
      True
      2692
      1869
      True



In [202]:

    
op_id = 106306

print case_metadata.loc[str(op_id)]
print
print case_info(op_id)









    



date                                    1961-12-04
court                                       scotus
name       sylvester-johnson-v-state-of-new-jersey
judges                                            
scdb_id                                           
term                                          1961
Name: 106306, dtype: object

https://www.courtlistener.com/opinion/106306/sylvester-johnson-v-state-of-new-jersey/
None

cases missing SCDB links



In [30]:

    
cases_no_scdb = pd.Series(cert_cases[~cert_cases['scdb_link']].index)
cases_no_scdb.to_csv('no_scdb_link.csv', index=False)



In [ ]:

	denied	certiorari	zero_degree	scdb_link	text_length	year	in_scdb
id
112378	True	True	True	True	87117	1990	True
85071	True	True	True	True	35629	1814	True
103080	True	True	True	True	1699	1938	True
94474	True	True	True	True	9377	1896	True
1158161	True	True	True	True	346	1964	True
88873	True	True	True	True	21808	1874	True
99298	True	True	True	True	1370	1919	True
106159	True	True	True	True	1028	1961	True
95370	True	True	True	True	60153	1900	True
95375	True	True	True	True	13012	1900	True
87614	True	True	True	True	24655	1864	True
95725	True	True	True	True	3255	1902	True
88040	True	True	True	True	2692	1869	True