notebook.community

Edit and run



In [22]:

    
import os, sys, io
import json
#import simplejson as json
from pprint import pprint
import datetime



In [2]:

    
data_path = '../../data'



In [3]:

    
# Given the juristidction, file type and root path to data
# Returns a list of case ids in that jurisdiction
def get_cases_in_jurisdiction( juris_abv = 'nced', file_type = 'opinions', data_path = '../../data'):
    
    # path leading to the jurisdiction files
    path = data_path + '/'+ file_type + '/' + juris_abv + '/'
    
    # TODO: throw an exception
    # Check that the directory exists
    if not os.path.isdir(path):
        print 'not a legal path'
        return []
    else:
        return [int(f.split('.json')[0]) for f in os.listdir(path)]



In [4]:

    
nced_case_ids = get_cases_in_jurisdiction('nced')



In [5]:



In [6]:

    
cl_file = data_path + '/clusters/nced/1361899.json'
op_file = data_path + '/opinions/nced/1361899.json'



In [15]:

    
# Open the cluster and opinion json files
with open(cl_file) as data_file:    
    cl_data_temp = json.load(data_file)
    
with open(op_file) as data_file:    
    op_data_temp = json.load(data_file)

# TODO: do this more succinctly
# Convert to utf8 from unicode
cl_data = {}
for k in cl_data_temp.keys():
    value = cl_data_temp[k]
    if k == 'opinions_cited':
        cl_data['opinions_cited'] = [v.encode('utf8') for v in value]
    elif type(value) == unicode:
        cl_data[k.encode('utf8')] = value.encode('utf8')
    else:
        cl_data[k.encode('utf8')] = value
        
        
op_data = {}
for k in op_data_temp.keys():
    value = op_data_temp[k]
    if k == 'opinions_cited':
        op_data['opinions_cited'] = [v.encode('utf8') for v in value]
    elif type(value) == unicode:
        op_data[k.encode('utf8')] = value.encode('utf8')
    else:
        op_data[k.encode('utf8')] = value



In [8]:

    
case_data = {}

for k in cl_data.keys():
    if k == 'case_name':
        case_data[k] = cl_data[k]
        
    if k == 'citation_id':
        case_data[k] = cl_data[k]

    if k == 'date_filed' 
        date_explode = cl_data['date_filed'].split('-') # make sure date is always in this format
        file_date = datetime.date(date_explode[0], date_explode[1], date_explode[2])
        case_data[k] = file_date

# Get the case text 
text = op_data['html']
if len(text) == 0:
    text = op_data['html_with_citations']
elif len(text) == 0:
    text = op_data['plain_text']
elif len(text) == 0:
    text = op_data['html_lawbox']
elif len(text) == 0:
    text = ''
    print('case ' + str(i) + ' has no text')
    
    
case_data['case_text'] = text



In [44]:

    
class Case:
    def __init__(self, op_file, cl_file):
    
        # Open the cluster and opinion json files
        with open(cl_file) as data_file:    
            cl_data_temp = json.load(data_file)

        with open(op_file) as data_file:    
            op_data_temp = json.load(data_file)

        # TODO: do this more succinctly
        # Convert to utf8 from unicode
        cl_data = {}
        for k in cl_data_temp.keys():
            value = cl_data_temp[k]
            if k == 'opinions_cited':
                cl_data['opinions_cited'] = [v.encode('utf8') for v in value]
            elif type(value) == unicode:
                cl_data[k.encode('utf8')] = value.encode('utf8')
            else:
                cl_data[k.encode('utf8')] = value

        op_data = {}
        for k in op_data_temp.keys():
            value = op_data_temp[k]
            if k == 'opinions_cited':
                op_data['opinions_cited'] = [v.encode('utf8') for v in value]
            elif type(value) == unicode:
                op_data[k.encode('utf8')] = value.encode('utf8')
            else:
                op_data[k.encode('utf8')] = value




        for k in cl_data.keys():
            if k == 'case_name':
                self.case_name = cl_data[k]

            if k == 'citation_id':
                self.case_id = cl_data[k]

            if k == 'date_filed': 
                date_explode = cl_data['date_filed'].split('-') # make sure date is always in this format
                file_date = datetime.date(int(date_explode[0]), int(date_explode[1]), int(date_explode[2]))
                self.date = file_date

        # Get the case text 
        text = op_data['html']
        if len(text) == 0:
            text = op_data['html_with_citations']
        elif len(text) == 0:
            text = op_data['plain_text']
        elif len(text) == 0:
            text = op_data['html_lawbox']
        elif len(text) == 0:
            text = ''
            print('case ' + str(i) + ' has no text')
        
        self.text = text
    
    def __repr__(self):
        return "Name: \t %s \n"\
               "Id \t %s \n"\
               "Date \t %s \n"\
                % (self.case_name, self.case_id, self.date )



In [45]:

    
case = Case(op_file, cl_file)



In [46]:

    
case.__repr__









    Out[46]:





<bound method Case.__repr__ of Name: 	 Stott v. Martin 
Id 	 1334670 
Date 	 1992-02-12 
>



In [ ]: