In [22]:
import os, sys, io
import json
#import simplejson as json
from pprint import pprint
import datetime
In [2]:
data_path = '../../data'
In [3]:
# Given the juristidction, file type and root path to data
# Returns a list of case ids in that jurisdiction
def get_cases_in_jurisdiction( juris_abv = 'nced', file_type = 'opinions', data_path = '../../data'):
# path leading to the jurisdiction files
path = data_path + '/'+ file_type + '/' + juris_abv + '/'
# TODO: throw an exception
# Check that the directory exists
if not os.path.isdir(path):
print 'not a legal path'
return []
else:
return [int(f.split('.json')[0]) for f in os.listdir(path)]
In [4]:
nced_case_ids = get_cases_in_jurisdiction('nced')
In [5]:
In [6]:
cl_file = data_path + '/clusters/nced/1361899.json'
op_file = data_path + '/opinions/nced/1361899.json'
In [15]:
# Open the cluster and opinion json files
with open(cl_file) as data_file:
cl_data_temp = json.load(data_file)
with open(op_file) as data_file:
op_data_temp = json.load(data_file)
# TODO: do this more succinctly
# Convert to utf8 from unicode
cl_data = {}
for k in cl_data_temp.keys():
value = cl_data_temp[k]
if k == 'opinions_cited':
cl_data['opinions_cited'] = [v.encode('utf8') for v in value]
elif type(value) == unicode:
cl_data[k.encode('utf8')] = value.encode('utf8')
else:
cl_data[k.encode('utf8')] = value
op_data = {}
for k in op_data_temp.keys():
value = op_data_temp[k]
if k == 'opinions_cited':
op_data['opinions_cited'] = [v.encode('utf8') for v in value]
elif type(value) == unicode:
op_data[k.encode('utf8')] = value.encode('utf8')
else:
op_data[k.encode('utf8')] = value
In [8]:
case_data = {}
for k in cl_data.keys():
if k == 'case_name':
case_data[k] = cl_data[k]
if k == 'citation_id':
case_data[k] = cl_data[k]
if k == 'date_filed'
date_explode = cl_data['date_filed'].split('-') # make sure date is always in this format
file_date = datetime.date(date_explode[0], date_explode[1], date_explode[2])
case_data[k] = file_date
# Get the case text
text = op_data['html']
if len(text) == 0:
text = op_data['html_with_citations']
elif len(text) == 0:
text = op_data['plain_text']
elif len(text) == 0:
text = op_data['html_lawbox']
elif len(text) == 0:
text = ''
print('case ' + str(i) + ' has no text')
case_data['case_text'] = text
In [44]:
class Case:
def __init__(self, op_file, cl_file):
# Open the cluster and opinion json files
with open(cl_file) as data_file:
cl_data_temp = json.load(data_file)
with open(op_file) as data_file:
op_data_temp = json.load(data_file)
# TODO: do this more succinctly
# Convert to utf8 from unicode
cl_data = {}
for k in cl_data_temp.keys():
value = cl_data_temp[k]
if k == 'opinions_cited':
cl_data['opinions_cited'] = [v.encode('utf8') for v in value]
elif type(value) == unicode:
cl_data[k.encode('utf8')] = value.encode('utf8')
else:
cl_data[k.encode('utf8')] = value
op_data = {}
for k in op_data_temp.keys():
value = op_data_temp[k]
if k == 'opinions_cited':
op_data['opinions_cited'] = [v.encode('utf8') for v in value]
elif type(value) == unicode:
op_data[k.encode('utf8')] = value.encode('utf8')
else:
op_data[k.encode('utf8')] = value
for k in cl_data.keys():
if k == 'case_name':
self.case_name = cl_data[k]
if k == 'citation_id':
self.case_id = cl_data[k]
if k == 'date_filed':
date_explode = cl_data['date_filed'].split('-') # make sure date is always in this format
file_date = datetime.date(int(date_explode[0]), int(date_explode[1]), int(date_explode[2]))
self.date = file_date
# Get the case text
text = op_data['html']
if len(text) == 0:
text = op_data['html_with_citations']
elif len(text) == 0:
text = op_data['plain_text']
elif len(text) == 0:
text = op_data['html_lawbox']
elif len(text) == 0:
text = ''
print('case ' + str(i) + ' has no text')
self.text = text
def __repr__(self):
return "Name: \t %s \n"\
"Id \t %s \n"\
"Date \t %s \n"\
% (self.case_name, self.case_id, self.date )
In [45]:
case = Case(op_file, cl_file)
In [46]:
case.__repr__
Out[46]:
In [ ]: