In [1]:
# Imports
import numpy
import pandas
import sklearn
import sklearn.dummy
import sklearn.metrics
import sklearn.ensemble
In [2]:
# Matplotlib setup
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn
seaborn.set()
In [3]:
# Load justice-centered SCDB data
scdb_data = pandas.read_csv("data/SCDB_2013_01_justiceCentered_Citation.csv")
In [4]:
"""
Setup the outcome map.
Rows correspond to vote types. Columns correspond to disposition types.
Element values correspond to:
* -1: no precedential issued opinion or uncodable, i.e., DIGs
* 0: affirm, i.e., no change in precedent
* 1: reverse, i.e., change in precent
"""
outcome_map = pandas.DataFrame([[-1, 0, 1, 1, 1, 0, 1, -1, -1, -1, -1],
[-1, 1, 0, 0, 0, 1, 0, -1, -1, -1, -1],
[-1, 0, 1, 1, 1, 0, 1, -1, -1, -1, -1],
[-1, 0, 1, 1, 1, 0, 1, -1, -1, -1, -1],
[-1, 0, 1, 1, 1, 0, 1, -1, -1, -1, -1],
[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
[-1, 0, 0, 0, -1, 0, -1, -1, -1, -1, -1]])
outcome_map.columns = range(1, 12)
outcome_map.index = range(1, 9)
def get_outcome(vote, disposition):
"""
Return the outcome code.
"""
if pandas.isnull(vote) or pandas.isnull(disposition):
return -1
return outcome_map.loc[int(vote), int(disposition)]
In [5]:
# Map the case-level disposition outcome
scdb_data.loc[:, "case_outcome_disposition"] = outcome_map.loc[1, scdb_data.loc[:, "caseDisposition"]].values
scdb_data.loc[:, "lc_case_outcome_disposition"] = outcome_map.loc[1, scdb_data.loc[:, "lcDisposition"]].values
# Map the justice-level disposition outcome
scdb_data.loc[:, "justice_outcome_disposition"] = scdb_data.loc[:, ("vote", "caseDisposition")] \
.apply(lambda row: get_outcome(row["vote"], row["caseDisposition"]), axis=1)
In the section below, we define methods that handle the execution and analysis of simulations. Simulations are based around the following concepts:
In [6]:
# Court to circuit mapping, which maps from SCDB codebook to the actual Circuit number
# http://scdb.wustl.edu/documentation.php?var=caseOrigin
# http://scdb.wustl.edu/documentation.php?var=caseSource
court_circuit_map = {1: 13,
2: 13, 3: 13, 4: 14, 5: 14, 6: 13, 7: 13, 8: 13,
9: 22, 10: 99, 12: 9, 13: 99, 14: 13, 15: 99, 16: 99,
17: 99, 18: 99, 19: 0, 20: 22, 21: 1, 22: 2, 23: 3,
24: 4, 25: 5, 26: 6, 27: 7, 28: 8, 29: 9, 30: 10,
31: 11, 32: 12, 41: 11, 42: 11, 43: 11, 44: 9, 45: 9,
46: 8, 47: 8, 48: 9, 49: 9, 50: 9, 51: 9, 52: 10, 53: 2,
54: 3, 55: 12, 56: 11, 57: 11, 58: 11, 59: 11, 60: 11,
61: 11, 62: 9, 63: 9, 64: 9, 65: 7, 66: 7, 67: 7, 68: 7,
69: 7, 70: 8, 71: 8, 72: 10, 73: 6, 74: 6, 75: 5, 76: 5,
77: 5, 78: 1, 79: 4, 80: 1, 81: 6, 82: 6, 83: 8, 84: 5,
85: 5, 86: 8, 87: 8, 88: 9, 89: 8, 90: 9, 91: 1, 92: 3,
93: 10, 94: 2, 95: 2, 96: 2, 97: 2, 98: 4, 99: 4, 100: 4,
101: 8, 102: 9, 103: 6, 104: 6, 105: 10, 106: 10, 107: 10,
108: 9, 109: 3, 110: 3, 111: 3, 112: 1, 113: 1, 114: 4,
115: 8, 116: 6, 117: 6, 118: 6, 119: 5, 120: 5, 121: 5,
122: 5, 123: 10, 124: 2, 125: 3, 126: 4, 127: 4, 128: 9,
129: 9, 130: 4, 131: 4, 132: 7, 133: 7, 134: 10, 150: 5,
151: 9, 152: 4, 153: 7, 155: 4, 160: 4, 162: 11, 163: 5,
164: 11, 165: 7, 166: 7, 167: 8, 168: 6, 169: 5, 170: 8,
171: 3, 172: 3, 173: 2, 174: 4, 175: 6, 176: 3, 177: 3,
178: 5, 179: 4, 180: 4, 181: 7, 182: 6, 183: 3, 184: 9,
185: 11, 186: 8, 187: 5, 300: 0, 301: 0, 302: 0, 400: 99,
401: 99, 402: 99, 403: 11, 404: 8, 405: 9, 406: 2, 407: 3,
408: 11, 409: 11, 410: 7, 411: 7, 412: 8, 413: 10, 414: 6,
415: 5, 416: 1, 417: 4, 418: 1, 419: 6, 420: 8,
421: 5, 422: 8, 423: 9, 424: 1, 425: 3, 426: 2,
427: 4, 428: 6, 429: 9, 430: 3, 431: 1, 432: 4, 433: 6,
434: 5, 435: 2, 436: 4, 437: 4, 438: 7,
439: 10, 440: 12, 441: 8, 442: 10, 443: 9}
def map_circuit(value):
try:
return court_circuit_map[value]
except:
return 0
# Get lists of classes for categorical vars
# Issue area
issue_area_codes = [0]
issue_area_codes.extend(sorted(scdb_data['issueArea'].fillna(0).apply(int).unique().tolist()))
# Issue
issue_codes = [0]
issue_codes.extend(sorted(scdb_data['issue'].fillna(0).apply(int).unique().tolist()))
# Courts
court_circuit_codes = [0]
court_circuit_codes.extend(sorted(list(set(court_circuit_map.values()))))
# Admin action
admin_action_codes = [0]
admin_action_codes.extend(sorted(scdb_data['adminAction'].fillna(0).apply(int).unique().tolist()))
# Law types
law_type_codes = [0]
law_type_codes.extend(sorted(scdb_data['lawType'].fillna(0).apply(int).unique().tolist()))
# Law supp types
law_supp_codes = [0]
law_supp_codes.extend(sorted(scdb_data['lawSupp'].fillna(0).apply(int).unique().tolist()))
# Cert reason
cert_reason_codes = [0]
cert_reason_codes.extend(sorted(scdb_data['certReason'].fillna(0).apply(int).unique().tolist()))
# Jurisdiction
jurisdiction_codes = [0]
jurisdiction_codes.extend(sorted(scdb_data['jurisdiction'].fillna(0).apply(int).unique().tolist()))
# LC Disagreement
lc_disagreement_codes = [0]
lc_disagreement_codes.extend(sorted(scdb_data['lcDisagreement'].fillna(0).apply(int).unique().tolist()))
# Justice codes
justice_codes = [0]
justice_codes.extend(sorted(scdb_data['justice'].fillna(0).apply(int).unique().tolist()))
# Parties
party_codes = [0]
party_codes.extend(sorted(scdb_data['petitioner'].fillna(0).apply(int).unique()))
party_codes.extend(sorted(scdb_data['respondent'].fillna(0).apply(int).unique()))
party_codes = sorted(list(set(party_codes)))
# LC outcome
lc_case_outcome_codes = [0]
lc_case_outcome_codes.extend(sorted(scdb_data['lc_case_outcome_disposition'].fillna(0).apply(int).unique().tolist()))
In [7]:
def preprocess_data(data):
"""
Process SCDB data frame into features.
"""
# Encode admin action
admin_action_encoded = sklearn.preprocessing.label_binarize(data['adminAction'].fillna(0).apply(int),
admin_action_codes)
# Encode issue area
issue_area_encoded = sklearn.preprocessing.label_binarize(data['issueArea'].fillna(0).apply(int),
issue_area_codes)
issue_encoded = sklearn.preprocessing.label_binarize(data['issue'].fillna(0).apply(int),
issue_codes)
# Encode law type, cert reason, and jurisdiction
law_type_encoded = sklearn.preprocessing.label_binarize(data['lawType'].fillna(0).apply(int),
law_type_codes)
law_supp_encoded = sklearn.preprocessing.label_binarize(data['lawSupp'].fillna(0).apply(int),
law_type_codes)
cert_reason_encoded = sklearn.preprocessing.label_binarize(data['certReason'].fillna(0).apply(int),
cert_reason_codes)
jurisdiction_encoded = sklearn.preprocessing.label_binarize(data['jurisdiction'].fillna(0).apply(int),
jurisdiction_codes)
# Encode courts
data.loc[:, 'case_source_map'] = data['caseSource'].apply(map_circuit).apply(int)
data.loc[:, 'case_origin_map'] = data['caseOrigin'].apply(map_circuit).apply(int)
case_source_encoded = sklearn.preprocessing.label_binarize(data['case_source_map'].fillna(0).apply(int),
court_circuit_codes)
case_origin_encoded = sklearn.preprocessing.label_binarize(data['case_origin_map'].fillna(0).apply(int),
court_circuit_codes)
# Encode parties
petitioner_encoded = sklearn.preprocessing.label_binarize(data['petitioner'].fillna(0).apply(int),
party_codes)
respondent_encoded = sklearn.preprocessing.label_binarize(data['respondent'].fillna(0).apply(int),
party_codes)
# Justice
justice_encoded = sklearn.preprocessing.label_binarize(data['justice'].fillna(0).apply(int),
justice_codes)
lc_outcome_encoded = sklearn.preprocessing.label_binarize(data['lc_case_outcome_disposition'].fillna(0).apply(int),
lc_case_outcome_codes)
return numpy.hstack((justice_encoded, admin_action_encoded, issue_area_encoded, issue_encoded,
law_type_encoded, law_supp_encoded, cert_reason_encoded, jurisdiction_encoded,
case_source_encoded, case_origin_encoded, petitioner_encoded, respondent_encoded,
lc_outcome_encoded))
# Test shape
scdb_feature_data = preprocess_data(scdb_data)
print(scdb_feature_data.shape)
In [8]:
def predict_rf(historical_data, current_data, max_leaf_nodes=1024, n_estimators=1000):
"""
Prediction method based on simple random forest classifier.
:param historical_data: SCDB DataFrame to use for out-of-sample calculationi; must be a subset of SCDB justice-centered
data known up to point in time
:param current_data: SCDB DataFrame to use to generate predictions
:return: vector containing predictions for each current_data record
"""
# Get features and targets
feature_train_data = preprocess_data(historical_data)
target_train_data = historical_data.loc[:, "justice_outcome_disposition"].values
target_index = (target_train_data >= 0)
# Train model
model = sklearn.ensemble.RandomForestClassifier(max_leaf_nodes=max_leaf_nodes,
n_estimators=n_estimators,
min_samples_leaf=2)
model.fit(feature_train_data[target_index, :], target_train_data[target_index])
prediction_score = model.predict_proba(preprocess_data(current_data))
return prediction_score[:, 1]
def run_simulation(simulation_data, term_list, prediction_method, score_method="binary"):
"""
This method defines the simulation driver.
:param simulation_data: SCDB DataFrame to use for simulation; must be a subset of SCDB justice-centered data
:param term_list: list of terms to simulate, e.g., [2000, 2001, 2002]
:param prediction_method: method that takes historical data and indicates, by justice, predictions for term
:param score_method: "binary" or "stratified"; binary maps to score >= 0.5, stratified maps to score <= random
:return: copy of simulation_data with additional columns representing predictions
"""
# Initialize predictions
return_data = simulation_data.copy()
return_data.loc[:, "prediction"] = numpy.nan
return_data.loc[:, "prediction_score"] = numpy.nan
# Iterate over all terms
for term in term_list:
print(term)
# Get indices for dockets to predict and use for historical data
before_term_index = simulation_data.loc[:, "term"] < term
current_term_index = (simulation_data.loc[:, "term"] == term) & (simulation_data.loc[:, "justice_outcome_disposition"] >= 0)
# Get the list of justices
term_justices = sorted(simulation_data.loc[current_term_index, "justice"].unique().tolist())
# Get the predictions
return_data.loc[current_term_index, "prediction_score"] = prediction_method(simulation_data.loc[before_term_index, :],
simulation_data.loc[current_term_index, :])
# Support both most_frequent and stratified approaches
if score_method == "binary":
return_data.loc[current_term_index, "prediction"] = (return_data.loc[current_term_index, "prediction_score"] >= 0.5).apply(int)
elif score_method == "stratified":
return_data.loc[current_term_index, "prediction"] = (return_data.loc[current_term_index, "prediction_score"] >= numpy.random.random(return_data.loc[current_term_index].shape[0])).apply(int)
else:
raise NotImplementedError
# Get the return range and return
term_index = return_data.loc[:, "term"].isin(term_list) & (return_data.loc[:, "justice_outcome_disposition"] >= 0)
return return_data.loc[term_index, :]
In [ ]:
# Set parameters
start_term = 1953
end_term = 2013
In [ ]:
%%time
# Run simulation for simplest model
print("predict_tree")
output_data = run_simulation(scdb_data, range(start_term, end_term), predict_rf)
In [ ]:
# Analyze results
print(sklearn.metrics.classification_report(output_data["justice_outcome_disposition"],
output_data["prediction"]))
print(sklearn.metrics.confusion_matrix(output_data["justice_outcome_disposition"],
output_data["prediction"]))
print(sklearn.metrics.accuracy_score(output_data["justice_outcome_disposition"],
output_data["prediction"]))
print(sklearn.metrics.f1_score(output_data["justice_outcome_disposition"],
output_data["prediction"]))
In [ ]:
# Get accuracy over time
output_data.loc[:, "correct"] = (output_data["justice_outcome_disposition"] == output_data["prediction"])
court_case_accuracy_by_year = output_data.groupby("term")["correct"].mean()
court_case_accuracy_by_year.plot()
In [ ]:
# Run vote simulation
output_data.loc[:, "case_prediction"] = numpy.nan
# Iterate over all dockets
for docket_id, docket_data in output_data.groupby('docketId'):
# Count predictions from docket
output_data.loc[:, "case_prediction"] = int(docket_data.loc[:, "prediction"].value_counts().idxmax())
In [ ]:
print(sklearn.metrics.classification_report(output_data["case_outcome_disposition"].fillna(-1),
output_data["case_prediction"].fillna(-1)))
print(sklearn.metrics.confusion_matrix(output_data["case_outcome_disposition"].fillna(-1),
output_data["case_prediction"].fillna(-1)))
print(sklearn.metrics.accuracy_score(output_data["case_outcome_disposition"].fillna(-1),
output_data["case_prediction"].fillna(-1)))
print(sklearn.metrics.f1_score(output_data["case_outcome_disposition"].fillna(-1),
output_data["case_prediction"].fillna(-1)))
In [ ]: