In [1]:
# Imports
import numpy
import pandas
import sklearn
import sklearn.dummy
import sklearn.metrics
import sklearn.ensemble

In [2]:
# Matplotlib setup
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn
seaborn.set()

In [3]:
# Load justice-centered SCDB data
scdb_data = pandas.read_csv("data/SCDB_2013_01_justiceCentered_Citation.csv")

Disposition outcoming coding

In the section below, we transform the SCDB vote and caseDisposition variables into an outcome variable indicating whether the case overall and each Justice has affirmed or reverse.


In [4]:
"""
Setup the outcome map.

Rows correspond to vote types.  Columns correspond to disposition types.

Element values correspond to:
 * -1: no precedential issued opinion or uncodable, i.e., DIGs
 * 0: affirm, i.e., no change in precedent
 * 1: reverse, i.e., change in precent
"""
outcome_map = pandas.DataFrame([[-1, 0, 1, 1, 1, 0, 1, -1, -1, -1, -1],
               [-1, 1, 0, 0, 0, 1, 0, -1, -1, -1, -1],
               [-1, 0, 1, 1, 1, 0, 1, -1, -1, -1, -1],
               [-1, 0, 1, 1, 1, 0, 1, -1, -1, -1, -1],
               [-1, 0, 1, 1, 1, 0, 1, -1, -1, -1, -1],
               [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
               [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
               [-1, 0, 0, 0, -1, 0, -1, -1, -1, -1, -1]])
outcome_map.columns = range(1, 12)
outcome_map.index = range(1, 9)

def get_outcome(vote, disposition):
    """
    Return the outcome code.
    """
    if pandas.isnull(vote) or pandas.isnull(disposition):
        return -1
    
    return outcome_map.loc[int(vote), int(disposition)]

In [5]:
# Map the case-level disposition outcome 
scdb_data.loc[:, "case_outcome_disposition"] = outcome_map.loc[1, scdb_data.loc[:, "caseDisposition"]].values
scdb_data.loc[:, "lc_case_outcome_disposition"] = outcome_map.loc[1, scdb_data.loc[:, "lcDisposition"]].values

# Map the justice-level disposition outcome
scdb_data.loc[:, "justice_outcome_disposition"] = scdb_data.loc[:, ("vote", "caseDisposition")] \
    .apply(lambda row: get_outcome(row["vote"], row["caseDisposition"]), axis=1)

Running a simulation

In the section below, we define methods that handle the execution and analysis of simulations. Simulations are based around the following concepts:

  • pre-processing methods: pre-processing methods handle cleaning, reshaping, transforming, and encoding input data
  • prediction methods: prediction methods take historical data and determine, for each term-justice, what prediction to make.

In [6]:
# Court to circuit mapping, which maps from SCDB codebook to the actual Circuit number
# http://scdb.wustl.edu/documentation.php?var=caseOrigin
# http://scdb.wustl.edu/documentation.php?var=caseSource
court_circuit_map = {1: 13,
                     2: 13, 3: 13, 4: 14, 5: 14, 6: 13, 7: 13, 8: 13,
                     9: 22, 10: 99, 12: 9, 13: 99, 14: 13, 15: 99, 16: 99,
                     17: 99, 18: 99, 19: 0, 20: 22, 21: 1, 22: 2, 23: 3,
                     24: 4, 25: 5, 26: 6, 27: 7, 28: 8, 29: 9, 30: 10,
                     31: 11, 32: 12, 41: 11, 42: 11, 43: 11, 44: 9, 45: 9,
                     46: 8, 47: 8, 48: 9, 49: 9, 50: 9, 51: 9, 52: 10, 53: 2,
                     54: 3, 55: 12, 56: 11, 57: 11, 58: 11, 59: 11, 60: 11,
                     61: 11, 62: 9, 63: 9, 64: 9, 65: 7, 66: 7, 67: 7, 68: 7,
                     69: 7, 70: 8, 71: 8, 72: 10, 73: 6, 74: 6, 75: 5, 76: 5,
                     77: 5, 78: 1, 79: 4, 80: 1, 81: 6, 82: 6, 83: 8, 84: 5,
                     85: 5, 86: 8, 87: 8, 88: 9, 89: 8, 90: 9, 91: 1, 92: 3,
                     93: 10, 94: 2, 95: 2, 96: 2, 97: 2, 98: 4, 99: 4, 100: 4,
                     101: 8, 102: 9, 103: 6, 104: 6, 105: 10, 106: 10, 107: 10,
                     108: 9, 109: 3, 110: 3, 111: 3, 112: 1, 113: 1, 114: 4,
                     115: 8, 116: 6, 117: 6, 118: 6, 119: 5, 120: 5, 121: 5,
                     122: 5, 123: 10, 124: 2, 125: 3, 126: 4, 127: 4, 128: 9,
                     129: 9, 130: 4, 131: 4, 132: 7, 133: 7, 134: 10, 150: 5,
                     151: 9, 152: 4, 153: 7, 155: 4, 160: 4, 162: 11, 163: 5,
                     164: 11, 165: 7, 166: 7, 167: 8, 168: 6, 169: 5, 170: 8,
                     171: 3, 172: 3, 173: 2, 174: 4, 175: 6, 176: 3, 177: 3,
                     178: 5, 179: 4, 180: 4, 181: 7, 182: 6, 183: 3, 184: 9,
                     185: 11, 186: 8, 187: 5, 300: 0, 301: 0, 302: 0, 400: 99,
                     401: 99, 402: 99, 403: 11, 404: 8, 405: 9, 406: 2, 407: 3,
                     408: 11, 409: 11, 410: 7, 411: 7, 412: 8, 413: 10, 414: 6,
                     415: 5, 416: 1, 417: 4, 418: 1, 419: 6, 420: 8,
                     421: 5, 422: 8, 423: 9, 424: 1, 425: 3, 426: 2,
                     427: 4, 428: 6, 429: 9, 430: 3, 431: 1, 432: 4, 433: 6,
                     434: 5, 435: 2, 436: 4, 437: 4, 438: 7,
                     439: 10, 440: 12, 441: 8, 442: 10, 443: 9}


def map_circuit(value):
    try:
        return court_circuit_map[value]
    except:
        return 0

# Get lists of classes for categorical vars
# Issue area
issue_area_codes = [0]
issue_area_codes.extend(sorted(scdb_data['issueArea'].fillna(0).apply(int).unique().tolist()))

# Issue
issue_codes = [0]
issue_codes.extend(sorted(scdb_data['issue'].fillna(0).apply(int).unique().tolist()))

# Courts
court_circuit_codes = [0]
court_circuit_codes.extend(sorted(list(set(court_circuit_map.values()))))

# Admin action
admin_action_codes = [0]
admin_action_codes.extend(sorted(scdb_data['adminAction'].fillna(0).apply(int).unique().tolist()))

# Law types
law_type_codes = [0]
law_type_codes.extend(sorted(scdb_data['lawType'].fillna(0).apply(int).unique().tolist()))

# Law supp types
law_supp_codes = [0]
law_supp_codes.extend(sorted(scdb_data['lawSupp'].fillna(0).apply(int).unique().tolist()))

# Cert reason
cert_reason_codes = [0]
cert_reason_codes.extend(sorted(scdb_data['certReason'].fillna(0).apply(int).unique().tolist()))

# Jurisdiction
jurisdiction_codes = [0]
jurisdiction_codes.extend(sorted(scdb_data['jurisdiction'].fillna(0).apply(int).unique().tolist()))

# LC Disagreement
lc_disagreement_codes = [0]
lc_disagreement_codes.extend(sorted(scdb_data['lcDisagreement'].fillna(0).apply(int).unique().tolist()))

# Justice codes
justice_codes = [0]
justice_codes.extend(sorted(scdb_data['justice'].fillna(0).apply(int).unique().tolist()))

# Parties
party_codes = [0]
party_codes.extend(sorted(scdb_data['petitioner'].fillna(0).apply(int).unique()))
party_codes.extend(sorted(scdb_data['respondent'].fillna(0).apply(int).unique()))
party_codes = sorted(list(set(party_codes)))

# LC outcome
lc_case_outcome_codes = [0]
lc_case_outcome_codes.extend(sorted(scdb_data['lc_case_outcome_disposition'].fillna(0).apply(int).unique().tolist()))

In [7]:
def preprocess_data(data):
    """
    Process SCDB data frame into features.
    """
    # Encode admin action
    admin_action_encoded = sklearn.preprocessing.label_binarize(data['adminAction'].fillna(0).apply(int),
                                                      admin_action_codes)
    # Encode issue area
    issue_area_encoded = sklearn.preprocessing.label_binarize(data['issueArea'].fillna(0).apply(int),
                                                      issue_area_codes)

    issue_encoded = sklearn.preprocessing.label_binarize(data['issue'].fillna(0).apply(int),
                                                      issue_codes)
    # Encode law type, cert reason, and jurisdiction
    law_type_encoded = sklearn.preprocessing.label_binarize(data['lawType'].fillna(0).apply(int),
                                                      law_type_codes)

    law_supp_encoded = sklearn.preprocessing.label_binarize(data['lawSupp'].fillna(0).apply(int),
                                                      law_type_codes)


    cert_reason_encoded = sklearn.preprocessing.label_binarize(data['certReason'].fillna(0).apply(int),
                                                      cert_reason_codes)

    jurisdiction_encoded = sklearn.preprocessing.label_binarize(data['jurisdiction'].fillna(0).apply(int),
                                                      jurisdiction_codes)
    # Encode courts
    data.loc[:, 'case_source_map'] = data['caseSource'].apply(map_circuit).apply(int)
    data.loc[:, 'case_origin_map'] = data['caseOrigin'].apply(map_circuit).apply(int)

    case_source_encoded = sklearn.preprocessing.label_binarize(data['case_source_map'].fillna(0).apply(int),
                                                      court_circuit_codes)
    case_origin_encoded = sklearn.preprocessing.label_binarize(data['case_origin_map'].fillna(0).apply(int),
                                                      court_circuit_codes)

    # Encode parties
    petitioner_encoded = sklearn.preprocessing.label_binarize(data['petitioner'].fillna(0).apply(int),
                                                              party_codes)
    respondent_encoded = sklearn.preprocessing.label_binarize(data['respondent'].fillna(0).apply(int),
                                                              party_codes)

    # Justice
    justice_encoded = sklearn.preprocessing.label_binarize(data['justice'].fillna(0).apply(int),
                                                              justice_codes)
    
    lc_outcome_encoded = sklearn.preprocessing.label_binarize(data['lc_case_outcome_disposition'].fillna(0).apply(int),
                                                  lc_case_outcome_codes)
    
    return numpy.hstack((justice_encoded, admin_action_encoded, issue_area_encoded, issue_encoded,
                         law_type_encoded, law_supp_encoded, cert_reason_encoded, jurisdiction_encoded,
                         case_source_encoded, case_origin_encoded, petitioner_encoded, respondent_encoded,
                         lc_outcome_encoded))

# Test shape
scdb_feature_data = preprocess_data(scdb_data)
print(scdb_feature_data.shape)


(76037, 1057)

In [8]:
def predict_rf(historical_data, current_data, max_leaf_nodes=1024, n_estimators=1000):
    """
    Prediction method based on simple random forest classifier.
    
    :param historical_data: SCDB DataFrame to use for out-of-sample calculationi; must be a subset of SCDB justice-centered 
      data known up to point in time
    :param current_data: SCDB DataFrame to use to generate predictions

    :return: vector containing predictions for each current_data record
    """
    
    # Get features and targets
    feature_train_data = preprocess_data(historical_data)
    target_train_data = historical_data.loc[:, "justice_outcome_disposition"].values
    target_index = (target_train_data >= 0)
    
    # Train model
    model = sklearn.ensemble.RandomForestClassifier(max_leaf_nodes=max_leaf_nodes,
                                                    n_estimators=n_estimators,
                                                    min_samples_leaf=2)
    model.fit(feature_train_data[target_index, :], target_train_data[target_index])
    prediction_score = model.predict_proba(preprocess_data(current_data))
    return prediction_score[:, 1]

def run_simulation(simulation_data, term_list,  prediction_method, score_method="binary"):
    """
    This method defines the simulation driver.
    
    :param simulation_data: SCDB DataFrame to use for simulation; must be a subset of SCDB justice-centered data
    :param term_list: list of terms to simulate, e.g., [2000, 2001, 2002]
    :param prediction_method: method that takes historical data and indicates, by justice,  predictions for term
    :param score_method: "binary" or "stratified"; binary maps to score >= 0.5, stratified maps to score <= random
    :return: copy of simulation_data with additional columns representing predictions
    """
    # Initialize predictions
    return_data = simulation_data.copy()
    return_data.loc[:, "prediction"] = numpy.nan
    return_data.loc[:, "prediction_score"] = numpy.nan

    # Iterate over all terms
    for term in term_list:
        print(term)
        # Get indices for dockets to predict and use for historical data
        before_term_index = simulation_data.loc[:, "term"] < term
        current_term_index = (simulation_data.loc[:, "term"] == term) & (simulation_data.loc[:, "justice_outcome_disposition"] >= 0)

        # Get the list of justices
        term_justices = sorted(simulation_data.loc[current_term_index, "justice"].unique().tolist())

        # Get the predictions
        return_data.loc[current_term_index, "prediction_score"] =  prediction_method(simulation_data.loc[before_term_index, :],
                                                                                simulation_data.loc[current_term_index, :])

        # Support both most_frequent and stratified approaches
        if score_method == "binary":
            return_data.loc[current_term_index, "prediction"] = (return_data.loc[current_term_index, "prediction_score"] >= 0.5).apply(int)
        elif score_method == "stratified":
            return_data.loc[current_term_index, "prediction"] = (return_data.loc[current_term_index, "prediction_score"] >= numpy.random.random(return_data.loc[current_term_index].shape[0])).apply(int)
        else:
            raise NotImplementedError

    # Get the return range and return
    term_index = return_data.loc[:, "term"].isin(term_list) & (return_data.loc[:, "justice_outcome_disposition"] >= 0)
    return return_data.loc[term_index, :]

In [ ]:
# Set parameters
start_term = 1953
end_term = 2013

Predicting case outcomes with a random forest

In the simulation below, we demonstrate the performance of the baseline model to predict case outcome based solely on historical court reversal rate.

The results indicate an accuracy of 63.72% with a frequency-weighted F1 score of 49.6%.


In [ ]:
%%time
# Run simulation for simplest model
print("predict_tree")
output_data = run_simulation(scdb_data, range(start_term, end_term), predict_rf)


predict_tree
1953

In [ ]:
# Analyze results
print(sklearn.metrics.classification_report(output_data["justice_outcome_disposition"],
                                      output_data["prediction"]))
print(sklearn.metrics.confusion_matrix(output_data["justice_outcome_disposition"],
                                      output_data["prediction"]))
print(sklearn.metrics.accuracy_score(output_data["justice_outcome_disposition"],
                                      output_data["prediction"]))
print(sklearn.metrics.f1_score(output_data["justice_outcome_disposition"],
                                      output_data["prediction"]))

In [ ]:
# Get accuracy over time
output_data.loc[:, "correct"] = (output_data["justice_outcome_disposition"] == output_data["prediction"])
court_case_accuracy_by_year = output_data.groupby("term")["correct"].mean()
court_case_accuracy_by_year.plot()

In [ ]:
# Run vote simulation
output_data.loc[:, "case_prediction"] = numpy.nan

# Iterate over all dockets
for docket_id, docket_data in output_data.groupby('docketId'):
    # Count predictions from docket
    output_data.loc[:, "case_prediction"]  = int(docket_data.loc[:, "prediction"].value_counts().idxmax())

In [ ]:
print(sklearn.metrics.classification_report(output_data["case_outcome_disposition"].fillna(-1),
                                      output_data["case_prediction"].fillna(-1)))
print(sklearn.metrics.confusion_matrix(output_data["case_outcome_disposition"].fillna(-1),
                                      output_data["case_prediction"].fillna(-1)))
print(sklearn.metrics.accuracy_score(output_data["case_outcome_disposition"].fillna(-1),
                                      output_data["case_prediction"].fillna(-1)))
print(sklearn.metrics.f1_score(output_data["case_outcome_disposition"].fillna(-1),
                                      output_data["case_prediction"].fillna(-1)))

In [ ]: