In [1]:
DOC = '''Supreme Court Oral Argument Predictor (SCOAP)

Creates models for predicting outcomes of Supreme Court oral
arguments. Pulls justice-specific phrases associated with
winning and losing arguments.

LICENSE:    MIT
AUTHOR:     theonaunheim@gmail.com
COPYRIGHT:  2017, Theo Naunheim
VERSION:    0.4.3
MODIFIED:   2017-03-26
DATA DIR:   .scoap
REQUIRES:   Jupyter Notebook and Xpdf/Poppler

WARNING:    THIS SCRIPT DOWNLOADS AND PROCESSES A LARGE
            VOLUME OF MATERIAL. IT IS COMPUTATIONALLY
            EXPENSIVE AND TAKES A NON-NEGLIGIBLE AMOUNT
            OF TIME AND BANDWIDTH.

'''

In [2]:
# Standard library imports
import asyncio
import copy
import itertools
import os
import re
import string
import sys
import zipfile

# Web/data imports
import bs4
import numpy as np
import pandas as pd
import requests

# Scikit learn imports
import sklearn
import sklearn.feature_extraction
import sklearn.metrics
import sklearn.model_selection
import sklearn.linear_model
import sklearn.naive_bayes
import sklearn.pipeline
import sklearn.svm
import sklearn.ensemble

In [3]:
# Constants and constant-ish things.

# Debug flag cuts down amount of data used.
DEBUG = False

# Website URLs for downloads
TRANSCRIPT_INFO = 'https://www.supremecourt.gov/oral_arguments/argument_transcript/'
TRANSCRIPT_DOWNLOADS = 'https://www.supremecourt.gov/oral_arguments/'
SCDB_CSV_DOWNLOAD_LINK = 'http://scdb.wustl.edu/_brickFiles/2016_01/SCDB_2016_01_justiceCentered_Docket.csv.zip'

# Transcript years for dynamic URL creation
START_YEAR = 2006
END_YEAR = 2017

# OS-specific path for PDF to text extraction utility.
if os.name == 'nt':
    PDF2TEXT_PATH = r'C:\Program Files\Xpdf\pdftotext.exe'
elif os.name == 'posix':
    PDF2TEXT_PATH = '/usr/bin/pdftotext'
else:
    raise Exception('This script requires Xpdf/Poppler utility pdftotext to run.')

# Paths for SCOAP specific data.
DATA_FOLDER = os.path.join(os.path.expanduser('~'), '.scoap')
SCDB_ZIP_NAME = SCDB_CSV_DOWNLOAD_LINK.rpartition('/')[2]
SCDB_CSV_NAME = SCDB_ZIP_NAME.rpartition('.')[0]
SCDB_ZIP_PATH = os.path.join(DATA_FOLDER, SCDB_ZIP_NAME)
SCDB_CSV_PATH = SCDB_ZIP_PATH.rpartition('.')[0]

# The current term justices and cases we wish to analyze.
CURRENT_JUSTICES = ['Roberts', 'Kennedy', 'Thomas', 'Ginsburg', 'Breyer', 'Alito', 'Sotomayor', 'Kagan']
CURRENT_CASES = ['15-214', '15-1031', '15-497', '15-1189', '16-369',
                 '16-254', '15-118', '15-1248', '16-32', '15-1194',
                 '16-54', '15-9260', '16-149', '16-1256', '15-1500',
                 '15-1391', '15-1406', '15-827', '15-1498', '16-348',
                 '15-1293', '15-1358', '15-8544', '15-797', '15-1204',
                 '15-680', '15-1262', '14-1538', '15-649', '15-866',
                 '15-513', '15-927', '15-423', '15-1251', '15-1111',
                 '14-1055', '15-1191', '15-537', '15-5991', '15-628', 
                 '15-8049', '14-9496', '15-777', '15-606', '15-7250',]

# Voting relationships for OT15, courtesy of http://www.scotusblog.com/statistics/
VOTING_RELATIONSHIPS = {"KENNEDY"  :{"KENNEDY":1.00,"SCALIA":0.82,"THOMAS":0.71,"KAGAN":0.95,"ROBERTS":0.88,"GINSBURG":0.84,"ALITO":0.82,"BREYER":0.91,"SOTOMAYOR":0.79},
                        "SCALIA"   :{"KENNEDY":0.82,"SCALIA":1.00,"THOMAS":0.88,"KAGAN":0.82,"ROBERTS":0.88,"GINSBURG":0.71,"ALITO":0.94,"BREYER":0.82,"SOTOMAYOR":0.65},
                        "THOMAS"   :{"KENNEDY":0.71,"SCALIA":0.88,"THOMAS":1.00,"KAGAN":0.67,"ROBERTS":0.75,"GINSBURG":0.62,"ALITO":0.78,"BREYER":0.67,"SOTOMAYOR":0.64},
                        "KAGAN"    :{"KENNEDY":0.95,"SCALIA":0.82,"THOMAS":0.67,"KAGAN":1.00,"ROBERTS":0.87,"GINSBURG":0.87,"ALITO":0.81,"BREYER":0.92,"SOTOMAYOR":0.81},
                        "ROBERTS"  :{"KENNEDY":0.88,"SCALIA":0.88,"THOMAS":0.75,"KAGAN":0.87,"ROBERTS":1.00,"GINSBURG":0.78,"ALITO":0.84,"BREYER":0.84,"SOTOMAYOR":0.77},
                        "GINSBURG" :{"KENNEDY":0.84,"SCALIA":0.71,"THOMAS":0.62,"KAGAN":0.87,"ROBERTS":0.78,"GINSBURG":1.00,"ALITO":0.73,"BREYER":0.86,"SOTOMAYOR":0.88},
                        "ALITO"    :{"KENNEDY":0.82,"SCALIA":0.94,"THOMAS":0.78,"KAGAN":0.81,"ROBERTS":0.84,"GINSBURG":0.73,"ALITO":1.00,"BREYER":0.77,"SOTOMAYOR":0.64},
                        "BREYER"   :{"KENNEDY":0.91,"SCALIA":0.82,"THOMAS":0.67,"KAGAN":0.92,"ROBERTS":0.84,"GINSBURG":0.86,"ALITO":0.77,"BREYER":1.00,"SOTOMAYOR":0.83},
                        "SOTOMAYOR":{"KENNEDY":0.79,"SCALIA":0.65,"THOMAS":0.64,"KAGAN":0.81,"ROBERTS":0.77,"GINSBURG":0.88,"ALITO":0.64,"BREYER":0.83,"SOTOMAYOR":1.00}}

In [4]:
# Define function.
def create_dataframe():
    '''Create a skeleton for our df.'''
    df = pd.DataFrame(columns=['CASE',
                               'DOCKET',
                               'ARGUMENT_YEAR',
                               'ARGUMENT_LINK',
                               'ARGUMENT_PATH',])
    return df


# Run function.
arg_df = create_dataframe()

In [5]:
# Define function.
def get_argument_metadata(df, start=START_YEAR - 1, end=END_YEAR + 1):
    '''This fetches oral argument location metadata.'''
    # For each year
    for year in range(start, end):
        # Create web address and download data 
        address = TRANSCRIPT_INFO + str(year)
        r = requests.get(address)
        # Parse data
        try:
            soup = bs4.BeautifulSoup(r.text, 'lxml')
            table = soup.find('table', 'table datatables')
            for row in table.findAll('tr'):
                link = row.find('a')
                case = row.find('span')
                # Write table info to dataframe.
                if link:
                    link_text = link.text[:-2].lower()
                    case_text = case.text
                    link_tail = link.attrs['href'].lstrip('../')
                    full_link = TRANSCRIPT_DOWNLOADS + link_tail
                    # Write to frame
                    path = os.path.join(DATA_FOLDER, link_text, 'argument.pdf')
                    df = df.append({'CASE': case_text,
                                    'DOCKET': link_text,
                                    'ARGUMENT_LINK': full_link,
                                    'ARGUMENT_PATH': path,
                                    'ARGUMENT_YEAR': str(year)}, ignore_index=True)
        except AttributeError:
            print('Attribute error. Probably an empty page.')
    return df


# Run function.
arg_df = get_argument_metadata(arg_df)


Attribute error. Probably an empty page.

In [6]:
# Show dataframe for clarity.
arg_df.head(3)


Out[6]:
CASE DOCKET ARGUMENT_YEAR ARGUMENT_LINK ARGUMENT_PATH
0 Hudson v. Michigan (Reargued) 04-1360 2005 https://www.supremecourt.gov/oral_arguments/ar... /home/theo/.scoap/04-1360/argument.pdf
1 Washington v. Recuenco 05-83 2005 https://www.supremecourt.gov/oral_arguments/ar... /home/theo/.scoap/05-83/argument.pdf
2 Burlington N. & S. F. R. Co. v. White 05-259 2005 https://www.supremecourt.gov/oral_arguments/ar... /home/theo/.scoap/05-259/argument.pdf

In [7]:
# Debug to shorten time during testing
if DEBUG:
    arg_df = arg_df.iloc[-10:].copy()

In [8]:
# Define function.
def make_directories(row):
    '''All cases get their own folder.'''
    try:
        path = os.path.join(DATA_FOLDER, row['DOCKET'])
        os.makedirs(path)
    except FileExistsError:
        pass

    
# Apply function. Output unnecessary.
_ = arg_df.apply(make_directories, axis=1)

In [9]:
# Define function.
def download_pdfs(row):
    '''Get PDFs and put in the folder if necessary.'''
    # If there's a link and no file, download.
    if row['ARGUMENT_LINK'] is not np.NaN:
        if os.path.exists(row['ARGUMENT_PATH']):
            return False
        r = requests.get(row['ARGUMENT_LINK'], stream=True)
        with open(row['ARGUMENT_PATH'], 'wb') as f:
            for chunk in r.iter_content(chunk_size=1024): 
                if chunk:
                    f.write(chunk)

                    
# Apply function. No assinment required.
_ = arg_df.apply(download_pdfs, axis=1)

In [10]:
arg_df.head(3)


Out[10]:
CASE DOCKET ARGUMENT_YEAR ARGUMENT_LINK ARGUMENT_PATH
0 Hudson v. Michigan (Reargued) 04-1360 2005 https://www.supremecourt.gov/oral_arguments/ar... /home/theo/.scoap/04-1360/argument.pdf
1 Washington v. Recuenco 05-83 2005 https://www.supremecourt.gov/oral_arguments/ar... /home/theo/.scoap/05-83/argument.pdf
2 Burlington N. & S. F. R. Co. v. White 05-259 2005 https://www.supremecourt.gov/oral_arguments/ar... /home/theo/.scoap/05-259/argument.pdf

In [11]:
# Define functions.
async def get_text(pdf_path): 
    '''This function is a coroutine for a single pdf2text.py instance.'''
    # Create the subprocess, redirect the standard output into a pipe
    process = await asyncio.create_subprocess_exec(PDF2TEXT_PATH,
                                                   pdf_path,
                                                   '-',
                                                   stdout=asyncio.subprocess.PIPE,
                                                   stderr=asyncio.subprocess.PIPE) 
    # Read output
    data = await process.communicate() 
    # Have process exit and return data.
    await process.wait()
    # Decode cp1252 for windows
    try:
        decoded_data = data[0].decode('cp1252')
    # And UTF-8 for Linux.
    except:
        decoded_data = data[0].decode()
    return decoded_data


async def get_all_text(pdf_paths):
    '''This gathers the pdf2text.py results.'''
    # Create list for return results.
    result_list = []
    # Create a list of tasks
    input_len = len(pdf_paths)
    num_chunks = (input_len // 10) + 1
    chunked_input = np.array_split(pdf_paths, num_chunks)
    # Now run each of the chunks in parallel to speed things up.
    for chunk in chunked_input:
        # Create tasks
        tasks = [get_text(path) for path in chunk]
        # Run all the tasks in parallel
        results = await asyncio.gather(*tasks)
        # Put the zipped (path, results) in result list
        for path, result in zip(chunk, results):
            result_list.append((path, result))
    return result_list


def add_arguments(df):
    '''Adds argument text to df.'''
    # Get unique PDFs
    unique_pdfs = df['ARGUMENT_PATH'].unique()
    # Windows only supports proactorloop.
    if os.name == 'nt':
        loop = asyncio.ProactorEventLoop()
    elif os.name == 'posix':
        loop = asyncio.SelectorEventLoop()
    else:
        loop == None
    asyncio.set_event_loop(loop)
    # Run our coroutine to extract text.
    arg_data = loop.run_until_complete(get_all_text(unique_pdfs))
    # Loop no longer necessary.
    loop.close()
    # Create dataframe for data.
    tdf = pd.DataFrame.from_records(arg_data, columns=['ARGUMENT_PATH', 'TEXT'])
    # Join to input df and fill na.
    df = df.merge(tdf, how='left', on='ARGUMENT_PATH').fillna('')
    return df


# Run function
arg_df = add_arguments(arg_df)

In [12]:
# Show dataframe for clarity.
arg_df.head(3)


Out[12]:
CASE DOCKET ARGUMENT_YEAR ARGUMENT_LINK ARGUMENT_PATH TEXT
0 Hudson v. Michigan (Reargued) 04-1360 2005 https://www.supremecourt.gov/oral_arguments/ar... /home/theo/.scoap/04-1360/argument.pdf 1\n\nIN THE SUPREME COURT OF THE UNITED STATES...
1 Washington v. Recuenco 05-83 2005 https://www.supremecourt.gov/oral_arguments/ar... /home/theo/.scoap/05-83/argument.pdf 1\n\nIN THE SUPREME COURT OF THE UNITED STATES...
2 Burlington N. & S. F. R. Co. v. White 05-259 2005 https://www.supremecourt.gov/oral_arguments/ar... /home/theo/.scoap/05-259/argument.pdf 1\n\nIN THE SUPREME COURT OF THE UNITED STATES...

In [13]:
# Define function.
def cut_unnecessary_text(df):
    '''This function cuts low information text from transcript.'''
    # First chop off the caption ('PROCEEDINGS' or 'P R O C E E D I N G S')
    capture_string = r'P\s?R\s?O\s?C\s?E\s?E\s?D\s?I\s?N\s?G\s?S([\s\S]*\Z)'
    df['TEXT'] = df['TEXT'].str.extract(capture_string,
                                        expand=False,
                                        flags=re.MULTILINE)
    # First we specify the patterns we don't want
    patterns_to_cut = [

        # Cut carriage returns and form feeds because f*** those guys.
        r'[\r\f]',

        # Remove tables at end ##:## 4 within no more than 100 chars of Alderson
        (r'\s*' +
         r'Alderson Reporting Company' + 
         # period because a.m. messes it up.
         r'[\s\S.]{0,75}\d?\d:\d?\d' * 3 +
         r'[\s\S]*' +
         r'\Z'),
        
        # Remove [2004 - 2005] footer
        r'1111 14th[\s\S]{0,100}20005',
        
        # Remove [2006 - 2016] header/footer unofficial
        r'Alderson[\s\S]{0,100}Review',

        # Remove [2006 - 2016] header/footer official
        r'Alderson[\s\S]{0,100}[oO]fficial',
    
        # Remove Genric Alderson 
        r'Alderson Reporting Company',

        # Cut court reporter annotations
        r'[(\[][\s\S]{0,100}[)\]]',

        # Cut line numbers, page numbers, all other low-information numbers
        r'[0-9]',

        # Cut PAGE
        r'[Pp][Aa][Gg][Ee]',

    ]    

    # Replace above patterns with empty space.
    for pattern in patterns_to_cut:
        df['TEXT'] = df['TEXT'].str.replace(pat=pattern,
                                            repl='',
                                            flags=re.MULTILINE)
    return df


# Run function.
arg_df = cut_unnecessary_text(arg_df)

In [14]:
# Show df for clarity
arg_df.head(3)


Out[14]:
CASE DOCKET ARGUMENT_YEAR ARGUMENT_LINK ARGUMENT_PATH TEXT
0 Hudson v. Michigan (Reargued) 04-1360 2005 https://www.supremecourt.gov/oral_arguments/ar... /home/theo/.scoap/04-1360/argument.pdf \n\n\n\n\n\n\n\n\nCHIEF JUSTICE ROBERTS:\n\nWe...
1 Washington v. Recuenco 05-83 2005 https://www.supremecourt.gov/oral_arguments/ar... /home/theo/.scoap/05-83/argument.pdf \n\n\n\n\n\n\n\nCHIEF JUSTICE ROBERTS:\n\nWe'l...
2 Burlington N. & S. F. R. Co. v. White 05-259 2005 https://www.supremecourt.gov/oral_arguments/ar... /home/theo/.scoap/05-259/argument.pdf \n\n\n\n\n\nCHIEF JUSTICE ROBERTS:\n\nWe'll he...

In [15]:
# Define function.
def create_heading_columns(df):
    '''This function finds the section headings for each case'''
    # Create Petitioner oral argument heading col
    pet_arg_pattern = ''.join([r'(',
                               r'ORAL ARGUMENT[\S\s]{,200}',
                               r'(?:PETITIONER|APPELLANT)S?',
                               # As appointed by this court optional
                               r'(?:[\S\s]{,50}THIS COURT)?',
                               r')'])
    df['PET_ARG_HEADING'] = df['TEXT'].str.extract(pet_arg_pattern,
                                                   expand=False,
                                                   flags=re.MULTILINE).fillna('')
        
    # Create Respondent oral argument heading col
    res_arg_pattern = ''.join([r'(ORAL ARGUMENT[\S\s]{,200}',
                               r'(?:RESPONDENT|APPELLEE)S?',
                               # As appointed by this court optional
                               r'(?:[\S\s]{,50}THIS COURT)?',
                               r')'])
    df['RES_ARG_HEADING'] = df['TEXT'].str.extract(res_arg_pattern,
                                                   expand=False,
                                                   flags=re.MULTILINE).fillna('')
    
    # Create Petitioner rebuttal heading col
    pet_reb_pattern = ''.join([r'(REBUTTAL ARGUMENT[\S\s]{,200}',
                               r'(?:PETITIONER|APPELLANT)S?',
                               # As appointed by this court optional
                               r'(?:[\S\s]{,50}THIS COURT)?',
                               r')'])
    df['PET_REB_HEADING'] = df['TEXT'].str.extract(pet_reb_pattern,
                                                   expand=False,
                                                   flags=re.MULTILINE).fillna('')
    
    return df


# TODO:
# IN ##-#### optional ... r'(?:[\S\s]{,10}IN[\S\s]{,5}-)?'

# Run function
arg_df = create_heading_columns(arg_df).fillna('')

In [16]:
# Define function.
def extract_petitioner_arg(df):
    '''Pulls out petitioner argument using section headers.'''

    # Create extraction (between pet arg heading and res arg heading)
    df['PET_ARG_REGEX'] = df.apply(lambda row: ''.join([row['PET_ARG_HEADING'],
                                                        r'([\S\s]*?)',
                                                        r'(?:ORAL)']),
                                   axis=1)
    
    # Extract and create petitioner argument column
    df['PETITIONER_ARGUMENT'] = df.apply(lambda row: re.findall(row['PET_ARG_REGEX'],
                                                                row['TEXT'],
                                                                flags=re.MULTILINE),
                                         axis=1)
    
    # If no match, empty string. Else, take match's.
    df['PETITIONER_ARGUMENT'] = df['PETITIONER_ARGUMENT'].map(lambda matches: ''.join(matches))
    
    return df


# Run function.
arg_df = extract_petitioner_arg(arg_df)

In [17]:
# Define function.
def extract_respondent_arg(df):
    '''Pulls out respondent argument using previously generated section heads.'''
    # Get respondent argument (between res arg heading and pet reb heading)
    df['RES_ARG_REGEX'] = df.apply(lambda row: ''.join([row['RES_ARG_HEADING'],
                                                        r'([\S\s]*?)',
                                                        r'(?:REBUTTAL)|(?:ORAL)']),
                                   axis=1)

    df['RESPONDENT_ARGUMENT'] = df.apply(lambda row: re.findall(row['RES_ARG_REGEX'],
                                                                row['TEXT'],
                                                                flags=re.MULTILINE),
                                         axis=1)
    
    # If no match, empty string. Else, take match's.
    df['RESPONDENT_ARGUMENT'] = df['RESPONDENT_ARGUMENT'].map(lambda matches: ''.join(matches))
    
    return df


# Run function.
arg_df = extract_respondent_arg(arg_df)

In [18]:
# Define function.
def extract_petitioner_reb(df):
    '''Pulls out petitioner rebuttal using previously generated section heads.'''
    # Get petitioner rebuttal (between pet reb heading and res reb heading)
    df['PET_REB_REGEX'] = df.apply(lambda row: ''.join([row['PET_REB_HEADING'],
                                                        r'([\S\s]*?)',
                                                        r'(?:\Z)']),
                                   axis=1)

    df['PETITIONER_REBUTTAL'] = df.apply(lambda row: re.search(row['PET_REB_REGEX'],
                                                               row['TEXT'],
                                                               flags=re.MULTILINE).group(1),
                                         axis=1)
    
    return df


# Run function.
arg_df = extract_petitioner_reb(arg_df)

In [19]:
# TODO. If transcript omits info (e.g. SAMSUNG/WAXMAN 15-777), no match.
len(arg_df)


Out[19]:
854

In [20]:
# Show dataframe for clarity
arg_df.head(3)


Out[20]:
CASE DOCKET ARGUMENT_YEAR ARGUMENT_LINK ARGUMENT_PATH TEXT PET_ARG_HEADING RES_ARG_HEADING PET_REB_HEADING PET_ARG_REGEX PETITIONER_ARGUMENT RES_ARG_REGEX RESPONDENT_ARGUMENT PET_REB_REGEX PETITIONER_REBUTTAL
0 Hudson v. Michigan (Reargued) 04-1360 2005 https://www.supremecourt.gov/oral_arguments/ar... /home/theo/.scoap/04-1360/argument.pdf \n\n\n\n\n\n\n\n\nCHIEF JUSTICE ROBERTS:\n\nWe... ORAL ARGUMENT OF DAVID A. MORAN\n\n\n\nON BEHA... ORAL ARGUMENT OF TIMOTHY A. BAUGHMAN\n\n\n\nON... REBUTTAL ARGUMENT OF DAVID A. MORAN\n\n\n\nON ... ORAL ARGUMENT OF DAVID A. MORAN\n\n\n\nON BEHA... \n\n\n\n\nMR. MORAN:\n\nMr. Chief Justice, and... ORAL ARGUMENT OF TIMOTHY A. BAUGHMAN\n\n\n\nON... \n\n\n\n\n\nMR. BAUGHMAN:\n\nThank you, Mr. Mo... REBUTTAL ARGUMENT OF DAVID A. MORAN\n\n\n\nON ... \n\n\n\nMR. MORAN:\n\n\n\nThank you --\n\nCHIE...
1 Washington v. Recuenco 05-83 2005 https://www.supremecourt.gov/oral_arguments/ar... /home/theo/.scoap/05-83/argument.pdf \n\n\n\n\n\n\n\nCHIEF JUSTICE ROBERTS:\n\nWe'l... ORAL ARGUMENT OF JAMES M. WHISMAN\n\n\n\nON BE... ORAL ARGUMENT OF GREGORY C. LINK\n\n\n\nON BEH... REBUTTAL ARGUMENT OF JAMES M. WHISMAN\n\n\n\nO... ORAL ARGUMENT OF JAMES M. WHISMAN\n\n\n\nON BE... \n\n\n\n\n\nMR. WHISMAN:\n\nMr. Chief Justice,... ORAL ARGUMENT OF GREGORY C. LINK\n\n\n\nON BEH... \n\n\n\n\nAnd on\n\nMR. LINK:\n\nMr. Chief Jus... REBUTTAL ARGUMENT OF JAMES M. WHISMAN\n\n\n\nO... \n\n\n\nJUSTICE SCALIA:\n\nThank you, counsel....
2 Burlington N. & S. F. R. Co. v. White 05-259 2005 https://www.supremecourt.gov/oral_arguments/ar... /home/theo/.scoap/05-259/argument.pdf \n\n\n\n\n\nCHIEF JUSTICE ROBERTS:\n\nWe'll he... ORAL ARGUMENT OF CARTER G. PHILLIPS\n\n\n\nON ... ORAL ARGUMENT OF DONALD A. DONATI\n\n\n\nON BE... REBUTTAL ARGUMENT OF CARTER G. PHILLIPS\n\n\n\... ORAL ARGUMENT OF CARTER G. PHILLIPS\n\n\n\nON ... \n\n\n\n\n\nMR. PHILLIPS:\n\nThank you, Mr. Ch... ORAL ARGUMENT OF DONALD A. DONATI\n\n\n\nON BE... \n\n\n\n\n\nMR. DONATI:\n\nMr. Chief Justice, ... REBUTTAL ARGUMENT OF CARTER G. PHILLIPS\n\n\n\... \n\n\n\nMR. PHILLIPS:\n\nThank you, Mr. Chief ...

In [21]:
bak = arg_df.copy()

In [22]:
def split_arguments(df):
    '''Split argument into a series of comments.'''    
    # Must be double quote because raw string addressing ' for O'Connor
    comment_pattern = r"([A-Z.'\s]{5,25}:\s[\s\S]*?)(?=[A-Z'.\s]{5,25}[:\Z])"
    for column in ['PETITIONER_ARGUMENT',
                   'RESPONDENT_ARGUMENT',
                   'PETITIONER_REBUTTAL']:
        # We only want periods in the middle of names.
        df[column] = df[column].str.findall(comment_pattern)
    return df


# Run functions
arg_df = split_arguments(arg_df)

In [23]:
def tuplify_cell(cell_value):
    '''Helper function for tuplify_argument().'''
    return_value = []
    for comment in cell_value:
        justice, _, comment = comment.partition(':')
        return_value.append(tuple([justice.replace('.', '').strip(),
                                   comment.strip()]))
    return return_value


def tuplify_arguments(df):
    '''Turn question strings into (justice, text) tuples.'''
    for column in ['PETITIONER_ARGUMENT',
                   'RESPONDENT_ARGUMENT',
                   'PETITIONER_REBUTTAL']:
        df[column] = df[column].map(tuplify_cell)
    return df.fillna('')


# Run function
arg_df = tuplify_arguments(arg_df)

In [24]:
def condense_cell(cell_value):
    '''Helper function for condense_argument().'''
    return_dict = {}
    for input_tuple in cell_value:
        justice, comment = input_tuple
        try:
            return_dict[justice].append(comment)
        except KeyError:
            return_dict[justice] = [comment]
    return return_dict


def condense_arguments(df):
    '''Turn args into: {'justice': ['comment 1', 'comment 2']}'''
    for column in ['PETITIONER_ARGUMENT',
                   'RESPONDENT_ARGUMENT',
                   'PETITIONER_REBUTTAL']:
        df[column] = df[column].map(condense_cell)
    return df


arg_df = condense_arguments(arg_df)

In [25]:
arg_df.head(3)


Out[25]:
CASE DOCKET ARGUMENT_YEAR ARGUMENT_LINK ARGUMENT_PATH TEXT PET_ARG_HEADING RES_ARG_HEADING PET_REB_HEADING PET_ARG_REGEX PETITIONER_ARGUMENT RES_ARG_REGEX RESPONDENT_ARGUMENT PET_REB_REGEX PETITIONER_REBUTTAL
0 Hudson v. Michigan (Reargued) 04-1360 2005 https://www.supremecourt.gov/oral_arguments/ar... /home/theo/.scoap/04-1360/argument.pdf \n\n\n\n\n\n\n\n\nCHIEF JUSTICE ROBERTS:\n\nWe... ORAL ARGUMENT OF DAVID A. MORAN\n\n\n\nON BEHA... ORAL ARGUMENT OF TIMOTHY A. BAUGHMAN\n\n\n\nON... REBUTTAL ARGUMENT OF DAVID A. MORAN\n\n\n\nON ... ORAL ARGUMENT OF DAVID A. MORAN\n\n\n\nON BEHA... {'JUSTICE SCALIA': ['Counsel, what -- what do ... ORAL ARGUMENT OF TIMOTHY A. BAUGHMAN\n\n\n\nON... {'JUSTICE SCALIA': ['I'd be happy to argue tha... REBUTTAL ARGUMENT OF DAVID A. MORAN\n\n\n\nON ... {'CHIEF JUSTICE ROBERTS': ['You think there --...
1 Washington v. Recuenco 05-83 2005 https://www.supremecourt.gov/oral_arguments/ar... /home/theo/.scoap/05-83/argument.pdf \n\n\n\n\n\n\n\nCHIEF JUSTICE ROBERTS:\n\nWe'l... ORAL ARGUMENT OF JAMES M. WHISMAN\n\n\n\nON BE... ORAL ARGUMENT OF GREGORY C. LINK\n\n\n\nON BEH... REBUTTAL ARGUMENT OF JAMES M. WHISMAN\n\n\n\nO... ORAL ARGUMENT OF JAMES M. WHISMAN\n\n\n\nON BE... {'JUSTICE SCALIA': ['Was deadly weapon still a... ORAL ARGUMENT OF GREGORY C. LINK\n\n\n\nON BEH... {'JUSTICE SCALIA': ['It didn't It said the Ho... REBUTTAL ARGUMENT OF JAMES M. WHISMAN\n\n\n\nO... {'JUSTICE SCALIA': ['Thank you, counsel. Mr. ...
2 Burlington N. & S. F. R. Co. v. White 05-259 2005 https://www.supremecourt.gov/oral_arguments/ar... /home/theo/.scoap/05-259/argument.pdf \n\n\n\n\n\nCHIEF JUSTICE ROBERTS:\n\nWe'll he... ORAL ARGUMENT OF CARTER G. PHILLIPS\n\n\n\nON ... ORAL ARGUMENT OF DONALD A. DONATI\n\n\n\nON BE... REBUTTAL ARGUMENT OF CARTER G. PHILLIPS\n\n\n\... ORAL ARGUMENT OF CARTER G. PHILLIPS\n\n\n\nON ... {'JUSTICE SCALIA': ['But has the language and... ORAL ARGUMENT OF DONALD A. DONATI\n\n\n\nON BE... {'JUSTICE SCALIA': ['I'm -- I'm a supervisor, ... REBUTTAL ARGUMENT OF CARTER G. PHILLIPS\n\n\n\... {'MR PHILLIPS': ['Thank you, Mr. Chief Justice...

In [26]:
# Define function creating secondary df.
def create_scdb_df():
    '''Download data frome the SCDB and instantiate dataframe.'''

    # If we've already download database, just load.
    if os.path.exists(SCDB_CSV_PATH):
        pass
    else:
        # Get data
        r = requests.get(SCDB_CSV_DOWNLOAD_LINK, stream=True)
        with open(SCDB_ZIP_PATH, 'wb') as f:
            for chunk in r.iter_content(chunk_size=1024): 
                if chunk:
                    f.write(chunk)
        # Unzip context manager
        with zipfile.ZipFile(SCDB_ZIP_PATH) as zip_file:
            # Read data context manager
            with zip_file.open(SCDB_CSV_NAME) as pseudo_file:
                data = pseudo_file.read()
                # Write data with context manager and write to csv.
                with open(SCDB_CSV_PATH, 'wb+') as f:
                    f.write(data)
    
    # Now create dataframe from csv.
    case_df = pd.read_csv(SCDB_CSV_PATH, encoding='latin-1')

    return case_df


# Run df.
case_df = create_scdb_df()

In [27]:
# Show case dataframe for clarity
case_df.head(3)


Out[27]:
caseId docketId caseIssuesId voteId dateDecision decisionType usCite sctCite ledCite lexisCite ... majVotes minVotes justice justiceName vote opinion direction majority firstAgreement secondAgreement
0 1946-001 1946-001-01 1946-001-01-01 1946-001-01-01-01-01 11/18/1946 1 329 U.S. 1 67 S. Ct. 6 91 L. Ed. 3 1946 U.S. LEXIS 1724 ... 8 1 86 HHBurton 2.0 1.0 1.0 1.0 NaN NaN
1 1946-001 1946-001-01 1946-001-01-01 1946-001-01-01-01-02 11/18/1946 1 329 U.S. 1 67 S. Ct. 6 91 L. Ed. 3 1946 U.S. LEXIS 1724 ... 8 1 84 RHJackson 1.0 1.0 2.0 2.0 NaN NaN
2 1946-001 1946-001-01 1946-001-01-01 1946-001-01-01-01-03 11/18/1946 1 329 U.S. 1 67 S. Ct. 6 91 L. Ed. 3 1946 U.S. LEXIS 1724 ... 8 1 81 WODouglas 1.0 1.0 2.0 2.0 NaN NaN

3 rows × 61 columns


In [28]:
cut_arg_df = arg_df[['DOCKET',
                     'CASE', 
                     'PETITIONER_ARGUMENT',
                     #'RESPONDENT_ARGUMENT',
                     'PETITIONER_REBUTTAL']]

cut_arg_df.head(3)


Out[28]:
DOCKET CASE PETITIONER_ARGUMENT PETITIONER_REBUTTAL
0 04-1360 Hudson v. Michigan (Reargued) {'JUSTICE SCALIA': ['Counsel, what -- what do ... {'CHIEF JUSTICE ROBERTS': ['You think there --...
1 05-83 Washington v. Recuenco {'JUSTICE SCALIA': ['Was deadly weapon still a... {'JUSTICE SCALIA': ['Thank you, counsel. Mr. ...
2 05-259 Burlington N. & S. F. R. Co. v. White {'JUSTICE SCALIA': ['But has the language and... {'MR PHILLIPS': ['Thank you, Mr. Chief Justice...

In [29]:
cut_case_df = case_df[['docket', 'majority', 'partyWinning', 'justiceName']]
cut_case_df.columns = ['DOCKET', 'majority', 'partyWinning', 'JUSTICE']

cut_case_df.head(3)


Out[29]:
DOCKET majority partyWinning JUSTICE
0 24 1.0 1.0 HHBurton
1 24 2.0 1.0 RHJackson
2 24 2.0 1.0 WODouglas

In [30]:
# Join case_df and arg_df to create joint dataframe jdf.
jdf = pd.merge(cut_arg_df,
               cut_case_df,
               how='left',
               on='DOCKET')

# Drop (Reargued) because it creates dupes.
contains_reargue = jdf['CASE'].str.contains('Reargue')
jdf = jdf[~contains_reargue]

In [31]:
# Show joint dataframe for clarity (all tail end will be np.NaN)
jdf.head(3)


Out[31]:
DOCKET CASE PETITIONER_ARGUMENT PETITIONER_REBUTTAL majority partyWinning JUSTICE
9 05-83 Washington v. Recuenco {'JUSTICE SCALIA': ['Was deadly weapon still a... {'JUSTICE SCALIA': ['Thank you, counsel. Mr. ... 2.0 1.0 JGRoberts
10 05-83 Washington v. Recuenco {'JUSTICE SCALIA': ['Was deadly weapon still a... {'JUSTICE SCALIA': ['Thank you, counsel. Mr. ... 1.0 1.0 JPStevens
11 05-83 Washington v. Recuenco {'JUSTICE SCALIA': ['Was deadly weapon still a... {'JUSTICE SCALIA': ['Thank you, counsel. Mr. ... 2.0 1.0 AScalia

In [32]:
'''

From documentation on 'partyWinning' column:

http://scdb.wustl.edu/documentation.php?var=partyWinning

0: no favorable disposition for petitioning party apparent
1: petitioning party received a favorable disposition
2: favorable disposition for petitioning party unclear 

We want to be able to separate those who won from those who did not
win. Consequently, we drop all cases where the decision was
ambiguous, or the winner was not apparent.

We then convert the 0 to False and the 1 to True, which gives
us a True/False 'PETITIONER_WINS' column.

PETITIONER_WINS
0 -> False
1 -> True

If the petitioner wins, it is because it was the decision of the
majority of the court. We can accurately describe the nature of this
column as 'PETITIONER_WINS_MAJORITY'.

'''

jdf = jdf[jdf['partyWinning'] != 2.0].copy()
jdf['PETITIONER_WINS_MAJORITY'] = jdf['partyWinning'].astype(bool)

In [33]:
'''

From documentation on 'majority' columns:

http://scdb.wustl.edu/documentation.php?var=majority

1: dissent
2: majority 

We want to convert this into a 'VOTED_WITH_MAJORITY' column.
To do this we subtract one from each and every value so that
dissent becomes 0 and majority becomes 1.

majority
0: dissent (result 1 - 1)
1: majority (result from 2 - 1)

Then we convert the 0 to False and 1 to True, so that we have a
'VOTED_WITH_MAJORITY' column.

VOTED_WITH_MAJORITY
0 -> False
1 -> True

'''

jdf['majority_minus_one'] = jdf['majority'] - 1
jdf['VOTED_WITH_MAJORITY'] = jdf['majority_minus_one'].astype(bool)
jdf


Out[33]:
DOCKET CASE PETITIONER_ARGUMENT PETITIONER_REBUTTAL majority partyWinning JUSTICE PETITIONER_WINS_MAJORITY majority_minus_one VOTED_WITH_MAJORITY
9 05-83 Washington v. Recuenco {'JUSTICE SCALIA': ['Was deadly weapon still a... {'JUSTICE SCALIA': ['Thank you, counsel. Mr. ... 2.0 1.0 JGRoberts True 1.0 True
10 05-83 Washington v. Recuenco {'JUSTICE SCALIA': ['Was deadly weapon still a... {'JUSTICE SCALIA': ['Thank you, counsel. Mr. ... 1.0 1.0 JPStevens True 0.0 False
11 05-83 Washington v. Recuenco {'JUSTICE SCALIA': ['Was deadly weapon still a... {'JUSTICE SCALIA': ['Thank you, counsel. Mr. ... 2.0 1.0 AScalia True 1.0 True
12 05-83 Washington v. Recuenco {'JUSTICE SCALIA': ['Was deadly weapon still a... {'JUSTICE SCALIA': ['Thank you, counsel. Mr. ... 2.0 1.0 AMKennedy True 1.0 True
13 05-83 Washington v. Recuenco {'JUSTICE SCALIA': ['Was deadly weapon still a... {'JUSTICE SCALIA': ['Thank you, counsel. Mr. ... 2.0 1.0 DHSouter True 1.0 True
14 05-83 Washington v. Recuenco {'JUSTICE SCALIA': ['Was deadly weapon still a... {'JUSTICE SCALIA': ['Thank you, counsel. Mr. ... 2.0 1.0 CThomas True 1.0 True
15 05-83 Washington v. Recuenco {'JUSTICE SCALIA': ['Was deadly weapon still a... {'JUSTICE SCALIA': ['Thank you, counsel. Mr. ... 1.0 1.0 RBGinsburg True 0.0 False
16 05-83 Washington v. Recuenco {'JUSTICE SCALIA': ['Was deadly weapon still a... {'JUSTICE SCALIA': ['Thank you, counsel. Mr. ... 2.0 1.0 SGBreyer True 1.0 True
17 05-83 Washington v. Recuenco {'JUSTICE SCALIA': ['Was deadly weapon still a... {'JUSTICE SCALIA': ['Thank you, counsel. Mr. ... 2.0 1.0 SAAlito True 1.0 True
18 05-259 Burlington N. & S. F. R. Co. v. White {'JUSTICE SCALIA': ['But has the language and... {'MR PHILLIPS': ['Thank you, Mr. Chief Justice... 2.0 0.0 JGRoberts False 1.0 True
19 05-259 Burlington N. & S. F. R. Co. v. White {'JUSTICE SCALIA': ['But has the language and... {'MR PHILLIPS': ['Thank you, Mr. Chief Justice... 2.0 0.0 JPStevens False 1.0 True
20 05-259 Burlington N. & S. F. R. Co. v. White {'JUSTICE SCALIA': ['But has the language and... {'MR PHILLIPS': ['Thank you, Mr. Chief Justice... 2.0 0.0 AScalia False 1.0 True
21 05-259 Burlington N. & S. F. R. Co. v. White {'JUSTICE SCALIA': ['But has the language and... {'MR PHILLIPS': ['Thank you, Mr. Chief Justice... 2.0 0.0 AMKennedy False 1.0 True
22 05-259 Burlington N. & S. F. R. Co. v. White {'JUSTICE SCALIA': ['But has the language and... {'MR PHILLIPS': ['Thank you, Mr. Chief Justice... 2.0 0.0 DHSouter False 1.0 True
23 05-259 Burlington N. & S. F. R. Co. v. White {'JUSTICE SCALIA': ['But has the language and... {'MR PHILLIPS': ['Thank you, Mr. Chief Justice... 2.0 0.0 CThomas False 1.0 True
24 05-259 Burlington N. & S. F. R. Co. v. White {'JUSTICE SCALIA': ['But has the language and... {'MR PHILLIPS': ['Thank you, Mr. Chief Justice... 2.0 0.0 RBGinsburg False 1.0 True
25 05-259 Burlington N. & S. F. R. Co. v. White {'JUSTICE SCALIA': ['But has the language and... {'MR PHILLIPS': ['Thank you, Mr. Chief Justice... 2.0 0.0 SGBreyer False 1.0 True
26 05-259 Burlington N. & S. F. R. Co. v. White {'JUSTICE SCALIA': ['But has the language and... {'MR PHILLIPS': ['Thank you, Mr. Chief Justice... 2.0 0.0 SAAlito False 1.0 True
27 05-352 United States v. Gonzalez-Lopez {'JUSTICE SCALIA': ['When did -- when did we ... {'JUSTICE STEVENS': ['Well, Mr. Dreeben, do yo... 2.0 0.0 JPStevens False 1.0 True
28 05-352 United States v. Gonzalez-Lopez {'JUSTICE SCALIA': ['When did -- when did we ... {'JUSTICE STEVENS': ['Well, Mr. Dreeben, do yo... 2.0 0.0 AScalia False 1.0 True
29 05-352 United States v. Gonzalez-Lopez {'JUSTICE SCALIA': ['When did -- when did we ... {'JUSTICE STEVENS': ['Well, Mr. Dreeben, do yo... 1.0 0.0 AMKennedy False 0.0 False
30 05-352 United States v. Gonzalez-Lopez {'JUSTICE SCALIA': ['When did -- when did we ... {'JUSTICE STEVENS': ['Well, Mr. Dreeben, do yo... 2.0 0.0 DHSouter False 1.0 True
31 05-352 United States v. Gonzalez-Lopez {'JUSTICE SCALIA': ['When did -- when did we ... {'JUSTICE STEVENS': ['Well, Mr. Dreeben, do yo... 1.0 0.0 CThomas False 0.0 False
32 05-352 United States v. Gonzalez-Lopez {'JUSTICE SCALIA': ['When did -- when did we ... {'JUSTICE STEVENS': ['Well, Mr. Dreeben, do yo... 2.0 0.0 RBGinsburg False 1.0 True
33 05-352 United States v. Gonzalez-Lopez {'JUSTICE SCALIA': ['When did -- when did we ... {'JUSTICE STEVENS': ['Well, Mr. Dreeben, do yo... 2.0 0.0 SGBreyer False 1.0 True
34 05-352 United States v. Gonzalez-Lopez {'JUSTICE SCALIA': ['When did -- when did we ... {'JUSTICE STEVENS': ['Well, Mr. Dreeben, do yo... 1.0 0.0 JGRoberts False 0.0 False
35 05-352 United States v. Gonzalez-Lopez {'JUSTICE SCALIA': ['When did -- when did we ... {'JUSTICE STEVENS': ['Well, Mr. Dreeben, do yo... 1.0 0.0 SAAlito False 0.0 False
36 05-5992 Zedner v. United States {'JUSTICE SCALIA': ['If the The delay -- the ... {'JUSTICE SCALIA': ['People can't And the I ... 2.0 1.0 JPStevens True 1.0 True
37 05-5992 Zedner v. United States {'JUSTICE SCALIA': ['If the The delay -- the ... {'JUSTICE SCALIA': ['People can't And the I ... 2.0 1.0 AScalia True 1.0 True
38 05-5992 Zedner v. United States {'JUSTICE SCALIA': ['If the The delay -- the ... {'JUSTICE SCALIA': ['People can't And the I ... 2.0 1.0 AMKennedy True 1.0 True
... ... ... ... ... ... ... ... ... ... ...
7147 15-1406 Goodyear Tire & Rubber Co. v. Haeger {'JUSTICE KAGAN': ['And, Mr. Bergeron, could ... {'JUSTICE SOTOMAYOR': ['There is something -- ... NaN NaN NaN True NaN True
7148 15-827 Endrew F. v. Douglas County School Dist. RE-1 {'MR FISHER': ['Mr. Chief Justice, and may it ... {'MR FISHER': ['Three points, Your Honors. Tw... NaN NaN NaN True NaN True
7149 15-1498 Lynch v. Dimaya {'MR KNEEDLER': ['Mr. Chief Justice, and may i... {'MR KNEEDLER': ['First, we explained in our o... NaN NaN NaN True NaN True
7150 16-348 Midland Funding, LLC v. Johnson {'JUSTICE KAGAN': ['Before you get to that ... {'MR SHANMUGAM': ['Thank you, Mr. Chief ... NaN NaN NaN True NaN True
7151 15-1293 Lee v. Tam {'MR STEWART': ['Thank you, Mr. Chief Justice,... {'MR STEWART': ['Let make three quick points. ... NaN NaN NaN True NaN True
7152 15-1358 Ziglar v. Abbasi {'GENERAL GERSHENGORN': ['Mr. Chief Justice, a... {'GENERAL GERSHENGORN': ['But if Thank you, M... NaN NaN NaN True NaN True
7153 15-8544 Beckles v. United States {'CHIEF JUSTICE ROBERTS': ['right? of depar... {'MS BERGMANN': ['I'd like to make -- start w... NaN NaN NaN True NaN True
7154 15-797 Moore v. Texas {'JUSTICE KAGAN': ['', 'We wouldn't need that,... {'CHIEF JUSTICE ROBERTS': [''], 'MR SLOAN': ['... NaN NaN NaN True NaN True
7155 15-1204 Jennings v. Rodriguez {'JUSTICE KAGAN': ['sorry', 'I was going to sh... {'GENERAL GERSHENGORN': ['Justice. Thank you,... NaN NaN NaN True NaN True
7156 15-680 Bethune-Hill v. Virginia State Bd. of Elections {'JUSTICE KAGAN': ['Mr. Elias, could I make su... {'MR ELIAS': ['Mr. Chief Justice, and may it ... NaN NaN NaN True NaN True
7157 15-1262 McCrory v. Harris {'JUSTICE KAGAN': ['Mr. Clement, that passage ... {'MR CLEMENT': ['A few points in rebuttal. ... NaN NaN NaN True NaN True
7158 14-1538 Life Technologies Corp. v. Promega Corp. {'JUSTICE KAGAN': ['that from? Where -- where... {'MR PHILLIPS': ['Thank you, Mr. Chief Justice... NaN NaN NaN True NaN True
7159 15-649 Czyzewski v. Jevic Holding Corp. {'JUSTICE KAGAN': ['', 'isn't mentioned somepl... {'MS SPINELLI': ['Respondents' position fails ... NaN NaN NaN True NaN True
7160 15-497 Fry v. Napoleon Community Schools {'JUSTICE KAGAN': ['Could -- could I ask about... {'MR BAGENSTOS': ['Thank you, Mr. Chief Justi... NaN NaN NaN True NaN True
7161 15-866 Star Athletica, L.L.C. v. Varsity Brands, Inc. {'JUSTICE KAGAN': ['How is your argument di... {'JUSTICE KAGAN': ['Well, can't the school jus... NaN NaN NaN True NaN True
7162 15-513 State Farm Fire & Casualty Co. v. United State... {'JUSTICE KAGAN': ['', '', 'couple of times --... {'MS SULLIVAN': ['When you write the opinion ... NaN NaN NaN True NaN True
7163 15-927 SCA Hygiene Products Aktiebolag v. First Quali... {'JUSTICE SOTOMAYOR': ['They have some cases ... {'JUSTICE KAGAN': ['Well, Mr. Black, I take it... NaN NaN NaN True NaN True
7164 15-423 Bolivarian Republic of Venezuela v. Helmerich ... {'JUSTICE KAGAN': ['In the last provision of ... {'MS STETSON': ['Ms. Carroll describes this c... NaN NaN NaN True NaN True
7165 15-1251 NLRB v. SW General, Inc. {'GENERAL GERSHENGORN': ['Mr. Chief Justice, a... {'GENERAL GERSHENGORN': ['Justice. Thank you,... NaN NaN NaN True NaN True
7166 15-1111 Bank of America Corp. v. Miami {'MR KATYAL': ['Thank you, Mr. Chief Justice, ... {'MR KATYAL': ['Four points, Your Honor. Fi... NaN NaN NaN True NaN True
7167 14-1055 Lightfoot v. Cendant Mortgage {'CHIEF JUSTICE ROBERTS': ['So a dozen You do... {'JUSTICE KAGAN': ['personal jurisdiction. ... NaN NaN NaN True NaN True
7168 15-1191 Lynch v. Morales-Santana {'MR KNEEDLER': ['Mr. Chief Justice, and may i... {'MR KNEEDLER': ['Thank you, Mr. Chief Justice... NaN NaN NaN True NaN True
7169 15-537 Bravo-Fernandez v. United States {'JUSTICE KAGAN': ['Ms. Blatt, it does seem to... {'MS BLATT': ['Thanks. Thank you, Mr. Chief ... NaN NaN NaN True NaN True
7170 15-5991 Shaw v. United States {'JUSTICE KAGAN': ['So, Ms. Bell, I guess -- I... {'MS BELL': ['Thank you, Mr. Chief Justice. ... NaN NaN NaN True NaN True
7171 15-628 Salman v. United States {'JUSTICE KAGAN': ['The only case Ms. Shapiro... {'CHIEF JUSTICE ROBERTS': ['Finish your sente... NaN NaN NaN True NaN True
7172 15-8049 Buck v. Davis {'JUSTICE KAGAN': ['But, for example, last yea... {'MS SWARNS': ['This Court has long recognized... NaN NaN NaN True NaN True
7173 14-9496 Manuel v. Joliet {'JUSTICE KAGAN': ['Mr. Eisenhammer, why shoul... {'MR EISENHAMMER': ['Thank you. Just to ans... NaN NaN NaN True NaN True
7174 15-777 Samsung Electronics Co., v. Apple Inc. {'JUSTICE KAGAN': ['Could I really quickly mak... {'JUSTICE BREYER': ['The problem, of course, i... NaN NaN NaN True NaN True
7175 15-606 Pena-Rodriguez v. Colorado {'MR FISHER': ['Mr. Chief Justice, and may it ... {'MR FISHER': ['Thank you. I'd like to make ... NaN NaN NaN True NaN True
7176 15-7250 Manrique v. United States {'JUSTICE KAGAN': ['Mr. Rashkind --', 'Mr. Ras... {'MR RASHKIND': ['Thank you, Your Honor. If... NaN NaN NaN True NaN True

7132 rows × 10 columns


In [34]:
'''

We can determine whether a petitioner won over a specific justice based on:

1. Whether the petitioner won over a majority, and
2. Whether the specific justice was a part of that majority.

If the answer to both of these questions is the same (that is, either
both the answers are Yes or both the answers are no), then the
petitioner won over the justice.

Logically:

    P_WINS_MAJ,  J_VOTES_MAJ = P_WINS_J
    
        If petitioner wins majority and justice voted with majority, the petitioner won over the justice

    P_WINS_MAJ, ~J_VOTES_MAJ = P_LOSES_J
    
        If petitioner wins majority and justice NOT a part of the majority, petitioner did not win justice

   ~P_WINS_MAJ,  J_VOTES_MAJ = P_LOSES_J
   
        If petitioner does NOT win majority and justice voted in majority, petitioner did not win justice

   ~P_WINS_MAJ,  ~J_VOTES_MAJ = P_WINS_J
   
        If petitioner does NOT win majority and justice voted NOT in majority, petitioner won justice


'''

def determine_vote(row):
    # If petitioner wins majority
    if row['PETITIONER_WINS_MAJORITY']:
        # Pet wins majority AND justice voted with majority
        if row['VOTED_WITH_MAJORITY']:
            return True
        # Pet wins majority AND justice voted against majority
        else:
            return False
    # If petitioner loses majority
    else:
        # Pet loses majority AND justice voted with majority
        if row['VOTED_WITH_MAJORITY']:
            return False
        # Pet loses majority AND justice voted against majority
        else:
            return True

# Voted with majority
jdf['VOTED_FOR_PETITIONER'] = jdf.apply(determine_vote, axis=1)

In [35]:
jdf.head(3)


Out[35]:
DOCKET CASE PETITIONER_ARGUMENT PETITIONER_REBUTTAL majority partyWinning JUSTICE PETITIONER_WINS_MAJORITY majority_minus_one VOTED_WITH_MAJORITY VOTED_FOR_PETITIONER
9 05-83 Washington v. Recuenco {'JUSTICE SCALIA': ['Was deadly weapon still a... {'JUSTICE SCALIA': ['Thank you, counsel. Mr. ... 2.0 1.0 JGRoberts True 1.0 True True
10 05-83 Washington v. Recuenco {'JUSTICE SCALIA': ['Was deadly weapon still a... {'JUSTICE SCALIA': ['Thank you, counsel. Mr. ... 1.0 1.0 JPStevens True 0.0 False False
11 05-83 Washington v. Recuenco {'JUSTICE SCALIA': ['Was deadly weapon still a... {'JUSTICE SCALIA': ['Thank you, counsel. Mr. ... 2.0 1.0 AScalia True 1.0 True True

In [36]:
# Demonstration dataframe
pd.DataFrame(data={'Justice Votes With Majority': ['Petitioner Wins Justice',
                                                   'Petitioner Loses Justice'], 
                   'Justice Votes Againt  Majority': ['Petitioner Loses Justice',
                                                      'Petitioner Wins Justice']},
             index=['Petitioner Wins Majority',
                    'Petitioner Loses Majority'])


Out[36]:
Justice Votes Againt Majority Justice Votes With Majority
Petitioner Wins Majority Petitioner Loses Justice Petitioner Wins Justice
Petitioner Loses Majority Petitioner Wins Justice Petitioner Loses Justice

In [37]:
jdf[['CASE', 'JUSTICE', 'VOTED_FOR_PETITIONER']].dropna().head(9)


Out[37]:
CASE JUSTICE VOTED_FOR_PETITIONER
9 Washington v. Recuenco JGRoberts True
10 Washington v. Recuenco JPStevens False
11 Washington v. Recuenco AScalia True
12 Washington v. Recuenco AMKennedy True
13 Washington v. Recuenco DHSouter True
14 Washington v. Recuenco CThomas True
15 Washington v. Recuenco RBGinsburg False
16 Washington v. Recuenco SGBreyer True
17 Washington v. Recuenco SAAlito True

In [38]:
# Define function.
def trim_columns(df):

    # Trim columns
    df = df[['DOCKET',
             'CASE',
             'JUSTICE',
             'PETITIONER_ARGUMENT',
             #'RESPONDENT_ARGUMENT',
             'PETITIONER_REBUTTAL',
             'VOTED_FOR_PETITIONER']]
    
    return df
    
    
# Run function.
jdf = trim_columns(jdf)

In [39]:
# Show joint dataframe for clarity.
jdf.head(3)


Out[39]:
DOCKET CASE JUSTICE PETITIONER_ARGUMENT PETITIONER_REBUTTAL VOTED_FOR_PETITIONER
9 05-83 Washington v. Recuenco JGRoberts {'JUSTICE SCALIA': ['Was deadly weapon still a... {'JUSTICE SCALIA': ['Thank you, counsel. Mr. ... True
10 05-83 Washington v. Recuenco JPStevens {'JUSTICE SCALIA': ['Was deadly weapon still a... {'JUSTICE SCALIA': ['Thank you, counsel. Mr. ... False
11 05-83 Washington v. Recuenco AScalia {'JUSTICE SCALIA': ['Was deadly weapon still a... {'JUSTICE SCALIA': ['Thank you, counsel. Mr. ... True

In [40]:
# Define filter function.
def filter_justice_data(row):
    '''Converts SCDB: RHJackson to JACKSON, which can be pulled from JUSTICE JACKSON.
    
       Then for JACKSON:
             {'JUSTICE JACKSON': [1, 2],
              'JUSTICE ROBERTS': [2, 3]}

       Becomes [1,2] for JACKSON's row.
    
    '''
    
    # Handle SCDB justice names. Based on capitalization
    # which messes up SDOConnor -> Connor
    # SDOConner should be OCONNER, not CONNER.
    if row['JUSTICE'] == 'SDOConnor':
        row['JUSTICE'] = 'SDOconnor'
    # Pick first lower case letter and start name one previous
    lower_mask = [letter.islower() for letter in row['JUSTICE']]
    first_lower = lower_mask.index(True)
    one_prior = first_lower - 1
    row['JUSTICE'] = row['JUSTICE'][one_prior:].upper()
            
    # Handle text columns
    for index in ['PETITIONER_ARGUMENT',
                  #'RESPONDENT_ARGUMENT',
                  'PETITIONER_REBUTTAL',]:
        # Find if justice name is in any of the keys.
        # 1 if found in string, 0 if not.
        # [1, 0, 0] -> True
        justice_represented = any([key.count(row['JUSTICE']) for key in row[index].keys()])
        # If represented, fill with value.
        if justice_represented:
            for key in row[index].keys():
                if row['JUSTICE'] in key and 'JUSTICE' in key:
                    try:
                        row[index] = row[index][key]
                    except TypeError:
                        # Fallback to edit distance?
                        pass
            # If a number has not been placed in the cell, place zero.
            if type(row[index]) == dict:
                row[index] = []
        # If not represented
        else:
            row[index] = []

    return row


# Apply function. If justice is NA ... not yet decided
jdf = (jdf.dropna(subset=['JUSTICE'])
          .apply(filter_justice_data, axis=1))

In [41]:
# Show joint dataframe for clarity
jdf.head(3)


Out[41]:
DOCKET CASE JUSTICE PETITIONER_ARGUMENT PETITIONER_REBUTTAL VOTED_FOR_PETITIONER
9 05-83 Washington v. Recuenco ROBERTS [Is the jury given a\n\ncopy of the information?] [Although under\n\nAnd those are the\n\n\n\nso... True
10 05-83 Washington v. Recuenco STEVENS [Could you -- could you\n\n\n\nclarify one thi... [But then we'd have Justice\n\n\n\nScalia's ca... False
11 05-83 Washington v. Recuenco SCALIA [Was deadly weapon still an\n\n\n\nenhancement... [Thank you, counsel.\n\nMr. Whisman, can I -- ... True

In [42]:
# Write argument data df to csv
arg_data_csv_path = os.path.join(os.path.expanduser('~'),
                                 '.scoap',
                                 'argument_data.csv')
jdf.to_csv(arg_data_csv_path, encoding='utf-8')

In [43]:
# Create text_df
text_df = pd.melt(jdf,
                  id_vars=['JUSTICE',
                           'DOCKET',
                           'VOTED_FOR_PETITIONER'],
                  value_vars=['PETITIONER_ARGUMENT',
                              #'RESPONDENT_ARGUMENT',
                              'PETITIONER_REBUTTAL'],
                  var_name='ARG_TYPE',
                  value_name='TEXT')

text_df.head(3)


Out[43]:
JUSTICE DOCKET VOTED_FOR_PETITIONER ARG_TYPE TEXT
0 ROBERTS 05-83 True PETITIONER_ARGUMENT [Is the jury given a\n\ncopy of the information?]
1 STEVENS 05-83 False PETITIONER_ARGUMENT [Could you -- could you\n\n\n\nclarify one thi...
2 SCALIA 05-83 True PETITIONER_ARGUMENT [Was deadly weapon still an\n\n\n\nenhancement...

In [44]:
# Define function
def reorient_args(row):
    '''Apply function to make respondent arguments useful.
        
    WARNING: HAND-WAVY, UNSCIENTIFIC FEATURE ENGINEERING BELOW.
    THIS ACTUALLY DECREASES ACCURACY AT PRESENT.
    
    We hamfistedly force the petitioner argument, respondent
    argument, and petitioner rebuttal into a single type of
    entry. Where before we had:
    
    JUSTICE, PET_ARG, PETITIONER_WINS
    JUSTICE, RES_ARG, PETITIONER_WINS
    JUSTICE, PET_REB, PETITIONER_WINS
    
    We will now have:
    
    JUSTICE, PET_ARG, QUESTIONEE_WON
    JUSTICE, RES_ARG, QUESTIONEE_WON
    JUSTICE, PET_REB, QUESTIONEE_WON
    
    The first notable change is that we transform PETITIONER_WINS
    to QUESTIONEE_WON. Previously, we could see what text is
    associated with petitioner wins because target PET_WINS was
    framed in terms of the petitioner. It was previously useless
    for respondent comments. We can get around this by reframing
    the target in terms of "Did the party to whom the justice
    directed the comment win?" instead of "Did the petitioner
    win?".
    
    This requires a big assumption: namely that petitioner
    arguments, respondent arguments, and petitioner rebuttals
    are roughly interchangable. In other words, we are presuming
    that justices will use similar terms i.e. "Your argument is
    bad and you should feel bad" whether it's the petitioner or
    it's the respondent.
    
    This theoretically results in some loss of prediction quality:
    negative words directed at a respondent may be markedly
    different in quality from those directed at a petitioner.
    However, this trades off with the fact that we have uroughly
    doubled the number of samples.
    '''

    vote_pet = row['VOTED_FOR_PETITIONER']
    arg_type = row['ARG_TYPE']
    
    if arg_type in ['RESPONDENT_ARGUMENT']:
        if vote_pet is True:
            voted_for_speaker = False
        else:
            voted_for_speaker = True

    if arg_type in ['PETITIONER_ARGUMENT', 'PETITIONER_REBUTTAL']:
        if vote_pet is True:
            voted_for_speaker = True
        else:
            voted_for_speaker = False

    return voted_for_speaker


# Run function
text_df['QUESTIONEE_WON'] = text_df.apply(reorient_args, axis=1)
text_df


Out[44]:
JUSTICE DOCKET VOTED_FOR_PETITIONER ARG_TYPE TEXT QUESTIONEE_WON
0 ROBERTS 05-83 True PETITIONER_ARGUMENT [Is the jury given a\n\ncopy of the information?] True
1 STEVENS 05-83 False PETITIONER_ARGUMENT [Could you -- could you\n\n\n\nclarify one thi... False
2 SCALIA 05-83 True PETITIONER_ARGUMENT [Was deadly weapon still an\n\n\n\nenhancement... True
3 KENNEDY 05-83 True PETITIONER_ARGUMENT [On -- on that point, I have\n\n\n\n-- I have ... True
4 SOUTER 05-83 True PETITIONER_ARGUMENT [, Now, did you have to prove\n\n\n\nthat beca... True
5 THOMAS 05-83 True PETITIONER_ARGUMENT [] True
6 GINSBURG 05-83 False PETITIONER_ARGUMENT [I thought the deadly weapon\n\n\n\n-- the def... False
7 BREYER 05-83 True PETITIONER_ARGUMENT [] True
8 ALITO 05-83 True PETITIONER_ARGUMENT [] True
9 ROBERTS 05-259 False PETITIONER_ARGUMENT [It has been endorsed\n\nby the EEOC, though] False
10 STEVENS 05-259 False PETITIONER_ARGUMENT [May I ask you this\n\n\n\nhypothetical?\n\n\n... False
11 SCALIA 05-259 False PETITIONER_ARGUMENT [But has the language and\n\n\n\n doesn't.\n\... False
12 KENNEDY 05-259 False PETITIONER_ARGUMENT [from the forklift forever or a year?\n\n\n\nt... False
13 SOUTER 05-259 False PETITIONER_ARGUMENT [Okay, but if that argument\n\nis sound -, Yes... False
14 THOMAS 05-259 False PETITIONER_ARGUMENT [] False
15 GINSBURG 05-259 False PETITIONER_ARGUMENT [They say one lunch, but\n\n\n\nnot if there's... False
16 BREYER 05-259 False PETITIONER_ARGUMENT [Well, the answer would be\n\n\n\nbecause Cong... False
17 ALITO 05-259 False PETITIONER_ARGUMENT [But he says, you know, as you\n\nIsn't a chan... False
18 STEVENS 05-352 False PETITIONER_ARGUMENT [How do you reconcile your\n\n\n\nposition wit... False
19 SCALIA 05-352 False PETITIONER_ARGUMENT [When did -- when did we\n\n\n\nfirst hold tha... False
20 KENNEDY 05-352 True PETITIONER_ARGUMENT [But, of\n\nWell, that was my -- even\n\n\n\ni... True
21 SOUTER 05-352 False PETITIONER_ARGUMENT [It's a\n\nThe only issue in that case\n\n\n\n... False
22 THOMAS 05-352 True PETITIONER_ARGUMENT [] True
23 GINSBURG 05-352 False PETITIONER_ARGUMENT [But in this -- in this\n\n\n\ncase, Mr. Dreeb... False
24 BREYER 05-352 False PETITIONER_ARGUMENT [] False
25 ROBERTS 05-352 True PETITIONER_ARGUMENT [Well, but what if --\n\n\n\njust to take an e... True
26 ALITO 05-352 True PETITIONER_ARGUMENT [It's actually easier than\n\nWell, why would ... True
27 STEVENS 05-5992 True PETITIONER_ARGUMENT [] True
28 SCALIA 05-5992 True PETITIONER_ARGUMENT [If the\n\nThe delay -- the delay was\n\n\n\nn... True
29 KENNEDY 05-5992 True PETITIONER_ARGUMENT [Prior to that time,\nIt couldn't\n\nYou're no... True
... ... ... ... ... ... ...
14116 ALITO 14-280 False PETITIONER_REBUTTAL [] False
14117 SOTOMAYOR 14-280 True PETITIONER_REBUTTAL [] True
14118 KAGAN 14-280 True PETITIONER_REBUTTAL [] True
14119 ROBERTS 14-7505 True PETITIONER_REBUTTAL [] True
14120 SCALIA 14-7505 True PETITIONER_REBUTTAL [] True
14121 KENNEDY 14-7505 True PETITIONER_REBUTTAL [] True
14122 THOMAS 14-7505 True PETITIONER_REBUTTAL [] True
14123 GINSBURG 14-7505 True PETITIONER_REBUTTAL [] True
14124 BREYER 14-7505 True PETITIONER_REBUTTAL [] True
14125 ALITO 14-7505 False PETITIONER_REBUTTAL [] False
14126 SOTOMAYOR 14-7505 True PETITIONER_REBUTTAL [] True
14127 KAGAN 14-7505 True PETITIONER_REBUTTAL [] True
14128 ROBERTS 14-857 True PETITIONER_REBUTTAL [] True
14129 SCALIA 14-857 True PETITIONER_REBUTTAL [] True
14130 KENNEDY 14-857 False PETITIONER_REBUTTAL [] False
14131 THOMAS 14-857 False PETITIONER_REBUTTAL [] False
14132 GINSBURG 14-857 False PETITIONER_REBUTTAL [] False
14133 BREYER 14-857 False PETITIONER_REBUTTAL [] False
14134 ALITO 14-857 True PETITIONER_REBUTTAL [] True
14135 SOTOMAYOR 14-857 False PETITIONER_REBUTTAL [] False
14136 KAGAN 14-857 False PETITIONER_REBUTTAL [] False
14137 ROBERTS 14-840 True PETITIONER_REBUTTAL [] True
14138 SCALIA 14-840 False PETITIONER_REBUTTAL [] False
14139 KENNEDY 14-840 True PETITIONER_REBUTTAL [] True
14140 THOMAS 14-840 False PETITIONER_REBUTTAL [] False
14141 GINSBURG 14-840 True PETITIONER_REBUTTAL [] True
14142 BREYER 14-840 True PETITIONER_REBUTTAL [] True
14143 ALITO 14-840 True PETITIONER_REBUTTAL [] True
14144 SOTOMAYOR 14-840 True PETITIONER_REBUTTAL [] True
14145 KAGAN 14-840 True PETITIONER_REBUTTAL [] True

14146 rows × 6 columns


In [45]:
# Define function
def create_text_df(df):
    '''Clean up text.'''
    
    # ' '.join[question_1, question_2, question_3] so single string
    df['TEXT'] = df['TEXT'].map(lambda item: ' '.join(item))

    # Create string of punctuation chars to remove (but not '-')
    punctuation = string.punctuation.replace('-', '').replace('/', '')
        
    # Remove punctuation via [.!?,;:] regex
    df['TEXT'] = df['TEXT'].str.replace('[' +
                                        punctuation +
                                        ']',
                                        # Replacement value
                                        '')

    # Remove double dash pattern
    df['TEXT'] = df['TEXT'].str.replace('--',
                                        # Replacement value
                                        '')
    
    # Get rid of all items without text.
    df = df.loc[df['TEXT'].str.strip().str.len() > 0,:]

    return df


# Run function
text_df = create_text_df(text_df)
text_df


Out[45]:
JUSTICE DOCKET VOTED_FOR_PETITIONER ARG_TYPE TEXT QUESTIONEE_WON
0 ROBERTS 05-83 True PETITIONER_ARGUMENT Is the jury given a\n\ncopy of the information True
1 STEVENS 05-83 False PETITIONER_ARGUMENT Could you could you\n\n\n\nclarify one thing ... False
2 SCALIA 05-83 True PETITIONER_ARGUMENT Was deadly weapon still an\n\n\n\nenhancement ... True
3 KENNEDY 05-83 True PETITIONER_ARGUMENT On on that point I have\n\n\n\n I have one qu... True
4 SOUTER 05-83 True PETITIONER_ARGUMENT Now did you have to prove\n\n\n\nthat because... True
6 GINSBURG 05-83 False PETITIONER_ARGUMENT I thought the deadly weapon\n\n\n\n the defini... False
9 ROBERTS 05-259 False PETITIONER_ARGUMENT It has been endorsed\n\nby the EEOC though False
10 STEVENS 05-259 False PETITIONER_ARGUMENT May I ask you this\n\n\n\nhypothetical\n\n\n\n... False
11 SCALIA 05-259 False PETITIONER_ARGUMENT But has the language and\n\n\n\n doesnt\n\n\n... False
12 KENNEDY 05-259 False PETITIONER_ARGUMENT from the forklift forever or a year\n\n\n\nthe... False
13 SOUTER 05-259 False PETITIONER_ARGUMENT Okay but if that argument\n\nis sound - Yes bu... False
15 GINSBURG 05-259 False PETITIONER_ARGUMENT They say one lunch but\n\n\n\nnot if theres a ... False
16 BREYER 05-259 False PETITIONER_ARGUMENT Well the answer would be\n\n\n\nbecause Congre... False
17 ALITO 05-259 False PETITIONER_ARGUMENT But he says you know as you\n\nIsnt a change i... False
18 STEVENS 05-352 False PETITIONER_ARGUMENT How do you reconcile your\n\n\n\nposition with... False
19 SCALIA 05-352 False PETITIONER_ARGUMENT When did when did we\n\n\n\nfirst hold that t... False
20 KENNEDY 05-352 True PETITIONER_ARGUMENT But of\n\nWell that was my even\n\n\n\nin the... True
21 SOUTER 05-352 False PETITIONER_ARGUMENT Its a\n\nThe only issue in that case\n\n\n\nis... False
23 GINSBURG 05-352 False PETITIONER_ARGUMENT But in this in this\n\n\n\ncase Mr Dreeben we... False
25 ROBERTS 05-352 True PETITIONER_ARGUMENT Well but what if \n\n\n\njust to take an examp... True
26 ALITO 05-352 True PETITIONER_ARGUMENT Its actually easier than\n\nWell why would it ... True
28 SCALIA 05-5992 True PETITIONER_ARGUMENT If the\n\nThe delay the delay was\n\n\n\nnot ... True
29 KENNEDY 05-5992 True PETITIONER_ARGUMENT Prior to that time\nIt couldnt\n\nYoure not qu... True
30 SOUTER 05-5992 True PETITIONER_ARGUMENT And it makes very clear\n\nNo but isnt isnt t... True
32 GINSBURG 05-5992 True PETITIONER_ARGUMENT For\n\nWhat about Mr Zas that\n\n\n\nthe court... True
33 BREYER 05-5992 True PETITIONER_ARGUMENT Yes but on on that one\n\n\n\nthe the Second... True
34 ROBERTS 05-5992 True PETITIONER_ARGUMENT of justice finding for all time\n\n\n\nbeginni... True
36 STEVENS 05-18 False PETITIONER_ARGUMENT Do you think the right\n\n\n\nincludes any cos... False
37 SCALIA 05-18 True PETITIONER_ARGUMENT I thought the GAO study\n\n\n\nincluded not ju... True
38 KENNEDY 05-18 True PETITIONER_ARGUMENT But\n\nI I would have thought\n\n\n\nthat you... True
... ... ... ... ... ... ...
13149 SCALIA 13-550 True PETITIONER_REBUTTAL But if ­­ if we agree that \n\nthere had to be... True
13150 KENNEDY 13-550 True PETITIONER_REBUTTAL Where ­­ where are you \n\nreading from Well y... True
13152 GINSBURG 13-550 True PETITIONER_REBUTTAL You also said did you \n\n\n\nnot that ­­ that... True
13153 BREYER 13-550 True PETITIONER_REBUTTAL Do you think both parties \n\n\n\nagree that t... True
13154 ALITO 13-550 True PETITIONER_REBUTTAL If we forget what happened \n\n\n\nat earlier ... True
13155 SOTOMAYOR 13-550 True PETITIONER_REBUTTAL One of the things that I \n\n\n\nwas looking f... True
13156 KAGAN 13-550 True PETITIONER_REBUTTAL And if I could just ask \n\n\n\nagain Mr Frede... True
13382 ROBERTS 13-1174 True PETITIONER_REBUTTAL Well hear argument \n\n\n\nfirst this morning ... True
13383 SCALIA 13-1174 True PETITIONER_REBUTTAL But these statutes do not \n\n\n\nspeak of con... True
13384 KENNEDY 13-1174 True PETITIONER_REBUTTAL But ­­ but your ­­ your \n\n\n\nanswer is that... True
13386 GINSBURG 13-1174 True PETITIONER_REBUTTAL Doesnt Rule  say \n\nunless otherwise provided... True
13387 BREYER 13-1174 True PETITIONER_REBUTTAL How do you do that \n\nBecause not ­­ not all ... True
13388 ALITO 13-1174 True PETITIONER_REBUTTAL Well but that ­­\nAnd that is Petitioners bri... True
13389 SOTOMAYOR 13-1174 True PETITIONER_REBUTTAL Could you tell me \n\n\n\nfirst what differenc... True
13390 KAGAN 13-1174 True PETITIONER_REBUTTAL Are you then saying that \n\n\n\ntheres no pra... True
13409 ROBERTS 13-1075 True PETITIONER_REBUTTAL We will hear \n\n\n\nargument next this mornin... True
13410 SCALIA 13-1075 True PETITIONER_REBUTTAL Didnt ­­ didnt the \n\n\n\nprivate right of a... True
13411 KENNEDY 13-1075 False PETITIONER_REBUTTAL Were ­­ were those ­­\nDidnt our cases ­­\n­­... False
13413 GINSBURG 13-1075 False PETITIONER_REBUTTAL I thought at least if a \n\n\n\nstatute passed... False
13414 BREYER 13-1075 False PETITIONER_REBUTTAL Well if that ­­ if thats \n\nall ­­ If in fact... False
13416 SOTOMAYOR 13-1075 False PETITIONER_REBUTTAL You are equating \n\n\n\nreasonable cause or ­... False
13417 KAGAN 13-1075 False PETITIONER_REBUTTAL Ms ­­ Ms Prelogar Im \n\n\n\nwondering what y... False
13427 ROBERTS 13-435 True PETITIONER_REBUTTAL Well hear argument \n\n\n\nnext this morning i... True
13428 SCALIA 13-435 True PETITIONER_REBUTTAL Do you think that ­­ Justice Scalia \nDo you t... True
13429 KENNEDY 13-435 True PETITIONER_REBUTTAL If we adopt your position \n\n\n\nwhich I take... True
13431 GINSBURG 13-435 True PETITIONER_REBUTTAL So youre saying that \n\nthis evidence what we... True
13432 BREYER 13-435 True PETITIONER_REBUTTAL But suppose it is actually \n\n\n\ndisputed an... True
13433 ALITO 13-435 True PETITIONER_REBUTTAL Well that may be true but \n\n\n\ndo you deny ... True
13434 SOTOMAYOR 13-435 True PETITIONER_REBUTTAL Whats wrong with that \n\n\n\nTheres an assump... True
13435 KAGAN 13-435 True PETITIONER_REBUTTAL Well Mr Shanmugam \n\nsuppose ­­ Mr Shanmugam ... True

5225 rows × 6 columns


In [46]:
# Create test/train split for text data
split = sklearn.model_selection.train_test_split

# Split test and train.
train_text_df, test_text_df = split(text_df, test_size = 0.2)
train_text_df = train_text_df.copy()
test_text_df = test_text_df.copy()

In [47]:
train_text_df.head(3)


Out[47]:
JUSTICE DOCKET VOTED_FOR_PETITIONER ARG_TYPE TEXT QUESTIONEE_WON
4984 SOTOMAYOR 11-1351 True PETITIONER_ARGUMENT Well one of the\n\n\n\nstrongest arguments by ... True
4969 SCALIA 11-9953 False PETITIONER_ARGUMENT Mr Bourke was was it\n\nwithin the control of... False
7278 ROBERTS 04-10566 False PETITIONER_REBUTTAL Where you have a treaty that becomes part of\n... False

In [48]:
train_text_df['JUSTICE'].unique()


Out[48]:
array(['SOTOMAYOR', 'SCALIA', 'ROBERTS', 'GINSBURG', 'ALITO', 'STEVENS',
       'KENNEDY', 'BREYER', 'KAGAN', 'SOUTER', 'THOMAS'], dtype=object)

In [49]:
# Define function
def create_pipelines(df):
    '''Creates pipelines for each justice.'''
    # Basic setup.
    gb = text_df.groupby('JUSTICE')
    justices = df['JUSTICE'].unique()
    dataframes = [gb.get_group(justice) for justice in justices]
    nb_pipelines = []
    sgd_pipelines = []
    rf_pipelines = []
    
    # Probably a vectorized way to do this.
    for justice, dataframe in zip(justices, dataframes):

        # Make aliases
        Pipe = sklearn.pipeline.Pipeline
        Vectorizer = sklearn.feature_extraction.text.CountVectorizer
        Transformer = sklearn.feature_extraction.text.TfidfTransformer
        MultiNB = sklearn.naive_bayes.MultinomialNB
        SGD = sklearn.linear_model.SGDClassifier
        RF = sklearn.ensemble.RandomForestClassifier

        # Reuseable arguments.
        vectorizer_params = {'ngram_range': (3, 5),
                             'min_df': 10}
        
        transformer_params = {'use_idf': True}

        ############# Multinomial Naive Bayes classifier
        nb_pipeline = Pipe([('vectorizer', Vectorizer(**vectorizer_params)),
                            ('transformer', Transformer(**transformer_params)),
                            ('classifier', MultiNB()),])
        try:
            nb_pipeline = nb_pipeline.fit(dataframe['TEXT'],
                                          dataframe['QUESTIONEE_WON'])
        except (ValueError, AttributeError):
            nb_pipeline = None
        nb_pipelines.append(nb_pipeline)

        ############ Gradient descent SGD
        sgd_pipeline = Pipe([('vectorizer', Vectorizer(**vectorizer_params)),
                             ('transformer', Transformer(**transformer_params)),
                             ('classifier', SGD(loss='log', penalty='l2')),])    
        try:
            sgd_pipeline = sgd_pipeline.fit(dataframe['TEXT'],
                                            dataframe['QUESTIONEE_WON'])
        except (ValueError, AttributeError):
            sgd_pipeline = None
        sgd_pipelines.append(sgd_pipeline)
        
        ############ RF
        rf_pipeline = Pipe([('vectorizer', Vectorizer(**vectorizer_params)),
                            ('transformer', Transformer(**transformer_params)),
                            ('classifier', RF(n_estimators=100))])    
        try:
            rf_pipeline = rf_pipeline.fit(dataframe['TEXT'],
                                          dataframe['QUESTIONEE_WON'])
        except (ValueError, AttributeError):
            rf_pipeline = None
        rf_pipelines.append(rf_pipeline)
    
    return [item for item in zip(justices,
                                 nb_pipelines, 
                                 sgd_pipelines, 
                                 rf_pipelines)]


# Create test and train pipelines
pipelines = create_pipelines(train_text_df)

In [50]:
pipelines[0][1]


Out[50]:
Pipeline(steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=10,
        ngram_range=(3, 5), preprocessor=None, stop_words=None,
      ...f=False, use_idf=True)), ('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [51]:
# Define function for creating an argument for add_predictions()
def create_model_dict(model_pipelines):
    '''For convenience we create an associative array of models.'''
    model_dict = {}
    for justice, nb_pipe, sgd_pipe, rf_pipe in model_pipelines:
        # Nested dicts
        model_dict[justice] = {}
        model_dict[justice]['SGD'] = sgd_pipe
        model_dict[justice]['NB'] = nb_pipe
        model_dict[justice]['RF'] = rf_pipe 
    return model_dict


# Run function
model_dict = create_model_dict(pipelines)

In [52]:
# Define function to add predictions to test frame
def add_predictions(row, model_dict, model_type):
    '''Apply() function for adding predictions'''
    justice_name = row['JUSTICE']
    try:
        model = model_dict[justice_name][model_type]
        prediction = model.predict([row['TEXT']])[0]
    # If no model, predict will not be an attribute.
    # No justice, no peace (also no model).
    except (KeyError, AttributeError):
        return np.NaN
    return prediction


test_text_df['NB_PREDICTION'] = test_text_df.apply(add_predictions,
                                                   args=(model_dict, 'NB'),
                                                   axis=1).astype(bool)

test_text_df['SGD_PREDICTION'] = test_text_df.apply(add_predictions,
                                                    args=(model_dict, 'SGD'),
                                                    axis=1).astype(bool)

test_text_df['RF_PREDICTION'] = test_text_df.apply(add_predictions,
                                                   args=(model_dict, 'RF'),
                                                   axis=1).astype(bool)

In [53]:
test_text_df.head(3)


Out[53]:
JUSTICE DOCKET VOTED_FOR_PETITIONER ARG_TYPE TEXT QUESTIONEE_WON NB_PREDICTION SGD_PREDICTION RF_PREDICTION
8900 STEVENS 06-666 True PETITIONER_REBUTTAL We are\n\nWe are talking about not\n\njust a d... True False False False
8551 KENNEDY 06-11429 False PETITIONER_REBUTTAL It is simply our\n\nWell but there has to be\n... False True True False
4644 ROBERTS 11-798 True PETITIONER_ARGUMENT That matters that\n\n\n\nmatters in your view... True True True True

In [54]:
# Check output for clarity
test_text_df.head(3)

# Assess accuracy
score = sklearn.metrics.accuracy_score
test_text_df = test_text_df.dropna()

# Conduct scoring
nb_score = score(test_text_df['QUESTIONEE_WON'],
                 test_text_df['NB_PREDICTION'])
sgd_score = score(test_text_df['QUESTIONEE_WON'],
                  test_text_df['SGD_PREDICTION'])
rf_score = score(test_text_df['QUESTIONEE_WON'],
                 test_text_df['RF_PREDICTION'])

# Format as string
base_string = '''
\n
The Naive Bayes model scored {:.1%}.\n\n
The Stochastic Gradient Decent model scored {:.1%}.\n\n
The Random Forest model scored {:.1%}.\n\n

This can't be real. TODO.
'''

print(base_string.format(nb_score, sgd_score, rf_score))




The Naive Bayes model scored 72.9%.


The Stochastic Gradient Decent model scored 83.7%.


The Random Forest model scored 94.6%.



This can't be real. TODO.


In [55]:
score = sklearn.metrics.roc_auc_score(test_text_df['QUESTIONEE_WON'].values,
                                      test_text_df['RF_PREDICTION'].values)

score


Out[55]:
0.94360588874499463

In [56]:
# Define function
def get_nb_phrases(nb_pipeline, number):
    '''Pull relevant phrases from model'''
    nb_vec = nb_pipeline.named_steps['vectorizer']
    nb_clf = nb_pipeline.named_steps['classifier']
    nb_names = nb_vec.get_feature_names()
    # nb_clf.feature_log_prob_[0] is for False (voted against party)
    # nb_clf.feature_log_prob_[1] is for True (voted for party)
    nb_probs = nb_clf.feature_log_prob_[1]
    nb_series = pd.Series({name: prob
                           for name, prob
                           in zip(nb_names, nb_probs)})
    # Turn into series
    top_values_nb = nb_series.sort_values(ascending=False).head(number).copy()
    top_values_nb.name = 'Top Naive Bayes Log Prob'
    bottom_values_nb = nb_series.sort_values(ascending=True).head(number).copy()
    bottom_values_nb.name = 'Bottom Naive Bayes Log Prob'
    return (top_values_nb, bottom_values_nb)


# Define function
def get_sgd_phrases(sgd_pipeline, number):
    '''Pull phrases from model.'''
    sgd_clf = sgd_pipeline.named_steps['classifier']
    sgd_vec = sgd_pipeline.named_steps['vectorizer']
    sgd_names = sgd_vec.get_feature_names()
    # sgd_clf.coef_[0] is for False (voted against party)
    sgd_probs = sgd_clf.coef_[0]
    sgd_series = pd.Series({name: prob
                            for name, prob
                            in zip(sgd_names, sgd_probs)})
    # Turn into series.
    top_values_sgd = sgd_series.sort_values(ascending=False).head(number).copy()
    top_values_sgd.name = 'Top SGD Log Prob'
    bottom_values_sgd = sgd_series.sort_values(ascending=True).head(number).copy()
    bottom_values_sgd.name = 'Bottom SGD Log Prob'
    return(top_values_sgd, bottom_values_sgd)


# Define function
def get_rf_phrases(rf_pipeline, number):
    '''Pull phrases from model. Importances are both top and bottom items.'''
    rf_clf = rf_pipeline.named_steps['classifier']
    rf_vec = rf_pipeline.named_steps['vectorizer']
    rf_names = rf_vec.get_feature_names()
    # rf_clf.geature_importances
    rf_probs = rf_clf.feature_importances_
    rf_series = pd.Series({name: prob
                           for name, prob
                           in zip(rf_names, rf_probs)})
    # Turn into series.
    top_values_rf = rf_series.sort_values(ascending=False).head(number).copy()
    top_values_rf.name = 'Top RF Feature Imp'
    bottom_values_rf = rf_series.sort_values(ascending=True).head(number).copy()
    bottom_values_rf.name = 'Bottom RF Feature Imp'
    return(top_values_rf, bottom_values_rf)

In [57]:
# Define function
def create_phrase_series(pipelines, number=500):
    '''Top and bottom phrase DFs. Two columns per justice in each (SGD & NB models).'''
    
    # Create data holding dicts
    return_value = []
    
    # Iterate through pipelines to get data we need.
    for justice, nb_pipeline, sgd_pipeline, rf_pipeline in pipelines:
        
        # Skip any empty pipelines (insufficient comments)
        if any([nb_pipeline is None, 
                sgd_pipeline is None,
                rf_pipeline is None]):
            continue

        # Get actual phrases
        top_values_nb, bottom_values_nb = get_nb_phrases(nb_pipeline,
                                                         number)
        top_values_sgd, bottom_values_sgd = get_sgd_phrases(sgd_pipeline,
                                                            number)
        top_values_rf, bottom_values_rf = get_rf_phrases(rf_pipeline,
                                                         number)

        # Add to return value
        return_value.append({'justice': justice,
                             'TOP_NB': top_values_nb,
                             'BOTTOM_NB': bottom_values_nb,
                             'TOP_SGD': top_values_sgd,
                             'BOTTOM_SGD': bottom_values_sgd,
                             'TOP_RF': top_values_rf,
                             'BOTTOM_RF': bottom_values_rf})

    # Return list of dicts.
    return return_value
        
    
# Run function
justice_data = create_phrase_series(pipelines)

In [58]:
# Show sample data for clarity
pd.DataFrame(justice_data[6]['BOTTOM_RF']).head(3)


Out[58]:
Bottom RF Feature Imp
would have to 0.000326
to make the 0.000343
are talking about 0.000351

In [59]:
# Define function
def create_frequency_dfs(justice_data, text_df):
    '''This function takes bottom phrases and computes frequency.'''
        
    # Results
    bottom_phrase_results = []
    
    for data_dict in justice_data:
        
        # Get justice name
        justice = data_dict['justice']

        # Get bottom values
        bottom_nb_phrases = data_dict['BOTTOM_NB']
        bottom_sgd_phrases = data_dict['BOTTOM_SGD']
        bottom_rf_phrases = data_dict['BOTTOM_RF']
        bottom_phrases = (bottom_nb_phrases.append(bottom_sgd_phrases)
                                           .append(bottom_rf_phrases)
                                           .drop_duplicates()
                                           .index.values)
        
        # Create won and lost dataframes.
        won_df = text_df[(text_df['JUSTICE'] == justice) &
                         (text_df['QUESTIONEE_WON'] == True)]
        lost_df = text_df[(text_df['JUSTICE'] == justice) &
                          (text_df['QUESTIONEE_WON'] == False)]
        
        # To string
        won_string = won_df['TEXT'].str.lower().str.cat(sep=' ')
        lost_string = lost_df['TEXT'].str.lower().str.cat(sep=' ')

   
        # Calculate bottom phrases
        for phrase in bottom_phrases:
            won_count = 0
            lost_count = 0
            
            # Get counts
            won_count += won_string.count(phrase)
            lost_count += lost_string.count(phrase)
            all_count = won_count + lost_count
            if all_count == 0:
                percentage = np.NaN
            else:
                percentage = won_count / all_count
            # Stick in results (list of dicts)
            bottom_phrase_results.append({'JUSTICE': justice,
                                          'PHRASE': phrase,
                                          'AT_WINNER_COUNT': won_count,
                                          'AT_LOSER_COUNT': lost_count,
                                          'AT_WINNER_PERCENT': percentage})

    # Create bottom dataframe
    bottom_df = pd.DataFrame(bottom_phrase_results)
    bottom_df = bottom_df.set_index(['JUSTICE', 'PHRASE'])
    bottom_df = bottom_df[['AT_WINNER_COUNT', 'AT_LOSER_COUNT', 'AT_WINNER_PERCENT']]
    bottom_df['AT_LOSER_PERCENT'] = 1 - bottom_df['AT_WINNER_PERCENT']

    return bottom_df


# Run function
bottom_freq_df = create_frequency_dfs(justice_data, text_df)
bottom_freq_df.head(5).dropna()


Out[59]:
AT_WINNER_COUNT AT_LOSER_COUNT AT_WINNER_PERCENT AT_LOSER_PERCENT
JUSTICE PHRASE
SOTOMAYOR why should we 2 10 0.166667 0.833333
could you tell me what 3 5 0.375000 0.625000
that there are 2 6 0.250000 0.750000
are you suggesting 2 7 0.222222 0.777778
between the two 2 5 0.285714 0.714286

In [60]:
# Write bottom_freq_df to file
bottom_csv_path = os.path.join(DATA_FOLDER, 'bottom_phrases.csv')
bottom_freq_df.to_csv(bottom_csv_path, encoding='utf-8')

In [61]:
# Define function
def create_tabulation_df():
    '''(Case, justice, arg_vect) x  model.'''
    
    # Make justice/model multiindex for columns
    cases = CURRENT_CASES
    # Really should have standardized this earlier
    justices = [justice.upper() for justice in CURRENT_JUSTICES]
    arg_types = ['PETITIONER_ARGUMENT',
                 'RESPONDENT_ARGUMENT',
                 'PETITIONER_REBUTTAL']
    models = ['NB', 'SGD', 'RF']
    cja_index = pd.MultiIndex.from_product([cases, justices, arg_types])

    # Make dataframe
    tabulation_df = pd.DataFrame(index=cja_index, columns=models, data=np.NaN)
    return tabulation_df


# Run function
tabulation_df = create_tabulation_df()

In [62]:
# Demo for clarity ... should be empty.
tabulation_df.head(3)


Out[62]:
NB SGD RF
15-214 ROBERTS PETITIONER_ARGUMENT NaN NaN NaN
RESPONDENT_ARGUMENT NaN NaN NaN
PETITIONER_REBUTTAL NaN NaN NaN

In [63]:
def make_current_df(arg_df):
    # Create a lookup dataframe.
    lookup_df = arg_df[arg_df['DOCKET'].isin(CURRENT_CASES)]
    output_rows = []
    input_rows = [row.to_dict() for index, row in lookup_df.iterrows()]
    for justice in CURRENT_JUSTICES:
        for row_dict in input_rows:
            dict_copy = copy.deepcopy(row_dict)
            dict_copy['JUSTICE'] = justice
            output_rows.append(dict_copy)
    return pd.DataFrame.from_dict(output_rows)

# Create new df
current_df = make_current_df(arg_df)
# Run previous functions
current_df = current_df.apply(filter_justice_data, axis=1)
current_df.head(3)


Out[63]:
ARGUMENT_LINK ARGUMENT_PATH ARGUMENT_YEAR CASE DOCKET JUSTICE PETITIONER_ARGUMENT PETITIONER_REBUTTAL PET_ARG_HEADING PET_ARG_REGEX PET_REB_HEADING PET_REB_REGEX RESPONDENT_ARGUMENT RES_ARG_HEADING RES_ARG_REGEX TEXT
0 https://www.supremecourt.gov/oral_arguments/ar... /home/theo/.scoap/15-214/argument.pdf 2016 Murr v. Wisconsin 15-214 ROBERTS [, , , And those would\n\nI thought your\n\n\n... [] ORAL ARGUMENT OF JOHN M. GROEN\n\n\n\nON BEHAL... ORAL ARGUMENT OF JOHN M. GROEN\n\n\n\nON BEHAL... REBUTTAL ARGUMENT OF JOHN M. GROEN\n\n\n\nON B... REBUTTAL ARGUMENT OF JOHN M. GROEN\n\n\n\nON B... {'CHIEF JUSTICE ROBERTS': ['Well, that's -- an... ORAL ARGUMENT OF RICHARD J. LAZARUS\nON BEHALF... ORAL ARGUMENT OF RICHARD J. LAZARUS\nON BEHALF... \n\n\n\n\n\n\n\nCHIEF JUSTICE ROBERTS:\n\nWe w...
1 https://www.supremecourt.gov/oral_arguments/ar... /home/theo/.scoap/15-1031/argument.pdf 2016 Howell v. Howell 15-1031 ROBERTS [This -- this is a\n\n\n\npretty basic questio... [] ORAL ARGUMENT OF ADAM G. UNIKOWSKY\n\n\n\nON B... ORAL ARGUMENT OF ADAM G. UNIKOWSKY\n\n\n\nON B... REBUTTAL ARGUMENT OF ADAM G. UNIKOWSKI\n\n\n\n... REBUTTAL ARGUMENT OF ADAM G. UNIKOWSKI\n\n\n\n... {'CHIEF JUSTICE ROBERTS': ['', 'I think -What ... ORAL ARGUMENT OF CHARLES W. WIRKEN\n\n\n\nON B... ORAL ARGUMENT OF CHARLES W. WIRKEN\n\n\n\nON B... \n\n\n\n\n\n\n\nCHIEF JUSTICE ROBERTS:\n\nWe'l...
2 https://www.supremecourt.gov/oral_arguments/ar... /home/theo/.scoap/15-1189/argument.pdf 2016 Impressions Products, Inc. v. Lexmark Int'l, Inc. 15-1189 ROBERTS [I'm sorry to\n\n\n\ninterrupt you, but that l... [] ORAL ARGUMENT OF ANDREW J. PINCUS\n\n\n\nON BE... ORAL ARGUMENT OF ANDREW J. PINCUS\n\n\n\nON BE... REBUTTAL ARGUMENT OF ANDREW J. PINCUS\n\n\n\nO... REBUTTAL ARGUMENT OF ANDREW J. PINCUS\n\n\n\nO... {'CHIEF JUSTICE ROBERTS': ['Why -- why is norm... ORAL ARGUMENT OF CONSTANTINE L. TRELA, JR.\n\n... ORAL ARGUMENT OF CONSTANTINE L. TRELA, JR.\n\n... \n\n\n\n\n\n\n\nCHIEF JUSTICE ROBERTS:\n\nWe'l...

In [64]:
# Define function
def create_lookup_series(current_df):
    '''Place text in df for processing. Each justice gets same data.'''
    # Flatten and reindex
    lookup_df = current_df[['DOCKET',
                            'JUSTICE',
                            'PETITIONER_ARGUMENT',
                            #'RESPONDENT_ARGUMENT',
                            'PETITIONER_REBUTTAL']]
    lookup_df = lookup_df.set_index(['DOCKET', 'JUSTICE'])
    # Flatten lists
    lookup_df = lookup_df.applymap(lambda x: ' '.join(x))
    # Make lookup series
    lookup_series = lookup_df.stack()
    # Sort and dedupe (where do dupes come from?)
    lookup_series.sort_index(inplace=True)
    lookup_series.drop_duplicates(inplace=True)
    return lookup_series


# Run
lookup_series = create_lookup_series(current_df)

In [65]:
lookup_series.head(3)


Out[65]:
DOCKET   JUSTICE                      
14-1055  ALITO     PETITIONER_ARGUMENT                                                     
         BREYER    PETITIONER_ARGUMENT    It's tough.\n\nI mean, I find\n\n\n\nthis pret...
         GINSBURG  PETITIONER_ARGUMENT    Does that include --\n\n\n\nyou -- you said su...
dtype: object

In [66]:
# Define function
def populate_tabulation_df(row, lookup_series):
    '''Fill in the dataframe. Meant to be applied.'''
    try:
        case, justice, arg_type = row.name
        # Make this not chained indexing
        value = lookup_series[case][justice][arg_type]
        row[['NB', 'SGD', 'RF']] = value, value, value
    except KeyError:
        row[['NB', 'SGD', 'RF']] = np.NaN, np.NaN, np.NaN
    return row


# Run function
tabulation_df = tabulation_df.apply(populate_tabulation_df,
                                    args=(lookup_series,),
                                    axis=1)

In [67]:
tabulation_df.head(3)


Out[67]:
NB SGD RF
15-214 ROBERTS PETITIONER_ARGUMENT And those would\n\nI thought your\n\n\n\nar... And those would\n\nI thought your\n\n\n\nar... And those would\n\nI thought your\n\n\n\nar...
RESPONDENT_ARGUMENT NaN NaN NaN
PETITIONER_REBUTTAL NaN NaN NaN

In [68]:
# Define function
def run_predictions(column, model_dict, tabulation_df):
    '''This applied function adds results to the result series.
    
    It is initially framed in terms of "QUESTIONEE_WINS", which is
    the output of the model.predict(). It is then converted to 
    "PLAINTIFF_WINS", by flipping the respondent argument (e.g.
    if questionee is plaintiff because plaintiff arg or
    plaintiff rebuttal, QUESTIONEE_WINS == PLAINTIFF_WINS ...
    if respondent argument, QUESTOINEE_WINS != PLAINTIFF_WINS).
    
    '''

    # There has to be a better way to vectorize with groupby.
    model_name = column.name
    
    # Need copy so we can iterrate and change "in place"
    column_copy = column.copy()
    
    # iterate through items
    for index, text in column_copy.iteritems():
        case, justice, arg_type = index
        # Get text
        try:
            model = model_dict[justice][model_name]
            # Cannot comapre np.NaN
            if model is None or pd.isnull(text):
                column.loc[index] = np.NaN
                continue
            # If you've already gone over it, it's bool. Therefore skip.
            if type(text) is np.bool_ or type(text) is bool:
                continue
            # Predict
            prediction = model.predict([text])[0]
            # Flip prediction because speaker -> party flip recorrect.
            if arg_type == 'RESPONDENT_ARGUMENT':
                prediction = not prediction
            # Write back to clolumn
            column.loc[index] = prediction
        except KeyError:
            column.loc[index] = np.NaN
    
    return column


# Run function
tabulation_df = tabulation_df.apply(run_predictions,
                                    axis=0,
                                    args=(model_dict, tabulation_df))

In [69]:
# Demo for clarity
tabulation_df.head(3)


Out[69]:
NB SGD RF
15-214 ROBERTS PETITIONER_ARGUMENT True True True
RESPONDENT_ARGUMENT NaN NaN NaN
PETITIONER_REBUTTAL NaN NaN NaN

In [70]:
def modified_sum(row):
    if row['RESPONDENT_VOTES'] < row['PETITIONER_VOTES']:
        return 'Petitioner'
    if row['RESPONDENT_VOTES'] > row['PETITIONER_VOTES']:
        return 'Respondent'
    else:
        return None

#### Define function
def calculate_votes(tabulation_df):
    # Consensus vector ... vectorize this.
    consensus = pd.Series(index=tabulation_df.index
                                             .droplevel(2)
                                             .copy(),
                          dtype='object')
    consensus.name = 'VOTES'
    # Iterate through tabulation
    tdf = tabulation_df.unstack()
    tdf = tdf.apply(lambda row: pd.value_counts(row.values), axis=1)
    tdf.columns = ['RESPONDENT_VOTES', 'PETITIONER_VOTES']
    tdf = tdf.fillna(0)
    tdf['VOTE'] = tdf.apply(modified_sum, axis=1)
    return tdf

# Run function
votes = calculate_votes(tabulation_df)
votes.head(8)


Out[70]:
RESPONDENT_VOTES PETITIONER_VOTES VOTE
14-1055 ALITO 0.0 3.0 Petitioner
BREYER 2.0 1.0 Respondent
GINSBURG 0.0 3.0 Petitioner
KAGAN 0.0 3.0 Petitioner
KENNEDY 0.0 0.0 None
ROBERTS 0.0 3.0 Petitioner
SOTOMAYOR 0.0 6.0 Petitioner
THOMAS 0.0 0.0 None

In [71]:
def harmonize_empty(votes, VOTING_RELATIONSHIPS):
    '''If null, make this justice copy another similarly-minded justice.'''

    voting_df = pd.DataFrame(VOTING_RELATIONSHIPS)
    # Don't want our inputed picks affecting other imputed picks.
    imputed_probabilities = []
        
    for index, row in votes.iterrows():
        # Parse
        case, justice = index
        if row['VOTE'] is None:
            # Get similarity rankings: ALITO: 7, BREYER: 2, KAGAN: 3
            similarity_ranks = voting_df.loc[justice].argsort()
            # Then rank so we have BREYER: 2, KAGAN: 3, ALITO: 7
            similarity_order = similarity_ranks.sort_values()
            # Similar justice list: [BREYER, KAGAN, ALITO]
            most_similar = similarity_order.index.values

            # Go through justice list to get closest.
            for sim_justice in most_similar:
                if sim_justice == 'SCALIA':
                    continue
                other_justice_prob = votes.loc[(case, sim_justice)]['VOTE']
                if other_justice_prob is None:
                    continue
                else:
                    imputed_probabilities.append({'case': case,
                                                  'justice': justice,
                                                  'prob': other_justice_prob})
                    
    # Now all imputed_probs are complete. Add back in.
    for prob in imputed_probabilities:
        index_tuple = tuple([prob['case'], prob['justice']])
        votes.loc[index_tuple, 'VOTE'] = prob['prob']
    return None


harmonize_empty(votes, VOTING_RELATIONSHIPS)
votes.head(8)


Out[71]:
RESPONDENT_VOTES PETITIONER_VOTES VOTE
14-1055 ALITO 0.0 3.0 Petitioner
BREYER 2.0 1.0 Respondent
GINSBURG 0.0 3.0 Petitioner
KAGAN 0.0 3.0 Petitioner
KENNEDY 0.0 0.0 Petitioner
ROBERTS 0.0 3.0 Petitioner
SOTOMAYOR 0.0 6.0 Petitioner
THOMAS 0.0 0.0 Respondent

In [72]:
def get_petitioner_votes(row):
    '''Helper function for apply.'''
    vc = row.value_counts()
    try:
        petitioner_count = vc['Petitioner']
    except KeyError:
        petitioner_count = 0
    return petitioner_count


def get_respondent_votes(row):
    '''Helper function for apply.'''
    vc = row.value_counts()
    try:
        respondent_count = vc['Respondent']
    except KeyError:
        respondent_count = 0
    return respondent_count


def process_votes(votes):
    '''Apply function for results.'''
    # Get rid of superfluous columns
    result = votes[['VOTE']]
    # Turn into dataframe.
    result = result.unstack()
    # Get rid of superfluous multiindex
    result.columns = result.columns.droplevel(0)
    # Add winner and loser counts.
    result['PET_VOTES'] = result.apply(get_petitioner_votes, axis=1)
    result['RES_VOTES'] = result.apply(get_respondent_votes, axis=1)
    # Arbitrary
    result['VICTOR'] = result['PET_VOTES'] > result['RES_VOTES']
    result['VICTOR'] = result['VICTOR'].map({True: 'Petitioner',
                                             False: 'Respondent'})
    return result


result = process_votes(votes)

try:
    result = result.drop('15-1112')
except Exception:
    pass

result


Out[72]:
ALITO BREYER GINSBURG KAGAN KENNEDY ROBERTS SOTOMAYOR THOMAS PET_VOTES RES_VOTES VICTOR
14-1055 Petitioner Respondent Petitioner Petitioner Petitioner Petitioner Petitioner Respondent 6 2 Petitioner
14-1538 Respondent Petitioner Petitioner Respondent Respondent Petitioner Respondent Petitioner 4 4 Respondent
14-9496 Petitioner Petitioner Respondent Petitioner Petitioner Petitioner Petitioner Petitioner 7 1 Petitioner
15-1031 Petitioner Petitioner Petitioner Petitioner Petitioner Respondent Respondent Petitioner 6 2 Petitioner
15-1111 Respondent Respondent Petitioner Respondent Petitioner Petitioner Petitioner Petitioner 5 3 Petitioner
15-118 Respondent Petitioner Petitioner Respondent Petitioner Respondent Petitioner Petitioner 5 3 Petitioner
15-1189 Respondent Petitioner Respondent Respondent Petitioner Petitioner Respondent Petitioner 4 4 Respondent
15-1191 Respondent Respondent Petitioner Petitioner Petitioner Petitioner Petitioner Respondent 5 3 Petitioner
15-1194 Petitioner Petitioner Respondent Petitioner Petitioner Petitioner Petitioner Petitioner 7 1 Petitioner
15-1204 Petitioner Petitioner Petitioner Petitioner Petitioner Petitioner Petitioner Petitioner 8 0 Petitioner
15-1248 Petitioner Petitioner Petitioner Petitioner Petitioner Petitioner Petitioner Petitioner 8 0 Petitioner
15-1251 Respondent Respondent Petitioner Petitioner Petitioner Respondent Petitioner Respondent 4 4 Respondent
15-1262 Petitioner Petitioner Petitioner Petitioner Petitioner Petitioner Respondent Petitioner 7 1 Petitioner
15-1293 Petitioner Petitioner Petitioner Respondent Petitioner Petitioner Petitioner Petitioner 7 1 Petitioner
15-1358 Petitioner Respondent Petitioner Petitioner Petitioner Petitioner Respondent Respondent 5 3 Petitioner
15-1391 Respondent Respondent Petitioner Respondent Respondent Respondent Petitioner Respondent 2 6 Respondent
15-1406 Petitioner Respondent Petitioner Petitioner Petitioner Petitioner Petitioner Respondent 6 2 Petitioner
15-1498 Petitioner Petitioner Petitioner Petitioner Respondent Petitioner Respondent Petitioner 6 2 Petitioner
15-1500 Petitioner Petitioner Respondent Petitioner Petitioner Petitioner Petitioner Petitioner 7 1 Petitioner
15-214 Petitioner Petitioner Petitioner Petitioner Petitioner Petitioner Petitioner Petitioner 8 0 Petitioner
15-423 Petitioner Petitioner Respondent Petitioner Respondent Petitioner Petitioner Petitioner 6 2 Petitioner
15-497 Petitioner Petitioner Respondent Petitioner Petitioner Petitioner Petitioner Petitioner 7 1 Petitioner
15-513 Respondent Petitioner Petitioner Petitioner Petitioner Respondent Respondent Petitioner 5 3 Petitioner
15-537 Petitioner Petitioner Respondent Petitioner Petitioner Petitioner Respondent Respondent 5 3 Petitioner
15-5991 Petitioner Respondent Respondent Respondent Petitioner Petitioner Respondent Respondent 3 5 Respondent
15-606 Petitioner Petitioner Respondent Petitioner Petitioner Petitioner Respondent Petitioner 6 2 Petitioner
15-628 Respondent Petitioner Respondent Respondent Petitioner Petitioner Petitioner Petitioner 5 3 Petitioner
15-649 Respondent Respondent Respondent Respondent Petitioner Petitioner Petitioner Respondent 3 5 Respondent
15-680 Petitioner Petitioner Respondent Petitioner Petitioner Petitioner Respondent Petitioner 6 2 Petitioner
15-7250 Respondent Petitioner Petitioner Respondent Petitioner Petitioner Petitioner Petitioner 6 2 Petitioner
15-777 Petitioner Respondent Petitioner Respondent Petitioner Petitioner Respondent Respondent 4 4 Respondent
15-797 Respondent Respondent Petitioner Respondent Petitioner Petitioner Petitioner Respondent 4 4 Respondent
15-8049 Petitioner Petitioner Petitioner Petitioner Petitioner Petitioner Respondent Respondent 6 2 Petitioner
15-827 Respondent Petitioner Respondent Petitioner Petitioner Petitioner Petitioner Petitioner 6 2 Petitioner
15-8544 Respondent Respondent Petitioner Respondent Petitioner Petitioner Petitioner Respondent 4 4 Respondent
15-866 Petitioner Petitioner Petitioner Petitioner Petitioner Petitioner Petitioner Petitioner 8 0 Petitioner
15-9260 Petitioner Respondent Petitioner Petitioner Petitioner Petitioner Petitioner Respondent 6 2 Petitioner
15-927 Respondent Respondent Petitioner Respondent Respondent Petitioner Respondent Respondent 2 6 Respondent
16-1256 None None None None None None None None 0 0 Respondent
16-149 Petitioner Respondent Petitioner Petitioner Petitioner Petitioner Respondent Respondent 5 3 Petitioner
16-254 Respondent Petitioner Petitioner Petitioner Petitioner Petitioner Petitioner Petitioner 7 1 Petitioner
16-32 Respondent Petitioner Petitioner Petitioner Respondent Respondent Petitioner Petitioner 5 3 Petitioner
16-348 Petitioner Petitioner Petitioner Petitioner Petitioner Petitioner Petitioner Petitioner 8 0 Petitioner
16-369 Petitioner Respondent Respondent Petitioner Petitioner Respondent Respondent Respondent 3 5 Respondent
16-54 Respondent Petitioner Petitioner Petitioner Petitioner Respondent Petitioner Petitioner 6 2 Petitioner

In [74]:
# Write results to file.
result_csv_path = os.path.join(DATA_FOLDER, 'case_results.csv')
result.to_csv(result_csv_path, encoding='utf-8')

In [75]:
result['VICTOR'].value_counts()


Out[75]:
Petitioner    33
Respondent    12
Name: VICTOR, dtype: int64