In [1]:
DOC = '''Supreme Court Oral Argument Predictor (SCOAP)
Creates models for predicting outcomes of Supreme Court oral
arguments. Pulls justice-specific phrases associated with
winning and losing arguments.
LICENSE: MIT
AUTHOR: theonaunheim@gmail.com
COPYRIGHT: 2017, Theo Naunheim
VERSION: 0.4.3
MODIFIED: 2017-03-26
DATA DIR: .scoap
REQUIRES: Jupyter Notebook and Xpdf/Poppler
WARNING: THIS SCRIPT DOWNLOADS AND PROCESSES A LARGE
VOLUME OF MATERIAL. IT IS COMPUTATIONALLY
EXPENSIVE AND TAKES A NON-NEGLIGIBLE AMOUNT
OF TIME AND BANDWIDTH.
'''
In [2]:
# Standard library imports
import asyncio
import copy
import itertools
import os
import re
import string
import sys
import zipfile
# Web/data imports
import bs4
import numpy as np
import pandas as pd
import requests
# Scikit learn imports
import sklearn
import sklearn.feature_extraction
import sklearn.metrics
import sklearn.model_selection
import sklearn.linear_model
import sklearn.naive_bayes
import sklearn.pipeline
import sklearn.svm
import sklearn.ensemble
In [3]:
# Constants and constant-ish things.
# Debug flag cuts down amount of data used.
DEBUG = False
# Website URLs for downloads
TRANSCRIPT_INFO = 'https://www.supremecourt.gov/oral_arguments/argument_transcript/'
TRANSCRIPT_DOWNLOADS = 'https://www.supremecourt.gov/oral_arguments/'
SCDB_CSV_DOWNLOAD_LINK = 'http://scdb.wustl.edu/_brickFiles/2016_01/SCDB_2016_01_justiceCentered_Docket.csv.zip'
# Transcript years for dynamic URL creation
START_YEAR = 2006
END_YEAR = 2017
# OS-specific path for PDF to text extraction utility.
if os.name == 'nt':
PDF2TEXT_PATH = r'C:\Program Files\Xpdf\pdftotext.exe'
elif os.name == 'posix':
PDF2TEXT_PATH = '/usr/bin/pdftotext'
else:
raise Exception('This script requires Xpdf/Poppler utility pdftotext to run.')
# Paths for SCOAP specific data.
DATA_FOLDER = os.path.join(os.path.expanduser('~'), '.scoap')
SCDB_ZIP_NAME = SCDB_CSV_DOWNLOAD_LINK.rpartition('/')[2]
SCDB_CSV_NAME = SCDB_ZIP_NAME.rpartition('.')[0]
SCDB_ZIP_PATH = os.path.join(DATA_FOLDER, SCDB_ZIP_NAME)
SCDB_CSV_PATH = SCDB_ZIP_PATH.rpartition('.')[0]
# The current term justices and cases we wish to analyze.
CURRENT_JUSTICES = ['Roberts', 'Kennedy', 'Thomas', 'Ginsburg', 'Breyer', 'Alito', 'Sotomayor', 'Kagan']
CURRENT_CASES = ['15-214', '15-1031', '15-497', '15-1189', '16-369',
'16-254', '15-118', '15-1248', '16-32', '15-1194',
'16-54', '15-9260', '16-149', '16-1256', '15-1500',
'15-1391', '15-1406', '15-827', '15-1498', '16-348',
'15-1293', '15-1358', '15-8544', '15-797', '15-1204',
'15-680', '15-1262', '14-1538', '15-649', '15-866',
'15-513', '15-927', '15-423', '15-1251', '15-1111',
'14-1055', '15-1191', '15-537', '15-5991', '15-628',
'15-8049', '14-9496', '15-777', '15-606', '15-7250',]
# Voting relationships for OT15, courtesy of http://www.scotusblog.com/statistics/
VOTING_RELATIONSHIPS = {"KENNEDY" :{"KENNEDY":1.00,"SCALIA":0.82,"THOMAS":0.71,"KAGAN":0.95,"ROBERTS":0.88,"GINSBURG":0.84,"ALITO":0.82,"BREYER":0.91,"SOTOMAYOR":0.79},
"SCALIA" :{"KENNEDY":0.82,"SCALIA":1.00,"THOMAS":0.88,"KAGAN":0.82,"ROBERTS":0.88,"GINSBURG":0.71,"ALITO":0.94,"BREYER":0.82,"SOTOMAYOR":0.65},
"THOMAS" :{"KENNEDY":0.71,"SCALIA":0.88,"THOMAS":1.00,"KAGAN":0.67,"ROBERTS":0.75,"GINSBURG":0.62,"ALITO":0.78,"BREYER":0.67,"SOTOMAYOR":0.64},
"KAGAN" :{"KENNEDY":0.95,"SCALIA":0.82,"THOMAS":0.67,"KAGAN":1.00,"ROBERTS":0.87,"GINSBURG":0.87,"ALITO":0.81,"BREYER":0.92,"SOTOMAYOR":0.81},
"ROBERTS" :{"KENNEDY":0.88,"SCALIA":0.88,"THOMAS":0.75,"KAGAN":0.87,"ROBERTS":1.00,"GINSBURG":0.78,"ALITO":0.84,"BREYER":0.84,"SOTOMAYOR":0.77},
"GINSBURG" :{"KENNEDY":0.84,"SCALIA":0.71,"THOMAS":0.62,"KAGAN":0.87,"ROBERTS":0.78,"GINSBURG":1.00,"ALITO":0.73,"BREYER":0.86,"SOTOMAYOR":0.88},
"ALITO" :{"KENNEDY":0.82,"SCALIA":0.94,"THOMAS":0.78,"KAGAN":0.81,"ROBERTS":0.84,"GINSBURG":0.73,"ALITO":1.00,"BREYER":0.77,"SOTOMAYOR":0.64},
"BREYER" :{"KENNEDY":0.91,"SCALIA":0.82,"THOMAS":0.67,"KAGAN":0.92,"ROBERTS":0.84,"GINSBURG":0.86,"ALITO":0.77,"BREYER":1.00,"SOTOMAYOR":0.83},
"SOTOMAYOR":{"KENNEDY":0.79,"SCALIA":0.65,"THOMAS":0.64,"KAGAN":0.81,"ROBERTS":0.77,"GINSBURG":0.88,"ALITO":0.64,"BREYER":0.83,"SOTOMAYOR":1.00}}
In [4]:
# Define function.
def create_dataframe():
'''Create a skeleton for our df.'''
df = pd.DataFrame(columns=['CASE',
'DOCKET',
'ARGUMENT_YEAR',
'ARGUMENT_LINK',
'ARGUMENT_PATH',])
return df
# Run function.
arg_df = create_dataframe()
In [5]:
# Define function.
def get_argument_metadata(df, start=START_YEAR - 1, end=END_YEAR + 1):
'''This fetches oral argument location metadata.'''
# For each year
for year in range(start, end):
# Create web address and download data
address = TRANSCRIPT_INFO + str(year)
r = requests.get(address)
# Parse data
try:
soup = bs4.BeautifulSoup(r.text, 'lxml')
table = soup.find('table', 'table datatables')
for row in table.findAll('tr'):
link = row.find('a')
case = row.find('span')
# Write table info to dataframe.
if link:
link_text = link.text[:-2].lower()
case_text = case.text
link_tail = link.attrs['href'].lstrip('../')
full_link = TRANSCRIPT_DOWNLOADS + link_tail
# Write to frame
path = os.path.join(DATA_FOLDER, link_text, 'argument.pdf')
df = df.append({'CASE': case_text,
'DOCKET': link_text,
'ARGUMENT_LINK': full_link,
'ARGUMENT_PATH': path,
'ARGUMENT_YEAR': str(year)}, ignore_index=True)
except AttributeError:
print('Attribute error. Probably an empty page.')
return df
# Run function.
arg_df = get_argument_metadata(arg_df)
In [6]:
# Show dataframe for clarity.
arg_df.head(3)
Out[6]:
In [7]:
# Debug to shorten time during testing
if DEBUG:
arg_df = arg_df.iloc[-10:].copy()
In [8]:
# Define function.
def make_directories(row):
'''All cases get their own folder.'''
try:
path = os.path.join(DATA_FOLDER, row['DOCKET'])
os.makedirs(path)
except FileExistsError:
pass
# Apply function. Output unnecessary.
_ = arg_df.apply(make_directories, axis=1)
In [9]:
# Define function.
def download_pdfs(row):
'''Get PDFs and put in the folder if necessary.'''
# If there's a link and no file, download.
if row['ARGUMENT_LINK'] is not np.NaN:
if os.path.exists(row['ARGUMENT_PATH']):
return False
r = requests.get(row['ARGUMENT_LINK'], stream=True)
with open(row['ARGUMENT_PATH'], 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
# Apply function. No assinment required.
_ = arg_df.apply(download_pdfs, axis=1)
In [10]:
arg_df.head(3)
Out[10]:
In [11]:
# Define functions.
async def get_text(pdf_path):
'''This function is a coroutine for a single pdf2text.py instance.'''
# Create the subprocess, redirect the standard output into a pipe
process = await asyncio.create_subprocess_exec(PDF2TEXT_PATH,
pdf_path,
'-',
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE)
# Read output
data = await process.communicate()
# Have process exit and return data.
await process.wait()
# Decode cp1252 for windows
try:
decoded_data = data[0].decode('cp1252')
# And UTF-8 for Linux.
except:
decoded_data = data[0].decode()
return decoded_data
async def get_all_text(pdf_paths):
'''This gathers the pdf2text.py results.'''
# Create list for return results.
result_list = []
# Create a list of tasks
input_len = len(pdf_paths)
num_chunks = (input_len // 10) + 1
chunked_input = np.array_split(pdf_paths, num_chunks)
# Now run each of the chunks in parallel to speed things up.
for chunk in chunked_input:
# Create tasks
tasks = [get_text(path) for path in chunk]
# Run all the tasks in parallel
results = await asyncio.gather(*tasks)
# Put the zipped (path, results) in result list
for path, result in zip(chunk, results):
result_list.append((path, result))
return result_list
def add_arguments(df):
'''Adds argument text to df.'''
# Get unique PDFs
unique_pdfs = df['ARGUMENT_PATH'].unique()
# Windows only supports proactorloop.
if os.name == 'nt':
loop = asyncio.ProactorEventLoop()
elif os.name == 'posix':
loop = asyncio.SelectorEventLoop()
else:
loop == None
asyncio.set_event_loop(loop)
# Run our coroutine to extract text.
arg_data = loop.run_until_complete(get_all_text(unique_pdfs))
# Loop no longer necessary.
loop.close()
# Create dataframe for data.
tdf = pd.DataFrame.from_records(arg_data, columns=['ARGUMENT_PATH', 'TEXT'])
# Join to input df and fill na.
df = df.merge(tdf, how='left', on='ARGUMENT_PATH').fillna('')
return df
# Run function
arg_df = add_arguments(arg_df)
In [12]:
# Show dataframe for clarity.
arg_df.head(3)
Out[12]:
In [13]:
# Define function.
def cut_unnecessary_text(df):
'''This function cuts low information text from transcript.'''
# First chop off the caption ('PROCEEDINGS' or 'P R O C E E D I N G S')
capture_string = r'P\s?R\s?O\s?C\s?E\s?E\s?D\s?I\s?N\s?G\s?S([\s\S]*\Z)'
df['TEXT'] = df['TEXT'].str.extract(capture_string,
expand=False,
flags=re.MULTILINE)
# First we specify the patterns we don't want
patterns_to_cut = [
# Cut carriage returns and form feeds because f*** those guys.
r'[\r\f]',
# Remove tables at end ##:## 4 within no more than 100 chars of Alderson
(r'\s*' +
r'Alderson Reporting Company' +
# period because a.m. messes it up.
r'[\s\S.]{0,75}\d?\d:\d?\d' * 3 +
r'[\s\S]*' +
r'\Z'),
# Remove [2004 - 2005] footer
r'1111 14th[\s\S]{0,100}20005',
# Remove [2006 - 2016] header/footer unofficial
r'Alderson[\s\S]{0,100}Review',
# Remove [2006 - 2016] header/footer official
r'Alderson[\s\S]{0,100}[oO]fficial',
# Remove Genric Alderson
r'Alderson Reporting Company',
# Cut court reporter annotations
r'[(\[][\s\S]{0,100}[)\]]',
# Cut line numbers, page numbers, all other low-information numbers
r'[0-9]',
# Cut PAGE
r'[Pp][Aa][Gg][Ee]',
]
# Replace above patterns with empty space.
for pattern in patterns_to_cut:
df['TEXT'] = df['TEXT'].str.replace(pat=pattern,
repl='',
flags=re.MULTILINE)
return df
# Run function.
arg_df = cut_unnecessary_text(arg_df)
In [14]:
# Show df for clarity
arg_df.head(3)
Out[14]:
In [15]:
# Define function.
def create_heading_columns(df):
'''This function finds the section headings for each case'''
# Create Petitioner oral argument heading col
pet_arg_pattern = ''.join([r'(',
r'ORAL ARGUMENT[\S\s]{,200}',
r'(?:PETITIONER|APPELLANT)S?',
# As appointed by this court optional
r'(?:[\S\s]{,50}THIS COURT)?',
r')'])
df['PET_ARG_HEADING'] = df['TEXT'].str.extract(pet_arg_pattern,
expand=False,
flags=re.MULTILINE).fillna('')
# Create Respondent oral argument heading col
res_arg_pattern = ''.join([r'(ORAL ARGUMENT[\S\s]{,200}',
r'(?:RESPONDENT|APPELLEE)S?',
# As appointed by this court optional
r'(?:[\S\s]{,50}THIS COURT)?',
r')'])
df['RES_ARG_HEADING'] = df['TEXT'].str.extract(res_arg_pattern,
expand=False,
flags=re.MULTILINE).fillna('')
# Create Petitioner rebuttal heading col
pet_reb_pattern = ''.join([r'(REBUTTAL ARGUMENT[\S\s]{,200}',
r'(?:PETITIONER|APPELLANT)S?',
# As appointed by this court optional
r'(?:[\S\s]{,50}THIS COURT)?',
r')'])
df['PET_REB_HEADING'] = df['TEXT'].str.extract(pet_reb_pattern,
expand=False,
flags=re.MULTILINE).fillna('')
return df
# TODO:
# IN ##-#### optional ... r'(?:[\S\s]{,10}IN[\S\s]{,5}-)?'
# Run function
arg_df = create_heading_columns(arg_df).fillna('')
In [16]:
# Define function.
def extract_petitioner_arg(df):
'''Pulls out petitioner argument using section headers.'''
# Create extraction (between pet arg heading and res arg heading)
df['PET_ARG_REGEX'] = df.apply(lambda row: ''.join([row['PET_ARG_HEADING'],
r'([\S\s]*?)',
r'(?:ORAL)']),
axis=1)
# Extract and create petitioner argument column
df['PETITIONER_ARGUMENT'] = df.apply(lambda row: re.findall(row['PET_ARG_REGEX'],
row['TEXT'],
flags=re.MULTILINE),
axis=1)
# If no match, empty string. Else, take match's.
df['PETITIONER_ARGUMENT'] = df['PETITIONER_ARGUMENT'].map(lambda matches: ''.join(matches))
return df
# Run function.
arg_df = extract_petitioner_arg(arg_df)
In [17]:
# Define function.
def extract_respondent_arg(df):
'''Pulls out respondent argument using previously generated section heads.'''
# Get respondent argument (between res arg heading and pet reb heading)
df['RES_ARG_REGEX'] = df.apply(lambda row: ''.join([row['RES_ARG_HEADING'],
r'([\S\s]*?)',
r'(?:REBUTTAL)|(?:ORAL)']),
axis=1)
df['RESPONDENT_ARGUMENT'] = df.apply(lambda row: re.findall(row['RES_ARG_REGEX'],
row['TEXT'],
flags=re.MULTILINE),
axis=1)
# If no match, empty string. Else, take match's.
df['RESPONDENT_ARGUMENT'] = df['RESPONDENT_ARGUMENT'].map(lambda matches: ''.join(matches))
return df
# Run function.
arg_df = extract_respondent_arg(arg_df)
In [18]:
# Define function.
def extract_petitioner_reb(df):
'''Pulls out petitioner rebuttal using previously generated section heads.'''
# Get petitioner rebuttal (between pet reb heading and res reb heading)
df['PET_REB_REGEX'] = df.apply(lambda row: ''.join([row['PET_REB_HEADING'],
r'([\S\s]*?)',
r'(?:\Z)']),
axis=1)
df['PETITIONER_REBUTTAL'] = df.apply(lambda row: re.search(row['PET_REB_REGEX'],
row['TEXT'],
flags=re.MULTILINE).group(1),
axis=1)
return df
# Run function.
arg_df = extract_petitioner_reb(arg_df)
In [19]:
# TODO. If transcript omits info (e.g. SAMSUNG/WAXMAN 15-777), no match.
len(arg_df)
Out[19]:
In [20]:
# Show dataframe for clarity
arg_df.head(3)
Out[20]:
In [21]:
bak = arg_df.copy()
In [22]:
def split_arguments(df):
'''Split argument into a series of comments.'''
# Must be double quote because raw string addressing ' for O'Connor
comment_pattern = r"([A-Z.'\s]{5,25}:\s[\s\S]*?)(?=[A-Z'.\s]{5,25}[:\Z])"
for column in ['PETITIONER_ARGUMENT',
'RESPONDENT_ARGUMENT',
'PETITIONER_REBUTTAL']:
# We only want periods in the middle of names.
df[column] = df[column].str.findall(comment_pattern)
return df
# Run functions
arg_df = split_arguments(arg_df)
In [23]:
def tuplify_cell(cell_value):
'''Helper function for tuplify_argument().'''
return_value = []
for comment in cell_value:
justice, _, comment = comment.partition(':')
return_value.append(tuple([justice.replace('.', '').strip(),
comment.strip()]))
return return_value
def tuplify_arguments(df):
'''Turn question strings into (justice, text) tuples.'''
for column in ['PETITIONER_ARGUMENT',
'RESPONDENT_ARGUMENT',
'PETITIONER_REBUTTAL']:
df[column] = df[column].map(tuplify_cell)
return df.fillna('')
# Run function
arg_df = tuplify_arguments(arg_df)
In [24]:
def condense_cell(cell_value):
'''Helper function for condense_argument().'''
return_dict = {}
for input_tuple in cell_value:
justice, comment = input_tuple
try:
return_dict[justice].append(comment)
except KeyError:
return_dict[justice] = [comment]
return return_dict
def condense_arguments(df):
'''Turn args into: {'justice': ['comment 1', 'comment 2']}'''
for column in ['PETITIONER_ARGUMENT',
'RESPONDENT_ARGUMENT',
'PETITIONER_REBUTTAL']:
df[column] = df[column].map(condense_cell)
return df
arg_df = condense_arguments(arg_df)
In [25]:
arg_df.head(3)
Out[25]:
In [26]:
# Define function creating secondary df.
def create_scdb_df():
'''Download data frome the SCDB and instantiate dataframe.'''
# If we've already download database, just load.
if os.path.exists(SCDB_CSV_PATH):
pass
else:
# Get data
r = requests.get(SCDB_CSV_DOWNLOAD_LINK, stream=True)
with open(SCDB_ZIP_PATH, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
# Unzip context manager
with zipfile.ZipFile(SCDB_ZIP_PATH) as zip_file:
# Read data context manager
with zip_file.open(SCDB_CSV_NAME) as pseudo_file:
data = pseudo_file.read()
# Write data with context manager and write to csv.
with open(SCDB_CSV_PATH, 'wb+') as f:
f.write(data)
# Now create dataframe from csv.
case_df = pd.read_csv(SCDB_CSV_PATH, encoding='latin-1')
return case_df
# Run df.
case_df = create_scdb_df()
In [27]:
# Show case dataframe for clarity
case_df.head(3)
Out[27]:
In [28]:
cut_arg_df = arg_df[['DOCKET',
'CASE',
'PETITIONER_ARGUMENT',
#'RESPONDENT_ARGUMENT',
'PETITIONER_REBUTTAL']]
cut_arg_df.head(3)
Out[28]:
In [29]:
cut_case_df = case_df[['docket', 'majority', 'partyWinning', 'justiceName']]
cut_case_df.columns = ['DOCKET', 'majority', 'partyWinning', 'JUSTICE']
cut_case_df.head(3)
Out[29]:
In [30]:
# Join case_df and arg_df to create joint dataframe jdf.
jdf = pd.merge(cut_arg_df,
cut_case_df,
how='left',
on='DOCKET')
# Drop (Reargued) because it creates dupes.
contains_reargue = jdf['CASE'].str.contains('Reargue')
jdf = jdf[~contains_reargue]
In [31]:
# Show joint dataframe for clarity (all tail end will be np.NaN)
jdf.head(3)
Out[31]:
In [32]:
'''
From documentation on 'partyWinning' column:
http://scdb.wustl.edu/documentation.php?var=partyWinning
0: no favorable disposition for petitioning party apparent
1: petitioning party received a favorable disposition
2: favorable disposition for petitioning party unclear
We want to be able to separate those who won from those who did not
win. Consequently, we drop all cases where the decision was
ambiguous, or the winner was not apparent.
We then convert the 0 to False and the 1 to True, which gives
us a True/False 'PETITIONER_WINS' column.
PETITIONER_WINS
0 -> False
1 -> True
If the petitioner wins, it is because it was the decision of the
majority of the court. We can accurately describe the nature of this
column as 'PETITIONER_WINS_MAJORITY'.
'''
jdf = jdf[jdf['partyWinning'] != 2.0].copy()
jdf['PETITIONER_WINS_MAJORITY'] = jdf['partyWinning'].astype(bool)
In [33]:
'''
From documentation on 'majority' columns:
http://scdb.wustl.edu/documentation.php?var=majority
1: dissent
2: majority
We want to convert this into a 'VOTED_WITH_MAJORITY' column.
To do this we subtract one from each and every value so that
dissent becomes 0 and majority becomes 1.
majority
0: dissent (result 1 - 1)
1: majority (result from 2 - 1)
Then we convert the 0 to False and 1 to True, so that we have a
'VOTED_WITH_MAJORITY' column.
VOTED_WITH_MAJORITY
0 -> False
1 -> True
'''
jdf['majority_minus_one'] = jdf['majority'] - 1
jdf['VOTED_WITH_MAJORITY'] = jdf['majority_minus_one'].astype(bool)
jdf
Out[33]:
In [34]:
'''
We can determine whether a petitioner won over a specific justice based on:
1. Whether the petitioner won over a majority, and
2. Whether the specific justice was a part of that majority.
If the answer to both of these questions is the same (that is, either
both the answers are Yes or both the answers are no), then the
petitioner won over the justice.
Logically:
P_WINS_MAJ, J_VOTES_MAJ = P_WINS_J
If petitioner wins majority and justice voted with majority, the petitioner won over the justice
P_WINS_MAJ, ~J_VOTES_MAJ = P_LOSES_J
If petitioner wins majority and justice NOT a part of the majority, petitioner did not win justice
~P_WINS_MAJ, J_VOTES_MAJ = P_LOSES_J
If petitioner does NOT win majority and justice voted in majority, petitioner did not win justice
~P_WINS_MAJ, ~J_VOTES_MAJ = P_WINS_J
If petitioner does NOT win majority and justice voted NOT in majority, petitioner won justice
'''
def determine_vote(row):
# If petitioner wins majority
if row['PETITIONER_WINS_MAJORITY']:
# Pet wins majority AND justice voted with majority
if row['VOTED_WITH_MAJORITY']:
return True
# Pet wins majority AND justice voted against majority
else:
return False
# If petitioner loses majority
else:
# Pet loses majority AND justice voted with majority
if row['VOTED_WITH_MAJORITY']:
return False
# Pet loses majority AND justice voted against majority
else:
return True
# Voted with majority
jdf['VOTED_FOR_PETITIONER'] = jdf.apply(determine_vote, axis=1)
In [35]:
jdf.head(3)
Out[35]:
In [36]:
# Demonstration dataframe
pd.DataFrame(data={'Justice Votes With Majority': ['Petitioner Wins Justice',
'Petitioner Loses Justice'],
'Justice Votes Againt Majority': ['Petitioner Loses Justice',
'Petitioner Wins Justice']},
index=['Petitioner Wins Majority',
'Petitioner Loses Majority'])
Out[36]:
In [37]:
jdf[['CASE', 'JUSTICE', 'VOTED_FOR_PETITIONER']].dropna().head(9)
Out[37]:
In [38]:
# Define function.
def trim_columns(df):
# Trim columns
df = df[['DOCKET',
'CASE',
'JUSTICE',
'PETITIONER_ARGUMENT',
#'RESPONDENT_ARGUMENT',
'PETITIONER_REBUTTAL',
'VOTED_FOR_PETITIONER']]
return df
# Run function.
jdf = trim_columns(jdf)
In [39]:
# Show joint dataframe for clarity.
jdf.head(3)
Out[39]:
In [40]:
# Define filter function.
def filter_justice_data(row):
'''Converts SCDB: RHJackson to JACKSON, which can be pulled from JUSTICE JACKSON.
Then for JACKSON:
{'JUSTICE JACKSON': [1, 2],
'JUSTICE ROBERTS': [2, 3]}
Becomes [1,2] for JACKSON's row.
'''
# Handle SCDB justice names. Based on capitalization
# which messes up SDOConnor -> Connor
# SDOConner should be OCONNER, not CONNER.
if row['JUSTICE'] == 'SDOConnor':
row['JUSTICE'] = 'SDOconnor'
# Pick first lower case letter and start name one previous
lower_mask = [letter.islower() for letter in row['JUSTICE']]
first_lower = lower_mask.index(True)
one_prior = first_lower - 1
row['JUSTICE'] = row['JUSTICE'][one_prior:].upper()
# Handle text columns
for index in ['PETITIONER_ARGUMENT',
#'RESPONDENT_ARGUMENT',
'PETITIONER_REBUTTAL',]:
# Find if justice name is in any of the keys.
# 1 if found in string, 0 if not.
# [1, 0, 0] -> True
justice_represented = any([key.count(row['JUSTICE']) for key in row[index].keys()])
# If represented, fill with value.
if justice_represented:
for key in row[index].keys():
if row['JUSTICE'] in key and 'JUSTICE' in key:
try:
row[index] = row[index][key]
except TypeError:
# Fallback to edit distance?
pass
# If a number has not been placed in the cell, place zero.
if type(row[index]) == dict:
row[index] = []
# If not represented
else:
row[index] = []
return row
# Apply function. If justice is NA ... not yet decided
jdf = (jdf.dropna(subset=['JUSTICE'])
.apply(filter_justice_data, axis=1))
In [41]:
# Show joint dataframe for clarity
jdf.head(3)
Out[41]:
In [42]:
# Write argument data df to csv
arg_data_csv_path = os.path.join(os.path.expanduser('~'),
'.scoap',
'argument_data.csv')
jdf.to_csv(arg_data_csv_path, encoding='utf-8')
In [43]:
# Create text_df
text_df = pd.melt(jdf,
id_vars=['JUSTICE',
'DOCKET',
'VOTED_FOR_PETITIONER'],
value_vars=['PETITIONER_ARGUMENT',
#'RESPONDENT_ARGUMENT',
'PETITIONER_REBUTTAL'],
var_name='ARG_TYPE',
value_name='TEXT')
text_df.head(3)
Out[43]:
In [44]:
# Define function
def reorient_args(row):
'''Apply function to make respondent arguments useful.
WARNING: HAND-WAVY, UNSCIENTIFIC FEATURE ENGINEERING BELOW.
THIS ACTUALLY DECREASES ACCURACY AT PRESENT.
We hamfistedly force the petitioner argument, respondent
argument, and petitioner rebuttal into a single type of
entry. Where before we had:
JUSTICE, PET_ARG, PETITIONER_WINS
JUSTICE, RES_ARG, PETITIONER_WINS
JUSTICE, PET_REB, PETITIONER_WINS
We will now have:
JUSTICE, PET_ARG, QUESTIONEE_WON
JUSTICE, RES_ARG, QUESTIONEE_WON
JUSTICE, PET_REB, QUESTIONEE_WON
The first notable change is that we transform PETITIONER_WINS
to QUESTIONEE_WON. Previously, we could see what text is
associated with petitioner wins because target PET_WINS was
framed in terms of the petitioner. It was previously useless
for respondent comments. We can get around this by reframing
the target in terms of "Did the party to whom the justice
directed the comment win?" instead of "Did the petitioner
win?".
This requires a big assumption: namely that petitioner
arguments, respondent arguments, and petitioner rebuttals
are roughly interchangable. In other words, we are presuming
that justices will use similar terms i.e. "Your argument is
bad and you should feel bad" whether it's the petitioner or
it's the respondent.
This theoretically results in some loss of prediction quality:
negative words directed at a respondent may be markedly
different in quality from those directed at a petitioner.
However, this trades off with the fact that we have uroughly
doubled the number of samples.
'''
vote_pet = row['VOTED_FOR_PETITIONER']
arg_type = row['ARG_TYPE']
if arg_type in ['RESPONDENT_ARGUMENT']:
if vote_pet is True:
voted_for_speaker = False
else:
voted_for_speaker = True
if arg_type in ['PETITIONER_ARGUMENT', 'PETITIONER_REBUTTAL']:
if vote_pet is True:
voted_for_speaker = True
else:
voted_for_speaker = False
return voted_for_speaker
# Run function
text_df['QUESTIONEE_WON'] = text_df.apply(reorient_args, axis=1)
text_df
Out[44]:
In [45]:
# Define function
def create_text_df(df):
'''Clean up text.'''
# ' '.join[question_1, question_2, question_3] so single string
df['TEXT'] = df['TEXT'].map(lambda item: ' '.join(item))
# Create string of punctuation chars to remove (but not '-')
punctuation = string.punctuation.replace('-', '').replace('/', '')
# Remove punctuation via [.!?,;:] regex
df['TEXT'] = df['TEXT'].str.replace('[' +
punctuation +
']',
# Replacement value
'')
# Remove double dash pattern
df['TEXT'] = df['TEXT'].str.replace('--',
# Replacement value
'')
# Get rid of all items without text.
df = df.loc[df['TEXT'].str.strip().str.len() > 0,:]
return df
# Run function
text_df = create_text_df(text_df)
text_df
Out[45]:
In [46]:
# Create test/train split for text data
split = sklearn.model_selection.train_test_split
# Split test and train.
train_text_df, test_text_df = split(text_df, test_size = 0.2)
train_text_df = train_text_df.copy()
test_text_df = test_text_df.copy()
In [47]:
train_text_df.head(3)
Out[47]:
In [48]:
train_text_df['JUSTICE'].unique()
Out[48]:
In [49]:
# Define function
def create_pipelines(df):
'''Creates pipelines for each justice.'''
# Basic setup.
gb = text_df.groupby('JUSTICE')
justices = df['JUSTICE'].unique()
dataframes = [gb.get_group(justice) for justice in justices]
nb_pipelines = []
sgd_pipelines = []
rf_pipelines = []
# Probably a vectorized way to do this.
for justice, dataframe in zip(justices, dataframes):
# Make aliases
Pipe = sklearn.pipeline.Pipeline
Vectorizer = sklearn.feature_extraction.text.CountVectorizer
Transformer = sklearn.feature_extraction.text.TfidfTransformer
MultiNB = sklearn.naive_bayes.MultinomialNB
SGD = sklearn.linear_model.SGDClassifier
RF = sklearn.ensemble.RandomForestClassifier
# Reuseable arguments.
vectorizer_params = {'ngram_range': (3, 5),
'min_df': 10}
transformer_params = {'use_idf': True}
############# Multinomial Naive Bayes classifier
nb_pipeline = Pipe([('vectorizer', Vectorizer(**vectorizer_params)),
('transformer', Transformer(**transformer_params)),
('classifier', MultiNB()),])
try:
nb_pipeline = nb_pipeline.fit(dataframe['TEXT'],
dataframe['QUESTIONEE_WON'])
except (ValueError, AttributeError):
nb_pipeline = None
nb_pipelines.append(nb_pipeline)
############ Gradient descent SGD
sgd_pipeline = Pipe([('vectorizer', Vectorizer(**vectorizer_params)),
('transformer', Transformer(**transformer_params)),
('classifier', SGD(loss='log', penalty='l2')),])
try:
sgd_pipeline = sgd_pipeline.fit(dataframe['TEXT'],
dataframe['QUESTIONEE_WON'])
except (ValueError, AttributeError):
sgd_pipeline = None
sgd_pipelines.append(sgd_pipeline)
############ RF
rf_pipeline = Pipe([('vectorizer', Vectorizer(**vectorizer_params)),
('transformer', Transformer(**transformer_params)),
('classifier', RF(n_estimators=100))])
try:
rf_pipeline = rf_pipeline.fit(dataframe['TEXT'],
dataframe['QUESTIONEE_WON'])
except (ValueError, AttributeError):
rf_pipeline = None
rf_pipelines.append(rf_pipeline)
return [item for item in zip(justices,
nb_pipelines,
sgd_pipelines,
rf_pipelines)]
# Create test and train pipelines
pipelines = create_pipelines(train_text_df)
In [50]:
pipelines[0][1]
Out[50]:
In [51]:
# Define function for creating an argument for add_predictions()
def create_model_dict(model_pipelines):
'''For convenience we create an associative array of models.'''
model_dict = {}
for justice, nb_pipe, sgd_pipe, rf_pipe in model_pipelines:
# Nested dicts
model_dict[justice] = {}
model_dict[justice]['SGD'] = sgd_pipe
model_dict[justice]['NB'] = nb_pipe
model_dict[justice]['RF'] = rf_pipe
return model_dict
# Run function
model_dict = create_model_dict(pipelines)
In [52]:
# Define function to add predictions to test frame
def add_predictions(row, model_dict, model_type):
'''Apply() function for adding predictions'''
justice_name = row['JUSTICE']
try:
model = model_dict[justice_name][model_type]
prediction = model.predict([row['TEXT']])[0]
# If no model, predict will not be an attribute.
# No justice, no peace (also no model).
except (KeyError, AttributeError):
return np.NaN
return prediction
test_text_df['NB_PREDICTION'] = test_text_df.apply(add_predictions,
args=(model_dict, 'NB'),
axis=1).astype(bool)
test_text_df['SGD_PREDICTION'] = test_text_df.apply(add_predictions,
args=(model_dict, 'SGD'),
axis=1).astype(bool)
test_text_df['RF_PREDICTION'] = test_text_df.apply(add_predictions,
args=(model_dict, 'RF'),
axis=1).astype(bool)
In [53]:
test_text_df.head(3)
Out[53]:
In [54]:
# Check output for clarity
test_text_df.head(3)
# Assess accuracy
score = sklearn.metrics.accuracy_score
test_text_df = test_text_df.dropna()
# Conduct scoring
nb_score = score(test_text_df['QUESTIONEE_WON'],
test_text_df['NB_PREDICTION'])
sgd_score = score(test_text_df['QUESTIONEE_WON'],
test_text_df['SGD_PREDICTION'])
rf_score = score(test_text_df['QUESTIONEE_WON'],
test_text_df['RF_PREDICTION'])
# Format as string
base_string = '''
\n
The Naive Bayes model scored {:.1%}.\n\n
The Stochastic Gradient Decent model scored {:.1%}.\n\n
The Random Forest model scored {:.1%}.\n\n
This can't be real. TODO.
'''
print(base_string.format(nb_score, sgd_score, rf_score))
In [55]:
score = sklearn.metrics.roc_auc_score(test_text_df['QUESTIONEE_WON'].values,
test_text_df['RF_PREDICTION'].values)
score
Out[55]:
In [56]:
# Define function
def get_nb_phrases(nb_pipeline, number):
'''Pull relevant phrases from model'''
nb_vec = nb_pipeline.named_steps['vectorizer']
nb_clf = nb_pipeline.named_steps['classifier']
nb_names = nb_vec.get_feature_names()
# nb_clf.feature_log_prob_[0] is for False (voted against party)
# nb_clf.feature_log_prob_[1] is for True (voted for party)
nb_probs = nb_clf.feature_log_prob_[1]
nb_series = pd.Series({name: prob
for name, prob
in zip(nb_names, nb_probs)})
# Turn into series
top_values_nb = nb_series.sort_values(ascending=False).head(number).copy()
top_values_nb.name = 'Top Naive Bayes Log Prob'
bottom_values_nb = nb_series.sort_values(ascending=True).head(number).copy()
bottom_values_nb.name = 'Bottom Naive Bayes Log Prob'
return (top_values_nb, bottom_values_nb)
# Define function
def get_sgd_phrases(sgd_pipeline, number):
'''Pull phrases from model.'''
sgd_clf = sgd_pipeline.named_steps['classifier']
sgd_vec = sgd_pipeline.named_steps['vectorizer']
sgd_names = sgd_vec.get_feature_names()
# sgd_clf.coef_[0] is for False (voted against party)
sgd_probs = sgd_clf.coef_[0]
sgd_series = pd.Series({name: prob
for name, prob
in zip(sgd_names, sgd_probs)})
# Turn into series.
top_values_sgd = sgd_series.sort_values(ascending=False).head(number).copy()
top_values_sgd.name = 'Top SGD Log Prob'
bottom_values_sgd = sgd_series.sort_values(ascending=True).head(number).copy()
bottom_values_sgd.name = 'Bottom SGD Log Prob'
return(top_values_sgd, bottom_values_sgd)
# Define function
def get_rf_phrases(rf_pipeline, number):
'''Pull phrases from model. Importances are both top and bottom items.'''
rf_clf = rf_pipeline.named_steps['classifier']
rf_vec = rf_pipeline.named_steps['vectorizer']
rf_names = rf_vec.get_feature_names()
# rf_clf.geature_importances
rf_probs = rf_clf.feature_importances_
rf_series = pd.Series({name: prob
for name, prob
in zip(rf_names, rf_probs)})
# Turn into series.
top_values_rf = rf_series.sort_values(ascending=False).head(number).copy()
top_values_rf.name = 'Top RF Feature Imp'
bottom_values_rf = rf_series.sort_values(ascending=True).head(number).copy()
bottom_values_rf.name = 'Bottom RF Feature Imp'
return(top_values_rf, bottom_values_rf)
In [57]:
# Define function
def create_phrase_series(pipelines, number=500):
'''Top and bottom phrase DFs. Two columns per justice in each (SGD & NB models).'''
# Create data holding dicts
return_value = []
# Iterate through pipelines to get data we need.
for justice, nb_pipeline, sgd_pipeline, rf_pipeline in pipelines:
# Skip any empty pipelines (insufficient comments)
if any([nb_pipeline is None,
sgd_pipeline is None,
rf_pipeline is None]):
continue
# Get actual phrases
top_values_nb, bottom_values_nb = get_nb_phrases(nb_pipeline,
number)
top_values_sgd, bottom_values_sgd = get_sgd_phrases(sgd_pipeline,
number)
top_values_rf, bottom_values_rf = get_rf_phrases(rf_pipeline,
number)
# Add to return value
return_value.append({'justice': justice,
'TOP_NB': top_values_nb,
'BOTTOM_NB': bottom_values_nb,
'TOP_SGD': top_values_sgd,
'BOTTOM_SGD': bottom_values_sgd,
'TOP_RF': top_values_rf,
'BOTTOM_RF': bottom_values_rf})
# Return list of dicts.
return return_value
# Run function
justice_data = create_phrase_series(pipelines)
In [58]:
# Show sample data for clarity
pd.DataFrame(justice_data[6]['BOTTOM_RF']).head(3)
Out[58]:
In [59]:
# Define function
def create_frequency_dfs(justice_data, text_df):
'''This function takes bottom phrases and computes frequency.'''
# Results
bottom_phrase_results = []
for data_dict in justice_data:
# Get justice name
justice = data_dict['justice']
# Get bottom values
bottom_nb_phrases = data_dict['BOTTOM_NB']
bottom_sgd_phrases = data_dict['BOTTOM_SGD']
bottom_rf_phrases = data_dict['BOTTOM_RF']
bottom_phrases = (bottom_nb_phrases.append(bottom_sgd_phrases)
.append(bottom_rf_phrases)
.drop_duplicates()
.index.values)
# Create won and lost dataframes.
won_df = text_df[(text_df['JUSTICE'] == justice) &
(text_df['QUESTIONEE_WON'] == True)]
lost_df = text_df[(text_df['JUSTICE'] == justice) &
(text_df['QUESTIONEE_WON'] == False)]
# To string
won_string = won_df['TEXT'].str.lower().str.cat(sep=' ')
lost_string = lost_df['TEXT'].str.lower().str.cat(sep=' ')
# Calculate bottom phrases
for phrase in bottom_phrases:
won_count = 0
lost_count = 0
# Get counts
won_count += won_string.count(phrase)
lost_count += lost_string.count(phrase)
all_count = won_count + lost_count
if all_count == 0:
percentage = np.NaN
else:
percentage = won_count / all_count
# Stick in results (list of dicts)
bottom_phrase_results.append({'JUSTICE': justice,
'PHRASE': phrase,
'AT_WINNER_COUNT': won_count,
'AT_LOSER_COUNT': lost_count,
'AT_WINNER_PERCENT': percentage})
# Create bottom dataframe
bottom_df = pd.DataFrame(bottom_phrase_results)
bottom_df = bottom_df.set_index(['JUSTICE', 'PHRASE'])
bottom_df = bottom_df[['AT_WINNER_COUNT', 'AT_LOSER_COUNT', 'AT_WINNER_PERCENT']]
bottom_df['AT_LOSER_PERCENT'] = 1 - bottom_df['AT_WINNER_PERCENT']
return bottom_df
# Run function
bottom_freq_df = create_frequency_dfs(justice_data, text_df)
bottom_freq_df.head(5).dropna()
Out[59]:
In [60]:
# Write bottom_freq_df to file
bottom_csv_path = os.path.join(DATA_FOLDER, 'bottom_phrases.csv')
bottom_freq_df.to_csv(bottom_csv_path, encoding='utf-8')
In [61]:
# Define function
def create_tabulation_df():
'''(Case, justice, arg_vect) x model.'''
# Make justice/model multiindex for columns
cases = CURRENT_CASES
# Really should have standardized this earlier
justices = [justice.upper() for justice in CURRENT_JUSTICES]
arg_types = ['PETITIONER_ARGUMENT',
'RESPONDENT_ARGUMENT',
'PETITIONER_REBUTTAL']
models = ['NB', 'SGD', 'RF']
cja_index = pd.MultiIndex.from_product([cases, justices, arg_types])
# Make dataframe
tabulation_df = pd.DataFrame(index=cja_index, columns=models, data=np.NaN)
return tabulation_df
# Run function
tabulation_df = create_tabulation_df()
In [62]:
# Demo for clarity ... should be empty.
tabulation_df.head(3)
Out[62]:
In [63]:
def make_current_df(arg_df):
# Create a lookup dataframe.
lookup_df = arg_df[arg_df['DOCKET'].isin(CURRENT_CASES)]
output_rows = []
input_rows = [row.to_dict() for index, row in lookup_df.iterrows()]
for justice in CURRENT_JUSTICES:
for row_dict in input_rows:
dict_copy = copy.deepcopy(row_dict)
dict_copy['JUSTICE'] = justice
output_rows.append(dict_copy)
return pd.DataFrame.from_dict(output_rows)
# Create new df
current_df = make_current_df(arg_df)
# Run previous functions
current_df = current_df.apply(filter_justice_data, axis=1)
current_df.head(3)
Out[63]:
In [64]:
# Define function
def create_lookup_series(current_df):
'''Place text in df for processing. Each justice gets same data.'''
# Flatten and reindex
lookup_df = current_df[['DOCKET',
'JUSTICE',
'PETITIONER_ARGUMENT',
#'RESPONDENT_ARGUMENT',
'PETITIONER_REBUTTAL']]
lookup_df = lookup_df.set_index(['DOCKET', 'JUSTICE'])
# Flatten lists
lookup_df = lookup_df.applymap(lambda x: ' '.join(x))
# Make lookup series
lookup_series = lookup_df.stack()
# Sort and dedupe (where do dupes come from?)
lookup_series.sort_index(inplace=True)
lookup_series.drop_duplicates(inplace=True)
return lookup_series
# Run
lookup_series = create_lookup_series(current_df)
In [65]:
lookup_series.head(3)
Out[65]:
In [66]:
# Define function
def populate_tabulation_df(row, lookup_series):
'''Fill in the dataframe. Meant to be applied.'''
try:
case, justice, arg_type = row.name
# Make this not chained indexing
value = lookup_series[case][justice][arg_type]
row[['NB', 'SGD', 'RF']] = value, value, value
except KeyError:
row[['NB', 'SGD', 'RF']] = np.NaN, np.NaN, np.NaN
return row
# Run function
tabulation_df = tabulation_df.apply(populate_tabulation_df,
args=(lookup_series,),
axis=1)
In [67]:
tabulation_df.head(3)
Out[67]:
In [68]:
# Define function
def run_predictions(column, model_dict, tabulation_df):
'''This applied function adds results to the result series.
It is initially framed in terms of "QUESTIONEE_WINS", which is
the output of the model.predict(). It is then converted to
"PLAINTIFF_WINS", by flipping the respondent argument (e.g.
if questionee is plaintiff because plaintiff arg or
plaintiff rebuttal, QUESTIONEE_WINS == PLAINTIFF_WINS ...
if respondent argument, QUESTOINEE_WINS != PLAINTIFF_WINS).
'''
# There has to be a better way to vectorize with groupby.
model_name = column.name
# Need copy so we can iterrate and change "in place"
column_copy = column.copy()
# iterate through items
for index, text in column_copy.iteritems():
case, justice, arg_type = index
# Get text
try:
model = model_dict[justice][model_name]
# Cannot comapre np.NaN
if model is None or pd.isnull(text):
column.loc[index] = np.NaN
continue
# If you've already gone over it, it's bool. Therefore skip.
if type(text) is np.bool_ or type(text) is bool:
continue
# Predict
prediction = model.predict([text])[0]
# Flip prediction because speaker -> party flip recorrect.
if arg_type == 'RESPONDENT_ARGUMENT':
prediction = not prediction
# Write back to clolumn
column.loc[index] = prediction
except KeyError:
column.loc[index] = np.NaN
return column
# Run function
tabulation_df = tabulation_df.apply(run_predictions,
axis=0,
args=(model_dict, tabulation_df))
In [69]:
# Demo for clarity
tabulation_df.head(3)
Out[69]:
In [70]:
def modified_sum(row):
if row['RESPONDENT_VOTES'] < row['PETITIONER_VOTES']:
return 'Petitioner'
if row['RESPONDENT_VOTES'] > row['PETITIONER_VOTES']:
return 'Respondent'
else:
return None
#### Define function
def calculate_votes(tabulation_df):
# Consensus vector ... vectorize this.
consensus = pd.Series(index=tabulation_df.index
.droplevel(2)
.copy(),
dtype='object')
consensus.name = 'VOTES'
# Iterate through tabulation
tdf = tabulation_df.unstack()
tdf = tdf.apply(lambda row: pd.value_counts(row.values), axis=1)
tdf.columns = ['RESPONDENT_VOTES', 'PETITIONER_VOTES']
tdf = tdf.fillna(0)
tdf['VOTE'] = tdf.apply(modified_sum, axis=1)
return tdf
# Run function
votes = calculate_votes(tabulation_df)
votes.head(8)
Out[70]:
In [71]:
def harmonize_empty(votes, VOTING_RELATIONSHIPS):
'''If null, make this justice copy another similarly-minded justice.'''
voting_df = pd.DataFrame(VOTING_RELATIONSHIPS)
# Don't want our inputed picks affecting other imputed picks.
imputed_probabilities = []
for index, row in votes.iterrows():
# Parse
case, justice = index
if row['VOTE'] is None:
# Get similarity rankings: ALITO: 7, BREYER: 2, KAGAN: 3
similarity_ranks = voting_df.loc[justice].argsort()
# Then rank so we have BREYER: 2, KAGAN: 3, ALITO: 7
similarity_order = similarity_ranks.sort_values()
# Similar justice list: [BREYER, KAGAN, ALITO]
most_similar = similarity_order.index.values
# Go through justice list to get closest.
for sim_justice in most_similar:
if sim_justice == 'SCALIA':
continue
other_justice_prob = votes.loc[(case, sim_justice)]['VOTE']
if other_justice_prob is None:
continue
else:
imputed_probabilities.append({'case': case,
'justice': justice,
'prob': other_justice_prob})
# Now all imputed_probs are complete. Add back in.
for prob in imputed_probabilities:
index_tuple = tuple([prob['case'], prob['justice']])
votes.loc[index_tuple, 'VOTE'] = prob['prob']
return None
harmonize_empty(votes, VOTING_RELATIONSHIPS)
votes.head(8)
Out[71]:
In [72]:
def get_petitioner_votes(row):
'''Helper function for apply.'''
vc = row.value_counts()
try:
petitioner_count = vc['Petitioner']
except KeyError:
petitioner_count = 0
return petitioner_count
def get_respondent_votes(row):
'''Helper function for apply.'''
vc = row.value_counts()
try:
respondent_count = vc['Respondent']
except KeyError:
respondent_count = 0
return respondent_count
def process_votes(votes):
'''Apply function for results.'''
# Get rid of superfluous columns
result = votes[['VOTE']]
# Turn into dataframe.
result = result.unstack()
# Get rid of superfluous multiindex
result.columns = result.columns.droplevel(0)
# Add winner and loser counts.
result['PET_VOTES'] = result.apply(get_petitioner_votes, axis=1)
result['RES_VOTES'] = result.apply(get_respondent_votes, axis=1)
# Arbitrary
result['VICTOR'] = result['PET_VOTES'] > result['RES_VOTES']
result['VICTOR'] = result['VICTOR'].map({True: 'Petitioner',
False: 'Respondent'})
return result
result = process_votes(votes)
try:
result = result.drop('15-1112')
except Exception:
pass
result
Out[72]:
In [74]:
# Write results to file.
result_csv_path = os.path.join(DATA_FOLDER, 'case_results.csv')
result.to_csv(result_csv_path, encoding='utf-8')
In [75]:
result['VICTOR'].value_counts()
Out[75]: