In [87]:
# Imports
import dateutil.parser
import matplotlib.pyplot as plt
import pandas
import sys

In [128]:
# Read raw sentence data
sentence_data = pandas.read_table('/data/workspace/wellsettled-research/scratch/results/ws_match_sentences.txt',
                  sep='|',
                  header=None)
sentence_data.columns = ['nan', 'sentence_id', 'case_caption', 'case_date', 'sentence']
del sentence_data['nan']

# Read sentence match data
sentence_match_data = pandas.read_csv('/data/workspace/wellsettled-research/scratch/results/ws_matches.csv',
                                      index_col=0)
                                      
sentence_match_data.columns = ['sentence_id', 'case_caption', 'case_date', 'phrase_id']

# Read phrase data
phrase_data = pandas.read_table('/data/workspace/wellsettled-research/scratch/results/ws_phrase_mapping.csv',
                                header=None)
phrase_data.columns = ['phrase_id', 'phrase_stem']

In [89]:
sentence_data.head()


Out[89]:
sentence_id case_caption case_date sentence
0 5d8c2fa4-4260-49e1-8ef1-e8d40c9339b2 PENNELL v. SAN JOSE, 485 U.S. 1 (1988) 1988-02-24 As appellants point out, ""[i]t is axiomatic t...
1 02da221e-2b90-4a4e-90fe-ebc010f30b2d EDENFIELD v. FANE, 507 U.S. 761 (1993) 1993-04-26 It is well established that ""[t]he party seek...
2 26fc841a-e7d3-4377-8b01-2a7dc461cc35 LEWIS v. CITY OF NEW ORLEANS, 415 U.S. 130 (1974) 1974-02-20 There are certain well-defined and narrowly li...
3 7e7a37cc-01a4-4756-9c82-835c227cda64 WEEKS v. ANGELONE, 528 U.S. 225 (2000) 2000-01-19 A jury is presumed to follow its instructions....
4 2b436cbc-b29f-42ae-b7a9-4bdffb7d69c1 BOARD OF TRUSTEES, UNIVERSITY OF ALABAMA v. GA... 2001-02-21 The ultimate guarantee of the Eleventh Amendme...

In [90]:
sentence_match_data.head()


Out[90]:
sentence_id case_caption case_date phrase_id
0 5d8c2fa4-4260-49e1-8ef1-e8d40c9339b2 PENNELL v. SAN JOSE, 485 U.S. 1 (1988) 1988-02-24 7551
1 02da221e-2b90-4a4e-90fe-ebc010f30b2d EDENFIELD v. FANE, 507 U.S. 761 (1993) 1993-04-26 3196
2 26fc841a-e7d3-4377-8b01-2a7dc461cc35 LEWIS v. CITY OF NEW ORLEANS, 415 U.S. 130 (1974) 1974-02-20 2069
3 7e7a37cc-01a4-4756-9c82-835c227cda64 WEEKS v. ANGELONE, 528 U.S. 225 (2000) 2000-01-19 6212
4 2b436cbc-b29f-42ae-b7a9-4bdffb7d69c1 BOARD OF TRUSTEES, UNIVERSITY OF ALABAMA v. GA... 2001-02-21 3692

In [92]:
phrase_data.head()


Out[92]:
phrase_id phrase_stem
0 0 defend seek revers basi insuffici evid swim up...
1 1 right assist counsel waiv
2 2 must consid state prior art shown refer employ...
3 3 privat properti land extend low-wat mark
4 4 order warrant find neglig proxim caus injuri m...

In [94]:
# to_date
def to_date(value):
    try:
        return dateutil.parser.parse(value).date()
    except:
        return None

# Merge all data
merge_data = sentence_match_data.merge(phrase_data, on='phrase_id').merge(sentence_data, on='sentence_id')
merge_data = merge_data[['sentence_id', 'case_caption_x', 'case_date_x', 'phrase_id', 'phrase_stem', 'sentence']]
merge_data.columns = ['sentence_id', 'case_caption', 'case_date', 'phrase_id', 'phrase_stem', 'sentence']
merge_data['case_date'] = merge_data['case_date'].apply(to_date)

In [99]:
# Now sort/check
merge_data.sort(columns=['phrase_id', 'case_date'], ascending=True, inplace=True)
merge_data.to_csv('merged_wsr_matches.csv')

In [67]:
def check_sentence_match(stems):
    """Check if the sentence stem sequence matches our desired
    patterns."""

    # Check minimum length
    if len(stems) < 3:
        return False

    # Require initial "it"
    if stems[0] != "it":
        return False

    # Check for "well*"
    if not stems[1].startswith(u'well'):
        return False

    # Now handle hyphen vs. space
    if '-' in stems[1]:
        tokens = stems[1].split(u'-')
        if tokens[1] in [u'settl', u'establish']:
            return True
    elif stems[2] in [u'settl', u'establish']:
        return True
    else:
        return False

In [100]:
# Handle imports
sys.path.append('/data/workspace/wellsettled-research/src/')
from wsr.process.stem import process_sentence

# Define nested function for sentence
def is_sentence_ws(sentence):
    return check_sentence_match(process_sentence(sentence.decode("utf-8")))

# Do it live
merge_data['is_ws'] = merge_data['sentence'].apply(is_sentence_ws)

In [109]:
# Get the first well-settled instances
first_ws_instances = merge_data.loc[merge_data['is_ws']==True].groupby('phrase_id').head(1)
first_ws_instances.to_csv('first_ws_instances.csv')
first_ws_id = first_ws_instances.index

In [112]:
# Update original merge DF
merge_data.loc[:, 'is_first_ws'] = False
merge_data.ix[first_ws_id, 'is_first_ws'] = True
merge_data = merge_data[['sentence_id', 'phrase_id', 'case_date', 'is_ws', 'is_first_ws', 'case_caption', 'phrase_stem', 'sentence']]
merge_data.to_csv('merged_wsr_matches.csv')

In [114]:
merge_data.head()


Out[114]:
sentence_id phrase_id case_date is_ws is_first_ws case_caption phrase_stem sentence
63676 5548d8e7-8305-4ccc-9b58-997f9692114e 0 1997-08-15 True True U.S. v. MULDERIG, 120 F.3d 534 (5th Cir. 1997) defend seek revers basi insuffici evid swim up... It is by now well settled that a defendant see...
63678 36fdb9a7-938d-4378-88db-1f11e89caafa 0 1998-03-09 False False U.S. v. CIHAK, 137 F.3d 252 (5th Cir. 1998) defend seek revers basi insuffici evid swim up... However, ""[i]t is by now well settled that a ...
63680 7e7c69f4-fa65-4e04-858e-972fcfdab309 0 2005-04-06 True False U.S. v. HOLMES, 406 F.3d 337 (5th Cir. 2005) defend seek revers basi insuffici evid swim up... It is by now well-settled that a defendant see...
63677 9472f1b6-4baa-402a-806f-62f8f85907a1 0 2009-06-10 False False U.S. v. STEPHENS, 571 F.3d 401 (5th Cir. 2009) defend seek revers basi insuffici evid swim up... Bartholomew challenges the sufficiency of the ...
63679 2e56a55e-6160-461e-8da5-0e2880e4b97d 0 2011-09-07 False False U.S. v. AGUILAR, 10-50817 (5th Cir. 9-7-2011) defend seek revers basi insuffici evid swim up... As a result, ""`a defendant seeking reversal o...

In [132]:
# Store data
no_ws_phrase_list = []
ws_phrase_counts = []

# Iterate over all phrases and count # before and after first WS
for phrase_id, phrase_group_data in merge_data.groupby('phrase_id'):
    # Get the counts
    total_usage = phrase_group_data.shape[0]
    total_ws_usage = int(phrase_group_data['is_ws'].sum())
    try:
        first_ws_usage = phrase_group_data['is_ws'].tolist().index(True)
    except:
        first_ws_usage = -1
        
    ws_phrase_counts.append((phrase_id, first_ws_usage, total_ws_usage, total_usage))
        
ws_phrase_df = pandas.DataFrame(ws_phrase_counts,
                                columns=['phrase_id', 'first_ws_index', 'total_ws', 'total'])
ws_phrase_df =  ws_phrase_df.merge(phrase_data, on='phrase_id')
ws_phrase_df.to_csv('ws_phrase_metadata.csv')