notebook.community

Edit and run



In [87]:

    
# Imports
import dateutil.parser
import matplotlib.pyplot as plt
import pandas
import sys



In [128]:

    
# Read raw sentence data
sentence_data = pandas.read_table('/data/workspace/wellsettled-research/scratch/results/ws_match_sentences.txt',
                  sep='|',
                  header=None)
sentence_data.columns = ['nan', 'sentence_id', 'case_caption', 'case_date', 'sentence']
del sentence_data['nan']

# Read sentence match data
sentence_match_data = pandas.read_csv('/data/workspace/wellsettled-research/scratch/results/ws_matches.csv',
                                      index_col=0)
                                      
sentence_match_data.columns = ['sentence_id', 'case_caption', 'case_date', 'phrase_id']

# Read phrase data
phrase_data = pandas.read_table('/data/workspace/wellsettled-research/scratch/results/ws_phrase_mapping.csv',
                                header=None)
phrase_data.columns = ['phrase_id', 'phrase_stem']



In [89]:

    
sentence_data.head()









    Out[89]:






  
    
      
      sentence_id
      case_caption
      case_date
      sentence
    
  
  
    
      0
       5d8c2fa4-4260-49e1-8ef1-e8d40c9339b2
                  PENNELL v. SAN JOSE, 485 U.S. 1 (1988)
       1988-02-24
       As appellants point out, ""[i]t is axiomatic t...
    
    
      1
       02da221e-2b90-4a4e-90fe-ebc010f30b2d
                  EDENFIELD v. FANE, 507 U.S. 761 (1993)
       1993-04-26
       It is well established that ""[t]he party seek...
    
    
      2
       26fc841a-e7d3-4377-8b01-2a7dc461cc35
       LEWIS v. CITY OF NEW ORLEANS, 415 U.S. 130 (1974)
       1974-02-20
       There are certain well-defined and narrowly li...
    
    
      3
       7e7a37cc-01a4-4756-9c82-835c227cda64
                  WEEKS v. ANGELONE, 528 U.S. 225 (2000)
       2000-01-19
       A jury is presumed to follow its instructions....
    
    
      4
       2b436cbc-b29f-42ae-b7a9-4bdffb7d69c1
       BOARD OF TRUSTEES, UNIVERSITY OF ALABAMA v. GA...
       2001-02-21
       The ultimate guarantee of the Eleventh Amendme...



In [90]:

    
sentence_match_data.head()









    Out[90]:






  
    
      
      sentence_id
      case_caption
      case_date
      phrase_id
    
  
  
    
      0
       5d8c2fa4-4260-49e1-8ef1-e8d40c9339b2
                  PENNELL v. SAN JOSE, 485 U.S. 1 (1988)
       1988-02-24
       7551
    
    
      1
       02da221e-2b90-4a4e-90fe-ebc010f30b2d
                  EDENFIELD v. FANE, 507 U.S. 761 (1993)
       1993-04-26
       3196
    
    
      2
       26fc841a-e7d3-4377-8b01-2a7dc461cc35
       LEWIS v. CITY OF NEW ORLEANS, 415 U.S. 130 (1974)
       1974-02-20
       2069
    
    
      3
       7e7a37cc-01a4-4756-9c82-835c227cda64
                  WEEKS v. ANGELONE, 528 U.S. 225 (2000)
       2000-01-19
       6212
    
    
      4
       2b436cbc-b29f-42ae-b7a9-4bdffb7d69c1
       BOARD OF TRUSTEES, UNIVERSITY OF ALABAMA v. GA...
       2001-02-21
       3692



In [92]:

    
phrase_data.head()









    Out[92]:






  
    
      
      phrase_id
      phrase_stem
    
  
  
    
      0
       0
       defend seek revers basi insuffici evid swim up...
    
    
      1
       1
                               right assist counsel waiv
    
    
      2
       2
       must consid state prior art shown refer employ...
    
    
      3
       3
                privat properti land extend low-wat mark
    
    
      4
       4
       order warrant find neglig proxim caus injuri m...



In [94]:

    
# to_date
def to_date(value):
    try:
        return dateutil.parser.parse(value).date()
    except:
        return None

# Merge all data
merge_data = sentence_match_data.merge(phrase_data, on='phrase_id').merge(sentence_data, on='sentence_id')
merge_data = merge_data[['sentence_id', 'case_caption_x', 'case_date_x', 'phrase_id', 'phrase_stem', 'sentence']]
merge_data.columns = ['sentence_id', 'case_caption', 'case_date', 'phrase_id', 'phrase_stem', 'sentence']
merge_data['case_date'] = merge_data['case_date'].apply(to_date)



In [99]:

    
# Now sort/check
merge_data.sort(columns=['phrase_id', 'case_date'], ascending=True, inplace=True)
merge_data.to_csv('merged_wsr_matches.csv')



In [67]:

    
def check_sentence_match(stems):
    """Check if the sentence stem sequence matches our desired
    patterns."""

    # Check minimum length
    if len(stems) < 3:
        return False

    # Require initial "it"
    if stems[0] != "it":
        return False

    # Check for "well*"
    if not stems[1].startswith(u'well'):
        return False

    # Now handle hyphen vs. space
    if '-' in stems[1]:
        tokens = stems[1].split(u'-')
        if tokens[1] in [u'settl', u'establish']:
            return True
    elif stems[2] in [u'settl', u'establish']:
        return True
    else:
        return False



In [100]:

    
# Handle imports
sys.path.append('/data/workspace/wellsettled-research/src/')
from wsr.process.stem import process_sentence

# Define nested function for sentence
def is_sentence_ws(sentence):
    return check_sentence_match(process_sentence(sentence.decode("utf-8")))

# Do it live
merge_data['is_ws'] = merge_data['sentence'].apply(is_sentence_ws)



In [109]:

    
# Get the first well-settled instances
first_ws_instances = merge_data.loc[merge_data['is_ws']==True].groupby('phrase_id').head(1)
first_ws_instances.to_csv('first_ws_instances.csv')
first_ws_id = first_ws_instances.index



In [112]:

    
# Update original merge DF
merge_data.loc[:, 'is_first_ws'] = False
merge_data.ix[first_ws_id, 'is_first_ws'] = True
merge_data = merge_data[['sentence_id', 'phrase_id', 'case_date', 'is_ws', 'is_first_ws', 'case_caption', 'phrase_stem', 'sentence']]
merge_data.to_csv('merged_wsr_matches.csv')



In [114]:

    
merge_data.head()









    Out[114]:






  
    
      
      sentence_id
      phrase_id
      case_date
      is_ws
      is_first_ws
      case_caption
      phrase_stem
      sentence
    
  
  
    
      63676
       5548d8e7-8305-4ccc-9b58-997f9692114e
       0
       1997-08-15
        True
        True
       U.S. v. MULDERIG, 120 F.3d 534 (5th Cir. 1997)
       defend seek revers basi insuffici evid swim up...
       It is by now well settled that a defendant see...
    
    
      63678
       36fdb9a7-938d-4378-88db-1f11e89caafa
       0
       1998-03-09
       False
       False
          U.S. v. CIHAK, 137 F.3d 252 (5th Cir. 1998)
       defend seek revers basi insuffici evid swim up...
       However, ""[i]t is by now well settled that a ...
    
    
      63680
       7e7c69f4-fa65-4e04-858e-972fcfdab309
       0
       2005-04-06
        True
       False
         U.S. v. HOLMES, 406 F.3d 337 (5th Cir. 2005)
       defend seek revers basi insuffici evid swim up...
       It is by now well-settled that a defendant see...
    
    
      63677
       9472f1b6-4baa-402a-806f-62f8f85907a1
       0
       2009-06-10
       False
       False
       U.S. v. STEPHENS, 571 F.3d 401 (5th Cir. 2009)
       defend seek revers basi insuffici evid swim up...
       Bartholomew challenges the sufficiency of the ...
    
    
      63679
       2e56a55e-6160-461e-8da5-0e2880e4b97d
       0
       2011-09-07
       False
       False
        U.S. v. AGUILAR, 10-50817 (5th Cir. 9-7-2011)
       defend seek revers basi insuffici evid swim up...
       As a result, ""`a defendant seeking reversal o...



In [132]:

    
# Store data
no_ws_phrase_list = []
ws_phrase_counts = []

# Iterate over all phrases and count # before and after first WS
for phrase_id, phrase_group_data in merge_data.groupby('phrase_id'):
    # Get the counts
    total_usage = phrase_group_data.shape[0]
    total_ws_usage = int(phrase_group_data['is_ws'].sum())
    try:
        first_ws_usage = phrase_group_data['is_ws'].tolist().index(True)
    except:
        first_ws_usage = -1
        
    ws_phrase_counts.append((phrase_id, first_ws_usage, total_ws_usage, total_usage))
        
ws_phrase_df = pandas.DataFrame(ws_phrase_counts,
                                columns=['phrase_id', 'first_ws_index', 'total_ws', 'total'])
ws_phrase_df =  ws_phrase_df.merge(phrase_data, on='phrase_id')
ws_phrase_df.to_csv('ws_phrase_metadata.csv')

	sentence_id	case_caption	case_date	sentence
0	5d8c2fa4-4260-49e1-8ef1-e8d40c9339b2	PENNELL v. SAN JOSE, 485 U.S. 1 (1988)	1988-02-24	As appellants point out, ""[i]t is axiomatic t...
1	02da221e-2b90-4a4e-90fe-ebc010f30b2d	EDENFIELD v. FANE, 507 U.S. 761 (1993)	1993-04-26	It is well established that ""[t]he party seek...
2	26fc841a-e7d3-4377-8b01-2a7dc461cc35	LEWIS v. CITY OF NEW ORLEANS, 415 U.S. 130 (1974)	1974-02-20	There are certain well-defined and narrowly li...
3	7e7a37cc-01a4-4756-9c82-835c227cda64	WEEKS v. ANGELONE, 528 U.S. 225 (2000)	2000-01-19	A jury is presumed to follow its instructions....
4	2b436cbc-b29f-42ae-b7a9-4bdffb7d69c1	BOARD OF TRUSTEES, UNIVERSITY OF ALABAMA v. GA...	2001-02-21	The ultimate guarantee of the Eleventh Amendme...

	phrase_id	phrase_stem
0	0	defend seek revers basi insuffici evid swim up...
1	1	right assist counsel waiv
2	2	must consid state prior art shown refer employ...
3	3	privat properti land extend low-wat mark
4	4	order warrant find neglig proxim caus injuri m...

	sentence_id	case_date	is_ws	is_first_ws	case_caption	phrase_stem	sentence
63676	5548d8e7-8305-4ccc-9b58-997f9692114e	1997-08-15	True	True	U.S. v. MULDERIG, 120 F.3d 534 (5th Cir. 1997)	defend seek revers basi insuffici evid swim up...	It is by now well settled that a defendant see...
63678	36fdb9a7-938d-4378-88db-1f11e89caafa	1998-03-09	False	False	U.S. v. CIHAK, 137 F.3d 252 (5th Cir. 1998)	defend seek revers basi insuffici evid swim up...	However, ""[i]t is by now well settled that a ...
63680	7e7c69f4-fa65-4e04-858e-972fcfdab309	2005-04-06	True	False	U.S. v. HOLMES, 406 F.3d 337 (5th Cir. 2005)	defend seek revers basi insuffici evid swim up...	It is by now well-settled that a defendant see...
63677	9472f1b6-4baa-402a-806f-62f8f85907a1	2009-06-10	False	False	U.S. v. STEPHENS, 571 F.3d 401 (5th Cir. 2009)	defend seek revers basi insuffici evid swim up...	Bartholomew challenges the sufficiency of the ...
63679	2e56a55e-6160-461e-8da5-0e2880e4b97d	2011-09-07	False	False	U.S. v. AGUILAR, 10-50817 (5th Cir. 9-7-2011)	defend seek revers basi insuffici evid swim up...	As a result, ""`a defendant seeking reversal o...