In [87]:
# Imports
import dateutil.parser
import matplotlib.pyplot as plt
import pandas
import sys
In [128]:
# Read raw sentence data
sentence_data = pandas.read_table('/data/workspace/wellsettled-research/scratch/results/ws_match_sentences.txt',
sep='|',
header=None)
sentence_data.columns = ['nan', 'sentence_id', 'case_caption', 'case_date', 'sentence']
del sentence_data['nan']
# Read sentence match data
sentence_match_data = pandas.read_csv('/data/workspace/wellsettled-research/scratch/results/ws_matches.csv',
index_col=0)
sentence_match_data.columns = ['sentence_id', 'case_caption', 'case_date', 'phrase_id']
# Read phrase data
phrase_data = pandas.read_table('/data/workspace/wellsettled-research/scratch/results/ws_phrase_mapping.csv',
header=None)
phrase_data.columns = ['phrase_id', 'phrase_stem']
In [89]:
sentence_data.head()
Out[89]:
In [90]:
sentence_match_data.head()
Out[90]:
In [92]:
phrase_data.head()
Out[92]:
In [94]:
# to_date
def to_date(value):
try:
return dateutil.parser.parse(value).date()
except:
return None
# Merge all data
merge_data = sentence_match_data.merge(phrase_data, on='phrase_id').merge(sentence_data, on='sentence_id')
merge_data = merge_data[['sentence_id', 'case_caption_x', 'case_date_x', 'phrase_id', 'phrase_stem', 'sentence']]
merge_data.columns = ['sentence_id', 'case_caption', 'case_date', 'phrase_id', 'phrase_stem', 'sentence']
merge_data['case_date'] = merge_data['case_date'].apply(to_date)
In [99]:
# Now sort/check
merge_data.sort(columns=['phrase_id', 'case_date'], ascending=True, inplace=True)
merge_data.to_csv('merged_wsr_matches.csv')
In [67]:
def check_sentence_match(stems):
"""Check if the sentence stem sequence matches our desired
patterns."""
# Check minimum length
if len(stems) < 3:
return False
# Require initial "it"
if stems[0] != "it":
return False
# Check for "well*"
if not stems[1].startswith(u'well'):
return False
# Now handle hyphen vs. space
if '-' in stems[1]:
tokens = stems[1].split(u'-')
if tokens[1] in [u'settl', u'establish']:
return True
elif stems[2] in [u'settl', u'establish']:
return True
else:
return False
In [100]:
# Handle imports
sys.path.append('/data/workspace/wellsettled-research/src/')
from wsr.process.stem import process_sentence
# Define nested function for sentence
def is_sentence_ws(sentence):
return check_sentence_match(process_sentence(sentence.decode("utf-8")))
# Do it live
merge_data['is_ws'] = merge_data['sentence'].apply(is_sentence_ws)
In [109]:
# Get the first well-settled instances
first_ws_instances = merge_data.loc[merge_data['is_ws']==True].groupby('phrase_id').head(1)
first_ws_instances.to_csv('first_ws_instances.csv')
first_ws_id = first_ws_instances.index
In [112]:
# Update original merge DF
merge_data.loc[:, 'is_first_ws'] = False
merge_data.ix[first_ws_id, 'is_first_ws'] = True
merge_data = merge_data[['sentence_id', 'phrase_id', 'case_date', 'is_ws', 'is_first_ws', 'case_caption', 'phrase_stem', 'sentence']]
merge_data.to_csv('merged_wsr_matches.csv')
In [114]:
merge_data.head()
Out[114]:
In [132]:
# Store data
no_ws_phrase_list = []
ws_phrase_counts = []
# Iterate over all phrases and count # before and after first WS
for phrase_id, phrase_group_data in merge_data.groupby('phrase_id'):
# Get the counts
total_usage = phrase_group_data.shape[0]
total_ws_usage = int(phrase_group_data['is_ws'].sum())
try:
first_ws_usage = phrase_group_data['is_ws'].tolist().index(True)
except:
first_ws_usage = -1
ws_phrase_counts.append((phrase_id, first_ws_usage, total_ws_usage, total_usage))
ws_phrase_df = pandas.DataFrame(ws_phrase_counts,
columns=['phrase_id', 'first_ws_index', 'total_ws', 'total'])
ws_phrase_df = ws_phrase_df.merge(phrase_data, on='phrase_id')
ws_phrase_df.to_csv('ws_phrase_metadata.csv')