In [16]:
import pandas as pd
%matplotlib inline
In [2]:
!curl "https://confluence.cornell.edu/download/attachments/172918779/supreme_court_dialogs_corpus_v1.01.zip?version=1&modificationDate=1351805307000&api=v2" -o scotus.zip
In [3]:
!unzip scotus.zip
In [4]:
!rm -rf supreme_court_dialogs_corpus_v1.01/__MACOSX/
In [5]:
!ls supreme_court_dialogs_corpus_v1.01/
In [6]:
with open("supreme_court_dialogs_corpus_v1.01/supreme.conversations.txt") as f:
text = f.read()
In [7]:
text.split("\n")[1].split(" +++$+++ ")
Out[7]:
In [8]:
len(text.split("\n"))
Out[8]:
In [9]:
corpus_df = pd.DataFrame(columns=["case_id", "after_previous", "speaker", "is_justice", "justice_vote", "presentation_side", "utterance"])
corpus_df
Out[9]:
In [10]:
for line in text.split("\n"):
values = line.split(" +++$+++ ")
if values is not '':
line_dict = {"case_id" : values[0], "after_previous" : values[2], "speaker" : values[3], "is_justice" : values[4], "justice_vote" : values[5], "presentation_side" : values[6], "utterance" : values[7] }
line_series = pd.Series(line_dict, name=values[1])
corpus_df = corpus_df.append(line_series)
if len(corpus_df) % 1000 is 0:
print(len(corpus_df), len(corpus_df) / len(text.split("\n")) * 100, "% ")
In [14]:
corpus_df[:5]
Out[14]:
In [ ]:
In [ ]: