In [16]:
import pandas as pd
%matplotlib inline

In [2]:
!curl "https://confluence.cornell.edu/download/attachments/172918779/supreme_court_dialogs_corpus_v1.01.zip?version=1&modificationDate=1351805307000&api=v2" -o scotus.zip


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 9914k  100 9914k    0     0  1959k      0  0:00:05  0:00:05 --:--:-- 2132k

In [3]:
!unzip scotus.zip


Archive:  scotus.zip
   creating: supreme_court_dialogs_corpus_v1.01/
  inflating: supreme_court_dialogs_corpus_v1.01/.DS_Store  
   creating: __MACOSX/
   creating: __MACOSX/supreme_court_dialogs_corpus_v1.01/
  inflating: __MACOSX/supreme_court_dialogs_corpus_v1.01/._.DS_Store  
  inflating: supreme_court_dialogs_corpus_v1.01/echoes_of_power.pdf  
  inflating: __MACOSX/supreme_court_dialogs_corpus_v1.01/._echoes_of_power.pdf  
  inflating: supreme_court_dialogs_corpus_v1.01/supreme.conversations.txt  
  inflating: __MACOSX/supreme_court_dialogs_corpus_v1.01/._supreme.conversations.txt  
  inflating: supreme_court_dialogs_corpus_v1.01/supreme.gender.txt  
  inflating: supreme_court_dialogs_corpus_v1.01/supreme.outcome.txt  
  inflating: supreme_court_dialogs_corpus_v1.01/supreme.README.v1.01.txt  
  inflating: __MACOSX/supreme_court_dialogs_corpus_v1.01/._supreme.README.v1.01.txt  
  inflating: supreme_court_dialogs_corpus_v1.01/supreme.votes.txt  
  inflating: supreme_court_dialogs_corpus_v1.01/timothy_hawes_thesis.pdf  
  inflating: __MACOSX/supreme_court_dialogs_corpus_v1.01/._timothy_hawes_thesis.pdf  

In [4]:
!rm -rf supreme_court_dialogs_corpus_v1.01/__MACOSX/

In [5]:
!ls supreme_court_dialogs_corpus_v1.01/


echoes_of_power.pdf	   supreme.outcome.txt	     timothy_hawes_thesis.pdf
supreme.conversations.txt  supreme.README.v1.01.txt
supreme.gender.txt	   supreme.votes.txt

In [6]:
with open("supreme_court_dialogs_corpus_v1.01/supreme.conversations.txt") as f:
    text = f.read()

In [7]:
text.split("\n")[1].split(" +++$+++ ")


Out[7]:
['02-1472',
 '4',
 'FALSE',
 'MR. MILLER',
 'NOT JUSTICE',
 'NA',
 'PETITIONER',
 'Justice Stevens, and may it please the Court: These two contract cases concern whether the Government is liable in money damages under the Contract Disputes Act and section 110 of the Indian Self-Determination Act when the Secretary fails to fully pay a contract price for the --']

In [8]:
len(text.split("\n"))


Out[8]:
51499

In [9]:
corpus_df = pd.DataFrame(columns=["case_id", "after_previous", "speaker", "is_justice", "justice_vote", "presentation_side", "utterance"])
corpus_df


Out[9]:
case_id after_previous speaker is_justice justice_vote presentation_side utterance

In [10]:
for line in text.split("\n"):
    values = line.split(" +++$+++ ")
    if values is not '':
        line_dict = {"case_id" : values[0], "after_previous" : values[2], "speaker" : values[3], "is_justice" : values[4], "justice_vote" : values[5], "presentation_side" : values[6], "utterance" : values[7] } 
        line_series = pd.Series(line_dict, name=values[1])
        corpus_df = corpus_df.append(line_series)
        if len(corpus_df) % 1000 is 0:
            print(len(corpus_df), len(corpus_df) / len(text.split("\n")) * 100, "% ")


1000 1.941785277384027 % 
2000 3.883570554768054 % 
3000 5.825355832152081 % 
4000 7.767141109536108 % 
5000 9.708926386920135 % 
6000 11.650711664304161 % 
7000 13.592496941688188 % 
8000 15.534282219072216 % 
9000 17.476067496456242 % 
10000 19.41785277384027 % 
11000 21.359638051224294 % 
12000 23.301423328608323 % 
13000 25.24320860599235 % 
14000 27.184993883376375 % 
15000 29.126779160760403 % 
16000 31.06856443814443 % 
17000 33.010349715528456 % 
18000 34.952134992912484 % 
19000 36.89392027029651 % 
20000 38.83570554768054 % 
21000 40.77749082506457 % 
22000 42.71927610244859 % 
23000 44.66106137983262 % 
24000 46.602846657216645 % 
25000 48.54463193460067 % 
26000 50.4864172119847 % 
27000 52.42820248936872 % 
28000 54.36998776675275 % 
29000 56.31177304413678 % 
30000 58.253558321520806 % 
31000 60.19534359890484 % 
32000 62.13712887628886 % 
33000 64.07891415367288 % 
34000 66.02069943105691 % 
35000 67.96248470844094 % 
36000 69.90426998582497 % 
37000 71.846055263209 % 
38000 73.78784054059302 % 
39000 75.72962581797705 % 
40000 77.67141109536108 % 
41000 79.6131963727451 % 
42000 81.55498165012914 % 
43000 83.49676692751315 % 
44000 85.43855220489718 % 
45000 87.3803374822812 % 
46000 89.32212275966523 % 
47000 91.26390803704926 % 
48000 93.20569331443329 % 
49000 95.14747859181732 % 
50000 97.08926386920135 % 
51000 99.03104914658537 % 
---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-10-20c704fcbf40> in <module>()
      2     values = line.split(" +++$+++ ")
      3     if values is not '':
----> 4         line_dict = {"case_id" : values[0], "after_previous" : values[2], "speaker" : values[3], "is_justice" : values[4], "justice_vote" : values[5], "presentation_side" : values[6], "utterance" : values[7] }
      5         line_series = pd.Series(line_dict, name=values[1])
      6         corpus_df = corpus_df.append(line_series)

IndexError: list index out of range

In [14]:
corpus_df[:5]


Out[14]:
case_id after_previous speaker is_justice justice_vote presentation_side utterance
2 02-1472 FALSE JUSTICE STEVENS JUSTICE PETITIONER We will now hear argument in the Cherokee Nati...
4 02-1472 FALSE MR. MILLER NOT JUSTICE NA PETITIONER Justice Stevens, and may it please the Court: ...
5 02-1472 TRUE JUSTICE O'CONNOR JUSTICE PETITIONER PETITIONER Would you mind explaining to us how these two ...
6 02-1472 TRUE MR. MILLER NOT JUSTICE NA PETITIONER No, Justice O'Connor. They're -- they're not o...
7 02-1472 TRUE JUSTICE O'CONNOR JUSTICE PETITIONER PETITIONER But they're certainly at odds on the legal the...

In [ ]:


In [ ]: