notebook.community

Edit and run



In [16]:

    
import pandas as pd
%matplotlib inline



In [2]:

    
!curl "https://confluence.cornell.edu/download/attachments/172918779/supreme_court_dialogs_corpus_v1.01.zip?version=1&modificationDate=1351805307000&api=v2" -o scotus.zip









    



  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 9914k  100 9914k    0     0  1959k      0  0:00:05  0:00:05 --:--:-- 2132k



In [3]:

    
!unzip scotus.zip









    



Archive:  scotus.zip
   creating: supreme_court_dialogs_corpus_v1.01/
  inflating: supreme_court_dialogs_corpus_v1.01/.DS_Store  
   creating: __MACOSX/
   creating: __MACOSX/supreme_court_dialogs_corpus_v1.01/
  inflating: __MACOSX/supreme_court_dialogs_corpus_v1.01/._.DS_Store  
  inflating: supreme_court_dialogs_corpus_v1.01/echoes_of_power.pdf  
  inflating: __MACOSX/supreme_court_dialogs_corpus_v1.01/._echoes_of_power.pdf  
  inflating: supreme_court_dialogs_corpus_v1.01/supreme.conversations.txt  
  inflating: __MACOSX/supreme_court_dialogs_corpus_v1.01/._supreme.conversations.txt  
  inflating: supreme_court_dialogs_corpus_v1.01/supreme.gender.txt  
  inflating: supreme_court_dialogs_corpus_v1.01/supreme.outcome.txt  
  inflating: supreme_court_dialogs_corpus_v1.01/supreme.README.v1.01.txt  
  inflating: __MACOSX/supreme_court_dialogs_corpus_v1.01/._supreme.README.v1.01.txt  
  inflating: supreme_court_dialogs_corpus_v1.01/supreme.votes.txt  
  inflating: supreme_court_dialogs_corpus_v1.01/timothy_hawes_thesis.pdf  
  inflating: __MACOSX/supreme_court_dialogs_corpus_v1.01/._timothy_hawes_thesis.pdf



In [4]:

    
!rm -rf supreme_court_dialogs_corpus_v1.01/__MACOSX/



In [5]:

    
!ls supreme_court_dialogs_corpus_v1.01/









    



echoes_of_power.pdf	   supreme.outcome.txt	     timothy_hawes_thesis.pdf
supreme.conversations.txt  supreme.README.v1.01.txt
supreme.gender.txt	   supreme.votes.txt



In [6]:

    
with open("supreme_court_dialogs_corpus_v1.01/supreme.conversations.txt") as f:
    text = f.read()



In [7]:

    
text.split("\n")[1].split(" +++$+++ ")









    Out[7]:





['02-1472',
 '4',
 'FALSE',
 'MR. MILLER',
 'NOT JUSTICE',
 'NA',
 'PETITIONER',
 'Justice Stevens, and may it please the Court: These two contract cases concern whether the Government is liable in money damages under the Contract Disputes Act and section 110 of the Indian Self-Determination Act when the Secretary fails to fully pay a contract price for the --']



In [8]:

    
len(text.split("\n"))









    Out[8]:





51499



In [9]:

    
corpus_df = pd.DataFrame(columns=["case_id", "after_previous", "speaker", "is_justice", "justice_vote", "presentation_side", "utterance"])
corpus_df









    Out[9]:






  
    
      
      case_id
      after_previous
      speaker
      is_justice
      justice_vote
      presentation_side
      utterance



In [10]:

    
for line in text.split("\n"):
    values = line.split(" +++$+++ ")
    if values is not '':
        line_dict = {"case_id" : values[0], "after_previous" : values[2], "speaker" : values[3], "is_justice" : values[4], "justice_vote" : values[5], "presentation_side" : values[6], "utterance" : values[7] } 
        line_series = pd.Series(line_dict, name=values[1])
        corpus_df = corpus_df.append(line_series)
        if len(corpus_df) % 1000 is 0:
            print(len(corpus_df), len(corpus_df) / len(text.split("\n")) * 100, "% ")









    



1000 1.941785277384027 % 
2000 3.883570554768054 % 
3000 5.825355832152081 % 
4000 7.767141109536108 % 
5000 9.708926386920135 % 
6000 11.650711664304161 % 
7000 13.592496941688188 % 
8000 15.534282219072216 % 
9000 17.476067496456242 % 
10000 19.41785277384027 % 
11000 21.359638051224294 % 
12000 23.301423328608323 % 
13000 25.24320860599235 % 
14000 27.184993883376375 % 
15000 29.126779160760403 % 
16000 31.06856443814443 % 
17000 33.010349715528456 % 
18000 34.952134992912484 % 
19000 36.89392027029651 % 
20000 38.83570554768054 % 
21000 40.77749082506457 % 
22000 42.71927610244859 % 
23000 44.66106137983262 % 
24000 46.602846657216645 % 
25000 48.54463193460067 % 
26000 50.4864172119847 % 
27000 52.42820248936872 % 
28000 54.36998776675275 % 
29000 56.31177304413678 % 
30000 58.253558321520806 % 
31000 60.19534359890484 % 
32000 62.13712887628886 % 
33000 64.07891415367288 % 
34000 66.02069943105691 % 
35000 67.96248470844094 % 
36000 69.90426998582497 % 
37000 71.846055263209 % 
38000 73.78784054059302 % 
39000 75.72962581797705 % 
40000 77.67141109536108 % 
41000 79.6131963727451 % 
42000 81.55498165012914 % 
43000 83.49676692751315 % 
44000 85.43855220489718 % 
45000 87.3803374822812 % 
46000 89.32212275966523 % 
47000 91.26390803704926 % 
48000 93.20569331443329 % 
49000 95.14747859181732 % 
50000 97.08926386920135 % 
51000 99.03104914658537 % 






    



---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-10-20c704fcbf40> in <module>()
      2     values = line.split(" +++$+++ ")
      3     if values is not '':
----> 4         line_dict = {"case_id" : values[0], "after_previous" : values[2], "speaker" : values[3], "is_justice" : values[4], "justice_vote" : values[5], "presentation_side" : values[6], "utterance" : values[7] }
      5         line_series = pd.Series(line_dict, name=values[1])
      6         corpus_df = corpus_df.append(line_series)

IndexError: list index out of range



In [14]:

    
corpus_df[:5]









    Out[14]:






  
    
      
      case_id
      after_previous
      speaker
      is_justice
      justice_vote
      presentation_side
      utterance
    
  
  
    
      2
      02-1472
      FALSE
      JUSTICE STEVENS
      JUSTICE
      PETITIONER
      
      We will now hear argument in the Cherokee Nati...
    
    
      4
      02-1472
      FALSE
      MR. MILLER
      NOT JUSTICE
      NA
      PETITIONER
      Justice Stevens, and may it please the Court: ...
    
    
      5
      02-1472
      TRUE
      JUSTICE O'CONNOR
      JUSTICE
      PETITIONER
      PETITIONER
      Would you mind explaining to us how these two ...
    
    
      6
      02-1472
      TRUE
      MR. MILLER
      NOT JUSTICE
      NA
      PETITIONER
      No, Justice O'Connor. They're -- they're not o...
    
    
      7
      02-1472
      TRUE
      JUSTICE O'CONNOR
      JUSTICE
      PETITIONER
      PETITIONER
      But they're certainly at odds on the legal the...



In [ ]:



In [ ]:

	case_id	after_previous	speaker	is_justice	justice_vote	presentation_side	utterance
2	02-1472	FALSE	JUSTICE STEVENS	JUSTICE	PETITIONER		We will now hear argument in the Cherokee Nati...
4	02-1472	FALSE	MR. MILLER	NOT JUSTICE	NA	PETITIONER	Justice Stevens, and may it please the Court: ...
5	02-1472	TRUE	JUSTICE O'CONNOR	JUSTICE	PETITIONER	PETITIONER	Would you mind explaining to us how these two ...
6	02-1472	TRUE	MR. MILLER	NOT JUSTICE	NA	PETITIONER	No, Justice O'Connor. They're -- they're not o...
7	02-1472	TRUE	JUSTICE O'CONNOR	JUSTICE	PETITIONER	PETITIONER	But they're certainly at odds on the legal the...