In [1]:
import pandas as pd
In [2]:
from IPython.core.display import HTML
HTML('<style>{}</style>'.format(open('style-notebook.css').read()))
Out[2]:
In [3]:
col_names = ['index', 'name', 'citation', 'author', 'number', 'date', 'court', 'coram', 'counsel', 'catchwords']
df = pd.read_table('raw.tsv', encoding='utf-8', header=None, names=col_names, index_col=0, parse_dates=True)
In [4]:
df.head()
Out[4]:
In [5]:
df.describe()
Out[5]:
There are only 5399 unique citations when there are 5391 judgments in the data set. This suggests that there may be duplicate entries.
In [6]:
df[df.duplicated('citation')]
Out[6]:
In [7]:
df[df.citation.str.contains('\[2014\] SGHCR 4')]
Out[7]:
We drop the duplicate rows from the data set.
In [8]:
df = df.drop(15509)
In [9]:
df[df.citation.str.contains('\[2014\] SGHC 192')]
Out[9]:
In [10]:
df = df.drop(15746)
In [11]:
df[df.citation.str.contains('\[2014\] SGHC 207')]
Out[11]:
In [12]:
df = df.drop(15751)
In [13]:
df[df.citation.str.contains('\[2014\] SGHC 213')]
Out[13]:
In [14]:
df = df.drop(15755)
In [15]:
df[df.citation.str.contains('\[2014\] SGHC 262')]
Out[15]:
These are not duplicates but are instead different entries with identical citations.
In [16]:
df[df.citation.str.contains('\[2014\] SGHC 242')]
Out[16]:
In [17]:
df[df.name.str.contains('Ong Kian Hoy')]
Out[17]:
A check on http://commonlii.org shows that the citation for judgment 15800 should be [2014] SGHC 242 and it turns out there is already an identical judgment 15814 with the correct citation in the data set, so we drop 15800 in favour of 15814.
In [18]:
df = df.drop(15800)
In [19]:
df[df.citation.str.contains('\[2015\] SGHC 134')]
Out[19]:
In [20]:
df = df.drop(16030)
In [21]:
df[df.citation.str.contains('\[2015\] SGCA 59')]
Out[21]:
In [22]:
df = df.drop(18274)
In [23]:
df[df.citation.str.contains('\[2015\] SGCA 60')]
Out[23]:
In [24]:
df = df.drop(18283)
A quick way of checking whether any values of the citation column is in the wrong format, is to split the values.
In [25]:
cits = df.citation.str.split(';', expand=True)
cits.columns = ['neutral', 'slr']
cits.head(20)
Out[25]:
In [26]:
swap_idx = cits.slr.notnull()
cits.loc[swap_idx, ['neutral', 'slr']] = cits.loc[swap_idx, ['slr', 'neutral']].values
cits.head(20)
Out[26]:
Check whether the neutral citation has the right format.
In [27]:
cits[cits.neutral.str.split().str.len()!=3]
Out[27]:
Judgment 15625 did not split correctly because of a missing space.
In [28]:
df.loc[15265, 'citation'] = '[2013] SGHC 115'
In [29]:
df.loc[[17912, 17913]]
Out[29]:
In [30]:
df[df.name.str.contains('ABJ')]
Out[30]:
Judgment 17912 and 17913 appear to be erroneous records so they are dropped.
In [31]:
df = df.drop([17912, 17913])
Quick check of the different components of the neutral citation.
In [32]:
for i in range(3):
print(cits.neutral.str.split(expand=True)[i].value_counts())
We know that the highest number of cases in a year was 427 in 2010, so we shouldn't be seeing 4 digit numbers in the final part of the neutral citation.
In [33]:
cits[cits.neutral.str.split(expand=True)[2].str.len() > 3]
Out[33]:
In [34]:
df.loc[[14786]]
Out[34]:
In [35]:
df[df.name.str.contains('Erin Brooke')]
Out[35]:
In [36]:
df = df.drop(14786)
In [37]:
df.loc[[17915]]
Out[37]:
In [38]:
df[df.name.str.contains('Ashik bin Aris')]
Out[38]:
In [39]:
df = df.drop(17915)
In [40]:
df.loc[[17916]]
Out[40]:
In [41]:
df[df.name.str.contains('Ferrero SPA')]
Out[41]:
In [42]:
df = df.drop(17916)
In [43]:
df.loc[[17921]]
Out[43]:
In [44]:
df[df.number.str.contains('591 of 2011')]
Out[44]:
In [45]:
df = df.drop(17921)
In [46]:
df.loc[[18026]]
Out[46]:
In [47]:
df[df.name.str.contains('Woo Kah Wai')]
Out[47]:
In [48]:
df = df.drop(18026)
In [49]:
df.loc[[18039]]
Out[49]:
In [50]:
df[df.name.str.contains('Muthukumaran s/o Varthan')]
Out[50]:
In [51]:
df = df.drop(18039)
In [52]:
df.loc[[18040]]
Out[52]:
In [53]:
df[df.name.str.contains('Grains and Industrial')]
Out[53]:
In [54]:
df = df.drop(18040)
In [55]:
df.loc[[18144]]
Out[55]:
In [56]:
df[df.name.str.contains('Malini Ventura')]
Out[56]:
In [57]:
df = df.drop(18144)
In [58]:
df.describe()
Out[58]:
In [59]:
len(df[df.author.isnull()])
Out[59]:
In [60]:
len(df[df.catchwords.isnull()])
Out[60]:
There are 5382 judgments in the dataset. The 'author' column has 888 missing values. The 'catchwords' column has 104 missing values
In [61]:
df.info()
In [62]:
try:
df.date = pd.to_datetime(df.date)
except Exception as e:
print (e)
Converting the date column into datetime raises an error so we verify whether the dates are in the right format (day, month and year).
In [63]:
df[df.date.str.split().str.len() != 3]
Out[63]:
The day and month for the date column in 15157 are joined together. This is fixed by adding the missing space.
In [64]:
df.loc[15157, 'date'] = '11 March 2013'
In [65]:
try:
df.date = pd.to_datetime(df.date)
except Exception as e:
print (e)
Attempting to convert the date column into datetime still raises the same error so we need to split the date values and check the day, month and year values.
In [66]:
dates = df.date.str.split(' ', expand=True)
In [67]:
dates[0].value_counts()
Out[67]:
The day values appear to be fine.
In [68]:
dates[1].value_counts()
Out[68]:
For the month values, we see that there are 4 instances where February was misspelled as Febuary.
In [69]:
dates[2].value_counts()
Out[69]:
For the year values, there are 6 instances where there is a trailing dot.
In [70]:
df.date = df.date.str.replace('Febuary', 'February').str.strip('.')
The errors are fixed by correcting the misspelled months and removing the trailing dots.
In [71]:
try:
df.date = pd.to_datetime(df.date)
except Exception as e:
print (e)
The conversion is successful this time.
In [72]:
df.date.describe()
Out[72]:
In [73]:
%matplotlib inline
df[df.court.str.contains('Appeal')].groupby(df.date.dt.year).size().plot(kind='bar')
Out[73]:
In [74]:
df[df.court.str.contains('High')].groupby(df.date.dt.year).size().plot(kind='bar')
Out[74]:
In [75]:
df[df.court.str.contains('Appeal|High')].groupby([df.court, df.date.dt.year]).size().unstack('court').plot(grid=True)
Out[75]:
In [76]:
df.court.value_counts()
Out[76]:
There is one inconsistent value which shows 'CA/Court of Appeal' instead of the usual 'Court of Appeal'. This is corrected.
In [77]:
df.court = df.court.str.replace('CA/', '')
df.court.value_counts()
Out[77]:
We check the coram column to ensure that the data is clean. As there is a sizeable number of missing values in the author column, it may be useful to fill in these missing values from the coram values where possible.
In [78]:
df.coram.describe()
Out[78]:
In [79]:
df[df.coram.str.contains('and')].head()
Out[79]:
In [80]:
df[df.coram.str.contains(';')].head()
Out[80]:
In [81]:
df[df.coram.str.contains(',')].head()
Out[81]:
It appears that there is no one consistent delimiter for all cases. The delimiter could be a comma, semicolon, or the word 'and'.
The coram values are first split using the delimiters: comma, semicolon, or the word 'and'. The results are then combined into a single dataframe for review.
In [82]:
judges = pd.DataFrame([judge for judge_list in df.coram.str.split(
'\s*,\s*|\s*and\s*|\s*;\s*').tolist() for judge in judge_list])
If the split was done correctly, the judicial titles such as 'J', 'JA', or 'JC' should appear at the end of each judge name.
In [83]:
try:
judges[0].str.split().str[-1].value_counts()
except Exception as e:
print (e)
However, when an attempt is made to split the values into 'name' and 'title' format (using default space), a 'list index out of range' error is raised when accessing the value at index [-1], which means there are empty strings in the series.
In [84]:
len(judges[judges[0]==''])
Out[84]:
There are 5 instances of empty strings, which means there are delimiters in unexpected places (beginning or end of string, or next to one another).
In [85]:
df[df.coram.str.startswith(';', na=False) | df.coram.str.endswith(';', na=False)]
Out[85]:
There is one instance of semicolons incorrectly appearing at the beginning and/or end of the judge's name. This is fixed by stripping the semicolons.
In [86]:
df['coram']= df.coram.str.strip('; ')
df.loc[[15878]]
Out[86]:
In [87]:
df[df.coram.str.contains(', and', na=False)]
Out[87]:
There are two instances of a comma appearing just before the word 'and'. This is fixed by replacing them with a semicolon.
In [88]:
df.coram = df.coram.str.replace(', and', ';')
df.loc[[15319, 15900]]
Out[88]:
In [89]:
df[df.coram.str.contains('; and', na=False)]
Out[89]:
There are two instances of a semicolon appearing just before the word 'and'. This is fixed by replacing them with a semicolon.
In [90]:
df.coram = df.coram.str.replace('; and', ';')
After fixing the incorrectly placed delimiters, the split is attempted again to extract the individual judge names and judicial titles.
In [91]:
judges = pd.DataFrame([judge for judge_list in df.coram.str.split(
',\s*|\s*and\s*|\s*;\s').tolist() for judge in judge_list])
In [92]:
judges[0].str.split().str[-1].value_counts()
Out[92]:
The split is successful for the majority of cases. However, there are 23 instances where the judge's name and title were not split correctly because of the additional words '(as he then was)' added to the judge's name. A number of other names were also split incorrectly due to commas appearing within the names.
In [93]:
df[df.coram.str.contains('was\)') | df.coram.str.contains('Yi-Ling,') | df.coram.str.contains('Sern,')].head()
Out[93]:
This is fixed by removing the words '(as he then was)' and the commas from the affected names.
In [94]:
df.coram = df.coram.str.replace(' \(as he then was\)', '').str.replace('Yi-Ling,','Yi-Ling').str.replace('Sern,', 'Sern')
Some of the names and judicial titles were not separated by a space. This is fixed by replacing the name with the correct separating space.
In [95]:
df.coram = df.coram.str.replace('Loh;J', 'Loh J')
There is one instance where the words 'plaintiff' and 'defendant.' unexpectedly appear in the coram column.
In [96]:
df[df.coram.str.contains('plaintiff|defendant')]
Out[96]:
It turns out that the values for the coram, counsel and catchwords columns were misaligned. This is fixed by shifting the values to their correct positions.
In [97]:
df.loc[[14912], ['coram', 'counsel', 'catchwords']] = df.loc[[14912], ['coram', 'counsel', 'catchwords']].shift(1, axis=1)
df.loc[[14912]]
Out[97]:
To fill in the missing values in the 'author' and 'coram' columns, the judge's name was obtained from http://commonlii.org/sg/cases/SGHC/2012/
In [98]:
df.loc[14912,['author','coram']] = 'Lai Siu Chiu J'
The handful of remaining cases that did not split correctly into 'name' and 'title' format are eyeballed to identify the errors or inconsistencies. We first check whether the judicial titles are missing from the coram values by checking the end-of-string values.
In [99]:
df[df.coram.str.contains('Teck$|Elaine$|Ping$|Lee$|Peng$|Chua$|Boon$|Tung$|Abdullah$|Lionel$')]
Out[99]:
The coram values listed above are either missing the judicial title or have it placed before the name. This is fixed by placing the judicial titles after the names.
In [100]:
df.loc[14695, ['author', 'coram']] = 'Leo Zhen Wei Lionel AR'
df.loc[14967, ['author', 'coram']] = df.loc[14988, ['author', 'coram']] = 'Chew Yi-Ling Elaine AR'
df.loc[14977, ['author', 'coram']] = 'Eunice Chua AR'
df.loc[15146, ['author', 'coram']] = 'Chee Min Ping AR'
df.loc[15199, 'coram'] = df.loc[17937, 'coram'] = 'Choo Han Teck J'
df.loc[15954, ['author', 'coram']] = 'Hoo Sheau Peng JC'
df.loc[15955, ['author', 'coram']] = 'Aedit Abdullah JC'
df.loc[17969, ['author', 'coram']] = 'Amy Tung AR'
df.loc[18212, ['author', 'coram']] = 'James Elisha Lee AR'
It is noted that the occurrence of an incorrectly split 'Boon' did not turn up in this end-of-string check. We try checking for the other possibility of an incorrect split due to a misplaced semicolon after the word 'Boon'.
In [101]:
df[df.coram.str.contains('Boon;')]
Out[101]:
We find the instances where there is a misplaced semicolon where there should have been a space instead (the semicolons here are likely to have been a side effect of preprocessor.py which replaces '\n' characters with ';' when parsing the html to ensure that catchwords are delimited correctly). The semicolons are replaced with a space.
In [102]:
df.coram = df.coram.str.replace('Boon;\s*', 'Boon ')
Finally for consistency, all comma and 'and' delimiters throughout the coram column are replaced with semicolons.
In [103]:
df.coram = df.coram.str.replace(',| and', ';')
It may be useful to write a function that gets the judge names since this code is repeated.
In [104]:
def get_judges(df):
judges = pd.DataFrame([judge for judge_list in df.coram.str.split('; ').tolist() for judge in judge_list])
judges.drop_duplicates(inplace=True)
judges = judges[0].str.rsplit(' ', expand=True, n=1)
judges.columns = ['name', 'title']
return judges
After fixing the names and using a consistent delimiter, the split is attempted again to extract the judge names and titles.
In [105]:
judges = get_judges(df)
judges.title.value_counts()
Out[105]:
The judge names and judicial titles appear to be splitting correctly now. The next step is to verify that the names are correct and consistent.
In [106]:
judges[judges.name.str.match('^A|B')].sort_values(by='name')
Out[106]:
In [107]:
judges[judges.name.str.match('^C|D|E')].sort_values(by='name')
Out[107]:
In [108]:
judges[judges.name.str.match('^F|G|H|I|J')].sort_values(by='name')
Out[108]:
In [109]:
judges[judges.name.str.match('^K|L|M|N')].sort_values(by='name')
Out[109]:
In [110]:
judges[judges.name.str.match('^P|Q|R|S')].sort_values(by='name')
Out[110]:
In [111]:
judges[judges.name.str.match('^T|U|V|W|X|Y|Z')].sort_values(by='name')
Out[111]:
Errors (missing spaces) are fixed and some names are changed for consistency (e.g. 'Quentin Loh Sze-On' to 'Quentin Loh')
In [112]:
df.coram = df.coram.str.replace('BoonLeong', 'Boon Leong').str.replace('AndrewAng', 'Andrew Ang')
df.coram = df.coram.str.replace('AndrewPhang', 'Andrew Phang').str.replace('ChanSeng', 'Chan Seng').str.replace('Chao;', 'Chao ')
df.coram = df.coram.str.replace('George;?Wei', 'George Wei').str.replace('Judith;', 'Judith ').str.replace('Lai;', 'Lai ')
df.coram = df.coram.str.replace('LeeKim', 'Lee Kim').str.replace('LeeSeiu', 'Lee Seiu')
df.coram = df.coram.str.replace('Lionel;?Yee', 'Lionel Yee').str.replace(' Sze-On', '').str.replace(' Li Shiong', '')
df.coram = df.coram.str.replace('CJAndrew', 'CJ; Andrew').str.replace('SiongThye', 'Siong Thye')
df.coram = df.coram.str.replace('TayYong', 'Tay Yong').str.replace('V;?K', 'V K').str.replace('WooBih', 'Woo Bih')
In [113]:
judges = get_judges(df)
judges.name.value_counts()
Out[113]:
The coram lists will need to be sorted so that we can compare them across different cases.
In [114]:
df.coram = df.coram.str.split('; ').apply(lambda x: "; ".join(sorted(x)))
In [115]:
df.coram.value_counts().head(20)
Out[115]:
Check whether there is any CA judgment with only one judge value in the coram column, as we expect to see more than one judge.
In [116]:
df[(~df.coram.str.contains(';')) & (df.court=='Court of Appeal')]
Out[116]:
Checking the html files reveals that:
In [117]:
df.loc[21949, 'author'] = 'Chao Hick Tin JA'
df.loc[21949, 'coram'] = 'Chao Hick Tin JA; Tan Lee Meng J'
Court of Appeal cases with 2 judges in the coram
In [118]:
df[~(df.coram.str.contains(';.+;')) & (df.court=='Court of Appeal')]
Out[118]:
Check how many CA and HC judgments have a missing value in the author column.
In [119]:
df[df.author.isnull()].court.value_counts()
Out[119]:
It is possible to fill in the missing values in the 'author' column using the 'coram' values, particularly for cases where there is only one judge in the coram.
In [120]:
df[df.author.isnull() & (~df.coram.str.contains(';'))].head()
Out[120]:
Copy values from coram column into author column if there is only one judge in the coram (that judge must have been the author).
In [121]:
df.loc[df.author.isnull() & (~df.coram.str.contains(';')),'author'] = df.loc[df.author.isnull() & (~df.coram.str.contains(';')),'coram'].values
In [122]:
df[df.author.isnull()].court.value_counts()
Out[122]:
This reduced the number of missing values in High Court judgments from 679 to 11.
In [123]:
df[df.author.isnull()].coram.value_counts()
Out[123]:
If there are multiple judges in the coram then one of them must be the author.
In [124]:
possible_authors = df[df.author.isnull()].coram.str.split('; ')
possible_authors.head()
Out[124]:
Look in the source html files for words "delivered by" to find which of the possible authors delivered the judgment.
In [125]:
import bs4
import re
p_tags = bs4.SoupStrainer('p')
for index, row in possible_authors.iteritems():
soup = bs4.BeautifulSoup(open('html/' + str(index) + '.html', 'r', encoding='utf-8').read(), 'lxml', parse_only=p_tags)
texts = soup(string=re.compile('\xa0'))
for t in texts:
text = t.replace('\xa0', ' ')
t.replace_with(text)
for judge in row:
if soup.find(string=re.compile("Delivered by\s+" + judge + "|" + judge + "\s+\(delivering the\s+")):
df.loc[index, 'author'] = judge
elif soup.find(string=re.compile("Delivered by")):
tag = soup.find(string=re.compile("Delivered by"))
if judge in (tag.next_element.string , tag.next_element.next_element.string):
df.loc[index, 'author'] = judge
Check how many remaining judgments have a missing value in the author column.
In [126]:
df[df.author.isnull()].court.value_counts()
Out[126]:
The number of missing values in Court of Appeal judgments has reduced from 207 to 161.
Check how many judgments with a missing value in the author column were delivered after 2004.
In [127]:
df[df.author.isnull() & (pd.to_datetime(df.date).dt.year > 2004)]
Out[127]:
Look up the missing values at http://commonlii.org which has a database of SG judgments since 2005
In [128]:
df.loc[13540, 'author'] = 'Chan Sek Keong CJ'
df.loc[15284, 'author'] = 'V K Rajah JA'
In [129]:
df[df.author.isnull()].court.value_counts()
Out[129]:
In [130]:
len(df[df.catchwords.isnull()])
Out[130]:
In [131]:
cases = df[df.catchwords.isnull() & (df.date.dt.year > 2012)]
cases
Out[131]:
In [132]:
df.loc[15337, 'catchwords'] = 'Civil Procedure — Foreign Judgments — Reciprocal Enforcement of Commonwealth Judgments Act'
df.loc[15415, 'catchwords'] = 'Civil Procedure — Striking Out'
df.loc[15606, 'catchwords'] = ('Civil Procedure — Jurisdiction;Civil Procedure — Service;Choses in Action — Assignment;'
'Conflict of Laws — Natural Forum;Conflict of Laws — Jurisdiction;Insolvency Law — Bankruptcy;'
'Res Judicata— Issue Estoppel')
df.loc[15607, 'catchwords'] = 'Administrative Law— Judicial Review'
df.loc[15815, 'catchwords'] = 'Family Law— Custody— Care and Control'
df.loc[15948, 'catchwords'] = 'Criminal law— Offences — Property— Criminal breach of trust'
df.loc[18028, 'catchwords'] = 'Family Law— Matrimonial Assets'
df.loc[18060, 'catchwords'] = 'Arbitration — Arbitrability and public policy;Arbitration — Arbitral tribunal — Competence'
df.loc[18134, 'catchwords'] = 'Injunctions — Interlocutory injunctions'
df.loc[18542, 'catchwords'] = ('Criminal Procedure and Sentencing— Sentencing— Rape;Criminal Procedure and Sentencing— '
'Sentencing— Aggravated outrage of modesty;Criminal Procedure and Sentencing— Sentencing— '
'Criminal intimidation')
df.loc[18546, 'catchwords'] = 'Unincorporated associations and trade unions — Societies'
df.loc[22600, 'catchwords'] = 'Employment Law — Pay — Recovery'
In [133]:
len(df[df.catchwords.isnull()])
Out[133]:
In [134]:
df.catchwords.describe()
Out[134]:
In [135]:
df.catchwords.head(10)
Out[135]:
In [136]:
df.catchwords.tail(10)
Out[136]:
In [137]:
df.catchwords = df.catchwords.str.lower().str.replace('\s*[—–-]+\s*', '-').str.replace('\s+', '_')
In [138]:
df.catchwords.tail(10)
Out[138]:
In [139]:
df.catchwords.str.split(';', expand=True).head(10)
Out[139]:
In [140]:
import numpy as np
In [141]:
def get_L1_catchwords(df):
ds = df.catchwords.str.split(';').str[0].str.split('-').str[0].fillna('')
for i in range(1, 11):
ds = ds + ';' + df.catchwords.str.split(';').str[i].str.split('-').str[0].fillna('')
return ds.str.split(';').apply(lambda x: ";".join(sorted(set(filter(None, x))))).replace('', np.nan)
In [142]:
def get_split_counts(ds):
return pd.Series(
[word for word_list in ds.fillna('').str.split(';').tolist() for word in word_list]).replace('', np.nan).value_counts()
In [143]:
L1_catchwords = get_L1_catchwords(df)
L1_catchwords.tail(10)
Out[143]:
In [144]:
L1_counts = get_split_counts(get_L1_catchwords(df)).sort_index()
len(L1_counts)
Out[144]:
In [145]:
L1_counts[:20]
Out[145]:
In [146]:
df[df.catchwords.str.contains('_recourse_against_award', na=False)]
Out[146]:
In [147]:
df.loc[14873, 'catchwords'] = 'arbitration-award-additional_award;arbitration-award-recourse_against_award'
In [148]:
df[df.catchwords.str.contains('admiralty', na=False) & (~df.catchwords.str.contains('admiralty_', na=False))]
Out[148]:
In [149]:
df.loc[14774, 'catchwords'] = 'admiralty_and_shipping;conflict_of_laws-forum_non_conveniens'
In [150]:
df[df.catchwords.str.contains(';bail$', na=False)]
Out[150]:
In [151]:
df.loc[14897, 'catchwords'] = 'criminal_procedure_and_sentencing-extradition-bail'
In [152]:
df.catchwords = df.catchwords.str.replace('adminstrative', 'administrative').str.replace('_−', '-').str.replace('_–_', '-')
In [153]:
L1_counts[21:40]
Out[153]:
In [154]:
df[df.catchwords.str.contains('^caveats', na=False)]
Out[154]:
In [155]:
df.loc[14773, 'catchwords'] = 'land-caveats;equity'
In [156]:
df.catchwords = df.catchwords.str.replace('construction_contracts,_contractors’_duties', 'construction_law-contractors’_duties')
df.catchwords = df.catchwords.str.replace('building_and_construction_contracts', 'building_and_construction_law')
df.catchwords = df.catchwords.str.replace('industry_security_of_payment_act', 'law-security_of_payment_act')
df.catchwords = df.catchwords.str.replace('civil_procedue', 'civil_procedure')
In [157]:
df[df.catchwords.str.contains('civil_procedure_', na=False)]
Out[157]:
In [158]:
df[df.catchwords.str.contains('\xad', na=False)]
Out[158]:
In [159]:
df.catchwords = df.catchwords.str.replace('\xad', '').str.replace('_-', '-')
In [160]:
L1_counts[41:60]
Out[160]:
In [161]:
df.catchwords = df.catchwords.str.replace('company_law', 'companies').str.replace('conflicts_of_laws', 'conflict_of_laws')
df.catchwords = df.catchwords.str.replace('constitutional_interpretation', 'constitutional_law')
df.catchwords = df.catchwords.str.replace('conflict_of_laws_forum_non_conveniens', 'conflict_of_laws-forum_non_conveniens')
In [162]:
L1_counts[61:100]
Out[162]:
In [163]:
df[df.catchwords.str.startswith('contracts', na=False)]
Out[163]:
In [164]:
df.loc[14876, 'catchwords'] = 'contract-building_contracts'
In [165]:
df[df.catchwords.str.contains(';damage-', na=False)]
Out[165]:
In [166]:
df.loc[14933, 'catchwords']
Out[166]:
In [167]:
df[df.catchwords.str.contains('criminal_procedure-', na=False)]
Out[167]:
In [168]:
df[df.catchwords.str.contains('criminal_procedure$', na=False)]
Out[168]:
In [169]:
df.loc[15098, 'catchwords'] = 'criminal_procedure_and_sentencing'
In [170]:
df[df.catchwords.str.contains('election_of_remedies', na=False)]
Out[170]:
In [171]:
df.loc[14290, 'catchwords'] = 'civil_procedure-amendment_of_pleadings-election_of_remedies'
In [172]:
df[df.catchwords.str.contains(';estoppel-', na=False)]
Out[172]:
In [173]:
df.loc[18364, 'catchwords']
Out[173]:
In [174]:
df.loc[18364, 'catchwords'] = ('civil_procedure-originating_processes;companies-memorandum_and_articles_of_association-effect;'
'companies-accounts;companies-capacity-pre-incorporation_contracts;contract-collateral_contracts;'
'contract-consideration;contract-ratification;equity-estoppel-estoppel_by_representation')
In [175]:
df.catchwords = df.catchwords.str.replace('credit_&_security', 'credit_and_security')
df.catchwords = df.catchwords.str.replace('credit_and_securities', 'credit_and_security')
df.catchwords = df.catchwords.str.replace('criminal_procedure-', 'criminal_procedure_and_sentencing-')
df.catchwords = df.catchwords.str.replace(';damage-', ';damages-').str.replace('evidence_limitation', 'evidence-limitation')
df.catchwords = df.catchwords.str.replace('family_law,_insolvency_law', 'family_law;insolvency_law')
In [176]:
L1_counts[101:140]
Out[176]:
In [177]:
df[df.catchwords.str.contains('^injunction-', na=False)]
Out[177]:
In [178]:
df.loc[18122, 'catchwords'] = 'injunctions-interlocutory_injunction;injunctions-springboard_injunction'
In [179]:
df[df.catchwords.str.contains('^insolvency-', na=False)]
Out[179]:
In [180]:
df.catchwords = df.catchwords.str.replace('^insolvency-', 'insolvency_law-')
In [181]:
df[df.catchwords.str.contains('^intellectual_property', na=False)]
Out[181]:
In [182]:
df.loc[18545, 'catchwords']
Out[182]:
In [183]:
df[df.catchwords.str.contains('copyright_infringement', na=False)]
Out[183]:
In [184]:
df[df.catchwords.str.contains('-groundless_threat', na=False)]
Out[184]:
In [185]:
df.loc[18545, 'catchwords'] = ('tort-passing_off;copyright-copyright_infringement;copyright-groundless_threat')
In [186]:
df[df.catchwords.str.contains('interest_of_the_public', na=False)]
Out[186]:
In [187]:
df.loc[17938, 'catchwords'] = 'legal_profession-reinstatement-interest_of_the_public'
In [188]:
df[df.catchwords.str.contains(';international_arbitration_act_\(cap_143a\)', na=False)]
Out[188]:
In [189]:
df.catchwords = df.catchwords.str.replace(';international_arbitration_act_\(cap_143a\)', ';arbitration')
In [190]:
df[df.catchwords.str.contains('^offences', na=False)]
Out[190]:
In [191]:
df.loc[15068, 'catchwords'] = 'criminal_law-offences-property-cheating'
In [192]:
df[df.catchwords.str.contains('^procedure', na=False)]
Out[192]:
In [193]:
df.loc[18090, 'catchwords'] = 'civil_procedure-summary_judgment'
In [194]:
df[df.catchwords.str.contains(';pre', na=False)]
Out[194]:
In [195]:
df.loc[14598, 'catchwords'] = 'civil_procedure-discovery_of_documents-pre-action_discovery'
In [196]:
df[df.catchwords.str.contains('^rape', na=False)]
Out[196]:
In [197]:
df.loc[17993, 'catchwords'] = 'criminal_law-offences-rape'
In [198]:
df[df.catchwords.str.contains(';ratification', na=False)]
Out[198]:
In [199]:
df.catchwords = df.catchwords.str.replace(';ratification', ';agency-ratification')
In [200]:
df.catchwords = df.catchwords.str.replace('land_law', 'land').str.replace('legal_professional', 'legal_profession')
df.catchwords = df.catchwords.str.replace('probate_&_administration', 'probate_and_administration')
In [201]:
L1_counts[141:]
Out[201]:
In [202]:
df[df.catchwords.str.contains(';sale_of_goods', na=False)]
Out[202]:
In [203]:
df.loc[18561, 'catchwords'] = 'contract-misrepresentation;commercial_transactions-sale_of_goods-implied_terms_as_to_quality'
In [204]:
df[df.catchwords.str.contains(';sentencing', na=False)]
Out[204]:
In [205]:
df.loc[18599, 'catchwords'] = 'criminal_procedure_and_sentencing-sentencing'
In [206]:
df[df.catchwords.str.contains('^trade_marks-', na=False)]
Out[206]:
In [207]:
df.loc[18213, 'catchwords'] = 'trade_marks_and_trade_names-infringement-assessment_of_damages-statutory_damages'
In [208]:
df[df.catchwords.str.contains('trademarks', na=False)]
Out[208]:
In [209]:
df.loc[14860, 'catchwords'] = ('trade_marks_and_trade_names-infringement-defence_of_prior_use;'
'trade_marks_and_trade_names-infringement-well_known_marks;tort-passing_off-goodwill')
In [210]:
df[df.catchwords.str.contains('unincorporated_associations$', na=False)]
Out[210]:
In [211]:
df.loc[14906, 'catchwords'] = 'civil_procedure-striking_out;unincorporated_associations_and_trade_unions'
In [212]:
df[df.catchwords.str.contains('work_injury_compensation_act', na=False)]
Out[212]:
In [213]:
df.loc[15035, 'catchwords'] = 'employment_law-work_injury_compensation_act'
In [214]:
L1_counts = get_split_counts(get_L1_catchwords(df)).sort_index()
L1_counts.head(20)
Out[214]:
In [215]:
L1_counts[21:40]
Out[215]:
In [216]:
L1_counts[41:60]
Out[216]:
In [217]:
L1_counts[61:]
Out[217]:
In [218]:
len(L1_counts)
Out[218]:
In [219]:
def get_L2_catchwords(df):
ds = df.catchwords.str.split(';').str[0].str.split('-').str[:2].str.join('-').fillna('')
for i in range(1, 11):
ds = ds + ';' + df.catchwords.str.split(';').str[i].str.split('-').str[:2].str.join('-').fillna('')
return ds.str.split(';').apply(lambda x: ";".join(sorted(set(filter(None, x))))).replace('', np.nan)
In [220]:
L2_counts = get_split_counts(get_L2_catchwords(df)).sort_index()
L2_counts.sort_values(ascending=False).head(20)
Out[220]:
In [221]:
df.describe()
Out[221]:
In [222]:
df.sort_values(by='date').tail(10)
Out[222]:
In [223]:
L1_catchwords = get_L1_catchwords(df)
L1_catchwords.head()
Out[223]:
In [224]:
L1_catchwords.tail()
Out[224]:
In [225]:
L1_catchwords = L1_catchwords.str.replace(';', ' ')
L1_catchwords.tail()
Out[225]:
In [ ]:
In [ ]:
In [252]:
pd.options.display.max_colwidth = 1000
with open('cats.txt', 'w') as f:
f.write(
'\n'.join(' '.join(line.split()) for line in L1_catchwords.to_string(na_rep='uncategorised', header=False).split('\n'))
)
pd.options.display.max_colwidth = 50
In [ ]: