notebook.community

Edit and run



In [2]:

    
from nltk.corpus.reader import CategorizedPlaintextCorpusReader

CORPUS_ROOT = r"\Users\howkh\Desktop\Data Science\Final Project\judgments"
CAT_FILE = CORPUS_ROOT + r"\cats.txt"
judgments = CategorizedPlaintextCorpusReader(root=CORPUS_ROOT, fileids='.*\\d+', cat_file=CAT_FILE, encoding='utf-8')



In [2]:

    
len(judgments.fileids())









    Out[2]:





5303



In [3]:

    
len(judgments.categories())









    Out[3]:





87



In [4]:

    
from collections import Counter

def get_top_categories(num_categories):
    category_counter = []
    for fileid in judgments.fileids():
        category_counter.extend(judgments.categories(fileid))
    return [pairs[0] for pairs in Counter(category_counter).most_common(num_categories)]



In [5]:

    
NUM_CATEGORIES = 10

selected_categories = get_top_categories(NUM_CATEGORIES)
selected_categories









    Out[5]:





['civil_procedure',
 'contract',
 'criminal_procedure_and_sentencing',
 'criminal_law',
 'tort',
 'family_law',
 'companies',
 'evidence',
 'land',
 'damages']



In [6]:

    
train_docs_ids = [
    fileid for fileid in judgments.fileids() if fileid.startswith('training') 
    #and any(category in judgments.categories(fileid) for category in selected_categories)
]
len(train_docs_ids)









    Out[6]:





3901



In [7]:

    
test_docs_ids = [
    fileid for fileid in judgments.fileids() if fileid.startswith('test') 
    #and any(category in judgments.categories(fileid) for category in selected_categories)
]
len(test_docs_ids)









    Out[7]:





1301



In [8]:

    
selected_docs_ids = [
    fileid for fileid in judgments.fileids() if any(category in judgments.categories(fileid) for category in selected_categories)
]
len(selected_docs_ids)









    Out[8]:





3841



In [9]:

    
import numpy as np

train_labels = np.zeros((len(train_docs_ids), NUM_CATEGORIES))
test_labels = np.zeros((len(test_docs_ids), NUM_CATEGORIES))

for idx, docid in enumerate(train_docs_ids):
    for category in judgments.categories(docid):
        if category in selected_categories:
            train_labels.itemset((idx, selected_categories.index(category)), 1)

for idx, docid in enumerate(test_docs_ids):
    for category in judgments.categories(docid):
        if category in selected_categories:
            test_labels.itemset((idx, selected_categories.index(category)), 1)



In [10]:

    
len(train_labels)









    Out[10]:





3901



In [11]:

    
len(test_labels)









    Out[11]:





1301



In [12]:

    
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

stopwords = set (stopwords.words('english'))

def get_tokens(text):
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if len(t) > 2]
    tokens = [t for t in tokens if t not in stopwords]
    tokens = [t for t in tokens if t not in "would also may could whether may".split()]
    tokens = [WordNetLemmatizer().lemmatize(t) for t in tokens]
    tokens = [t for t in tokens if not any(c.isdigit() for c in t)]
    return tokens



In [12]:

    
train_docs = [judgments.raw(doc_id) for doc_id in train_docs_ids]
test_docs = [judgments.raw(doc_id) for doc_id in test_docs_ids]



In [23]:

    
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier

# Tokenization, learn and transform training docs, transform test docs
vectorizer = TfidfVectorizer(stop_words=stopwords, tokenizer=get_tokens)
vectorized_train_docs = vectorizer.fit_transform(train_docs)
vectorized_test_docs = vectorizer.transform(test_docs)



In [ ]:

    
#from sklearn.preprocessing import MultiLabelBinarizer
# Transform mutilabel labels
mlb = MultiLabelBinarizer()
train_labels = mlb.fit_transform([judgments.categories(doc_id) for doc_id in train_docs_ids])
test_labels = mlb.transform([judgments.categories(doc_id) for doc_id in test_docs_ids])



In [53]:

    
# Classifier
classifier = OneVsRestClassifier(LinearSVC(random_state=42))
classifier.fit(vectorized_train_docs, train_labels)









    Out[53]:





OneVsRestClassifier(estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=42, tol=0.0001,
     verbose=0),
          n_jobs=1)



In [54]:

    
predictions = classifier.predict(vectorized_test_docs)
predictions.shape









    Out[54]:





(1301, 20)



In [55]:

    
# Evaluation
# Micro-average: Every assignment (document, label) has the same importance. Common categories have more effect over the aggregate quality than smaller ones.
# Macro-average: The quality for each category is calculated independently and their average is reported. All the categories are equally important.

# Top 20 categories

from sklearn.metrics import f1_score, precision_score, recall_score
 
precision = precision_score(test_labels, predictions, average='micro')
recall = recall_score(test_labels, predictions, average='micro')
f1 = f1_score(test_labels, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))
 
precision = precision_score(test_labels, predictions, average='macro')
recall = recall_score(test_labels, predictions, average='macro')
f1 = f1_score(test_labels, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))









    



Micro-average quality numbers
Precision: 0.8065, Recall: 0.5544, F1-measure: 0.6571
Macro-average quality numbers
Precision: 0.7501, Recall: 0.4920, F1-measure: 0.5759



In [26]:

    
# Evaluation
# Micro-average: Every assignment (document, label) has the same importance. Common categories have more effect over the aggregate quality than smaller ones.
# Macro-average: The quality for each category is calculated independently and their average is reported. All the categories are equally important.

# Top 15 categories

from sklearn.metrics import f1_score, precision_score, recall_score
 
precision = precision_score(test_labels, predictions, average='micro')
recall = recall_score(test_labels, predictions, average='micro')
f1 = f1_score(test_labels, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))
 
precision = precision_score(test_labels, predictions, average='macro')
recall = recall_score(test_labels, predictions, average='macro')
f1 = f1_score(test_labels, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))









    



Micro-average quality numbers
Precision: 0.8062, Recall: 0.5663, F1-measure: 0.6652
Macro-average quality numbers
Precision: 0.7434, Recall: 0.5140, F1-measure: 0.5910



In [36]:

    
# Evaluation
# Micro-average: Every assignment (document, label) has the same importance. Common categories have more effect over the aggregate quality than smaller ones.
# Macro-average: The quality for each category is calculated independently and their average is reported. All the categories are equally important.

# Top 10 categories

from sklearn.metrics import f1_score, precision_score, recall_score
 
precision = precision_score(test_labels, predictions, average='micro')
recall = recall_score(test_labels, predictions, average='micro')
f1 = f1_score(test_labels, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))
 
precision = precision_score(test_labels, predictions, average='macro')
recall = recall_score(test_labels, predictions, average='macro')
f1 = f1_score(test_labels, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))









    



Micro-average quality numbers
Precision: 0.8030, Recall: 0.5723, F1-measure: 0.6683
Macro-average quality numbers
Precision: 0.8183, Recall: 0.5475, F1-measure: 0.6389



In [41]:

    
# Evaluation
# Micro-average: Every assignment (document, label) has the same importance. Common categories have more effect over the aggregate quality than smaller ones.
# Macro-average: The quality for each category is calculated independently and their average is reported. All the categories are equally important.

# Top 7 categories

from sklearn.metrics import f1_score, precision_score, recall_score
 
precision = precision_score(test_labels, predictions, average='micro')
recall = recall_score(test_labels, predictions, average='micro')
f1 = f1_score(test_labels, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))
 
precision = precision_score(test_labels, predictions, average='macro')
recall = recall_score(test_labels, predictions, average='macro')
f1 = f1_score(test_labels, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))









    



Micro-average quality numbers
Precision: 0.7948, Recall: 0.6068, F1-measure: 0.6882
Macro-average quality numbers
Precision: 0.8067, Recall: 0.6144, F1-measure: 0.6884



In [31]:

    
# Evaluation
# Micro-average: Every assignment (document, label) has the same importance. Common categories have more effect over the aggregate quality than smaller ones.
# Macro-average: The quality for each category is calculated independently and their average is reported. All the categories are equally important.

# Top 5 categories

from sklearn.metrics import f1_score, precision_score, recall_score
 
precision = precision_score(test_labels, predictions, average='micro')
recall = recall_score(test_labels, predictions, average='micro')
f1 = f1_score(test_labels, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))
 
precision = precision_score(test_labels, predictions, average='macro')
recall = recall_score(test_labels, predictions, average='macro')
f1 = f1_score(test_labels, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))









    



Micro-average quality numbers
Precision: 0.7778, Recall: 0.5950, F1-measure: 0.6742
Macro-average quality numbers
Precision: 0.7735, Recall: 0.5828, F1-measure: 0.6590



In [48]:

    
# Evaluation
# Micro-average: Every assignment (document, label) has the same importance. Common categories have more effect over the aggregate quality than smaller ones.
# Macro-average: The quality for each category is calculated independently and their average is reported. All the categories are equally important.

# Top 3 categories

from sklearn.metrics import f1_score, precision_score, recall_score
 
precision = precision_score(test_labels, predictions, average='micro')
recall = recall_score(test_labels, predictions, average='micro')
f1 = f1_score(test_labels, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))
 
precision = precision_score(test_labels, predictions, average='macro')
recall = recall_score(test_labels, predictions, average='macro')
f1 = f1_score(test_labels, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))









    



Micro-average quality numbers
Precision: 0.7773, Recall: 0.6066, F1-measure: 0.6814
Macro-average quality numbers
Precision: 0.7700, Recall: 0.6065, F1-measure: 0.6767



In [56]:

    
train_docs_ids[0]









    Out[56]:





'training/12481'



In [13]:

    
import gensim

from gensim.models.doc2vec import TaggedDocument
assert gensim.models.doc2vec.FAST_VERSION > -1

train_tagged_docs = [TaggedDocument(get_tokens(judgments.raw(fileid)), [fileid.split("/")[1]]) for fileid in train_docs_ids]









    



C:\Users\howkh\Anaconda3\envs\py35\lib\site-packages\gensim\utils.py:855: UserWarning: detected Windows; aliasing chunkize to chunkize_serial
  warnings.warn("detected Windows; aliasing chunkize to chunkize_serial")



In [14]:

    
model = gensim.models.doc2vec.Doc2Vec(size=50, min_count=3, workers=4, window=8, alpha=0.025, min_alpha=0.025)
model.build_vocab(train_tagged_docs)
for epoch in range(10):
    model.train(train_tagged_docs)
    model.alpha -= 0.002
    model.min_alpha = model.alpha
model.save('trained.doc2vec')



In [1]:

    
model = gensim.models.doc2vec.Doc2Vec.load('trained.doc2vec')









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-b4cace0253a4> in <module>()
----> 1 model = gensim.models.doc2vec.Doc2Vec.load('trained.doc2vec')

NameError: name 'gensim' is not defined



In [15]:

    
model.docvecs[0].shape









    Out[15]:





(50,)



In [16]:

    
test_doc = model.infer_vector(get_tokens(judgments.raw('test/22821')))



In [17]:

    
test_doc.shape









    Out[17]:





(50,)



In [64]:

    
model.docvecs.most_similar([test_doc])









    Out[64]:





[('22744', 0.7519214749336243),
 ('15103', 0.7250633835792542),
 ('22723', 0.7186082601547241),
 ('22602', 0.7177111506462097),
 ('15775', 0.7088651657104492),
 ('20446', 0.7039706110954285),
 ('15388', 0.6877029538154602),
 ('18014', 0.6870896816253662),
 ('22770', 0.6866070628166199),
 ('18182', 0.6831321716308594)]



In [85]:

    
df[df['index'].isin([idx for idx, sim in model.docvecs.most_similar([test_doc])])]









    Out[85]:







  
    
      
      index
      name
      citation
      author
      case_no
      date
      court
      coram
      counsel
      reserved
      catchwords-1
      catchwords-2
      url
    
  
  
    
      2611
      15103
      Public Prosecutor v Hang Tuah bin Jumaat
      [2013] SGHC 28
      Choo Han Teck J
      Criminal Case No 4 of 2012
      29 January 2013
      High Court
      Choo Han Teck J
      Kavita Uthrapathy and Adrian Loo Yu Hao (Attor...
      NaN
      criminal_law
      NaN
      http://www.singaporelaw.sg/sglaw/laws-of-singa...
    
    
      2831
      15388
      Public Prosecutor v Chum Tat Suan
      [2013] SGHC 221
      Choo Han Teck J
      Criminal Case No 1 of 2012
      24 October 2013
      High Court
      Choo Han Teck J
      Mohamed Faizal and Qiu Huixiang (Attorney-Gene...
      Judgment reserved
      criminal_law
      criminal_law-statutory_offences
      http://www.singaporelaw.sg/sglaw/laws-of-singa...
    
    
      3153
      15775
      Public Prosecutor v Purushothaman a/l Subramaniam
      [2014] SGHC 231
      Tan Siong Thye J
      Criminal Case No 27 of 2014
      10 November 2014
      High Court
      Tan Siong Thye J
      Lau Wing Yum and Seraphina Fong (Attorney-Gene...
      NaN
      criminal_procedure_and_sentencing
      criminal_procedure_and_sentencing-sentencing
      http://www.singaporelaw.sg/sglaw/laws-of-singa...
    
    
      3477
      18014
      Public Prosecutor v Mahesvaran a/l Sivalingam
      [2014] SGHC 182
      Tan Siong Thye J
      Criminal Case No 30 of 2014
      17 September 2014
      High Court
      Tan Siong Thye J
      Tan Wen Hsien and N K Anitha (Attorney General...
      NaN
      criminal_law
      criminal_law-statutory_offences
      http://www.singaporelaw.sg/sglaw/laws-of-singa...
    
    
      3644
      18182
      Public Prosecutor v Hamidah bte Awang and another
      [2015] SGHC 270
      Lee Seiu Kin J
      Criminal Case No 32 of 2014
      16 October 2015
      High Court
      Lee Seiu Kin J
      Ng Cheng Thiam and Chee Min Ping (Attorney-Gen...
      NaN
      criminal_law
      criminal_law-statutory_offences
      http://www.singaporelaw.sg/sglaw/laws-of-singa...
    
    
      4023
      20446
      Public Prosecutor v Sibeko Lindiwe Mary-Jane
      [2016] SGHC 228
      Lee Seiu Kin J
      Criminal Case No 24 of 2016
      19 October 2016
      High Court
      Lee Seiu Kin J
      Ma Hanfeng and Kenny Yang (Attorney-General's ...
      NaN
      criminal_procedure_and_sentencing
      criminal_procedure_and_sentencing-sentencing
      http://www.singaporelaw.sg/sglaw/laws-of-singa...
    
    
      5112
      22602
      Public Prosecutor v Mohamad Noor bin Abdullah
      [2016] SGHC 257
      Lee Seiu Kin J
      Criminal Case No 54 of 2016
      18 November 2016
      High Court
      Lee Seiu Kin J
      Lau Wing Yum and Tan Yanying (Attorney-General...
      NaN
      evidence;criminal_law;criminal_procedure_and_s...
      evidence-proof_of_evidence;criminal_law-statut...
      http://www.singaporelaw.sg/sglaw/laws-of-singa...
    
    
      5199
      22723
      Public Prosecutor v Razak bin Bashir
      [2017] SGHC 33
      Woo Bih Li J
      Criminal Case No 7 of 2017
      24 February 2017
      High Court
      Woo Bih Li J
      Tan YanYing and Terence Chua (Attorney-General...
      NaN
      criminal_procedure_and_sentencing
      criminal_procedure_and_sentencing-sentencing
      http://www.singaporelaw.sg/sglaw/laws-of-singa...
    
    
      5220
      22744
      Public Prosecutor v Pandian a/l Subramaniam
      [2017] SGHC 55
      Pang Khang Chau JC
      Criminal Case No 21 of 2017
      14 March 2017
      High Court
      Pang Khang Chau JC
      Lu Zhuoren John and Chan Yi Cheng (Attorney-Ge...
      NaN
      criminal_law
      criminal_law-statutory_offences
      http://www.singaporelaw.sg/sglaw/laws-of-singa...
    
    
      5246
      22770
      Public Prosecutor v Alagesan Nathan and another
      [2017] SGHC 67
      Audrey Lim JC
      Criminal Case No 20 of 2017
      03 April 2017
      High Court
      Audrey Lim JC
      Wong Woon Kwong and Star Chen (Attorney-Genera...
      NaN
      criminal_procedure_and_sentencing
      criminal_procedure_and_sentencing-sentencing
      http://www.singaporelaw.sg/sglaw/laws-of-singa...



In [18]:

    
model.docvecs.most_similar([test_doc])









    Out[18]:





[('20446', 0.769403338432312),
 ('14429', 0.6676273345947266),
 ('22163', 0.6563906669616699),
 ('22744', 0.6544111967086792),
 ('13793', 0.6285304427146912),
 ('18014', 0.6267184019088745),
 ('18182', 0.6263490319252014),
 ('18458', 0.6246256828308105),
 ('13789', 0.6238721609115601),
 ('13510', 0.6082495450973511)]



In [20]:

    
import pandas as pd
df = pd.read_csv('judgments/data.tsv', sep='\t', encoding='utf-8')
df[df['index'].isin([idx for idx, sim in model.docvecs.most_similar([test_doc])])]









    Out[20]:







  
    
      
      index
      name
      citation
      author
      case_no
      date
      court
      coram
      counsel
      reserved
      catchwords-1
      catchwords-2
      url
    
  
  
    
      1030
      13510
      Tay Siew Gek Rachelgina Jasmine v Public Prose...
      [2008] SGHC 176
      Choo Han Teck J
      MA 198/2008
      20 Oct 2008
      High Court
      Choo Han Teck J
      Terence Teo Chee Seng (Able Law Practice LLC) ...
      Judgment reserved
      criminal_procedure_and_sentencing
      NaN
      http://www.singaporelaw.sg/sglaw/laws-of-singa...
    
    
      1309
      13789
      Public Prosecutor v ABJ
      [2009] SGHC 185
      Choo Han Teck J
      CC 29/2009
      14 Aug 2009
      High Court
      Choo Han Teck J
      Gordon Oh (Deputy Public Prosecutor) for the p...
      NaN
      criminal_procedure_and_sentencing
      NaN
      http://www.singaporelaw.sg/sglaw/laws-of-singa...
    
    
      1313
      13793
      Public Prosecutor v Tan Chin Hock
      [2009] SGHC 189
      Choo Han Teck J
      CC 36/2009
      25 Aug 2009
      High Court
      Choo Han Teck J
      Amarjit Singh and Tan Boon Khai (Deputy Public...
      NaN
      criminal_law
      NaN
      http://www.singaporelaw.sg/sglaw/laws-of-singa...
    
    
      1943
      14429
      Choi Byeongkuk v Public Prosecutor
      [2011] SGHC 6
      Choo Han Teck J
      Magistrate’s Appeal No 393 of 2010 (DAC 42066 ...
      11 January 2011
      High Court
      Choo Han Teck J
      S K Kumar (S K Kumar & Associates) for the app...
      NaN
      criminal_procedure_and_sentencing
      NaN
      http://www.singaporelaw.sg/sglaw/laws-of-singa...
    
    
      3477
      18014
      Public Prosecutor v Mahesvaran a/l Sivalingam
      [2014] SGHC 182
      Tan Siong Thye J
      Criminal Case No 30 of 2014
      17 September 2014
      High Court
      Tan Siong Thye J
      Tan Wen Hsien and N K Anitha (Attorney General...
      NaN
      criminal_law
      criminal_law-statutory_offences
      http://www.singaporelaw.sg/sglaw/laws-of-singa...
    
    
      3644
      18182
      Public Prosecutor v Hamidah bte Awang and another
      [2015] SGHC 270
      Lee Seiu Kin J
      Criminal Case No 32 of 2014
      16 October 2015
      High Court
      Lee Seiu Kin J
      Ng Cheng Thiam and Chee Min Ping (Attorney-Gen...
      NaN
      criminal_law
      criminal_law-statutory_offences
      http://www.singaporelaw.sg/sglaw/laws-of-singa...
    
    
      3845
      18458
      Public Prosecutor v Azahari bin Ahmad and another
      [2016] SGHC 101
      Hoo Sheau Peng JC
      Criminal Case No 23 of 2016
      23 May 2016
      High Court
      Hoo Sheau Peng JC
      John Lu Zhuoren and Nicholas Wuan Kin Lek (Att...
      NaN
      criminal_law
      criminal_law-statutory_offences
      http://www.singaporelaw.sg/sglaw/laws-of-singa...
    
    
      4023
      20446
      Public Prosecutor v Sibeko Lindiwe Mary-Jane
      [2016] SGHC 228
      Lee Seiu Kin J
      Criminal Case No 24 of 2016
      19 October 2016
      High Court
      Lee Seiu Kin J
      Ma Hanfeng and Kenny Yang (Attorney-General's ...
      NaN
      criminal_procedure_and_sentencing
      criminal_procedure_and_sentencing-sentencing
      http://www.singaporelaw.sg/sglaw/laws-of-singa...
    
    
      4676
      22163
      Public Prosecutor v Jamal anak Nyalau and Others
      [2002] SGHC 78
      NaN
      CC 22/2002
      19 Apr 2002
      High Court
      MPH Rubin J
      Daniel Yong and Hwong Meng Jet (Deputy Public ...
      NaN
      criminal_law;criminal_procedure_and_sentencing
      criminal_law-offences;criminal_procedure_and_s...
      http://www.singaporelaw.sg/sglaw/laws-of-singa...
    
    
      5220
      22744
      Public Prosecutor v Pandian a/l Subramaniam
      [2017] SGHC 55
      Pang Khang Chau JC
      Criminal Case No 21 of 2017
      14 March 2017
      High Court
      Pang Khang Chau JC
      Lu Zhuoren John and Chan Yi Cheng (Attorney-Ge...
      NaN
      criminal_law
      criminal_law-statutory_offences
      http://www.singaporelaw.sg/sglaw/laws-of-singa...



In [21]:

    
test_doc2vecs = [model.infer_vector(get_tokens(judgments.raw(docid))) for docid in test_docs_ids]
len(test_doc2vecs)









    Out[21]:





1301



In [22]:

    
test_doc2vecs = np.array(test_doc2vecs)



In [23]:

    
training_doc2vecs = model.docvecs.doctag_syn0



In [25]:

    
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
classifier = OneVsRestClassifier(LinearSVC(random_state=42))
classifier.fit(training_doc2vecs, train_labels)









    Out[25]:





OneVsRestClassifier(estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=42, tol=0.0001,
     verbose=0),
          n_jobs=1)



In [27]:

    
predictions = classifier.predict(test_doc2vecs)
predictions.shape









    Out[27]:





(1301, 10)



In [105]:

    
# Doc2Vec 300

from sklearn.metrics import f1_score, precision_score, recall_score
 
precision = precision_score(test_labels, predictions, average='micro')
recall = recall_score(test_labels, predictions, average='micro')
f1 = f1_score(test_labels, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))
 
precision = precision_score(test_labels, predictions, average='macro')
recall = recall_score(test_labels, predictions, average='macro')
f1 = f1_score(test_labels, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))









    



Micro-average quality numbers
Precision: 0.2595, Recall: 0.6324, F1-measure: 0.3680
Macro-average quality numbers
Precision: 0.2452, Recall: 0.6167, F1-measure: 0.3462



In [28]:

    
# Doc2Vec 50

from sklearn.metrics import f1_score, precision_score, recall_score
 
precision = precision_score(test_labels, predictions, average='micro')
recall = recall_score(test_labels, predictions, average='micro')
f1 = f1_score(test_labels, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))
 
precision = precision_score(test_labels, predictions, average='macro')
recall = recall_score(test_labels, predictions, average='macro')
f1 = f1_score(test_labels, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))









    



Micro-average quality numbers
Precision: 0.6632, Recall: 0.4804, F1-measure: 0.5572
Macro-average quality numbers
Precision: 0.6758, Recall: 0.4712, F1-measure: 0.5320



In [68]:

    
import pandas as pd
df = pd.read_csv('judgments/data.tsv', sep='\t', encoding='utf-8')



In [13]:

    
import gensim
import pyLDAvis.gensim

tokenized_docs = [get_tokens(judgments.raw(docid)) for docid in selected_docs_ids]









    



C:\Users\howkh\Anaconda3\envs\py35\lib\site-packages\gensim\utils.py:855: UserWarning: detected Windows; aliasing chunkize to chunkize_serial
  warnings.warn("detected Windows; aliasing chunkize to chunkize_serial")



In [14]:

    
corpus_vocab = gensim.corpora.Dictionary(tokenized_docs)
len(corpus_vocab)









    Out[14]:





209884



In [15]:

    
bow_corpus = [corpus_vocab.doc2bow(doc) for doc in tokenized_docs]

lda_model = gensim.models.ldamodel.LdaModel(bow_corpus, num_topics=NUM_CATEGORIES, id2word=corpus_vocab, passes=15)
lda_display = pyLDAvis.gensim.prepare(lda_model, bow_corpus, corpus_vocab, sort_topics=False)
pyLDAvis.display(lda_display)









    



C:\Users\howkh\Anaconda3\envs\py35\lib\site-packages\pyLDAvis\_prepare.py:387: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  topic_term_dists = topic_term_dists.ix[topic_order]






    Out[15]:



In [20]:

    
import pickle
corpus_vocab.save('top-10-categories.dictionary')
pickle.dump(bow_corpus, open( "top-10-bow_corpus.list", "wb" ))



In [21]:

    
lda_model.save('10-topics.ldamodel')



In [24]:

    
from sklearn.feature_extraction.text import TfidfVectorizer

selected_docs = [judgments.raw(docid) for docid in selected_docs_ids]
tfidf_vectorizer = TfidfVectorizer(stop_words=stopwords, tokenizer=get_tokens)
tfidf_vectors = tfidf_vectorizer.fit_transform(selected_docs)



In [27]:

    
import gensim
w2v_model = gensim.models.word2vec.Word2Vec(tokenized_docs, size=100)









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-27-70ca701c4d44> in <module>()
      1 import gensim
      2 w2v_model = gensim.models.word2vec.Word2Vec(tokenized_docs, size=100)
----> 3 w2v = dict(zip(w2v.model.index2word, model.syn0))

NameError: name 'w2v' is not defined



In [ ]:

    
w2v_model.save('top-10-w2vmodel')



In [31]:

    
w2v = dict(zip(w2v_model.wv.index2word, w2v_model.wv.syn0))



In [33]:

    
selected_train_docs_ids = [docid for docid in selected_docs_ids if docid.startswith('training')]
selected_test_docs_ids = [docid for docid in selected_docs_ids if docid.startswith('test')]



In [34]:

    
from collections import Counter

c = Counter()

for docid in selected_train_docs_ids:
    c.update(judgments.categories(docid))



In [40]:

    
c1 = Counter()

for docid in selected_train_docs_ids:
    c1.update(str(len(judgments.categories(docid))))
c1









    Out[40]:





Counter({'1': 1814, '2': 763, '3': 229, '4': 55, '5': 13, '6': 5, '7': 1})



In [ ]:

	index	name	citation	author	case_no	date	court	coram	counsel	reserved	catchwords-1	catchwords-2	url
2611	15103	Public Prosecutor v Hang Tuah bin Jumaat	[2013] SGHC 28	Choo Han Teck J	Criminal Case No 4 of 2012	29 January 2013	High Court	Choo Han Teck J	Kavita Uthrapathy and Adrian Loo Yu Hao (Attor...	NaN	criminal_law	NaN	http://www.singaporelaw.sg/sglaw/laws-of-singa...
2831	15388	Public Prosecutor v Chum Tat Suan	[2013] SGHC 221	Choo Han Teck J	Criminal Case No 1 of 2012	24 October 2013	High Court	Choo Han Teck J	Mohamed Faizal and Qiu Huixiang (Attorney-Gene...	Judgment reserved	criminal_law	criminal_law-statutory_offences	http://www.singaporelaw.sg/sglaw/laws-of-singa...
3153	15775	Public Prosecutor v Purushothaman a/l Subramaniam	[2014] SGHC 231	Tan Siong Thye J	Criminal Case No 27 of 2014	10 November 2014	High Court	Tan Siong Thye J	Lau Wing Yum and Seraphina Fong (Attorney-Gene...	NaN	criminal_procedure_and_sentencing	criminal_procedure_and_sentencing-sentencing	http://www.singaporelaw.sg/sglaw/laws-of-singa...
3477	18014	Public Prosecutor v Mahesvaran a/l Sivalingam	[2014] SGHC 182	Tan Siong Thye J	Criminal Case No 30 of 2014	17 September 2014	High Court	Tan Siong Thye J	Tan Wen Hsien and N K Anitha (Attorney General...	NaN	criminal_law	criminal_law-statutory_offences	http://www.singaporelaw.sg/sglaw/laws-of-singa...
3644	18182	Public Prosecutor v Hamidah bte Awang and another	[2015] SGHC 270	Lee Seiu Kin J	Criminal Case No 32 of 2014	16 October 2015	High Court	Lee Seiu Kin J	Ng Cheng Thiam and Chee Min Ping (Attorney-Gen...	NaN	criminal_law	criminal_law-statutory_offences	http://www.singaporelaw.sg/sglaw/laws-of-singa...
4023	20446	Public Prosecutor v Sibeko Lindiwe Mary-Jane	[2016] SGHC 228	Lee Seiu Kin J	Criminal Case No 24 of 2016	19 October 2016	High Court	Lee Seiu Kin J	Ma Hanfeng and Kenny Yang (Attorney-General's ...	NaN	criminal_procedure_and_sentencing	criminal_procedure_and_sentencing-sentencing	http://www.singaporelaw.sg/sglaw/laws-of-singa...
5112	22602	Public Prosecutor v Mohamad Noor bin Abdullah	[2016] SGHC 257	Lee Seiu Kin J	Criminal Case No 54 of 2016	18 November 2016	High Court	Lee Seiu Kin J	Lau Wing Yum and Tan Yanying (Attorney-General...	NaN	evidence;criminal_law;criminal_procedure_and_s...	evidence-proof_of_evidence;criminal_law-statut...	http://www.singaporelaw.sg/sglaw/laws-of-singa...
5199	22723	Public Prosecutor v Razak bin Bashir	[2017] SGHC 33	Woo Bih Li J	Criminal Case No 7 of 2017	24 February 2017	High Court	Woo Bih Li J	Tan YanYing and Terence Chua (Attorney-General...	NaN	criminal_procedure_and_sentencing	criminal_procedure_and_sentencing-sentencing	http://www.singaporelaw.sg/sglaw/laws-of-singa...
5220	22744	Public Prosecutor v Pandian a/l Subramaniam	[2017] SGHC 55	Pang Khang Chau JC	Criminal Case No 21 of 2017	14 March 2017	High Court	Pang Khang Chau JC	Lu Zhuoren John and Chan Yi Cheng (Attorney-Ge...	NaN	criminal_law	criminal_law-statutory_offences	http://www.singaporelaw.sg/sglaw/laws-of-singa...
5246	22770	Public Prosecutor v Alagesan Nathan and another	[2017] SGHC 67	Audrey Lim JC	Criminal Case No 20 of 2017	03 April 2017	High Court	Audrey Lim JC	Wong Woon Kwong and Star Chen (Attorney-Genera...	NaN	criminal_procedure_and_sentencing	criminal_procedure_and_sentencing-sentencing	http://www.singaporelaw.sg/sglaw/laws-of-singa...

	index	name	citation	author	case_no	date	court	coram	counsel	reserved	catchwords-1	catchwords-2	url
1030	13510	Tay Siew Gek Rachelgina Jasmine v Public Prose...	[2008] SGHC 176	Choo Han Teck J	MA 198/2008	20 Oct 2008	High Court	Choo Han Teck J	Terence Teo Chee Seng (Able Law Practice LLC) ...	Judgment reserved	criminal_procedure_and_sentencing	NaN	http://www.singaporelaw.sg/sglaw/laws-of-singa...
1309	13789	Public Prosecutor v ABJ	[2009] SGHC 185	Choo Han Teck J	CC 29/2009	14 Aug 2009	High Court	Choo Han Teck J	Gordon Oh (Deputy Public Prosecutor) for the p...	NaN	criminal_procedure_and_sentencing	NaN	http://www.singaporelaw.sg/sglaw/laws-of-singa...
1313	13793	Public Prosecutor v Tan Chin Hock	[2009] SGHC 189	Choo Han Teck J	CC 36/2009	25 Aug 2009	High Court	Choo Han Teck J	Amarjit Singh and Tan Boon Khai (Deputy Public...	NaN	criminal_law	NaN	http://www.singaporelaw.sg/sglaw/laws-of-singa...
1943	14429	Choi Byeongkuk v Public Prosecutor	[2011] SGHC 6	Choo Han Teck J	Magistrate’s Appeal No 393 of 2010 (DAC 42066 ...	11 January 2011	High Court	Choo Han Teck J	S K Kumar (S K Kumar & Associates) for the app...	NaN	criminal_procedure_and_sentencing	NaN	http://www.singaporelaw.sg/sglaw/laws-of-singa...
3477	18014	Public Prosecutor v Mahesvaran a/l Sivalingam	[2014] SGHC 182	Tan Siong Thye J	Criminal Case No 30 of 2014	17 September 2014	High Court	Tan Siong Thye J	Tan Wen Hsien and N K Anitha (Attorney General...	NaN	criminal_law	criminal_law-statutory_offences	http://www.singaporelaw.sg/sglaw/laws-of-singa...
3644	18182	Public Prosecutor v Hamidah bte Awang and another	[2015] SGHC 270	Lee Seiu Kin J	Criminal Case No 32 of 2014	16 October 2015	High Court	Lee Seiu Kin J	Ng Cheng Thiam and Chee Min Ping (Attorney-Gen...	NaN	criminal_law	criminal_law-statutory_offences	http://www.singaporelaw.sg/sglaw/laws-of-singa...
3845	18458	Public Prosecutor v Azahari bin Ahmad and another	[2016] SGHC 101	Hoo Sheau Peng JC	Criminal Case No 23 of 2016	23 May 2016	High Court	Hoo Sheau Peng JC	John Lu Zhuoren and Nicholas Wuan Kin Lek (Att...	NaN	criminal_law	criminal_law-statutory_offences	http://www.singaporelaw.sg/sglaw/laws-of-singa...
4023	20446	Public Prosecutor v Sibeko Lindiwe Mary-Jane	[2016] SGHC 228	Lee Seiu Kin J	Criminal Case No 24 of 2016	19 October 2016	High Court	Lee Seiu Kin J	Ma Hanfeng and Kenny Yang (Attorney-General's ...	NaN	criminal_procedure_and_sentencing	criminal_procedure_and_sentencing-sentencing	http://www.singaporelaw.sg/sglaw/laws-of-singa...
4676	22163	Public Prosecutor v Jamal anak Nyalau and Others	[2002] SGHC 78	NaN	CC 22/2002	19 Apr 2002	High Court	MPH Rubin J	Daniel Yong and Hwong Meng Jet (Deputy Public ...	NaN	criminal_law;criminal_procedure_and_sentencing	criminal_law-offences;criminal_procedure_and_s...	http://www.singaporelaw.sg/sglaw/laws-of-singa...
5220	22744	Public Prosecutor v Pandian a/l Subramaniam	[2017] SGHC 55	Pang Khang Chau JC	Criminal Case No 21 of 2017	14 March 2017	High Court	Pang Khang Chau JC	Lu Zhuoren John and Chan Yi Cheng (Attorney-Ge...	NaN	criminal_law	criminal_law-statutory_offences	http://www.singaporelaw.sg/sglaw/laws-of-singa...