In [2]:
from nltk.corpus.reader import CategorizedPlaintextCorpusReader

CORPUS_ROOT = r"\Users\howkh\Desktop\Data Science\Final Project\judgments"
CAT_FILE = CORPUS_ROOT + r"\cats.txt"
judgments = CategorizedPlaintextCorpusReader(root=CORPUS_ROOT, fileids='.*\\d+', cat_file=CAT_FILE, encoding='utf-8')

In [2]:
len(judgments.fileids())


Out[2]:
5303

In [3]:
len(judgments.categories())


Out[3]:
87

In [4]:
from collections import Counter

def get_top_categories(num_categories):
    category_counter = []
    for fileid in judgments.fileids():
        category_counter.extend(judgments.categories(fileid))
    return [pairs[0] for pairs in Counter(category_counter).most_common(num_categories)]

In [5]:
NUM_CATEGORIES = 10

selected_categories = get_top_categories(NUM_CATEGORIES)
selected_categories


Out[5]:
['civil_procedure',
 'contract',
 'criminal_procedure_and_sentencing',
 'criminal_law',
 'tort',
 'family_law',
 'companies',
 'evidence',
 'land',
 'damages']

In [6]:
train_docs_ids = [
    fileid for fileid in judgments.fileids() if fileid.startswith('training') 
    #and any(category in judgments.categories(fileid) for category in selected_categories)
]
len(train_docs_ids)


Out[6]:
3901

In [7]:
test_docs_ids = [
    fileid for fileid in judgments.fileids() if fileid.startswith('test') 
    #and any(category in judgments.categories(fileid) for category in selected_categories)
]
len(test_docs_ids)


Out[7]:
1301

In [8]:
selected_docs_ids = [
    fileid for fileid in judgments.fileids() if any(category in judgments.categories(fileid) for category in selected_categories)
]
len(selected_docs_ids)


Out[8]:
3841

In [9]:
import numpy as np

train_labels = np.zeros((len(train_docs_ids), NUM_CATEGORIES))
test_labels = np.zeros((len(test_docs_ids), NUM_CATEGORIES))

for idx, docid in enumerate(train_docs_ids):
    for category in judgments.categories(docid):
        if category in selected_categories:
            train_labels.itemset((idx, selected_categories.index(category)), 1)

for idx, docid in enumerate(test_docs_ids):
    for category in judgments.categories(docid):
        if category in selected_categories:
            test_labels.itemset((idx, selected_categories.index(category)), 1)

In [10]:
len(train_labels)


Out[10]:
3901

In [11]:
len(test_labels)


Out[11]:
1301

In [12]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

stopwords = set (stopwords.words('english'))

def get_tokens(text):
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if len(t) > 2]
    tokens = [t for t in tokens if t not in stopwords]
    tokens = [t for t in tokens if t not in "would also may could whether may".split()]
    tokens = [WordNetLemmatizer().lemmatize(t) for t in tokens]
    tokens = [t for t in tokens if not any(c.isdigit() for c in t)]
    return tokens

In [12]:
train_docs = [judgments.raw(doc_id) for doc_id in train_docs_ids]
test_docs = [judgments.raw(doc_id) for doc_id in test_docs_ids]

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier

# Tokenization, learn and transform training docs, transform test docs
vectorizer = TfidfVectorizer(stop_words=stopwords, tokenizer=get_tokens)
vectorized_train_docs = vectorizer.fit_transform(train_docs)
vectorized_test_docs = vectorizer.transform(test_docs)

In [ ]:
#from sklearn.preprocessing import MultiLabelBinarizer
# Transform mutilabel labels
mlb = MultiLabelBinarizer()
train_labels = mlb.fit_transform([judgments.categories(doc_id) for doc_id in train_docs_ids])
test_labels = mlb.transform([judgments.categories(doc_id) for doc_id in test_docs_ids])

In [53]:
# Classifier
classifier = OneVsRestClassifier(LinearSVC(random_state=42))
classifier.fit(vectorized_train_docs, train_labels)


Out[53]:
OneVsRestClassifier(estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=42, tol=0.0001,
     verbose=0),
          n_jobs=1)

In [54]:
predictions = classifier.predict(vectorized_test_docs)
predictions.shape


Out[54]:
(1301, 20)

In [55]:
# Evaluation
# Micro-average: Every assignment (document, label) has the same importance. Common categories have more effect over the aggregate quality than smaller ones.
# Macro-average: The quality for each category is calculated independently and their average is reported. All the categories are equally important.

# Top 20 categories

from sklearn.metrics import f1_score, precision_score, recall_score
 
precision = precision_score(test_labels, predictions, average='micro')
recall = recall_score(test_labels, predictions, average='micro')
f1 = f1_score(test_labels, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))
 
precision = precision_score(test_labels, predictions, average='macro')
recall = recall_score(test_labels, predictions, average='macro')
f1 = f1_score(test_labels, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))


Micro-average quality numbers
Precision: 0.8065, Recall: 0.5544, F1-measure: 0.6571
Macro-average quality numbers
Precision: 0.7501, Recall: 0.4920, F1-measure: 0.5759

In [26]:
# Evaluation
# Micro-average: Every assignment (document, label) has the same importance. Common categories have more effect over the aggregate quality than smaller ones.
# Macro-average: The quality for each category is calculated independently and their average is reported. All the categories are equally important.

# Top 15 categories

from sklearn.metrics import f1_score, precision_score, recall_score
 
precision = precision_score(test_labels, predictions, average='micro')
recall = recall_score(test_labels, predictions, average='micro')
f1 = f1_score(test_labels, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))
 
precision = precision_score(test_labels, predictions, average='macro')
recall = recall_score(test_labels, predictions, average='macro')
f1 = f1_score(test_labels, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))


Micro-average quality numbers
Precision: 0.8062, Recall: 0.5663, F1-measure: 0.6652
Macro-average quality numbers
Precision: 0.7434, Recall: 0.5140, F1-measure: 0.5910

In [36]:
# Evaluation
# Micro-average: Every assignment (document, label) has the same importance. Common categories have more effect over the aggregate quality than smaller ones.
# Macro-average: The quality for each category is calculated independently and their average is reported. All the categories are equally important.

# Top 10 categories

from sklearn.metrics import f1_score, precision_score, recall_score
 
precision = precision_score(test_labels, predictions, average='micro')
recall = recall_score(test_labels, predictions, average='micro')
f1 = f1_score(test_labels, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))
 
precision = precision_score(test_labels, predictions, average='macro')
recall = recall_score(test_labels, predictions, average='macro')
f1 = f1_score(test_labels, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))


Micro-average quality numbers
Precision: 0.8030, Recall: 0.5723, F1-measure: 0.6683
Macro-average quality numbers
Precision: 0.8183, Recall: 0.5475, F1-measure: 0.6389

In [41]:
# Evaluation
# Micro-average: Every assignment (document, label) has the same importance. Common categories have more effect over the aggregate quality than smaller ones.
# Macro-average: The quality for each category is calculated independently and their average is reported. All the categories are equally important.

# Top 7 categories

from sklearn.metrics import f1_score, precision_score, recall_score
 
precision = precision_score(test_labels, predictions, average='micro')
recall = recall_score(test_labels, predictions, average='micro')
f1 = f1_score(test_labels, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))
 
precision = precision_score(test_labels, predictions, average='macro')
recall = recall_score(test_labels, predictions, average='macro')
f1 = f1_score(test_labels, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))


Micro-average quality numbers
Precision: 0.7948, Recall: 0.6068, F1-measure: 0.6882
Macro-average quality numbers
Precision: 0.8067, Recall: 0.6144, F1-measure: 0.6884

In [31]:
# Evaluation
# Micro-average: Every assignment (document, label) has the same importance. Common categories have more effect over the aggregate quality than smaller ones.
# Macro-average: The quality for each category is calculated independently and their average is reported. All the categories are equally important.

# Top 5 categories

from sklearn.metrics import f1_score, precision_score, recall_score
 
precision = precision_score(test_labels, predictions, average='micro')
recall = recall_score(test_labels, predictions, average='micro')
f1 = f1_score(test_labels, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))
 
precision = precision_score(test_labels, predictions, average='macro')
recall = recall_score(test_labels, predictions, average='macro')
f1 = f1_score(test_labels, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))


Micro-average quality numbers
Precision: 0.7778, Recall: 0.5950, F1-measure: 0.6742
Macro-average quality numbers
Precision: 0.7735, Recall: 0.5828, F1-measure: 0.6590

In [48]:
# Evaluation
# Micro-average: Every assignment (document, label) has the same importance. Common categories have more effect over the aggregate quality than smaller ones.
# Macro-average: The quality for each category is calculated independently and their average is reported. All the categories are equally important.

# Top 3 categories

from sklearn.metrics import f1_score, precision_score, recall_score
 
precision = precision_score(test_labels, predictions, average='micro')
recall = recall_score(test_labels, predictions, average='micro')
f1 = f1_score(test_labels, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))
 
precision = precision_score(test_labels, predictions, average='macro')
recall = recall_score(test_labels, predictions, average='macro')
f1 = f1_score(test_labels, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))


Micro-average quality numbers
Precision: 0.7773, Recall: 0.6066, F1-measure: 0.6814
Macro-average quality numbers
Precision: 0.7700, Recall: 0.6065, F1-measure: 0.6767

In [56]:
train_docs_ids[0]


Out[56]:
'training/12481'

In [13]:
import gensim

from gensim.models.doc2vec import TaggedDocument
assert gensim.models.doc2vec.FAST_VERSION > -1

train_tagged_docs = [TaggedDocument(get_tokens(judgments.raw(fileid)), [fileid.split("/")[1]]) for fileid in train_docs_ids]


C:\Users\howkh\Anaconda3\envs\py35\lib\site-packages\gensim\utils.py:855: UserWarning: detected Windows; aliasing chunkize to chunkize_serial
  warnings.warn("detected Windows; aliasing chunkize to chunkize_serial")

In [14]:
model = gensim.models.doc2vec.Doc2Vec(size=50, min_count=3, workers=4, window=8, alpha=0.025, min_alpha=0.025)
model.build_vocab(train_tagged_docs)
for epoch in range(10):
    model.train(train_tagged_docs)
    model.alpha -= 0.002
    model.min_alpha = model.alpha
model.save('trained.doc2vec')

In [1]:
model = gensim.models.doc2vec.Doc2Vec.load('trained.doc2vec')


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-b4cace0253a4> in <module>()
----> 1 model = gensim.models.doc2vec.Doc2Vec.load('trained.doc2vec')

NameError: name 'gensim' is not defined

In [15]:
model.docvecs[0].shape


Out[15]:
(50,)

In [16]:
test_doc = model.infer_vector(get_tokens(judgments.raw('test/22821')))

In [17]:
test_doc.shape


Out[17]:
(50,)

In [64]:
model.docvecs.most_similar([test_doc])


Out[64]:
[('22744', 0.7519214749336243),
 ('15103', 0.7250633835792542),
 ('22723', 0.7186082601547241),
 ('22602', 0.7177111506462097),
 ('15775', 0.7088651657104492),
 ('20446', 0.7039706110954285),
 ('15388', 0.6877029538154602),
 ('18014', 0.6870896816253662),
 ('22770', 0.6866070628166199),
 ('18182', 0.6831321716308594)]

In [85]:
df[df['index'].isin([idx for idx, sim in model.docvecs.most_similar([test_doc])])]


Out[85]:
index name citation author case_no date court coram counsel reserved catchwords-1 catchwords-2 url
2611 15103 Public Prosecutor v Hang Tuah bin Jumaat [2013] SGHC 28 Choo Han Teck J Criminal Case No 4 of 2012 29 January 2013 High Court Choo Han Teck J Kavita Uthrapathy and Adrian Loo Yu Hao (Attor... NaN criminal_law NaN http://www.singaporelaw.sg/sglaw/laws-of-singa...
2831 15388 Public Prosecutor v Chum Tat Suan [2013] SGHC 221 Choo Han Teck J Criminal Case No 1 of 2012 24 October 2013 High Court Choo Han Teck J Mohamed Faizal and Qiu Huixiang (Attorney-Gene... Judgment reserved criminal_law criminal_law-statutory_offences http://www.singaporelaw.sg/sglaw/laws-of-singa...
3153 15775 Public Prosecutor v Purushothaman a/l Subramaniam [2014] SGHC 231 Tan Siong Thye J Criminal Case No 27 of 2014 10 November 2014 High Court Tan Siong Thye J Lau Wing Yum and Seraphina Fong (Attorney-Gene... NaN criminal_procedure_and_sentencing criminal_procedure_and_sentencing-sentencing http://www.singaporelaw.sg/sglaw/laws-of-singa...
3477 18014 Public Prosecutor v Mahesvaran a/l Sivalingam [2014] SGHC 182 Tan Siong Thye J Criminal Case No 30 of 2014 17 September 2014 High Court Tan Siong Thye J Tan Wen Hsien and N K Anitha (Attorney General... NaN criminal_law criminal_law-statutory_offences http://www.singaporelaw.sg/sglaw/laws-of-singa...
3644 18182 Public Prosecutor v Hamidah bte Awang and another [2015] SGHC 270 Lee Seiu Kin J Criminal Case No 32 of 2014 16 October 2015 High Court Lee Seiu Kin J Ng Cheng Thiam and Chee Min Ping (Attorney-Gen... NaN criminal_law criminal_law-statutory_offences http://www.singaporelaw.sg/sglaw/laws-of-singa...
4023 20446 Public Prosecutor v Sibeko Lindiwe Mary-Jane [2016] SGHC 228 Lee Seiu Kin J Criminal Case No 24 of 2016 19 October 2016 High Court Lee Seiu Kin J Ma Hanfeng and Kenny Yang (Attorney-General's ... NaN criminal_procedure_and_sentencing criminal_procedure_and_sentencing-sentencing http://www.singaporelaw.sg/sglaw/laws-of-singa...
5112 22602 Public Prosecutor v Mohamad Noor bin Abdullah [2016] SGHC 257 Lee Seiu Kin J Criminal Case No 54 of 2016 18 November 2016 High Court Lee Seiu Kin J Lau Wing Yum and Tan Yanying (Attorney-General... NaN evidence;criminal_law;criminal_procedure_and_s... evidence-proof_of_evidence;criminal_law-statut... http://www.singaporelaw.sg/sglaw/laws-of-singa...
5199 22723 Public Prosecutor v Razak bin Bashir [2017] SGHC 33 Woo Bih Li J Criminal Case No 7 of 2017 24 February 2017 High Court Woo Bih Li J Tan YanYing and Terence Chua (Attorney-General... NaN criminal_procedure_and_sentencing criminal_procedure_and_sentencing-sentencing http://www.singaporelaw.sg/sglaw/laws-of-singa...
5220 22744 Public Prosecutor v Pandian a/l Subramaniam [2017] SGHC 55 Pang Khang Chau JC Criminal Case No 21 of 2017 14 March 2017 High Court Pang Khang Chau JC Lu Zhuoren John and Chan Yi Cheng (Attorney-Ge... NaN criminal_law criminal_law-statutory_offences http://www.singaporelaw.sg/sglaw/laws-of-singa...
5246 22770 Public Prosecutor v Alagesan Nathan and another [2017] SGHC 67 Audrey Lim JC Criminal Case No 20 of 2017 03 April 2017 High Court Audrey Lim JC Wong Woon Kwong and Star Chen (Attorney-Genera... NaN criminal_procedure_and_sentencing criminal_procedure_and_sentencing-sentencing http://www.singaporelaw.sg/sglaw/laws-of-singa...

In [18]:
model.docvecs.most_similar([test_doc])


Out[18]:
[('20446', 0.769403338432312),
 ('14429', 0.6676273345947266),
 ('22163', 0.6563906669616699),
 ('22744', 0.6544111967086792),
 ('13793', 0.6285304427146912),
 ('18014', 0.6267184019088745),
 ('18182', 0.6263490319252014),
 ('18458', 0.6246256828308105),
 ('13789', 0.6238721609115601),
 ('13510', 0.6082495450973511)]

In [20]:
import pandas as pd
df = pd.read_csv('judgments/data.tsv', sep='\t', encoding='utf-8')
df[df['index'].isin([idx for idx, sim in model.docvecs.most_similar([test_doc])])]


Out[20]:
index name citation author case_no date court coram counsel reserved catchwords-1 catchwords-2 url
1030 13510 Tay Siew Gek Rachelgina Jasmine v Public Prose... [2008] SGHC 176 Choo Han Teck J MA 198/2008 20 Oct 2008 High Court Choo Han Teck J Terence Teo Chee Seng (Able Law Practice LLC) ... Judgment reserved criminal_procedure_and_sentencing NaN http://www.singaporelaw.sg/sglaw/laws-of-singa...
1309 13789 Public Prosecutor v ABJ [2009] SGHC 185 Choo Han Teck J CC 29/2009 14 Aug 2009 High Court Choo Han Teck J Gordon Oh (Deputy Public Prosecutor) for the p... NaN criminal_procedure_and_sentencing NaN http://www.singaporelaw.sg/sglaw/laws-of-singa...
1313 13793 Public Prosecutor v Tan Chin Hock [2009] SGHC 189 Choo Han Teck J CC 36/2009 25 Aug 2009 High Court Choo Han Teck J Amarjit Singh and Tan Boon Khai (Deputy Public... NaN criminal_law NaN http://www.singaporelaw.sg/sglaw/laws-of-singa...
1943 14429 Choi Byeongkuk v Public Prosecutor [2011] SGHC 6 Choo Han Teck J Magistrate’s Appeal No 393 of 2010 (DAC 42066 ... 11 January 2011 High Court Choo Han Teck J S K Kumar (S K Kumar & Associates) for the app... NaN criminal_procedure_and_sentencing NaN http://www.singaporelaw.sg/sglaw/laws-of-singa...
3477 18014 Public Prosecutor v Mahesvaran a/l Sivalingam [2014] SGHC 182 Tan Siong Thye J Criminal Case No 30 of 2014 17 September 2014 High Court Tan Siong Thye J Tan Wen Hsien and N K Anitha (Attorney General... NaN criminal_law criminal_law-statutory_offences http://www.singaporelaw.sg/sglaw/laws-of-singa...
3644 18182 Public Prosecutor v Hamidah bte Awang and another [2015] SGHC 270 Lee Seiu Kin J Criminal Case No 32 of 2014 16 October 2015 High Court Lee Seiu Kin J Ng Cheng Thiam and Chee Min Ping (Attorney-Gen... NaN criminal_law criminal_law-statutory_offences http://www.singaporelaw.sg/sglaw/laws-of-singa...
3845 18458 Public Prosecutor v Azahari bin Ahmad and another [2016] SGHC 101 Hoo Sheau Peng JC Criminal Case No 23 of 2016 23 May 2016 High Court Hoo Sheau Peng JC John Lu Zhuoren and Nicholas Wuan Kin Lek (Att... NaN criminal_law criminal_law-statutory_offences http://www.singaporelaw.sg/sglaw/laws-of-singa...
4023 20446 Public Prosecutor v Sibeko Lindiwe Mary-Jane [2016] SGHC 228 Lee Seiu Kin J Criminal Case No 24 of 2016 19 October 2016 High Court Lee Seiu Kin J Ma Hanfeng and Kenny Yang (Attorney-General's ... NaN criminal_procedure_and_sentencing criminal_procedure_and_sentencing-sentencing http://www.singaporelaw.sg/sglaw/laws-of-singa...
4676 22163 Public Prosecutor v Jamal anak Nyalau and Others [2002] SGHC 78 NaN CC 22/2002 19 Apr 2002 High Court MPH Rubin J Daniel Yong and Hwong Meng Jet (Deputy Public ... NaN criminal_law;criminal_procedure_and_sentencing criminal_law-offences;criminal_procedure_and_s... http://www.singaporelaw.sg/sglaw/laws-of-singa...
5220 22744 Public Prosecutor v Pandian a/l Subramaniam [2017] SGHC 55 Pang Khang Chau JC Criminal Case No 21 of 2017 14 March 2017 High Court Pang Khang Chau JC Lu Zhuoren John and Chan Yi Cheng (Attorney-Ge... NaN criminal_law criminal_law-statutory_offences http://www.singaporelaw.sg/sglaw/laws-of-singa...

In [21]:
test_doc2vecs = [model.infer_vector(get_tokens(judgments.raw(docid))) for docid in test_docs_ids]
len(test_doc2vecs)


Out[21]:
1301

In [22]:
test_doc2vecs = np.array(test_doc2vecs)

In [23]:
training_doc2vecs = model.docvecs.doctag_syn0

In [25]:
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
classifier = OneVsRestClassifier(LinearSVC(random_state=42))
classifier.fit(training_doc2vecs, train_labels)


Out[25]:
OneVsRestClassifier(estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=42, tol=0.0001,
     verbose=0),
          n_jobs=1)

In [27]:
predictions = classifier.predict(test_doc2vecs)
predictions.shape


Out[27]:
(1301, 10)

In [105]:
# Doc2Vec 300

from sklearn.metrics import f1_score, precision_score, recall_score
 
precision = precision_score(test_labels, predictions, average='micro')
recall = recall_score(test_labels, predictions, average='micro')
f1 = f1_score(test_labels, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))
 
precision = precision_score(test_labels, predictions, average='macro')
recall = recall_score(test_labels, predictions, average='macro')
f1 = f1_score(test_labels, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))


Micro-average quality numbers
Precision: 0.2595, Recall: 0.6324, F1-measure: 0.3680
Macro-average quality numbers
Precision: 0.2452, Recall: 0.6167, F1-measure: 0.3462

In [28]:
# Doc2Vec 50

from sklearn.metrics import f1_score, precision_score, recall_score
 
precision = precision_score(test_labels, predictions, average='micro')
recall = recall_score(test_labels, predictions, average='micro')
f1 = f1_score(test_labels, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))
 
precision = precision_score(test_labels, predictions, average='macro')
recall = recall_score(test_labels, predictions, average='macro')
f1 = f1_score(test_labels, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))


Micro-average quality numbers
Precision: 0.6632, Recall: 0.4804, F1-measure: 0.5572
Macro-average quality numbers
Precision: 0.6758, Recall: 0.4712, F1-measure: 0.5320

In [68]:
import pandas as pd
df = pd.read_csv('judgments/data.tsv', sep='\t', encoding='utf-8')

In [13]:
import gensim
import pyLDAvis.gensim

tokenized_docs = [get_tokens(judgments.raw(docid)) for docid in selected_docs_ids]


C:\Users\howkh\Anaconda3\envs\py35\lib\site-packages\gensim\utils.py:855: UserWarning: detected Windows; aliasing chunkize to chunkize_serial
  warnings.warn("detected Windows; aliasing chunkize to chunkize_serial")

In [14]:
corpus_vocab = gensim.corpora.Dictionary(tokenized_docs)
len(corpus_vocab)


Out[14]:
209884

In [15]:
bow_corpus = [corpus_vocab.doc2bow(doc) for doc in tokenized_docs]

lda_model = gensim.models.ldamodel.LdaModel(bow_corpus, num_topics=NUM_CATEGORIES, id2word=corpus_vocab, passes=15)
lda_display = pyLDAvis.gensim.prepare(lda_model, bow_corpus, corpus_vocab, sort_topics=False)
pyLDAvis.display(lda_display)


C:\Users\howkh\Anaconda3\envs\py35\lib\site-packages\pyLDAvis\_prepare.py:387: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  topic_term_dists = topic_term_dists.ix[topic_order]
Out[15]:

In [20]:
import pickle
corpus_vocab.save('top-10-categories.dictionary')
pickle.dump(bow_corpus, open( "top-10-bow_corpus.list", "wb" ))

In [21]:
lda_model.save('10-topics.ldamodel')

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

selected_docs = [judgments.raw(docid) for docid in selected_docs_ids]
tfidf_vectorizer = TfidfVectorizer(stop_words=stopwords, tokenizer=get_tokens)
tfidf_vectors = tfidf_vectorizer.fit_transform(selected_docs)

In [27]:
import gensim
w2v_model = gensim.models.word2vec.Word2Vec(tokenized_docs, size=100)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-27-70ca701c4d44> in <module>()
      1 import gensim
      2 w2v_model = gensim.models.word2vec.Word2Vec(tokenized_docs, size=100)
----> 3 w2v = dict(zip(w2v.model.index2word, model.syn0))

NameError: name 'w2v' is not defined

In [ ]:
w2v_model.save('top-10-w2vmodel')

In [31]:
w2v = dict(zip(w2v_model.wv.index2word, w2v_model.wv.syn0))

In [33]:
selected_train_docs_ids = [docid for docid in selected_docs_ids if docid.startswith('training')]
selected_test_docs_ids = [docid for docid in selected_docs_ids if docid.startswith('test')]

In [34]:
from collections import Counter

c = Counter()

for docid in selected_train_docs_ids:
    c.update(judgments.categories(docid))

In [40]:
c1 = Counter()

for docid in selected_train_docs_ids:
    c1.update(str(len(judgments.categories(docid))))
c1


Out[40]:
Counter({'1': 1814, '2': 763, '3': 229, '4': 55, '5': 13, '6': 5, '7': 1})

In [ ]: