In [2]:
from nltk.corpus.reader import CategorizedPlaintextCorpusReader
CORPUS_ROOT = r"\Users\howkh\Desktop\Data Science\Final Project\judgments"
CAT_FILE = CORPUS_ROOT + r"\cats.txt"
judgments = CategorizedPlaintextCorpusReader(root=CORPUS_ROOT, fileids='.*\\d+', cat_file=CAT_FILE, encoding='utf-8')
In [2]:
len(judgments.fileids())
Out[2]:
In [3]:
len(judgments.categories())
Out[3]:
In [4]:
from collections import Counter
def get_top_categories(num_categories):
category_counter = []
for fileid in judgments.fileids():
category_counter.extend(judgments.categories(fileid))
return [pairs[0] for pairs in Counter(category_counter).most_common(num_categories)]
In [5]:
NUM_CATEGORIES = 10
selected_categories = get_top_categories(NUM_CATEGORIES)
selected_categories
Out[5]:
In [6]:
train_docs_ids = [
fileid for fileid in judgments.fileids() if fileid.startswith('training')
#and any(category in judgments.categories(fileid) for category in selected_categories)
]
len(train_docs_ids)
Out[6]:
In [7]:
test_docs_ids = [
fileid for fileid in judgments.fileids() if fileid.startswith('test')
#and any(category in judgments.categories(fileid) for category in selected_categories)
]
len(test_docs_ids)
Out[7]:
In [8]:
selected_docs_ids = [
fileid for fileid in judgments.fileids() if any(category in judgments.categories(fileid) for category in selected_categories)
]
len(selected_docs_ids)
Out[8]:
In [9]:
import numpy as np
train_labels = np.zeros((len(train_docs_ids), NUM_CATEGORIES))
test_labels = np.zeros((len(test_docs_ids), NUM_CATEGORIES))
for idx, docid in enumerate(train_docs_ids):
for category in judgments.categories(docid):
if category in selected_categories:
train_labels.itemset((idx, selected_categories.index(category)), 1)
for idx, docid in enumerate(test_docs_ids):
for category in judgments.categories(docid):
if category in selected_categories:
test_labels.itemset((idx, selected_categories.index(category)), 1)
In [10]:
len(train_labels)
Out[10]:
In [11]:
len(test_labels)
Out[11]:
In [12]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
stopwords = set (stopwords.words('english'))
def get_tokens(text):
text = text.lower()
tokens = word_tokenize(text)
tokens = [t for t in tokens if len(t) > 2]
tokens = [t for t in tokens if t not in stopwords]
tokens = [t for t in tokens if t not in "would also may could whether may".split()]
tokens = [WordNetLemmatizer().lemmatize(t) for t in tokens]
tokens = [t for t in tokens if not any(c.isdigit() for c in t)]
return tokens
In [12]:
train_docs = [judgments.raw(doc_id) for doc_id in train_docs_ids]
test_docs = [judgments.raw(doc_id) for doc_id in test_docs_ids]
In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
# Tokenization, learn and transform training docs, transform test docs
vectorizer = TfidfVectorizer(stop_words=stopwords, tokenizer=get_tokens)
vectorized_train_docs = vectorizer.fit_transform(train_docs)
vectorized_test_docs = vectorizer.transform(test_docs)
In [ ]:
#from sklearn.preprocessing import MultiLabelBinarizer
# Transform mutilabel labels
mlb = MultiLabelBinarizer()
train_labels = mlb.fit_transform([judgments.categories(doc_id) for doc_id in train_docs_ids])
test_labels = mlb.transform([judgments.categories(doc_id) for doc_id in test_docs_ids])
In [53]:
# Classifier
classifier = OneVsRestClassifier(LinearSVC(random_state=42))
classifier.fit(vectorized_train_docs, train_labels)
Out[53]:
In [54]:
predictions = classifier.predict(vectorized_test_docs)
predictions.shape
Out[54]:
In [55]:
# Evaluation
# Micro-average: Every assignment (document, label) has the same importance. Common categories have more effect over the aggregate quality than smaller ones.
# Macro-average: The quality for each category is calculated independently and their average is reported. All the categories are equally important.
# Top 20 categories
from sklearn.metrics import f1_score, precision_score, recall_score
precision = precision_score(test_labels, predictions, average='micro')
recall = recall_score(test_labels, predictions, average='micro')
f1 = f1_score(test_labels, predictions, average='micro')
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))
precision = precision_score(test_labels, predictions, average='macro')
recall = recall_score(test_labels, predictions, average='macro')
f1 = f1_score(test_labels, predictions, average='macro')
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))
In [26]:
# Evaluation
# Micro-average: Every assignment (document, label) has the same importance. Common categories have more effect over the aggregate quality than smaller ones.
# Macro-average: The quality for each category is calculated independently and their average is reported. All the categories are equally important.
# Top 15 categories
from sklearn.metrics import f1_score, precision_score, recall_score
precision = precision_score(test_labels, predictions, average='micro')
recall = recall_score(test_labels, predictions, average='micro')
f1 = f1_score(test_labels, predictions, average='micro')
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))
precision = precision_score(test_labels, predictions, average='macro')
recall = recall_score(test_labels, predictions, average='macro')
f1 = f1_score(test_labels, predictions, average='macro')
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))
In [36]:
# Evaluation
# Micro-average: Every assignment (document, label) has the same importance. Common categories have more effect over the aggregate quality than smaller ones.
# Macro-average: The quality for each category is calculated independently and their average is reported. All the categories are equally important.
# Top 10 categories
from sklearn.metrics import f1_score, precision_score, recall_score
precision = precision_score(test_labels, predictions, average='micro')
recall = recall_score(test_labels, predictions, average='micro')
f1 = f1_score(test_labels, predictions, average='micro')
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))
precision = precision_score(test_labels, predictions, average='macro')
recall = recall_score(test_labels, predictions, average='macro')
f1 = f1_score(test_labels, predictions, average='macro')
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))
In [41]:
# Evaluation
# Micro-average: Every assignment (document, label) has the same importance. Common categories have more effect over the aggregate quality than smaller ones.
# Macro-average: The quality for each category is calculated independently and their average is reported. All the categories are equally important.
# Top 7 categories
from sklearn.metrics import f1_score, precision_score, recall_score
precision = precision_score(test_labels, predictions, average='micro')
recall = recall_score(test_labels, predictions, average='micro')
f1 = f1_score(test_labels, predictions, average='micro')
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))
precision = precision_score(test_labels, predictions, average='macro')
recall = recall_score(test_labels, predictions, average='macro')
f1 = f1_score(test_labels, predictions, average='macro')
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))
In [31]:
# Evaluation
# Micro-average: Every assignment (document, label) has the same importance. Common categories have more effect over the aggregate quality than smaller ones.
# Macro-average: The quality for each category is calculated independently and their average is reported. All the categories are equally important.
# Top 5 categories
from sklearn.metrics import f1_score, precision_score, recall_score
precision = precision_score(test_labels, predictions, average='micro')
recall = recall_score(test_labels, predictions, average='micro')
f1 = f1_score(test_labels, predictions, average='micro')
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))
precision = precision_score(test_labels, predictions, average='macro')
recall = recall_score(test_labels, predictions, average='macro')
f1 = f1_score(test_labels, predictions, average='macro')
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))
In [48]:
# Evaluation
# Micro-average: Every assignment (document, label) has the same importance. Common categories have more effect over the aggregate quality than smaller ones.
# Macro-average: The quality for each category is calculated independently and their average is reported. All the categories are equally important.
# Top 3 categories
from sklearn.metrics import f1_score, precision_score, recall_score
precision = precision_score(test_labels, predictions, average='micro')
recall = recall_score(test_labels, predictions, average='micro')
f1 = f1_score(test_labels, predictions, average='micro')
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))
precision = precision_score(test_labels, predictions, average='macro')
recall = recall_score(test_labels, predictions, average='macro')
f1 = f1_score(test_labels, predictions, average='macro')
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))
In [56]:
train_docs_ids[0]
Out[56]:
In [13]:
import gensim
from gensim.models.doc2vec import TaggedDocument
assert gensim.models.doc2vec.FAST_VERSION > -1
train_tagged_docs = [TaggedDocument(get_tokens(judgments.raw(fileid)), [fileid.split("/")[1]]) for fileid in train_docs_ids]
In [14]:
model = gensim.models.doc2vec.Doc2Vec(size=50, min_count=3, workers=4, window=8, alpha=0.025, min_alpha=0.025)
model.build_vocab(train_tagged_docs)
for epoch in range(10):
model.train(train_tagged_docs)
model.alpha -= 0.002
model.min_alpha = model.alpha
model.save('trained.doc2vec')
In [1]:
model = gensim.models.doc2vec.Doc2Vec.load('trained.doc2vec')
In [15]:
model.docvecs[0].shape
Out[15]:
In [16]:
test_doc = model.infer_vector(get_tokens(judgments.raw('test/22821')))
In [17]:
test_doc.shape
Out[17]:
In [64]:
model.docvecs.most_similar([test_doc])
Out[64]:
In [85]:
df[df['index'].isin([idx for idx, sim in model.docvecs.most_similar([test_doc])])]
Out[85]:
In [18]:
model.docvecs.most_similar([test_doc])
Out[18]:
In [20]:
import pandas as pd
df = pd.read_csv('judgments/data.tsv', sep='\t', encoding='utf-8')
df[df['index'].isin([idx for idx, sim in model.docvecs.most_similar([test_doc])])]
Out[20]:
In [21]:
test_doc2vecs = [model.infer_vector(get_tokens(judgments.raw(docid))) for docid in test_docs_ids]
len(test_doc2vecs)
Out[21]:
In [22]:
test_doc2vecs = np.array(test_doc2vecs)
In [23]:
training_doc2vecs = model.docvecs.doctag_syn0
In [25]:
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
classifier = OneVsRestClassifier(LinearSVC(random_state=42))
classifier.fit(training_doc2vecs, train_labels)
Out[25]:
In [27]:
predictions = classifier.predict(test_doc2vecs)
predictions.shape
Out[27]:
In [105]:
# Doc2Vec 300
from sklearn.metrics import f1_score, precision_score, recall_score
precision = precision_score(test_labels, predictions, average='micro')
recall = recall_score(test_labels, predictions, average='micro')
f1 = f1_score(test_labels, predictions, average='micro')
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))
precision = precision_score(test_labels, predictions, average='macro')
recall = recall_score(test_labels, predictions, average='macro')
f1 = f1_score(test_labels, predictions, average='macro')
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))
In [28]:
# Doc2Vec 50
from sklearn.metrics import f1_score, precision_score, recall_score
precision = precision_score(test_labels, predictions, average='micro')
recall = recall_score(test_labels, predictions, average='micro')
f1 = f1_score(test_labels, predictions, average='micro')
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))
precision = precision_score(test_labels, predictions, average='macro')
recall = recall_score(test_labels, predictions, average='macro')
f1 = f1_score(test_labels, predictions, average='macro')
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))
In [68]:
import pandas as pd
df = pd.read_csv('judgments/data.tsv', sep='\t', encoding='utf-8')
In [13]:
import gensim
import pyLDAvis.gensim
tokenized_docs = [get_tokens(judgments.raw(docid)) for docid in selected_docs_ids]
In [14]:
corpus_vocab = gensim.corpora.Dictionary(tokenized_docs)
len(corpus_vocab)
Out[14]:
In [15]:
bow_corpus = [corpus_vocab.doc2bow(doc) for doc in tokenized_docs]
lda_model = gensim.models.ldamodel.LdaModel(bow_corpus, num_topics=NUM_CATEGORIES, id2word=corpus_vocab, passes=15)
lda_display = pyLDAvis.gensim.prepare(lda_model, bow_corpus, corpus_vocab, sort_topics=False)
pyLDAvis.display(lda_display)
Out[15]:
In [20]:
import pickle
corpus_vocab.save('top-10-categories.dictionary')
pickle.dump(bow_corpus, open( "top-10-bow_corpus.list", "wb" ))
In [21]:
lda_model.save('10-topics.ldamodel')
In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
selected_docs = [judgments.raw(docid) for docid in selected_docs_ids]
tfidf_vectorizer = TfidfVectorizer(stop_words=stopwords, tokenizer=get_tokens)
tfidf_vectors = tfidf_vectorizer.fit_transform(selected_docs)
In [27]:
import gensim
w2v_model = gensim.models.word2vec.Word2Vec(tokenized_docs, size=100)
In [ ]:
w2v_model.save('top-10-w2vmodel')
In [31]:
w2v = dict(zip(w2v_model.wv.index2word, w2v_model.wv.syn0))
In [33]:
selected_train_docs_ids = [docid for docid in selected_docs_ids if docid.startswith('training')]
selected_test_docs_ids = [docid for docid in selected_docs_ids if docid.startswith('test')]
In [34]:
from collections import Counter
c = Counter()
for docid in selected_train_docs_ids:
c.update(judgments.categories(docid))
In [40]:
c1 = Counter()
for docid in selected_train_docs_ids:
c1.update(str(len(judgments.categories(docid))))
c1
Out[40]:
In [ ]: