In [67]:
from pprint import pprint
import matplotlib.pyplot as plt
import numpy as np
import nltk
import string
import math
import sklearn.cluster
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
defining the documents here and set the vocabulary
In [68]:
TOKENIZER = TreebankWordTokenizer()
LEMMATIZER = WordNetLemmatizer()
d1 = "Frodo and Sam were trembling in the darkness, surrounded in darkness by hundreds of blood-thirsty orc. Sam was certain these beasts were about to taste the scent of their flesh."
d2 = "The faceless black beast then stabbed Frodo. He felt like every nerve in his body was hurting. Suddenly, he thought of Sam and his calming smile. Frodo had betrayed him."
d3 = "Frodo sword was radiating blue, stronger and stronger every second. Orc were getting closer. And these weren’t just regular orc either, Uruk-Hai were among them. Frodo had killed regular orc before, but he had never stabbed an Uruk-Hai, not with the blue stick."
d4 = "Sam was carrying a small lamp, shedding some blue light. He was afraid that orc might spot him, but it was the only way to avoid deadly pitfalls of Mordor."
docs = [d1.lower(), d2.lower(), d3.lower(), d4.lower()]
ORDERED_VOC = [t.lower() for t in ["Frodo", "Sam", "beast", "orc", "blue"]]
VOC = set(ORDERED_VOC)
defining some functions here that will be needed later on
In [69]:
# stolen from http://stackoverflow.com/questions/15586721/wordnet-lemmatization-and-pos-tagging-in-python
def get_wordnet_pos(treebank_tag):
if treebank_tag.startswith('J'):
return wordnet.ADJ
elif treebank_tag.startswith('V'):
return wordnet.VERB
elif treebank_tag.startswith('N'):
return wordnet.NOUN
elif treebank_tag.startswith('R'):
return wordnet.ADV
else:
return ''
def reduce_to_voc(tokens):
t_rel = []
for t in tokens:
if t in VOC:
t_rel.append(t)
return t_rel
def lemmatize(tokens):
tagged = nltk.pos_tag(tokens)
for t in tagged:
try:
yield LEMMATIZER.lemmatize(t[0], get_wordnet_pos(t[1]))
except KeyError:
pass
def tokenize(doc):
doc = " ".join("".join([" " if ch in string.punctuation else ch for ch in doc]).split())
result = []
for token in TOKENIZER.tokenize(doc):
result.append(token.lower())
return result
def preprocess(doc):
tokens = tokenize(doc)
tokens = lemmatize(tokens)
tokens = reduce_to_voc(tokens)
return tokens
do preprocessing, the follwoing tokens remain documents are numbered horizontally, commencing from doc1
In [70]:
docs_prep = []
for doc in docs:
docs_prep.append(preprocess(doc))
pprint(docs_prep)
compute the idf matrix
In [71]:
def compute_idf(docs):
mat = np.zeros((len(ORDERED_VOC), len(ORDERED_VOC)))
no_docs = len(docs)
for doc in docs:
for i in range(len(ORDERED_VOC)):
term = ORDERED_VOC[i]
mat[i,i] += 1 if term in doc else 0
for i in range(len(ORDERED_VOC)):
mat[i,i] = no_docs/(mat[i,i]) if mat[i,i] !=0 else 0
return mat
idf = compute_idf(docs_prep)
pprint(ORDERED_VOC)
pprint(idf)
compute the tf matrix
In [72]:
def compute_tf(docs):
mat = np.zeros((len(docs) ,len(ORDERED_VOC)))
for d in range(len(docs)):
doc = docs[d]
for t in range(len(ORDERED_VOC)):
term = ORDERED_VOC[t]
mat[d][t] = doc.count(term)
return mat
tf = compute_tf(docs_prep)
pprint(tf)
compute the tf-idf matrix, then transpose it so it is in document-term format
In [73]:
tf_idf = np.matmul(tf, idf)
term_doc_tfidf = np.transpose(tf_idf)
print('TF-IDF matrix in document term format')
pprint(tf_idf)
In [74]:
def svd(mat):
return np.linalg.svd(mat)
u, sigma, v = svd(term_doc_tfidf)
sigma = np.diag(sigma)
pprint([u, sigma, v])
In [75]:
sigma_2 = sigma[:2,:]
u_2 = u[:,:2]
v_2 = v[:,:2]
print('Latent matrices with K2')
pprint([u_2, sigma_2, v_2])
dense_vec = np.matmul(sigma_2, np.transpose(v))
print('\ndoc vector in 2 dimensional space')
pprint(dense_vec)
x = dense_vec[:1,:]
y = dense_vec[1:2,:]
plt.scatter(x, y)
plt.show()
In [76]:
query= ['sam', 'blue', 'orc']
tf_query = compute_tf([query])
tf_idf_query = np.transpose(np.matmul(tf_query, idf))
print('TF-IDF vector query')
pprint([tf_idf_query])
print('\nQuery Vector in latent space ')
dense_query = np.matmul(np.transpose(u_2), tf_idf_query)
pprint(dense_query)
def cosine(doc1, doc2):
length_doc1 = 0
length_doc2 = 0
scalar = 0.0
if not len(doc1) == len(doc2):
raise ValueError('Vectors of different length')
for i in range(len(doc1)):
scalar += doc1[i] * doc2[i]
length_doc1 += math.pow(doc1[i], 2)
length_doc2 += math.pow(doc2[i], 2)
length_doc1, length_doc2 = math.sqrt(length_doc1), math.sqrt(length_doc2)
return scalar / (length_doc1 * length_doc2)
def rank(doc_term, query_term):
ranking = []
i = 0
for doc in (doc_term):
sim = {'doc': str(i+1),
'similarity' : cosine(doc, query_term)
}
i += 1
ranking.append(sim)
return sorted(ranking, key=lambda x: x['similarity'], reverse=True)
ranked = rank(np.transpose(dense_vec), np.transpose(dense_query)[0])
print('\nranked documents:')
pprint(ranked)
In [77]:
doc_term_idfvector = [
[0.17, 0.21, 0.35, 0.44, 0.49, 0.39, 0.09, 0.07, 0.37, 0.24],
[0.49, 0.48, 0.44, 0.09, 0.24, 0.2, 0.41, 0.16, 0.1, 0.15],
[0.41, 0.36, 0.27, 0.19, 0.15, 0.42, 0.23, 0.42, 0.02, 0.42],
[0.31, 0.41, 0.21, 0.19, 0.47, 0.28, 0.21, 0.39, 0.16, 0.38],
[0.46, 0.12, 0.21, 0.25, 0.38, 0.38, 0.46, 0.23, 0.31, 0.14],
[0.13, 0.33, 0.28, 0.42, 0.07, 0.13, 0.58, 0.15, 0.0, 0.49],
[0.21, 0.09, 0.07, 0.09, 0.3, 0.54, 0.24, 0.43, 0.51, 0.21],
[0.18, 0.39, 0.42, 0.05, 0.41, 0.1, 0.52, 0.12, 0.14, 0.38],
[0.4, 0.51, 0.01, 0.1, 0.12, 0.22, 0.26, 0.34, 0.42, 0.38]
]
def avg_cluster_similarity(cluster, doc):
similarity = 0.0
for member in cluster:
similarity += cosine(member, doc)
sim = similarity/len(cluster)
return sim
def max_cluster_similarity(cluster, doc):
max_similarity = 0.0
for member in cluster:
current_sim = cosine(member, doc)
if current_sim > max_similarity:
max_similarity = current_sim
return max_similarity
def single_pass_clustering(docs, lamda, cluster_similarity):
"""
returns: clusters; first array is the list of clusters, which then contains a list of documents
"""
clusters = []
clusters.append([docs.pop(-1)])
# loop over each doc and try to assign to cluster
for doc in docs:
max_similarity = 0
best_cluster = None
for cluster in clusters:
# determine the best matching cluster, lamda will be checked later on
sim = cluster_similarity(cluster, doc)
if sim > max_similarity:
max_similarity = sim
best_cluster = cluster
if lamda > max_similarity:
# create a new cluster
clusters.append([doc])
else:
# append to existing cluster
best_cluster.append(doc)
return clusters
def print_clusters(clusters):
i = 0
for cluster in clusters:
print('Cluster %s:' % (str(i)))
i += 1
print('\t' + "; ".join([str(doc) for doc in cluster]))
spc0_6 = single_pass_clustering(doc_term_idfvector, 0.6, max_cluster_similarity)
spc0_8 = single_pass_clustering(doc_term_idfvector, 0.8, max_cluster_similarity)
print('Single pass clustering with lamda=0.6')
print_clusters(spc0_6)
print('Single pass clustering with lamda=0.8')
print_clusters(spc0_8)
In general a lower lamda will lead to less clusters as the algorithm will assign a document with lower similarities to an existing cluster
In [78]:
spc0_8_reversed = single_pass_clustering([d for d in reversed(doc_term_idfvector)], 0.8, max_cluster_similarity)
print('Single pass clustering with lamda=0.8 and reversed streamlining')
print_clusters(spc0_8_reversed)
The clusters change, which is entailed by the different start cluster the algorithm has chosen
In [79]:
centroids = [
[0.33, 0.33, 0.42, 0.12, 0.2, 0.34, 0.58, 0.19, 0.07, 0.24],
[0.29, 0.16, 0.38, 0.48, 0.43, 0.11, 0.12, 0.33, 0.03, 0.44],
[0.01, 0.17, 0.11, 0.27, 0.23, 0.37, 0.35, 0.48, 0.54, 0.24]
]
centroids_np = np.array(centroids)
kmeans = sklearn.cluster.KMeans(n_clusters=3, init=centroids_np).fit(np.array(doc_term_idfvector))
centers = kmeans.cluster_centers_
print('Cluster centers')
pprint(centers)
i = 0
for doc in doc_term_idfvector:
clusters = []
best_cluster = None
max_sim = 0.0
for center in range(len(centers)):
sim = avg_cluster_similarity([centers[center]], doc)
if max_sim < sim:
max_sim = sim
best_cluster = center
print('Doc %s %s assigned to cluster %s' %(str(i+1), doc, str(best_cluster+1)))
i += 1
In [80]:
relevant_docs = set([i for i in range(1,20) if i % 2 !=0])
results = {
'r1' : set([1, 2, 5, 6, 13]),
'r2' : set([1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 19, 14, 17, 3, 15, 16, 18, 20]),
'r3' : set([1, 2, 4, 5, 9, 10, 12, 13, 14, 15, 20])
}
def compute_precision(relevant, retrieved):
tp = len(set.intersection(relevant, retrieved))
return tp/len(retrieved)
def compute_recall(relevant, retrieved):
tp = len(set.intersection(relevant, retrieved))
return tp/len(relevant)
def compute_fmeasure(f, relevant, retrieved):
precision = compute_precision(relevant, retrieved)
recall = compute_recall(relevant, retrieved)
return 1/((f*(1/precision))+(1-f)*(1/recall))
def compute_f1measure(relevant, retrieved):
precision = compute_precision(relevant, retrieved)
recall = compute_recall(relevant, retrieved)
return 2 * ( (precision *recall) / (precision + recall))
for k,v in results.items():
evaluation = {
'precision' : compute_precision(relevant_docs, v),
'recall' : compute_recall(relevant_docs, v),
'f1' : compute_f1measure(relevant_docs, v)
}
results[k] = evaluation
pprint(results)
these measures are only suitable for binary evaluation but cannot evaluate rankings
In [84]:
relevant_docs = set([i for i in range(1,20) if i % 2 !=0])
results = {
'r1' : [1, 2, 5, 6, 13],
'r2' : [1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 19, 14, 17, 3, 15, 16, 18, 20],
'r3' : [1, 2, 4, 5, 9, 10, 12, 13, 14, 15, 20]
}
def average_precision(relevant_docs, ranking):
ranks = []
for item in relevant_docs:
try:
ranks.append(compute_precision(relevant_docs, ranking[:ranking.index(item)+1]))
except ValueError:
pass
return sum([add for add in ranks]) / len(ranks)
def compute_rprecision(relevant, retrieved):
tp = len(set.intersection(relevant, retrieved))
return tp/len(relevant)
for k,v in results.items():
evaluation = {
'p@5' : compute_precision(relevant_docs, set(v[:5])),
'r-precision' : compute_rprecision(relevant_docs, set(v[:len(relevant_docs)])),
'average precision' : average_precision(relevant_docs, v)
}
results[k] = evaluation
pprint(results)
In [82]:
q1 = [1, 6, 9, 17, 21]
q2 = [1, 3, 4]
q3 = [2, 5, 8, 9, 10]
q4 = [4]
q5 = [1, 2, 6]
queries = [q1, q2, q3, q4, q5]
def precision(no_rel, rank):
return no_rel/rank
def average_precision(ranks):
return sum([precision(ranks.index(r)+1, r) for r in ranks])/len(ranks)
def mean_average_precision(queries):
return sum([average_precision(query) for query in queries])/len(queries)
map_ = mean_average_precision(queries)
print('Mean average precision is \n %s' % map_)