IRWS Homework 03

Sebastian Wagner

Exercise 1

a)


In [67]:
from pprint import pprint
import matplotlib.pyplot as plt
import numpy as np
import nltk
import string
import math
import sklearn.cluster

from nltk.tokenize import TreebankWordTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

defining the documents here and set the vocabulary


In [68]:
TOKENIZER = TreebankWordTokenizer()
LEMMATIZER = WordNetLemmatizer()


d1 = "Frodo and Sam were trembling in the darkness, surrounded in darkness by hundreds of blood-thirsty orc. Sam was certain these beasts were about to taste the scent of their flesh."
d2 = "The faceless black beast then stabbed Frodo. He felt like every nerve in his body was hurting. Suddenly, he thought of Sam and his calming smile. Frodo had betrayed him."
d3 = "Frodo sword was radiating blue, stronger and stronger every second. Orc were getting closer. And these weren’t just regular orc either, Uruk-Hai were among them. Frodo had killed regular orc before, but he had never stabbed an Uruk-Hai, not with the blue stick."
d4 = "Sam was carrying a small lamp, shedding some blue light. He was afraid that orc might spot him, but it was the only way to avoid deadly pitfalls of Mordor."

docs = [d1.lower(), d2.lower(), d3.lower(), d4.lower()]


ORDERED_VOC = [t.lower() for t in ["Frodo", "Sam", "beast", "orc", "blue"]]
VOC = set(ORDERED_VOC)

defining some functions here that will be needed later on


In [69]:
# stolen from http://stackoverflow.com/questions/15586721/wordnet-lemmatization-and-pos-tagging-in-python
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''

def reduce_to_voc(tokens):
    t_rel = []
    for t in tokens:
        if t in VOC:
            t_rel.append(t)
    return t_rel

def lemmatize(tokens):
    tagged = nltk.pos_tag(tokens)
    for t in tagged:
        try:
            yield LEMMATIZER.lemmatize(t[0], get_wordnet_pos(t[1]))
        except KeyError:
            pass
        
def tokenize(doc):
    doc = " ".join("".join([" " if ch in string.punctuation else ch for ch in doc]).split())
    result = []
    for token in TOKENIZER.tokenize(doc):
        result.append(token.lower())
    return result

def preprocess(doc):
    tokens = tokenize(doc)
    tokens = lemmatize(tokens)
    tokens = reduce_to_voc(tokens)
    return tokens

do preprocessing, the follwoing tokens remain documents are numbered horizontally, commencing from doc1


In [70]:
docs_prep = []
for doc in docs:
    docs_prep.append(preprocess(doc))
pprint(docs_prep)


[['frodo', 'sam', 'orc', 'sam', 'beast'],
 ['beast', 'frodo', 'sam', 'frodo'],
 ['frodo', 'blue', 'orc', 'orc', 'frodo', 'orc', 'blue'],
 ['sam', 'blue', 'orc']]

compute the idf matrix


In [71]:
def compute_idf(docs):
    mat = np.zeros((len(ORDERED_VOC), len(ORDERED_VOC)))
    no_docs = len(docs)
    for doc in docs:
        for i in range(len(ORDERED_VOC)):
            term = ORDERED_VOC[i]
            mat[i,i] += 1 if term in doc else 0
    for i in range(len(ORDERED_VOC)):
        mat[i,i] = no_docs/(mat[i,i]) if mat[i,i] !=0 else 0
    return mat

idf = compute_idf(docs_prep)
pprint(ORDERED_VOC)
pprint(idf)


['frodo', 'sam', 'beast', 'orc', 'blue']
array([[ 1.33333333,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  1.33333333,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  2.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  1.33333333,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  2.        ]])

compute the tf matrix


In [72]:
def compute_tf(docs):
    mat = np.zeros((len(docs) ,len(ORDERED_VOC)))
    for d in range(len(docs)):
        doc = docs[d]
        for t in range(len(ORDERED_VOC)):
            term = ORDERED_VOC[t]
            mat[d][t] = doc.count(term)
    return mat

tf = compute_tf(docs_prep)
pprint(tf)


array([[ 1.,  2.,  1.,  1.,  0.],
       [ 2.,  1.,  1.,  0.,  0.],
       [ 2.,  0.,  0.,  3.,  2.],
       [ 0.,  1.,  0.,  1.,  1.]])

compute the tf-idf matrix, then transpose it so it is in document-term format


In [73]:
tf_idf = np.matmul(tf, idf)
term_doc_tfidf = np.transpose(tf_idf)
print('TF-IDF matrix in document term format')
pprint(tf_idf)


TF-IDF matrix in document term format
array([[ 1.33333333,  2.66666667,  2.        ,  1.33333333,  0.        ],
       [ 2.66666667,  1.33333333,  2.        ,  0.        ,  0.        ],
       [ 2.66666667,  0.        ,  0.        ,  4.        ,  4.        ],
       [ 0.        ,  1.33333333,  0.        ,  1.33333333,  2.        ]])

b)


In [74]:
def svd(mat):
    return np.linalg.svd(mat)
    
u, sigma, v = svd(term_doc_tfidf)
sigma = np.diag(sigma)
    
pprint([u, sigma, v])


[array([[ -4.85279372e-01,  -3.20298863e-01,  -7.05471070e-01,
          9.51352972e-02,  -3.93919299e-01],
       [ -2.41792427e-01,  -5.50491549e-01,   6.63994195e-01,
          2.05997018e-01,  -3.93919299e-01],
       [ -1.75793034e-01,  -5.83892823e-01,  -6.20129059e-02,
         -6.02554933e-02,   7.87838597e-01],
       [ -5.98256394e-01,   2.34463181e-01,   2.05534526e-01,
         -7.38154362e-01,   5.96744876e-16],
       [ -5.63228595e-01,   4.45492851e-01,   1.23823405e-01,
          6.32464953e-01,   2.62612866e-01]]),
 array([[ 7.08256944,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  4.34281253,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  2.12423778,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.82886997]]),
 array([[-0.34466026, -0.27787293, -0.83868126, -0.31719029],
       [-0.6332787 , -0.63458983,  0.42960496,  0.10813604],
       [ 0.46136234, -0.52722742, -0.26542436,  0.66236391],
       [-0.51702242,  0.49205043, -0.20396868,  0.67005296]])]

c)


In [75]:
sigma_2 = sigma[:2,:]
u_2 = u[:,:2]
v_2 = v[:,:2]
print('Latent matrices with K2')
pprint([u_2, sigma_2, v_2])

dense_vec = np.matmul(sigma_2, np.transpose(v))

print('\ndoc vector in 2 dimensional space')
pprint(dense_vec)

x = dense_vec[:1,:]
y = dense_vec[1:2,:]

plt.scatter(x, y)
plt.show()


Latent matrices with K2
[array([[-0.48527937, -0.32029886],
       [-0.24179243, -0.55049155],
       [-0.17579303, -0.58389282],
       [-0.59825639,  0.23446318],
       [-0.56322859,  0.44549285]]),
 array([[ 7.08256944,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  4.34281253,  0.        ,  0.        ]]),
 array([[-0.34466026, -0.27787293],
       [-0.6332787 , -0.63458983],
       [ 0.46136234, -0.52722742],
       [-0.51702242,  0.49205043]])]

doc vector in 2 dimensional space
array([[-2.44108023, -4.48524039,  3.26763082, -3.66184718],
       [-1.20675003, -2.75590468, -2.28964985,  2.13688279]])

d)


In [76]:
query= ['sam', 'blue', 'orc']
tf_query = compute_tf([query])
tf_idf_query = np.transpose(np.matmul(tf_query, idf))
print('TF-IDF vector query')
pprint([tf_idf_query])

print('\nQuery Vector in latent space ')
dense_query = np.matmul(np.transpose(u_2), tf_idf_query)
pprint(dense_query)

def cosine(doc1, doc2):
    length_doc1 = 0
    length_doc2 = 0
    scalar = 0.0
    if not len(doc1) == len(doc2):
        raise ValueError('Vectors of different length')
    for i in range(len(doc1)):
        scalar += doc1[i] * doc2[i]
        length_doc1 += math.pow(doc1[i], 2) 
        length_doc2 += math.pow(doc2[i], 2)
    length_doc1, length_doc2 = math.sqrt(length_doc1), math.sqrt(length_doc2)
    return scalar / (length_doc1 * length_doc2)

def rank(doc_term, query_term):
    ranking = []
    i = 0
    for doc in (doc_term):
        sim = {'doc':  str(i+1),
               'similarity' : cosine(doc, query_term)
              }
        i += 1
        ranking.append(sim)
    return sorted(ranking, key=lambda x: x['similarity'], reverse=True)


    
ranked = rank(np.transpose(dense_vec), np.transpose(dense_query)[0])
print('\nranked documents:')
pprint(ranked)


TF-IDF vector query
[array([[ 0.        ],
       [ 1.33333333],
       [ 0.        ],
       [ 1.33333333],
       [ 2.        ]])]

Query Vector in latent space 
array([[-2.24652228],
       [ 0.46961454]])

ranked documents:
[{'doc': '4', 'similarity': 0.94855206456837704},
 {'doc': '1', 'similarity': 0.7867987390042479},
 {'doc': '2', 'similarity': 0.72687085886729397},
 {'doc': '3', 'similarity': -0.91905247180497585}]

Exercise 2

a)


In [77]:
doc_term_idfvector = [
    [0.17, 0.21, 0.35, 0.44, 0.49, 0.39, 0.09, 0.07, 0.37, 0.24],
    [0.49, 0.48, 0.44, 0.09, 0.24, 0.2, 0.41, 0.16, 0.1, 0.15],
    [0.41, 0.36, 0.27, 0.19, 0.15, 0.42, 0.23, 0.42, 0.02, 0.42],
    [0.31, 0.41, 0.21, 0.19, 0.47, 0.28, 0.21, 0.39, 0.16, 0.38],
    [0.46, 0.12, 0.21, 0.25, 0.38, 0.38, 0.46, 0.23, 0.31, 0.14],
    [0.13, 0.33, 0.28, 0.42, 0.07, 0.13, 0.58, 0.15, 0.0, 0.49],
    [0.21, 0.09, 0.07, 0.09, 0.3, 0.54, 0.24, 0.43, 0.51, 0.21],
    [0.18, 0.39, 0.42, 0.05, 0.41, 0.1, 0.52, 0.12, 0.14, 0.38],
    [0.4, 0.51, 0.01, 0.1, 0.12, 0.22, 0.26, 0.34, 0.42, 0.38]
]


def avg_cluster_similarity(cluster, doc):
    similarity = 0.0
    for member in cluster:
        similarity += cosine(member, doc)
    sim = similarity/len(cluster)
    return sim

def max_cluster_similarity(cluster, doc):
    max_similarity = 0.0
    for member in cluster:
        current_sim = cosine(member, doc)
        if current_sim > max_similarity:
            max_similarity = current_sim
    return max_similarity
    
def single_pass_clustering(docs, lamda, cluster_similarity):
    """
    returns: clusters; first array is the list of clusters, which then contains a list of documents
    """
    clusters = []
    clusters.append([docs.pop(-1)])
    # loop over each doc and try to assign to cluster
    for doc in docs:
        max_similarity = 0
        best_cluster = None
        for cluster in clusters:
            # determine the best matching cluster, lamda will be checked later on
            sim = cluster_similarity(cluster, doc)
            if sim > max_similarity:
                max_similarity = sim
                best_cluster = cluster
        if lamda > max_similarity:
            # create a new cluster
            clusters.append([doc])
        else:
            # append to existing cluster
            best_cluster.append(doc)
    return clusters


def print_clusters(clusters):
    i = 0
    for cluster in clusters:
        print('Cluster %s:' % (str(i)))
        i += 1
        print('\t' + "; ".join([str(doc) for doc in cluster]))
    
spc0_6 = single_pass_clustering(doc_term_idfvector, 0.6, max_cluster_similarity)
spc0_8 = single_pass_clustering(doc_term_idfvector, 0.8, max_cluster_similarity)

print('Single pass clustering with lamda=0.6')
print_clusters(spc0_6)

print('Single pass clustering with lamda=0.8')
print_clusters(spc0_8)


Single pass clustering with lamda=0.6
Cluster 0:
	[0.4, 0.51, 0.01, 0.1, 0.12, 0.22, 0.26, 0.34, 0.42, 0.38]; [0.17, 0.21, 0.35, 0.44, 0.49, 0.39, 0.09, 0.07, 0.37, 0.24]; [0.49, 0.48, 0.44, 0.09, 0.24, 0.2, 0.41, 0.16, 0.1, 0.15]; [0.41, 0.36, 0.27, 0.19, 0.15, 0.42, 0.23, 0.42, 0.02, 0.42]; [0.31, 0.41, 0.21, 0.19, 0.47, 0.28, 0.21, 0.39, 0.16, 0.38]; [0.46, 0.12, 0.21, 0.25, 0.38, 0.38, 0.46, 0.23, 0.31, 0.14]; [0.13, 0.33, 0.28, 0.42, 0.07, 0.13, 0.58, 0.15, 0.0, 0.49]; [0.21, 0.09, 0.07, 0.09, 0.3, 0.54, 0.24, 0.43, 0.51, 0.21]; [0.18, 0.39, 0.42, 0.05, 0.41, 0.1, 0.52, 0.12, 0.14, 0.38]
Single pass clustering with lamda=0.8
Cluster 0:
	[0.18, 0.39, 0.42, 0.05, 0.41, 0.1, 0.52, 0.12, 0.14, 0.38]; [0.49, 0.48, 0.44, 0.09, 0.24, 0.2, 0.41, 0.16, 0.1, 0.15]; [0.41, 0.36, 0.27, 0.19, 0.15, 0.42, 0.23, 0.42, 0.02, 0.42]; [0.31, 0.41, 0.21, 0.19, 0.47, 0.28, 0.21, 0.39, 0.16, 0.38]; [0.46, 0.12, 0.21, 0.25, 0.38, 0.38, 0.46, 0.23, 0.31, 0.14]; [0.13, 0.33, 0.28, 0.42, 0.07, 0.13, 0.58, 0.15, 0.0, 0.49]; [0.21, 0.09, 0.07, 0.09, 0.3, 0.54, 0.24, 0.43, 0.51, 0.21]
Cluster 1:
	[0.17, 0.21, 0.35, 0.44, 0.49, 0.39, 0.09, 0.07, 0.37, 0.24]

In general a lower lamda will lead to less clusters as the algorithm will assign a document with lower similarities to an existing cluster


In [78]:
spc0_8_reversed = single_pass_clustering([d for d in reversed(doc_term_idfvector)], 0.8, max_cluster_similarity)
print('Single pass clustering with lamda=0.8 and reversed streamlining')
print_clusters(spc0_8_reversed)


Single pass clustering with lamda=0.8 and reversed streamlining
Cluster 0:
	[0.17, 0.21, 0.35, 0.44, 0.49, 0.39, 0.09, 0.07, 0.37, 0.24]
Cluster 1:
	[0.21, 0.09, 0.07, 0.09, 0.3, 0.54, 0.24, 0.43, 0.51, 0.21]; [0.46, 0.12, 0.21, 0.25, 0.38, 0.38, 0.46, 0.23, 0.31, 0.14]; [0.31, 0.41, 0.21, 0.19, 0.47, 0.28, 0.21, 0.39, 0.16, 0.38]; [0.41, 0.36, 0.27, 0.19, 0.15, 0.42, 0.23, 0.42, 0.02, 0.42]; [0.49, 0.48, 0.44, 0.09, 0.24, 0.2, 0.41, 0.16, 0.1, 0.15]
Cluster 2:
	[0.13, 0.33, 0.28, 0.42, 0.07, 0.13, 0.58, 0.15, 0.0, 0.49]

The clusters change, which is entailed by the different start cluster the algorithm has chosen

b)


In [79]:
centroids = [
    [0.33, 0.33, 0.42, 0.12, 0.2, 0.34, 0.58, 0.19, 0.07, 0.24],
    [0.29, 0.16, 0.38, 0.48, 0.43, 0.11, 0.12, 0.33, 0.03, 0.44],
    [0.01, 0.17, 0.11, 0.27, 0.23, 0.37, 0.35, 0.48, 0.54, 0.24]
]
centroids_np = np.array(centroids)

kmeans = sklearn.cluster.KMeans(n_clusters=3, init=centroids_np).fit(np.array(doc_term_idfvector))
centers = kmeans.cluster_centers_

print('Cluster centers')
pprint(centers)

i = 0
for doc in doc_term_idfvector:
    clusters = []
    best_cluster = None
    max_sim = 0.0
    for center in range(len(centers)):
        sim = avg_cluster_similarity([centers[center]], doc)
        if max_sim < sim:
            max_sim = sim
            best_cluster = center
    print('Doc %s %s assigned to cluster %s' %(str(i+1), doc, str(best_cluster+1))) 
    i += 1


Cluster centers
array([[ 0.3725,  0.3225,  0.3   ,  0.2375,  0.21  ,  0.2825,  0.42  ,
         0.24  ,  0.1075,  0.3   ],
       [ 0.24  ,  0.31  ,  0.28  ,  0.315 ,  0.48  ,  0.335 ,  0.15  ,
         0.23  ,  0.265 ,  0.31  ],
       [ 0.21  ,  0.09  ,  0.07  ,  0.09  ,  0.3   ,  0.54  ,  0.24  ,
         0.43  ,  0.51  ,  0.21  ]])
Doc 1 [0.17, 0.21, 0.35, 0.44, 0.49, 0.39, 0.09, 0.07, 0.37, 0.24] assigned to cluster 2
Doc 2 [0.49, 0.48, 0.44, 0.09, 0.24, 0.2, 0.41, 0.16, 0.1, 0.15] assigned to cluster 1
Doc 3 [0.41, 0.36, 0.27, 0.19, 0.15, 0.42, 0.23, 0.42, 0.02, 0.42] assigned to cluster 1
Doc 4 [0.31, 0.41, 0.21, 0.19, 0.47, 0.28, 0.21, 0.39, 0.16, 0.38] assigned to cluster 2
Doc 5 [0.46, 0.12, 0.21, 0.25, 0.38, 0.38, 0.46, 0.23, 0.31, 0.14] assigned to cluster 1
Doc 6 [0.13, 0.33, 0.28, 0.42, 0.07, 0.13, 0.58, 0.15, 0.0, 0.49] assigned to cluster 1
Doc 7 [0.21, 0.09, 0.07, 0.09, 0.3, 0.54, 0.24, 0.43, 0.51, 0.21] assigned to cluster 3
/usr/local/lib/python3.5/dist-packages/sklearn/cluster/k_means_.py:889: RuntimeWarning: Explicit initial center position passed: performing only one init in k-means instead of n_init=10
  return_n_iter=True)

Exercise 3

a)


In [80]:
relevant_docs = set([i for i in range(1,20) if i % 2 !=0])

results = {
    'r1' : set([1, 2, 5, 6, 13]),
    'r2' : set([1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 19, 14, 17, 3, 15, 16, 18, 20]),
    'r3' : set([1, 2, 4, 5, 9, 10, 12, 13, 14, 15, 20])
}


def compute_precision(relevant, retrieved):
    tp = len(set.intersection(relevant, retrieved))
    return tp/len(retrieved)

def compute_recall(relevant, retrieved):
    tp = len(set.intersection(relevant, retrieved))
    return tp/len(relevant)

def compute_fmeasure(f, relevant, retrieved):
    precision = compute_precision(relevant, retrieved)
    recall = compute_recall(relevant, retrieved)
    return 1/((f*(1/precision))+(1-f)*(1/recall))


def compute_f1measure(relevant, retrieved):
    precision = compute_precision(relevant, retrieved)
    recall = compute_recall(relevant, retrieved)
    return 2 * ( (precision *recall) / (precision + recall))
    
for k,v in results.items():
    evaluation = {
        'precision' : compute_precision(relevant_docs, v),
        'recall' : compute_recall(relevant_docs, v),
        'f1' : compute_f1measure(relevant_docs, v)
    }
    results[k] = evaluation
pprint(results)


{'r1': {'f1': 0.4, 'precision': 0.6, 'recall': 0.3},
 'r2': {'f1': 0.6666666666666666, 'precision': 0.5, 'recall': 1.0},
 'r3': {'f1': 0.47619047619047616,
        'precision': 0.45454545454545453,
        'recall': 0.5}}

these measures are only suitable for binary evaluation but cannot evaluate rankings

b)


In [84]:
relevant_docs = set([i for i in range(1,20) if i % 2 !=0])
results = {
    'r1' : [1, 2, 5, 6, 13],
    'r2' : [1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 19, 14, 17, 3, 15, 16, 18, 20],
    'r3' : [1, 2, 4, 5, 9, 10, 12, 13, 14, 15, 20]
}


def average_precision(relevant_docs, ranking):
    ranks = []
    for item in relevant_docs:
        try:
            ranks.append(compute_precision(relevant_docs, ranking[:ranking.index(item)+1]))
        except ValueError:
            pass
    return sum([add for add in ranks]) / len(ranks)

def compute_rprecision(relevant, retrieved):
    tp = len(set.intersection(relevant, retrieved))
    return tp/len(relevant)
    

for k,v in results.items():
    evaluation = {
        'p@5' : compute_precision(relevant_docs, set(v[:5])),
        'r-precision' : compute_rprecision(relevant_docs, set(v[:len(relevant_docs)])),
        'average precision' : average_precision(relevant_docs, v)
    }
    results[k] = evaluation

pprint(results)


{'r1': {'average precision': 0.7555555555555555,
        'p@5': 0.6,
        'r-precision': 0.3},
 'r2': {'average precision': 0.5722530165912518,
        'p@5': 0.4,
        'r-precision': 0.5},
 'r3': {'average precision': 0.62, 'p@5': 0.6, 'r-precision': 0.5}}

c)


In [82]:
q1 = [1, 6, 9, 17, 21]
q2 = [1, 3, 4]
q3 = [2, 5, 8, 9, 10]
q4 = [4]
q5 = [1, 2, 6]

queries = [q1, q2, q3, q4, q5]

def precision(no_rel, rank):
    return no_rel/rank
    

def average_precision(ranks):
    return sum([precision(ranks.index(r)+1,  r) for r in ranks])/len(ranks)

def mean_average_precision(queries):
    return sum([average_precision(query) for query in queries])/len(queries)

map_ = mean_average_precision(queries)
print('Mean average precision is \n %s' % map_)


Mean average precision is 
 0.5521577964519141