In [1]:
%matplotlib inline
from __future__ import print_function, division

import csv
import numpy as np
import matplotlib.pyplot as plt
import os

from scipy.spatial.distance import jaccard
from sklearn.metrics import jaccard_similarity_score
from sklearn.preprocessing import normalize

In [2]:
def load_mallet_matrix(model_name, valid_app_ids, filename="doc_matrix.tsv", mallet_dir="mallet"):
    full_path = os.path.join(mallet_dir, model_name, filename)
    with open(full_path, "r") as f:
        reader = csv.reader(f, delimiter="\t")
        app_ids = list()
        vectors = list()
        for line in reader:
            app_id = np.int(line[1].split("/")[-1])
            if valid_app_ids is not None and app_id not in valid_app_ids:
                continue
            app_ids.append(np.int(line[1].split("/")[-1]))
            vector = np.array(map(np.float, line[2:]))
            vector /= np.linalg.norm(vector)
            vectors.append(vector)
    app_ids, vectors = np.array(app_ids), np.array(vectors)
    app_order = np.argsort(app_ids)
    app_ids = app_ids[app_order]
    vectors = vectors[app_order]
    return app_ids, vectors.dot(vectors.T)

def load_doc_tag_matrix(valid_app_ids, filename="doc_tag_matrix.npy", data_dir="data"):
    with open(os.path.join(data_dir, filename), "rb") as f:
        arr = np.load(f)
    app_ids, jaccard_matrix = arr[:, 0], arr[:, 1:]
    if valid_app_ids is not None:
        to_keep = np.array([i 
                            for i, app_id in enumerate(app_ids)
                            if app_id in valid_app_ids])
        app_ids = app_ids[to_keep]
        jaccard_matrix = jaccard_matrix[to_keep]
    app_order = np.argsort(app_ids)
    app_ids = app_ids[app_order]
    jaccard_matrix = jaccard_matrix[app_order]
    jaccard_sim = jaccard_matrix.dot(jaccard_matrix.T).astype(np.float)
    for app1, app2 in np.ndindex(jaccard_sim.shape):
        if app1 < app2:
            union = np.logical_or(jaccard_matrix[app1], jaccard_matrix[app2]).sum() + 1 # Smoothing
            jaccard_sim[app1, app2] /= union
            jaccard_sim[app2, app1] /= union
        elif app1 == app2:
            jaccard_sim[app1, app2] = 1.0
    return app_ids, jaccard_sim

In [3]:
app_ids, descriptions_sim = load_mallet_matrix("40_features_descriptions", None)
descriptions_sim = normalize(descriptions_sim)

# Some games are missing from this matrix, so we have to remove them
app_id_to_index = {app_id: i for i, app_id in enumerate(app_ids)}

_, reviews_sim = load_mallet_matrix("40_features", app_id_to_index)
_, reviews_sim_40_1000_2000 = load_mallet_matrix("40_features_1000_tokens_2000_iterations", app_id_to_index)
_, reviews_sim_40_2000_2000 = load_mallet_matrix("40_features_2000_tokens_2000_iterations", app_id_to_index)
_, reviews_sim_40_2000_2000_new = load_mallet_matrix("40_features_2000_tokens_2000_iterations_new_data", app_id_to_index)
_, reviews_sim_40_2000_500_new = load_mallet_matrix("40_features_2000_tokens_500_iterations_new_data", app_id_to_index)
# _, reviews_sim_40_5000_2000 = load_mallet_matrix("40_features_5000_tokens_2000_iterations", app_id_to_index)
# _, reviews_sim = load_mallet_matrix("40_features", app_id_to_index)
# _, reviews_sim_100 = load_mallet_matrix("100_features", app_id_to_index)
# _, reviews_sim_25_2000 = load_mallet_matrix("25_features_2000_tokens", app_id_to_index)
# _, reviews_sim_50_2000 = load_mallet_matrix("50_features_2000_tokens", app_id_to_index)
# _, reviews_sim_30_1000_2000 = load_mallet_matrix("30_features_1000_tokens_2000_iterations", app_id_to_index)

In [4]:
_, jaccard_sim = load_doc_tag_matrix(app_id_to_index)

In [ ]:


In [5]:
models = {
    "40_features": reviews_sim,
    "40_features_1000_tokens_2000_iterations": reviews_sim_40_1000_2000,
    "40_features_2000_tokens_2000_iterations": reviews_sim_40_2000_2000,
    "40_features_2000_tokens_2000_iterations_new_data": reviews_sim_40_2000_2000_new,
    "40_features_2000_tokens_500_iterations_new_data": reviews_sim_40_2000_500_new,
#     "40_features_5000_tokens_2000_iterations": reviews_sim_40_5000_2000,
#     "100_features": reviews_sim_100,
#     "25_features_2000_tokens": reviews_sim_25_2000,
#     "50_features_2000_tokens": reviews_sim_50_2000,
#     "30_features_1000_tokens_2000_iterations": reviews_sim_30_1000_2000,
    "Jaccard": jaccard_sim
}

In [6]:
def get_ground_truth(app_index, ground_truth_matrix, lim=100):
    return set(np.argsort(ground_truth_matrix[app_index])[::-1][:lim])

def get_ranking(app_index, sim_matrix):
    return np.argsort(sim_matrix[app_index])[::-1]

def precision_recall(app_index, sim_matrix, ground_truth_matrix):
    # From assignment 3
    ranking_in = get_ranking(app_index, sim_matrix)
    relevant = get_ground_truth(app_index, ground_truth_matrix)
    
    precision = np.zeros(len(ranking_in) + 1)
    recall = np.zeros(len(ranking_in) + 1)
    num_relevant = len(relevant)
    
    num_rel_retrieved = 0
    for i, app_index in enumerate(ranking_in):
        i = i + 1
        if app_index in relevant:
            num_rel_retrieved += 1
        precision[i] = num_rel_retrieved / i
        recall[i] = num_rel_retrieved / num_relevant
    return precision[1:], recall[1:]

queries = {
    "Dark Souls II": app_id_to_index[211420],
    "Subnautica": app_id_to_index[264710],
    "Portal 2": app_id_to_index[620],
    "Ark: Survival Evolved": app_id_to_index[346110],
    "Borderlands 2": app_id_to_index[49520],
}

In [7]:
def plot_precision_recall(axis,
                          test_matrix, test_model_name,
                          ground_truth_matrix, ground_truth_model_name):
    title = "Precision recall for %s model using %s model as ground truth"%(test_model_name,
                                                                            ground_truth_model_name)
    axis.set_title(title)
    axis.set_xlabel("Recall@k")
    axis.set_ylabel("Precision@k")

    for i, (name, app_index) in enumerate(sorted(queries.items())):
        precision, recall = precision_recall(app_index, test_matrix, ground_truth_matrix)
        axis.plot(recall, precision, label=name)    

    total_precision_recall = np.zeros((len(app_ids), 2, len(app_ids)))
    for app_index in xrange(len(app_ids)):
        total_precision_recall[app_index] = precision_recall(app_index, test_matrix, ground_truth_matrix)
    total_precision = total_precision_recall[:, 0, :].mean(axis=0)
    total_recall = total_precision_recall[:, 1, :].mean(axis=0)
    axis.plot(total_recall,
              total_precision,
              linewidth=3,
              label="Global Mean",
              color="k")
    axis.legend()
    return total_precision, total_recall

In [8]:
f, axes = plt.subplots(len(models), figsize=(12 ,6*len(models)))

model_PRs = {}

for axis, (model_name, model) in zip(axes, models.iteritems()):
    print(model_name)
    model_PRs[model_name] = plot_precision_recall(axis,
                                                  model, model_name,
                                                  descriptions_sim, "Mallet Descriptions")
plt.show()


Jaccard
40_features_1000_tokens_2000_iterations
40_features_2000_tokens_2000_iterations
40_features_2000_tokens_500_iterations_new_data
40_features
40_features_2000_tokens_2000_iterations_new_data

In [15]:
fig = plt.figure(figsize=(8, 5))
for model_name, (precision, recall) in model_PRs.iteritems():
    plt.plot(recall, precision, label=model_name, linewidth=2)
plt.xlabel("Recall@k")
plt.ylabel("Precision@k")
plt.title("Average Precision Recall")
plt.xlim(0, 1)
plt.ylim(0, 0.6)
plt.legend()
plt.show()



In [16]:
with open("app_id_to_num_reviews.csv") as f:
    counts = [int(line.split(",")[1]) for line in f.readlines() if len(line) > 0]
fig = plt.figure(figsize=(12, 6))
plt.hist(counts)
plt.xlabel("Number of Reviews")
plt.ylabel("Number of Games")
plt.title("Number of Reviews per Game")
plt.show()



In [ ]: