In [1]:
import numpy as np

In [2]:
train_X, train_y, test_X, test_y = [], [], [], []

In [3]:
def unique(seq):
    result = []
    for x in seq:
        if x not in result:
            result.append(x)
    return result

In [4]:
for line in open('coursera_sessions_train.txt'):
    x, y = line.split(';')
    train_X.append([int(value) for value in x.split(',')])
    
    y = y.strip()
    if y:
        y = [int(value) for value in y.split(',')]
    else: 
        y = []
    train_y.append(y)

In [5]:
for line in open('coursera_sessions_test.txt'):
    x, y = line.split(';')
    test_X.append([int(value) for value in x.split(',')])
    
    y = y.strip()
    if y:
        y = [int(value) for value in y.split(',')]
    else: 
        y = []
    test_y.append(y)

In [6]:
from collections import Counter
watched_counter = Counter(value for session_values in train_X for value in session_values)
bought_counter = Counter(value for session_values in train_y for value in session_values)

In [7]:
def precision(y_pred, y_true, k):
    seq = y_pred[:k]
    return sum(1 for value in seq if value in y_true) / k

In [8]:
def recall(y_pred, y_true, k): 
    predictions = y_pred[:k]
    return sum(1 for value in y_true if value in predictions) / len(y_true)

In [9]:
X_train_not_empty = [train_X[idx] for idx in range(len(train_X)) if train_y[idx]]
y_train_not_empty = [train_y[idx] for idx in range(len(train_y)) if train_y[idx]]

In [10]:
X_test_not_empty = [test_X[idx] for idx in range(len(test_X)) if test_y[idx]]
y_test_not_empty = [test_y[idx] for idx in range(len(test_y)) if test_y[idx]]

In [11]:
def sort_key(counter):
    def _key(x):
        return counter.get(x, 0)
    return _key

top_watched_train = [unique(sorted(session, key=sort_key(watched_counter), reverse=True)) for session in X_train_not_empty]
top_bought_train = [unique(sorted(session, key=sort_key(bought_counter), reverse=True)) for session in X_train_not_empty]

In [12]:
from itertools import cycle

def get_metric(y_pred, y_true, metric, k):
    sample_wise_metric = map(metric, y_pred, y_true, cycle([k]))
    return np.mean(list(sample_wise_metric))

In [13]:
def get_metrics_values(y_pred, y_true, file):
    with open(file, 'w') as fp:
        for k in [1, 5]:
            for metric in [recall, precision]:
                fp.write(str(round(get_metric(y_pred, y_true, metric, k), 2)) + ' ')

In [14]:
get_metrics_values(top_watched_train, y_train_not_empty, 'ans1.txt')

In [15]:
get_metrics_values(top_bought_train, y_train_not_empty, 'ans3.txt')

In [16]:
top_watched_test = [unique(sorted(session, key=sort_key(watched_counter), reverse=True)) for session in X_test_not_empty]
top_bought_test = [unique(sorted(session, key=sort_key(bought_counter), reverse=True)) for session in X_test_not_empty]

In [17]:
get_metrics_values(top_watched_test, y_test_not_empty, 'ans2.txt')

In [18]:
get_metrics_values(top_bought_test, y_test_not_empty, 'ans4.txt')

In [ ]: