In [1]:
import numpy as np
In [2]:
train_X, train_y, test_X, test_y = [], [], [], []
In [3]:
def unique(seq):
result = []
for x in seq:
if x not in result:
result.append(x)
return result
In [4]:
for line in open('coursera_sessions_train.txt'):
x, y = line.split(';')
train_X.append([int(value) for value in x.split(',')])
y = y.strip()
if y:
y = [int(value) for value in y.split(',')]
else:
y = []
train_y.append(y)
In [5]:
for line in open('coursera_sessions_test.txt'):
x, y = line.split(';')
test_X.append([int(value) for value in x.split(',')])
y = y.strip()
if y:
y = [int(value) for value in y.split(',')]
else:
y = []
test_y.append(y)
In [6]:
from collections import Counter
watched_counter = Counter(value for session_values in train_X for value in session_values)
bought_counter = Counter(value for session_values in train_y for value in session_values)
In [7]:
def precision(y_pred, y_true, k):
seq = y_pred[:k]
return sum(1 for value in seq if value in y_true) / k
In [8]:
def recall(y_pred, y_true, k):
predictions = y_pred[:k]
return sum(1 for value in y_true if value in predictions) / len(y_true)
In [9]:
X_train_not_empty = [train_X[idx] for idx in range(len(train_X)) if train_y[idx]]
y_train_not_empty = [train_y[idx] for idx in range(len(train_y)) if train_y[idx]]
In [10]:
X_test_not_empty = [test_X[idx] for idx in range(len(test_X)) if test_y[idx]]
y_test_not_empty = [test_y[idx] for idx in range(len(test_y)) if test_y[idx]]
In [11]:
def sort_key(counter):
def _key(x):
return counter.get(x, 0)
return _key
top_watched_train = [unique(sorted(session, key=sort_key(watched_counter), reverse=True)) for session in X_train_not_empty]
top_bought_train = [unique(sorted(session, key=sort_key(bought_counter), reverse=True)) for session in X_train_not_empty]
In [12]:
from itertools import cycle
def get_metric(y_pred, y_true, metric, k):
sample_wise_metric = map(metric, y_pred, y_true, cycle([k]))
return np.mean(list(sample_wise_metric))
In [13]:
def get_metrics_values(y_pred, y_true, file):
with open(file, 'w') as fp:
for k in [1, 5]:
for metric in [recall, precision]:
fp.write(str(round(get_metric(y_pred, y_true, metric, k), 2)) + ' ')
In [14]:
get_metrics_values(top_watched_train, y_train_not_empty, 'ans1.txt')
In [15]:
get_metrics_values(top_bought_train, y_train_not_empty, 'ans3.txt')
In [16]:
top_watched_test = [unique(sorted(session, key=sort_key(watched_counter), reverse=True)) for session in X_test_not_empty]
top_bought_test = [unique(sorted(session, key=sort_key(bought_counter), reverse=True)) for session in X_test_not_empty]
In [17]:
get_metrics_values(top_watched_test, y_test_not_empty, 'ans2.txt')
In [18]:
get_metrics_values(top_bought_test, y_test_not_empty, 'ans4.txt')
In [ ]: