In [ ]:
# Copyright 2016 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
################################################################################
#
# Notebook to process the ratings and produce plots and tables.
In [ ]:
%matplotlib notebook
%load_ext autoreload
%autoreload 2
In [ ]:
from __future__ import division
import collections
import csv
import itertools
import jsonpickle
import math
import numpy as np
import multiprocessing
import os
import pandas as pd
import pickle
import scipy
import seaborn as sns
import sys
import sklearn.cross_validation
sys.path.append(
os.path.abspath(os.path.join(os.path.dirname('__file__'), os.path.pardir)))
import logs_processing.click_model as click_model
from logs_processing.create_tasks import Action, LogItem
In [ ]:
current_palette = sns.color_palette()
sns.palplot(current_palette)
In [ ]:
CF = '<DIRECTORY_WITH_THE_ANONYMIZED_DATASET>'
SPAMMER_FILENAME = 'spammers_anonymized.txt'
RESULTS_D = 'results_D_anonymized.csv'
RESULTS_R = 'results_R_anonymized.csv'
TASK_FILE = 'serps_anonymized.csv'
In [ ]:
USE_CF_TRUST = True
In [ ]:
spammers = set()
with open(os.path.join(CF, SPAMMER_FILENAME)) as f:
for worker_id in f:
spammers.add(worker_id.rstrip())
print '%d spammers' % len(spammers)
In [ ]:
log_id_to_rel = collections.defaultdict(click_model.RelContainer)
log_id_to_query = {}
good_worker_ratings = 0
total_ratings = 0
all_workers = set()
with open(os.path.join(CF, RESULTS_D)) as results_D:
for row in csv.DictReader(results_D):
worker_id = row['cas_worker_id']
all_workers.add(worker_id)
total_ratings += 1
if worker_id not in spammers:
good_worker_ratings += 1
trust = float(row['cf_worker_trust']) if USE_CF_TRUST else 1
log_id = row['cas_log_id']
click_model.RelContainer.add_rel(log_id_to_rel[log_id].Ds, row['D'], trust)
log_id_to_query[log_id] = row['cas_query_id']
print '(D) %.1f%% ratings form spammers' % (100 - 100 * good_worker_ratings / total_ratings)
In [ ]:
good_worker_ratings = 0
total_ratings = 0
yes_detailed = []
with open(os.path.join(CF, RESULTS_R)) as results_R:
for row in csv.DictReader(results_R):
worker_id = row['cas_worker_id']
all_workers.add(worker_id)
total_ratings += 1
if worker_id not in spammers:
good_worker_ratings +=1
trust = float(row['cf_worker_trust']) if USE_CF_TRUST else 1
log_id = row['cas_log_id']
click_model.RelContainer.add_rel(log_id_to_rel[log_id].Rs, row['R'], trust)
query = row['cas_query_id']
if row['yes_detailed']:
yes_detailed.append(row['yes_detailed'])
old_query = log_id_to_query.setdefault(log_id, query)
if old_query != query:
print >>sys.stderr, ('The same log_id '
'(%s) maps to two different queries: [%s] and [%s]' % (
log_id, old_query, query))
sys.exit(1)
print '%d items with complete relevance' % sum(
1 for r in log_id_to_rel.itervalues() if r)
print '%d queries with at least one completely judged document' % len(set(
log_id_to_query[k] for k, r in log_id_to_rel.iteritems() if r))
print '%d workers in total' % len(all_workers)
print '(R) %.1f%% ratings form spammers' % (100 - 100 * good_worker_ratings / total_ratings)
In [ ]:
def percentages(counter):
s = sum(counter.values())
return ['%s: %.1f%%' % (k, v / s * 100) for k, v in counter.most_common()]
In [ ]:
print percentages(collections.Counter(yes_detailed))
In [ ]:
Ds = collections.Counter(x[0] for rel in log_id_to_rel.itervalues() for x in rel.Ds)
Rs = collections.Counter(x[0] for rel in log_id_to_rel.itervalues() for x in rel.Rs)
In [ ]:
print percentages(Ds)
print percentages(Rs)
In [ ]:
most_common_rel_labels = []
for rel in log_id_to_rel.itervalues():
most_common_rel_labels.append({'D': click_model.rel_most_common(rel.Ds), 'R': click_model.rel_most_common(rel.Rs)})
mc_rels = pd.DataFrame(most_common_rel_labels)
In [ ]:
scipy.stats.pearsonr(mc_rels['R'], mc_rels['D'])
In [ ]:
scipy.stats.spearmanr(mc_rels['R'], mc_rels['D'])
In [ ]:
ax = sns.regplot(x='R', y='D', data=mc_rels, x_jitter=.1, y_jitter=.1)
ax.figure.savefig('R_D_correlation.pdf')
In [ ]:
data = []
with open(os.path.join(CF, TASK_FILE)) as task_file:
sat_labels = []
num_skipped = 0
num_sat_true = 0
num_total = 0
reader = csv.DictReader(task_file)
for key, query_rows_iter in itertools.groupby(reader,
key=lambda row: (row['cas_log_id'].split('_')[:-1], # SERP id
row['cas_query_id'],
row['sat_feedback'])):
sat = key[2]
if sat == 'undefined':
print >>sys.stderr, 'Undefined sat label for query [%s]' % key[1]
sat_labels.append(sat)
sat = click_model.parse_sat(sat)
if sat is None:
num_skipped += 1
continue
elif sat:
num_sat_true += 1
data_row = {'query': key[1], 'sat': sat, 'session': [], 'serp': []}
for row in query_rows_iter:
data_row['session'].append(jsonpickle.decode(row['actions']))
data_row['serp'].append(click_model.Snippet(emup=row['emup'],
cas_item_type=row['cas_item_type'],
is_complex=row['is_complex']))
data.append(data_row)
num_total += 1
print collections.Counter(sat_labels)
print 'Skipped %d rows out of %d' % (num_skipped, num_total + num_skipped)
print '%.1f%% of SAT labels in the data' % (num_sat_true / num_total * 100)
In [ ]:
print '%d queries left' % len(data)
print '%d SERP items w/ ratings' % sum(sum(1 for l in row['session'] if log_id_to_rel[l.log_id]) for row in data)
In [ ]:
MODELS = {
'CAS': click_model.CAS(log_id_to_rel),
'CASnod': click_model.CAS(log_id_to_rel, use_D=False),
'CASnosat': click_model.CAS(log_id_to_rel, sat_term_weight=0),
'CASnoreg': click_model.CAS(log_id_to_rel, reg_coeff=0),
'random': click_model.RandomSatModel(),
'PBM': click_model.PyClickModel('PBM', log_id_to_rel),
'UBM': click_model.PyClickModel('UBM', log_id_to_rel),
'DCG': click_model.DCG(log_id_to_rel),
'uUBM': click_model.uUBM(log_id_to_rel),
}
In [ ]:
def compute_performance(index, train_data, test_data, result_queue):
result = {}
for name, model in MODELS.iteritems():
try:
params = model.train(train_data)
ll_values_test = [
model.log_likelihood(params,
d['session'], d['serp'], d['sat'],
f_only=True
) for d in test_data
]
result[name] = {}
result[name]['full'] = np.average([l.full for l in ll_values_test])
result[name]['click'] = np.average([l.clicks for l in ll_values_test])
result[name]['sat'] = np.average([l.sat for l in ll_values_test])
result[name]['utility'] = [model.utility(params, d['session'], d['serp']) for d in test_data]
result[name]['sat pearson'] = scipy.stats.pearsonr(
[int(d['sat']) for d in test_data],
result[name]['utility']
)[0]
except Exception, e:
result[name] = sys.exc_info()
result_queue.put((index, result))
In [ ]:
N_REPETITIONS = 1
N_FOLDS = 3
N = len(data)
data = np.array(data)
In [ ]:
result_queue = multiprocessing.Queue()
workers = []
for rep_index in xrange(N_REPETITIONS):
for fold_num, (train_index, test_index) in enumerate(sklearn.cross_validation.KFold(N, n_folds=N_FOLDS,
shuffle=True,
random_state=rep_index)):
w = multiprocessing.Process(target=compute_performance,
args=((rep_index, fold_num), data[train_index], data[test_index], result_queue))
workers.append(w)
w.start()
In [ ]:
results = []
for i in xrange(len(workers)):
try:
results.append(result_queue.get(timeout=300))
print >>sys.stderr, i,
except multiprocessing.TimeoutError:
print >>sys.stderr, '..',
print len(results)
In [ ]:
for w in workers:
w.join()
In [ ]:
def flatten(results):
out = []
for idx, result in results:
for model, r in result.iteritems():
if isinstance(r, tuple):
print >>sys.stderr, r
else:
out += [{'rep': idx[0], 'fold': idx[1], 'model': model, 'metric': k, 'value': v} for (k, v) in r.iteritems()]
return out
In [ ]:
d = pd.DataFrame(flatten(results))
d.to_pickle('results.df')
In [ ]:
def utility(rep, fold, model):
return d[d['rep'] == rep][d['fold'] == fold][d['model'] == model][d['metric'] == 'utility'].iloc[0]['value']
#utility(0, 0, 'CAS')
In [ ]:
correlations = {}
model_names = ['CASnod', 'CASnosat', 'CASnoreg',
'CAS',
'UBM', 'PBM',
'DCG', 'uUBM']
for i in xrange(len(model_names)):
m1 = model_names[i]
correlations[m1] = {}
for m2 in model_names[:i]:
vals = []
for rep in xrange(N_REPETITIONS):
for fold in xrange(N_FOLDS):
try:
m1_utility = utility(rep, fold, m1)
m2_utility = utility(rep, fold, m2)
vals.append(scipy.stats.pearsonr(m1_utility, m2_utility)[0])
except IndexError as e:
print >>sys.stderr, 'Missing value: rep=%d, fold=%d, m1=%s, m2=%s' % (rep, fold, m1, m2)
continue
correlations[m1][m2] = np.mean(vals)
correlations = pd.DataFrame(correlations, index=model_names[:-1], columns=model_names[1:])
In [ ]:
print correlations.to_latex(float_format=lambda x: '---' if math.isnan(x) else '%.3f' % x)
In [ ]:
def is_complex(serp):
return any(snippet.is_complex for snippet in serp)
In [ ]:
def apply_mask(iterable, mask, inverted=False):
return [x for x, m in zip(iterable, mask) if (m if not inverted else not m)]
In [ ]:
N_REPETITIONS_COMPLEX = 20
model_names = [
# 'CASnod', 'CASnosat', 'CASnoreg',
# 'CAS',
# 'UBM', 'PBM',
'random', 'DCG', 'uUBM']
num_complex_serps = {}
results = []
data = np.array(data)
complex_serps = [is_complex(x['serp']) for x in data]
for rep_index, (train_index, test_index) in enumerate(sklearn.cross_validation.StratifiedShuffleSplit(
complex_serps, N_REPETITIONS_COMPLEX, test_size=1/24, random_state=0)):
num_complex_serps[rep_index] = {}
train_data = data[train_index]
test_data = data[test_index]
complex_serp_mask = [is_complex(x['serp']) for x in test_data]
sat_labels = [int(x['sat']) for x in test_data]
sat_labels_complex = apply_mask(sat_labels, complex_serp_mask)
num_complex_serps[rep_index] = len(sat_labels_complex)
for m in model_names:
try:
model = MODELS[m]
params = model.train(train_data)
m_utility = [model.utility(params, x['session'], x['serp']) for x in test_data]
results.append({'rep': rep_index, 'model': m,
'utility': apply_mask(m_utility, complex_serp_mask), 'sat': sat_labels_complex})
except Exception as e:
print >>sys.stderr, 'Exception at rep=%d, m=%s: %s' % (rep_index, m, str(e))
continue
In [ ]:
per_m_results = collections.defaultdict(lambda: {'u': [], 's': []})
for d in [pd.read_pickle('out_heterogeneous/%d.df' % i) for i in xrange(20)]:
for c in d:
r = d[c]
u = r.utility
s = r.sat
assert len(u) == 1
assert len(s) == 1
per_m_results[r.name]['u'].append(u[0])
per_m_results[r.name]['s'].append(s[0])
for m, res in per_m_results.iteritems():
print m, scipy.stats.pearsonr(res['u'], res['s'])[0]
In [ ]:
sat_pearson = pd.DataFrame(results)
In [ ]:
sat_pearson
In [ ]:
FIGS = '<DIRECTORY_TO_OUTPUT_FIGURES>'
In [ ]:
model_names = ['CASnod', 'CASnosat', 'CASnoreg', 'CAS', 'UBM', 'PBM', 'random', 'DCG', 'uUBM']
colors = sns.color_palette('Set1', n_colors=len(model_names), desat=0.3)
pal = {m: colors[k] for k, m in enumerate(model_names)}
In [ ]:
def restyle(ax):
ax.set_xlabel('')
ax.set_ylabel('')
ax.xaxis.grid(color='white')
ax.set_xticklabels(ax.xaxis.get_majorticklabels(), rotation=30)
In [ ]:
models = ['CASnod', 'CASnosat', 'CASnoreg',
'CAS',
'UBM', 'PBM',
'random', 'uUBM',
]
In [ ]:
ax = sns.boxplot(x='model', y='value', data=d[d['metric'] == 'click'], order=models, palette=pal)
restyle(ax)
ax.set_ylim([-4.5, -1.4])
ax.figure.savefig(os.path.join(FIGS, 'll_click.pdf'))
In [ ]:
sat_data = d[d['metric'] == 'sat'][d['model'].isin(models)]
sat_data.set_index(['model', 'rep', 'fold'], inplace=True, verify_integrity=True)
sat_data.sort_index(inplace=True)
sat_data = sat_data.set_value(('CASnosat', range(N_REPETITIONS), range(N_FOLDS)), 'value', float('NaN'))
sat_data.reset_index(level=0, inplace=True)
In [ ]:
ax = sns.boxplot(x='model', y='value', data=sat_data, order=models, palette=pal)
restyle(ax)
ax.set_ylim([-0.8, -0.2])
ax.figure.savefig(os.path.join(FIGS, 'll_sat.pdf'))
In [ ]:
models_attention = ['CASrank', 'CASnogeom', 'CASnoclass',
'CASnod', 'CAS',
]
colors2 = sns.color_palette('Set2', n_colors=3, desat=0.3)
pal2 = pal.copy()
pal2.update({m: c for m, c in zip(models_attention[:3], colors2)})
In [ ]:
ax = sns.boxplot(x='model', y='value', data=d_att[d_att['metric'] == 'click'], order=models_attention, palette=pal2)
restyle(ax)
ax.set_aspect(8)
ax.figure.savefig(os.path.join(FIGS, 'll_click_attention.pdf'), bbox_inches='tight')
In [ ]:
ax = sns.boxplot(x='model', y='value', data=d_att[d_att['metric'] == 'sat'], order=models_attention, palette=pal2)
restyle(ax)
ax.set_aspect(16)
ax.figure.savefig(os.path.join(FIGS, 'll_sat_attention.pdf'), bbox_inches='tight')
In [ ]:
ax = sns.boxplot(x='model', y='value', data=d_att[d_att['metric'] == 'sat pearson'], order=models_attention, palette=pal2)
restyle(ax)
ax.figure.savefig(os.path.join(FIGS, 'sat_pearson_attention.pdf'), bbox_inches='tight')
In [ ]:
def picklable_pyclick_model(pyclick_model):
return {'attr': pyclick_model.params[pyclick_model.param_names.attr],
'exam': pyclick_model.params[pyclick_model.param_names.exam]}
In [ ]:
TREC_MODELS = {
# 'CAS': click_model.CAS(log_id_to_rel),
# 'CAST': click_model.CAS(log_id_to_rel, use_D=False, trec_style=True),
# 'CASTnoreg': click_model.CAS(log_id_to_rel, use_D=False, trec_style=True, reg_coeff=0),
'CASTnosat': click_model.CAS(log_id_to_rel, use_D=False, trec_style=True, sat_term_weight=0),
'CASTnosatnoreg': click_model.CAS(log_id_to_rel, use_D=False, trec_style=True, sat_term_weight=0, reg_coeff=0),
}
for name, model in TREC_MODELS.iteritems():
params = model.train(data)
with open('%s.params' % name, 'w') as f:
pickle.dump(params, f)