In [ ]:
# Copyright 2016 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
################################################################################
#
# Notebook to process the ratings and produce plots and tables.

In [ ]:
%matplotlib notebook
%load_ext autoreload
%autoreload 2

In [ ]:
from __future__ import division

import collections
import csv
import itertools
import jsonpickle
import math
import numpy as np
import multiprocessing
import os
import pandas as pd
import pickle
import scipy
import seaborn as sns
import sys

import sklearn.cross_validation

sys.path.append(
    os.path.abspath(os.path.join(os.path.dirname('__file__'), os.path.pardir)))

import logs_processing.click_model as click_model
from logs_processing.create_tasks import Action, LogItem

In [ ]:
current_palette = sns.color_palette()
sns.palplot(current_palette)

In [ ]:
CF = '<DIRECTORY_WITH_THE_ANONYMIZED_DATASET>'
SPAMMER_FILENAME = 'spammers_anonymized.txt'
RESULTS_D = 'results_D_anonymized.csv'
RESULTS_R = 'results_R_anonymized.csv'
TASK_FILE = 'serps_anonymized.csv'

In [ ]:
USE_CF_TRUST = True

Read the data

Read spammers data


In [ ]:
spammers = set()
with open(os.path.join(CF, SPAMMER_FILENAME)) as f:
    for worker_id in f:
        spammers.add(worker_id.rstrip())

print '%d spammers' % len(spammers)

Read ratings


In [ ]:
log_id_to_rel = collections.defaultdict(click_model.RelContainer)
log_id_to_query = {}
good_worker_ratings = 0
total_ratings = 0
all_workers = set()
with open(os.path.join(CF, RESULTS_D)) as results_D:
    for row in csv.DictReader(results_D):
        worker_id = row['cas_worker_id']
        all_workers.add(worker_id)
        total_ratings += 1
        if worker_id not in spammers:
            good_worker_ratings += 1
            trust = float(row['cf_worker_trust']) if USE_CF_TRUST else 1
            log_id = row['cas_log_id']
            click_model.RelContainer.add_rel(log_id_to_rel[log_id].Ds, row['D'], trust)
            log_id_to_query[log_id] = row['cas_query_id']
print '(D) %.1f%% ratings form spammers' % (100 - 100 * good_worker_ratings / total_ratings)

In [ ]:
good_worker_ratings = 0
total_ratings = 0
yes_detailed = []
with open(os.path.join(CF, RESULTS_R)) as results_R:
    for row in csv.DictReader(results_R):
        worker_id = row['cas_worker_id']
        all_workers.add(worker_id)
        total_ratings += 1
        if worker_id not in spammers:
            good_worker_ratings +=1
            trust = float(row['cf_worker_trust']) if USE_CF_TRUST else 1
            log_id = row['cas_log_id']
            click_model.RelContainer.add_rel(log_id_to_rel[log_id].Rs, row['R'], trust)
            query = row['cas_query_id']
            if row['yes_detailed']:
                yes_detailed.append(row['yes_detailed'])
            old_query = log_id_to_query.setdefault(log_id, query)
            if old_query != query:
                print >>sys.stderr, ('The same log_id '
                        '(%s) maps to two different queries: [%s] and [%s]' % (
                                log_id, old_query, query))
                sys.exit(1)

print '%d items with complete relevance' % sum(
        1 for r in log_id_to_rel.itervalues() if r)

print '%d queries with at least one completely judged document' % len(set(
        log_id_to_query[k] for k, r in log_id_to_rel.iteritems() if r))

print '%d workers in total' % len(all_workers)

print '(R) %.1f%% ratings form spammers' % (100 - 100 * good_worker_ratings / total_ratings)

In [ ]:
def percentages(counter):
    s = sum(counter.values())
    return ['%s: %.1f%%' % (k, v / s * 100) for k, v in counter.most_common()]

In [ ]:
print percentages(collections.Counter(yes_detailed))

In [ ]:
Ds = collections.Counter(x[0] for rel in log_id_to_rel.itervalues() for x in rel.Ds)
Rs = collections.Counter(x[0] for rel in log_id_to_rel.itervalues() for x in rel.Rs)

In [ ]:
print percentages(Ds)
print percentages(Rs)

Correlation between R and D


In [ ]:
most_common_rel_labels = []
for rel in log_id_to_rel.itervalues():
    most_common_rel_labels.append({'D': click_model.rel_most_common(rel.Ds), 'R': click_model.rel_most_common(rel.Rs)})
mc_rels = pd.DataFrame(most_common_rel_labels)

In [ ]:
scipy.stats.pearsonr(mc_rels['R'], mc_rels['D'])

In [ ]:
scipy.stats.spearmanr(mc_rels['R'], mc_rels['D'])

In [ ]:
ax = sns.regplot(x='R', y='D', data=mc_rels, x_jitter=.1, y_jitter=.1)
ax.figure.savefig('R_D_correlation.pdf')

Read SERPs and logs


In [ ]:
data = []
with open(os.path.join(CF, TASK_FILE)) as task_file:
    sat_labels = []
    num_skipped = 0
    num_sat_true = 0
    num_total = 0
    reader = csv.DictReader(task_file)
    for key, query_rows_iter in itertools.groupby(reader,
                    key=lambda row: (row['cas_log_id'].split('_')[:-1], # SERP id
                                     row['cas_query_id'],
                                     row['sat_feedback'])):
        sat = key[2]
        if sat == 'undefined':
            print >>sys.stderr, 'Undefined sat label for query [%s]' % key[1]
        sat_labels.append(sat)
        sat = click_model.parse_sat(sat)
        if sat is None:
            num_skipped += 1
            continue
        elif sat:
            num_sat_true += 1
        data_row = {'query': key[1], 'sat': sat, 'session': [], 'serp': []}
        for row in query_rows_iter:
            data_row['session'].append(jsonpickle.decode(row['actions']))
            data_row['serp'].append(click_model.Snippet(emup=row['emup'],
                                                        cas_item_type=row['cas_item_type'],
                                                        is_complex=row['is_complex']))
        data.append(data_row)
        num_total += 1
    print collections.Counter(sat_labels)
    print 'Skipped %d rows out of %d' % (num_skipped, num_total + num_skipped)
    print '%.1f%% of SAT labels in the data' % (num_sat_true / num_total * 100)

In [ ]:
print '%d queries left' % len(data)
print '%d SERP items w/ ratings' % sum(sum(1 for l in row['session'] if log_id_to_rel[l.log_id]) for row in data)

Do the heavy lifting


In [ ]:
MODELS = {
    'CAS': click_model.CAS(log_id_to_rel),
    'CASnod': click_model.CAS(log_id_to_rel, use_D=False),
    'CASnosat': click_model.CAS(log_id_to_rel, sat_term_weight=0),
    'CASnoreg': click_model.CAS(log_id_to_rel, reg_coeff=0),
    'random': click_model.RandomSatModel(),
    'PBM': click_model.PyClickModel('PBM', log_id_to_rel),
    'UBM': click_model.PyClickModel('UBM', log_id_to_rel),
    'DCG': click_model.DCG(log_id_to_rel),
    'uUBM': click_model.uUBM(log_id_to_rel),
}

In [ ]:
def compute_performance(index, train_data, test_data, result_queue):
    result = {}
    for name, model in MODELS.iteritems():
        try:
            params = model.train(train_data)
            ll_values_test = [
                    model.log_likelihood(params,
                                         d['session'], d['serp'], d['sat'],
                                         f_only=True
                    ) for d in test_data
            ]
            result[name] = {}
            result[name]['full'] = np.average([l.full for l in ll_values_test])
            result[name]['click'] = np.average([l.clicks for l in ll_values_test])
            result[name]['sat'] = np.average([l.sat for l in ll_values_test])
            result[name]['utility'] = [model.utility(params, d['session'], d['serp']) for d in test_data]
            result[name]['sat pearson'] = scipy.stats.pearsonr(
                    [int(d['sat']) for d in test_data],
                    result[name]['utility']
            )[0]
        except Exception, e:
            result[name] = sys.exc_info()
    result_queue.put((index, result))

In [ ]:
N_REPETITIONS = 1
N_FOLDS = 3
N = len(data)
data = np.array(data)

In [ ]:
result_queue = multiprocessing.Queue()
workers = []
for rep_index in xrange(N_REPETITIONS):
    for fold_num, (train_index, test_index) in enumerate(sklearn.cross_validation.KFold(N, n_folds=N_FOLDS,
                                                                                        shuffle=True,
                                                                                        random_state=rep_index)):
        w = multiprocessing.Process(target=compute_performance,
                    args=((rep_index, fold_num), data[train_index], data[test_index], result_queue))
        workers.append(w)
        w.start()

In [ ]:
results = []
for i in xrange(len(workers)):
    try:
        results.append(result_queue.get(timeout=300))
        print >>sys.stderr, i,
    except multiprocessing.TimeoutError:
        print >>sys.stderr, '..',
print len(results)

In [ ]:
for w in workers:
    w.join()

Save the results


In [ ]:
def flatten(results):
    out = []
    for idx, result in results:
        for model, r in result.iteritems():
            if isinstance(r, tuple):
                print >>sys.stderr, r
            else:
                out += [{'rep': idx[0], 'fold': idx[1], 'model': model, 'metric': k, 'value': v} for (k, v) in r.iteritems()]
    return out

In [ ]:
d = pd.DataFrame(flatten(results))
d.to_pickle('results.df')

Metric-metric correlation


In [ ]:
def utility(rep, fold, model):
    return d[d['rep'] == rep][d['fold'] == fold][d['model'] == model][d['metric'] == 'utility'].iloc[0]['value']
#utility(0, 0, 'CAS')

In [ ]:
correlations = {}
model_names = ['CASnod', 'CASnosat', 'CASnoreg',
          'CAS',
          'UBM', 'PBM',
          'DCG', 'uUBM']
for i in xrange(len(model_names)):
    m1 = model_names[i]
    correlations[m1] = {}
    for m2 in model_names[:i]:
        vals = []
        for rep in xrange(N_REPETITIONS):
            for fold in xrange(N_FOLDS):
                try:
                    m1_utility = utility(rep, fold, m1)
                    m2_utility = utility(rep, fold, m2)
                    vals.append(scipy.stats.pearsonr(m1_utility, m2_utility)[0])
                except IndexError as e:
                    print >>sys.stderr, 'Missing value: rep=%d, fold=%d, m1=%s, m2=%s' % (rep, fold, m1, m2)
                    continue                   
        correlations[m1][m2] = np.mean(vals)
correlations = pd.DataFrame(correlations, index=model_names[:-1], columns=model_names[1:])

In [ ]:
print correlations.to_latex(float_format=lambda x: '---' if math.isnan(x) else '%.3f' % x)

Complex SERPs


In [ ]:
def is_complex(serp):
    return any(snippet.is_complex for snippet in serp)

In [ ]:
def apply_mask(iterable, mask, inverted=False):
    return [x for x, m in zip(iterable, mask) if (m if not inverted else not m)]

In [ ]:
N_REPETITIONS_COMPLEX = 20

model_names = [
#           'CASnod', 'CASnosat', 'CASnoreg',
#           'CAS',
#           'UBM', 'PBM',
          'random', 'DCG', 'uUBM']

num_complex_serps = {}
results = []

data = np.array(data)

complex_serps = [is_complex(x['serp']) for x in data]

for rep_index, (train_index, test_index) in enumerate(sklearn.cross_validation.StratifiedShuffleSplit(
        complex_serps, N_REPETITIONS_COMPLEX, test_size=1/24, random_state=0)):
    num_complex_serps[rep_index] = {}
    train_data = data[train_index]
    test_data = data[test_index]
    complex_serp_mask = [is_complex(x['serp']) for x in test_data]
    sat_labels = [int(x['sat']) for x in test_data]
    sat_labels_complex = apply_mask(sat_labels, complex_serp_mask)
    num_complex_serps[rep_index] = len(sat_labels_complex)
    for m in model_names:
        try:
            model = MODELS[m]
            params = model.train(train_data)
            m_utility = [model.utility(params, x['session'], x['serp']) for x in test_data]
            results.append({'rep': rep_index, 'model': m,
                            'utility': apply_mask(m_utility, complex_serp_mask), 'sat': sat_labels_complex})
        except Exception as e:
            print >>sys.stderr, 'Exception at rep=%d, m=%s: %s' % (rep_index, m, str(e))
            continue

In [ ]:
per_m_results = collections.defaultdict(lambda: {'u': [], 's': []})
for d in [pd.read_pickle('out_heterogeneous/%d.df' % i) for i in xrange(20)]:
    for c in d:
        r = d[c]
        u = r.utility
        s = r.sat
        assert len(u) == 1
        assert len(s) == 1
        per_m_results[r.name]['u'].append(u[0])
        per_m_results[r.name]['s'].append(s[0])

for m, res in per_m_results.iteritems():
    print m, scipy.stats.pearsonr(res['u'], res['s'])[0]

In [ ]:
sat_pearson = pd.DataFrame(results)

In [ ]:
sat_pearson

Plot Results


In [ ]:
FIGS = '<DIRECTORY_TO_OUTPUT_FIGURES>'

In [ ]:
model_names = ['CASnod', 'CASnosat', 'CASnoreg', 'CAS', 'UBM', 'PBM', 'random', 'DCG', 'uUBM']
colors = sns.color_palette('Set1', n_colors=len(model_names), desat=0.3)

pal = {m: colors[k] for k, m in enumerate(model_names)}

In [ ]:
def restyle(ax):
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.xaxis.grid(color='white')
    ax.set_xticklabels(ax.xaxis.get_majorticklabels(), rotation=30)

In [ ]:
models = ['CASnod', 'CASnosat', 'CASnoreg',
          'CAS',
          'UBM', 'PBM',
          'random', 'uUBM',
]

Clicks LL


In [ ]:
ax = sns.boxplot(x='model', y='value', data=d[d['metric'] == 'click'], order=models, palette=pal)
restyle(ax)
ax.set_ylim([-4.5, -1.4])
ax.figure.savefig(os.path.join(FIGS, 'll_click.pdf'))

Satisfaction LL


In [ ]:
sat_data = d[d['metric'] == 'sat'][d['model'].isin(models)]
sat_data.set_index(['model', 'rep', 'fold'], inplace=True, verify_integrity=True)
sat_data.sort_index(inplace=True)
sat_data = sat_data.set_value(('CASnosat', range(N_REPETITIONS), range(N_FOLDS)), 'value', float('NaN'))
sat_data.reset_index(level=0, inplace=True)

In [ ]:
ax = sns.boxplot(x='model', y='value', data=sat_data, order=models, palette=pal)
restyle(ax)
ax.set_ylim([-0.8, -0.2])
ax.figure.savefig(os.path.join(FIGS, 'll_sat.pdf'))

Attention feature analysis


In [ ]:
models_attention = ['CASrank', 'CASnogeom', 'CASnoclass',
    'CASnod', 'CAS',
]
colors2 = sns.color_palette('Set2', n_colors=3, desat=0.3)
pal2 = pal.copy()
pal2.update({m: c for m, c in zip(models_attention[:3], colors2)})

Clicks


In [ ]:
ax = sns.boxplot(x='model', y='value', data=d_att[d_att['metric'] == 'click'], order=models_attention, palette=pal2)
restyle(ax)
ax.set_aspect(8)
ax.figure.savefig(os.path.join(FIGS, 'll_click_attention.pdf'), bbox_inches='tight')

Satisfaction


In [ ]:
ax = sns.boxplot(x='model', y='value', data=d_att[d_att['metric'] == 'sat'], order=models_attention, palette=pal2)
restyle(ax)
ax.set_aspect(16)
ax.figure.savefig(os.path.join(FIGS, 'll_sat_attention.pdf'), bbox_inches='tight')

Pearson


In [ ]:
ax = sns.boxplot(x='model', y='value', data=d_att[d_att['metric'] == 'sat pearson'], order=models_attention, palette=pal2)
restyle(ax)
ax.figure.savefig(os.path.join(FIGS, 'sat_pearson_attention.pdf'), bbox_inches='tight')

Train on the whole dataset (to be used with TREC)


In [ ]:
def picklable_pyclick_model(pyclick_model):
    return {'attr': pyclick_model.params[pyclick_model.param_names.attr],
            'exam': pyclick_model.params[pyclick_model.param_names.exam]}

In [ ]:
TREC_MODELS = {
#      'CAS': click_model.CAS(log_id_to_rel),
#      'CAST': click_model.CAS(log_id_to_rel, use_D=False, trec_style=True),
#      'CASTnoreg': click_model.CAS(log_id_to_rel, use_D=False, trec_style=True, reg_coeff=0),
     'CASTnosat': click_model.CAS(log_id_to_rel, use_D=False, trec_style=True, sat_term_weight=0),
     'CASTnosatnoreg': click_model.CAS(log_id_to_rel, use_D=False, trec_style=True, sat_term_weight=0, reg_coeff=0),
}
for name, model in TREC_MODELS.iteritems():
    params = model.train(data)
    with open('%s.params' % name, 'w') as f:
        pickle.dump(params, f)