notebook.community

Edit and run



In [ ]:

    
# Copyright 2016 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
################################################################################
#
# Notebook to process the ratings and produce plots and tables.



In [ ]:

    
%matplotlib notebook
%load_ext autoreload
%autoreload 2



In [ ]:

    
from __future__ import division

import collections
import csv
import itertools
import jsonpickle
import math
import numpy as np
import multiprocessing
import os
import pandas as pd
import pickle
import scipy
import seaborn as sns
import sys

import sklearn.cross_validation

sys.path.append(
    os.path.abspath(os.path.join(os.path.dirname('__file__'), os.path.pardir)))

import logs_processing.click_model as click_model
from logs_processing.create_tasks import Action, LogItem



In [ ]:

    
current_palette = sns.color_palette()
sns.palplot(current_palette)



In [ ]:

    
CF = '<DIRECTORY_WITH_THE_ANONYMIZED_DATASET>'
SPAMMER_FILENAME = 'spammers_anonymized.txt'
RESULTS_D = 'results_D_anonymized.csv'
RESULTS_R = 'results_R_anonymized.csv'
TASK_FILE = 'serps_anonymized.csv'



In [ ]:

    
USE_CF_TRUST = True

Read the data

Read spammers data



In [ ]:

    
spammers = set()
with open(os.path.join(CF, SPAMMER_FILENAME)) as f:
    for worker_id in f:
        spammers.add(worker_id.rstrip())

print '%d spammers' % len(spammers)

Read ratings



In [ ]:

    
log_id_to_rel = collections.defaultdict(click_model.RelContainer)
log_id_to_query = {}
good_worker_ratings = 0
total_ratings = 0
all_workers = set()
with open(os.path.join(CF, RESULTS_D)) as results_D:
    for row in csv.DictReader(results_D):
        worker_id = row['cas_worker_id']
        all_workers.add(worker_id)
        total_ratings += 1
        if worker_id not in spammers:
            good_worker_ratings += 1
            trust = float(row['cf_worker_trust']) if USE_CF_TRUST else 1
            log_id = row['cas_log_id']
            click_model.RelContainer.add_rel(log_id_to_rel[log_id].Ds, row['D'], trust)
            log_id_to_query[log_id] = row['cas_query_id']
print '(D) %.1f%% ratings form spammers' % (100 - 100 * good_worker_ratings / total_ratings)



In [ ]:

    
good_worker_ratings = 0
total_ratings = 0
yes_detailed = []
with open(os.path.join(CF, RESULTS_R)) as results_R:
    for row in csv.DictReader(results_R):
        worker_id = row['cas_worker_id']
        all_workers.add(worker_id)
        total_ratings += 1
        if worker_id not in spammers:
            good_worker_ratings +=1
            trust = float(row['cf_worker_trust']) if USE_CF_TRUST else 1
            log_id = row['cas_log_id']
            click_model.RelContainer.add_rel(log_id_to_rel[log_id].Rs, row['R'], trust)
            query = row['cas_query_id']
            if row['yes_detailed']:
                yes_detailed.append(row['yes_detailed'])
            old_query = log_id_to_query.setdefault(log_id, query)
            if old_query != query:
                print >>sys.stderr, ('The same log_id '
                        '(%s) maps to two different queries: [%s] and [%s]' % (
                                log_id, old_query, query))
                sys.exit(1)

print '%d items with complete relevance' % sum(
        1 for r in log_id_to_rel.itervalues() if r)

print '%d queries with at least one completely judged document' % len(set(
        log_id_to_query[k] for k, r in log_id_to_rel.iteritems() if r))

print '%d workers in total' % len(all_workers)

print '(R) %.1f%% ratings form spammers' % (100 - 100 * good_worker_ratings / total_ratings)



In [ ]:

    
def percentages(counter):
    s = sum(counter.values())
    return ['%s: %.1f%%' % (k, v / s * 100) for k, v in counter.most_common()]



In [ ]:

    
print percentages(collections.Counter(yes_detailed))



In [ ]:

    
Ds = collections.Counter(x[0] for rel in log_id_to_rel.itervalues() for x in rel.Ds)
Rs = collections.Counter(x[0] for rel in log_id_to_rel.itervalues() for x in rel.Rs)



In [ ]:

    
print percentages(Ds)
print percentages(Rs)

Correlation between R and D



In [ ]:

    
most_common_rel_labels = []
for rel in log_id_to_rel.itervalues():
    most_common_rel_labels.append({'D': click_model.rel_most_common(rel.Ds), 'R': click_model.rel_most_common(rel.Rs)})
mc_rels = pd.DataFrame(most_common_rel_labels)



In [ ]:

    
scipy.stats.pearsonr(mc_rels['R'], mc_rels['D'])



In [ ]:

    
scipy.stats.spearmanr(mc_rels['R'], mc_rels['D'])



In [ ]:

    
ax = sns.regplot(x='R', y='D', data=mc_rels, x_jitter=.1, y_jitter=.1)
ax.figure.savefig('R_D_correlation.pdf')

Read SERPs and logs



In [ ]:

    
data = []
with open(os.path.join(CF, TASK_FILE)) as task_file:
    sat_labels = []
    num_skipped = 0
    num_sat_true = 0
    num_total = 0
    reader = csv.DictReader(task_file)
    for key, query_rows_iter in itertools.groupby(reader,
                    key=lambda row: (row['cas_log_id'].split('_')[:-1], # SERP id
                                     row['cas_query_id'],
                                     row['sat_feedback'])):
        sat = key[2]
        if sat == 'undefined':
            print >>sys.stderr, 'Undefined sat label for query [%s]' % key[1]
        sat_labels.append(sat)
        sat = click_model.parse_sat(sat)
        if sat is None:
            num_skipped += 1
            continue
        elif sat:
            num_sat_true += 1
        data_row = {'query': key[1], 'sat': sat, 'session': [], 'serp': []}
        for row in query_rows_iter:
            data_row['session'].append(jsonpickle.decode(row['actions']))
            data_row['serp'].append(click_model.Snippet(emup=row['emup'],
                                                        cas_item_type=row['cas_item_type'],
                                                        is_complex=row['is_complex']))
        data.append(data_row)
        num_total += 1
    print collections.Counter(sat_labels)
    print 'Skipped %d rows out of %d' % (num_skipped, num_total + num_skipped)
    print '%.1f%% of SAT labels in the data' % (num_sat_true / num_total * 100)



In [ ]:

    
print '%d queries left' % len(data)
print '%d SERP items w/ ratings' % sum(sum(1 for l in row['session'] if log_id_to_rel[l.log_id]) for row in data)

Do the heavy lifting



In [ ]:

    
MODELS = {
    'CAS': click_model.CAS(log_id_to_rel),
    'CASnod': click_model.CAS(log_id_to_rel, use_D=False),
    'CASnosat': click_model.CAS(log_id_to_rel, sat_term_weight=0),
    'CASnoreg': click_model.CAS(log_id_to_rel, reg_coeff=0),
    'random': click_model.RandomSatModel(),
    'PBM': click_model.PyClickModel('PBM', log_id_to_rel),
    'UBM': click_model.PyClickModel('UBM', log_id_to_rel),
    'DCG': click_model.DCG(log_id_to_rel),
    'uUBM': click_model.uUBM(log_id_to_rel),
}



In [ ]:

    
def compute_performance(index, train_data, test_data, result_queue):
    result = {}
    for name, model in MODELS.iteritems():
        try:
            params = model.train(train_data)
            ll_values_test = [
                    model.log_likelihood(params,
                                         d['session'], d['serp'], d['sat'],
                                         f_only=True
                    ) for d in test_data
            ]
            result[name] = {}
            result[name]['full'] = np.average([l.full for l in ll_values_test])
            result[name]['click'] = np.average([l.clicks for l in ll_values_test])
            result[name]['sat'] = np.average([l.sat for l in ll_values_test])
            result[name]['utility'] = [model.utility(params, d['session'], d['serp']) for d in test_data]
            result[name]['sat pearson'] = scipy.stats.pearsonr(
                    [int(d['sat']) for d in test_data],
                    result[name]['utility']
            )[0]
        except Exception, e:
            result[name] = sys.exc_info()
    result_queue.put((index, result))



In [ ]:

    
N_REPETITIONS = 1
N_FOLDS = 3
N = len(data)
data = np.array(data)



In [ ]:

    
result_queue = multiprocessing.Queue()
workers = []
for rep_index in xrange(N_REPETITIONS):
    for fold_num, (train_index, test_index) in enumerate(sklearn.cross_validation.KFold(N, n_folds=N_FOLDS,
                                                                                        shuffle=True,
                                                                                        random_state=rep_index)):
        w = multiprocessing.Process(target=compute_performance,
                    args=((rep_index, fold_num), data[train_index], data[test_index], result_queue))
        workers.append(w)
        w.start()



In [ ]:

    
results = []
for i in xrange(len(workers)):
    try:
        results.append(result_queue.get(timeout=300))
        print >>sys.stderr, i,
    except multiprocessing.TimeoutError:
        print >>sys.stderr, '..',
print len(results)



In [ ]:

    
for w in workers:
    w.join()

Save the results



In [ ]:

    
def flatten(results):
    out = []
    for idx, result in results:
        for model, r in result.iteritems():
            if isinstance(r, tuple):
                print >>sys.stderr, r
            else:
                out += [{'rep': idx[0], 'fold': idx[1], 'model': model, 'metric': k, 'value': v} for (k, v) in r.iteritems()]
    return out



In [ ]:

    
d = pd.DataFrame(flatten(results))
d.to_pickle('results.df')

Metric-metric correlation



In [ ]:

    
def utility(rep, fold, model):
    return d[d['rep'] == rep][d['fold'] == fold][d['model'] == model][d['metric'] == 'utility'].iloc[0]['value']
#utility(0, 0, 'CAS')



In [ ]:

    
correlations = {}
model_names = ['CASnod', 'CASnosat', 'CASnoreg',
          'CAS',
          'UBM', 'PBM',
          'DCG', 'uUBM']
for i in xrange(len(model_names)):
    m1 = model_names[i]
    correlations[m1] = {}
    for m2 in model_names[:i]:
        vals = []
        for rep in xrange(N_REPETITIONS):
            for fold in xrange(N_FOLDS):
                try:
                    m1_utility = utility(rep, fold, m1)
                    m2_utility = utility(rep, fold, m2)
                    vals.append(scipy.stats.pearsonr(m1_utility, m2_utility)[0])
                except IndexError as e:
                    print >>sys.stderr, 'Missing value: rep=%d, fold=%d, m1=%s, m2=%s' % (rep, fold, m1, m2)
                    continue                   
        correlations[m1][m2] = np.mean(vals)
correlations = pd.DataFrame(correlations, index=model_names[:-1], columns=model_names[1:])



In [ ]:

    
print correlations.to_latex(float_format=lambda x: '---' if math.isnan(x) else '%.3f' % x)

Complex SERPs



In [ ]:

    
def is_complex(serp):
    return any(snippet.is_complex for snippet in serp)



In [ ]:

    
def apply_mask(iterable, mask, inverted=False):
    return [x for x, m in zip(iterable, mask) if (m if not inverted else not m)]



In [ ]:

    
N_REPETITIONS_COMPLEX = 20

model_names = [
#           'CASnod', 'CASnosat', 'CASnoreg',
#           'CAS',
#           'UBM', 'PBM',
          'random', 'DCG', 'uUBM']

num_complex_serps = {}
results = []

data = np.array(data)

complex_serps = [is_complex(x['serp']) for x in data]

for rep_index, (train_index, test_index) in enumerate(sklearn.cross_validation.StratifiedShuffleSplit(
        complex_serps, N_REPETITIONS_COMPLEX, test_size=1/24, random_state=0)):
    num_complex_serps[rep_index] = {}
    train_data = data[train_index]
    test_data = data[test_index]
    complex_serp_mask = [is_complex(x['serp']) for x in test_data]
    sat_labels = [int(x['sat']) for x in test_data]
    sat_labels_complex = apply_mask(sat_labels, complex_serp_mask)
    num_complex_serps[rep_index] = len(sat_labels_complex)
    for m in model_names:
        try:
            model = MODELS[m]
            params = model.train(train_data)
            m_utility = [model.utility(params, x['session'], x['serp']) for x in test_data]
            results.append({'rep': rep_index, 'model': m,
                            'utility': apply_mask(m_utility, complex_serp_mask), 'sat': sat_labels_complex})
        except Exception as e:
            print >>sys.stderr, 'Exception at rep=%d, m=%s: %s' % (rep_index, m, str(e))
            continue



In [ ]:

    
per_m_results = collections.defaultdict(lambda: {'u': [], 's': []})
for d in [pd.read_pickle('out_heterogeneous/%d.df' % i) for i in xrange(20)]:
    for c in d:
        r = d[c]
        u = r.utility
        s = r.sat
        assert len(u) == 1
        assert len(s) == 1
        per_m_results[r.name]['u'].append(u[0])
        per_m_results[r.name]['s'].append(s[0])

for m, res in per_m_results.iteritems():
    print m, scipy.stats.pearsonr(res['u'], res['s'])[0]



In [ ]:

    
sat_pearson = pd.DataFrame(results)



In [ ]:

    
sat_pearson

Plot Results



In [ ]:

    
FIGS = '<DIRECTORY_TO_OUTPUT_FIGURES>'



In [ ]:

    
model_names = ['CASnod', 'CASnosat', 'CASnoreg', 'CAS', 'UBM', 'PBM', 'random', 'DCG', 'uUBM']
colors = sns.color_palette('Set1', n_colors=len(model_names), desat=0.3)

pal = {m: colors[k] for k, m in enumerate(model_names)}



In [ ]:

    
def restyle(ax):
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.xaxis.grid(color='white')
    ax.set_xticklabels(ax.xaxis.get_majorticklabels(), rotation=30)



In [ ]:

    
models = ['CASnod', 'CASnosat', 'CASnoreg',
          'CAS',
          'UBM', 'PBM',
          'random', 'uUBM',
]

Clicks LL



In [ ]:

    
ax = sns.boxplot(x='model', y='value', data=d[d['metric'] == 'click'], order=models, palette=pal)
restyle(ax)
ax.set_ylim([-4.5, -1.4])
ax.figure.savefig(os.path.join(FIGS, 'll_click.pdf'))

Satisfaction LL



In [ ]:

    
sat_data = d[d['metric'] == 'sat'][d['model'].isin(models)]
sat_data.set_index(['model', 'rep', 'fold'], inplace=True, verify_integrity=True)
sat_data.sort_index(inplace=True)
sat_data = sat_data.set_value(('CASnosat', range(N_REPETITIONS), range(N_FOLDS)), 'value', float('NaN'))
sat_data.reset_index(level=0, inplace=True)



In [ ]:

    
ax = sns.boxplot(x='model', y='value', data=sat_data, order=models, palette=pal)
restyle(ax)
ax.set_ylim([-0.8, -0.2])
ax.figure.savefig(os.path.join(FIGS, 'll_sat.pdf'))

Attention feature analysis



In [ ]:

    
models_attention = ['CASrank', 'CASnogeom', 'CASnoclass',
    'CASnod', 'CAS',
]
colors2 = sns.color_palette('Set2', n_colors=3, desat=0.3)
pal2 = pal.copy()
pal2.update({m: c for m, c in zip(models_attention[:3], colors2)})

Clicks



In [ ]:

    
ax = sns.boxplot(x='model', y='value', data=d_att[d_att['metric'] == 'click'], order=models_attention, palette=pal2)
restyle(ax)
ax.set_aspect(8)
ax.figure.savefig(os.path.join(FIGS, 'll_click_attention.pdf'), bbox_inches='tight')

Satisfaction



In [ ]:

    
ax = sns.boxplot(x='model', y='value', data=d_att[d_att['metric'] == 'sat'], order=models_attention, palette=pal2)
restyle(ax)
ax.set_aspect(16)
ax.figure.savefig(os.path.join(FIGS, 'll_sat_attention.pdf'), bbox_inches='tight')

Pearson



In [ ]:

    
ax = sns.boxplot(x='model', y='value', data=d_att[d_att['metric'] == 'sat pearson'], order=models_attention, palette=pal2)
restyle(ax)
ax.figure.savefig(os.path.join(FIGS, 'sat_pearson_attention.pdf'), bbox_inches='tight')

Train on the whole dataset (to be used with TREC)



In [ ]:

    
def picklable_pyclick_model(pyclick_model):
    return {'attr': pyclick_model.params[pyclick_model.param_names.attr],
            'exam': pyclick_model.params[pyclick_model.param_names.exam]}



In [ ]:

    
TREC_MODELS = {
#      'CAS': click_model.CAS(log_id_to_rel),
#      'CAST': click_model.CAS(log_id_to_rel, use_D=False, trec_style=True),
#      'CASTnoreg': click_model.CAS(log_id_to_rel, use_D=False, trec_style=True, reg_coeff=0),
     'CASTnosat': click_model.CAS(log_id_to_rel, use_D=False, trec_style=True, sat_term_weight=0),
     'CASTnosatnoreg': click_model.CAS(log_id_to_rel, use_D=False, trec_style=True, sat_term_weight=0, reg_coeff=0),
}
for name, model in TREC_MODELS.iteritems():
    params = model.train(data)
    with open('%s.params' % name, 'w') as f:
        pickle.dump(params, f)