In [1]:
%load_ext autoreload
%autoreload 2

import os
import json
import tabulate
from collections import Counter
from IPython.display import HTML, display

Load wordstat json logs


In [3]:
models_dir = '~/ParlAI/data/controllable_dialogue/wordstat_files'  # Enter the path to your wordstat_files directory here
wordstat_files = [fname for fname in os.listdir(models_dir) if 'wordstats.json' in fname]
mf2data = {} # master dict mapping model file name to its data dict

print('Loading %i files...' % len(wordstat_files), end='')
for idx, json_file in enumerate(sorted(wordstat_files)):
    mf = json_file[:json_file.index('.wordstats.json')]
    print('%i, ' % idx, end='')
    with open(os.path.join(models_dir, json_file), "r") as f:
        data = json.load(f)
    mf2data[mf] = data
print('\nFinished loading files')


Loading 54 files...0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 
Finished loading files

Make table of automatic metrics


In [4]:
# This cell makes Table 6 from the paper

columns = [
    'extrep_2gram',
    'extrep_nonstopword',
    'intrep_2gram',
    'intrep_nonstopword',
    'partnerrep_2gram',
    'avg_nidf',
    'lastuttsim',
    'question',
]

header_row = ['model name'] + columns

rows = [
    # gold data and baselines
    'goldresponse',
    'convai2_finetuned_baseline.valid.usemodelreply.beam1',
    'convai2_finetuned_baseline.valid.usemodelreply.beam20.beamminnbest10',

    # repetition control (WD)
    'convai2_finetuned_baseline.valid.usemodelreply.beam20.beamminnbest10.WDfeatures:extrep_2gram-0.5',
    'convai2_finetuned_baseline.valid.usemodelreply.beam20.beamminnbest10.WDfeatures:extrep_2gram-1.25',
    'convai2_finetuned_baseline.valid.usemodelreply.beam20.beamminnbest10.WDfeatures:extrep_2gram-3.5',
    'convai2_finetuned_baseline.valid.usemodelreply.beam20.beamminnbest10.WDfeatures:extrep_2gram-1e+20',
    'convai2_finetuned_baseline.valid.usemodelreply.beam20.beamminnbest10.WDfeatures:extrep_2gram-3.5_extrep_nonstopword-1e+20_intrep_nonstopword-1e+20',
    
    # question control (CT)
    'control_questionb11e10.valid.usemodelreply.beam20.beamminnbest10.setcontrols:question0.WDfeatures:extrep_2gram-3.5_extrep_nonstopword-1e+20_intrep_nonstopword-1e+20',
    'control_questionb11e10.valid.usemodelreply.beam20.beamminnbest10.setcontrols:question1.WDfeatures:extrep_2gram-3.5_extrep_nonstopword-1e+20_intrep_nonstopword-1e+20',
    'control_questionb11e10.valid.usemodelreply.beam20.beamminnbest10.setcontrols:question4.WDfeatures:extrep_2gram-3.5_extrep_nonstopword-1e+20_intrep_nonstopword-1e+20',
    'control_questionb11e10.valid.usemodelreply.beam20.beamminnbest10.setcontrols:question7.WDfeatures:extrep_2gram-3.5_extrep_nonstopword-1e+20_intrep_nonstopword-1e+20',
    'control_questionb11e10.valid.usemodelreply.beam20.beamminnbest10.setcontrols:question10.WDfeatures:extrep_2gram-3.5_extrep_nonstopword-1e+20_intrep_nonstopword-1e+20',
    'control_questionb11e10.valid.usemodelreply.beam20.beamminnbest10.setcontrols:question10.beamreorder_best_extrep2gram_qn.WDfeatures:extrep_nonstopword-1e+20_intrep_nonstopword-1e+20',

    # specificity control (CT)
    'control_avgnidf10b10e.valid.usemodelreply.beam20.beamminnbest10.setcontrols:avg_nidf0.WDfeatures:extrep_2gram-3.5_extrep_nonstopword-1e+20_intrep_nonstopword-1e+20',
    'control_avgnidf10b10e.valid.usemodelreply.beam20.beamminnbest10.setcontrols:avg_nidf2.WDfeatures:extrep_2gram-3.5_extrep_nonstopword-1e+20_intrep_nonstopword-1e+20',
    'control_avgnidf10b10e.valid.usemodelreply.beam20.beamminnbest10.setcontrols:avg_nidf4.WDfeatures:extrep_2gram-3.5_extrep_nonstopword-1e+20_intrep_nonstopword-1e+20',
    'control_avgnidf10b10e.valid.usemodelreply.beam20.beamminnbest10.setcontrols:avg_nidf7.WDfeatures:extrep_2gram-3.5_extrep_nonstopword-1e+20_intrep_nonstopword-1e+20',
    'control_avgnidf10b10e.valid.usemodelreply.beam20.beamminnbest10.setcontrols:avg_nidf9.WDfeatures:extrep_2gram-3.5_extrep_nonstopword-1e+20_intrep_nonstopword-1e+20',

    # specificity control (WD)
    'convai2_finetuned_baseline.valid.usemodelreply.beam20.beamminnbest10.WDfeatures:extrep_2gram-3.5_extrep_nonstopword-1e+20_intrep_nonstopword-1e+20_nidf-10.0',
    'convai2_finetuned_baseline.valid.usemodelreply.beam20.beamminnbest10.WDfeatures:extrep_2gram-3.5_extrep_nonstopword-1e+20_intrep_nonstopword-1e+20_nidf-4.0',
    'convai2_finetuned_baseline.valid.usemodelreply.beam20.beamminnbest10.WDfeatures:extrep_2gram-3.5_extrep_nonstopword-1e+20_intrep_nonstopword-1e+20_nidf4.0',
    'convai2_finetuned_baseline.valid.usemodelreply.beam20.beamminnbest10.WDfeatures:extrep_2gram-3.5_extrep_nonstopword-1e+20_intrep_nonstopword-1e+20_nidf6.0',
    'convai2_finetuned_baseline.valid.usemodelreply.beam20.beamminnbest10.WDfeatures:extrep_2gram-3.5_extrep_nonstopword-1e+20_intrep_nonstopword-1e+20_nidf8.0',
    
    # response-related control (WD)
    'convai2_finetuned_baseline.valid.usemodelreply.beam20.beamminnbest10.WDfeatures:extrep_2gram-3.5_extrep_nonstopword-1e+20_intrep_2gram-1e+20_intrep_nonstopword-1e+20_lastuttsim-10.0_partnerrep_2gram-1e+20',
    'convai2_finetuned_baseline.valid.usemodelreply.beam20.beamminnbest10.WDfeatures:extrep_2gram-3.5_extrep_nonstopword-1e+20_intrep_2gram-1e+20_intrep_nonstopword-1e+20_lastuttsim0.0_partnerrep_2gram-1e+20',
    'convai2_finetuned_baseline.valid.usemodelreply.beam20.beamminnbest10.WDfeatures:extrep_2gram-3.5_extrep_nonstopword-1e+20_intrep_2gram-1e+20_intrep_nonstopword-1e+20_lastuttsim5.0_partnerrep_2gram-1e+20',
    'convai2_finetuned_baseline.valid.usemodelreply.beam20.beamminnbest10.WDfeatures:extrep_2gram-3.5_extrep_nonstopword-1e+20_intrep_2gram-1e+20_intrep_nonstopword-1e+20_lastuttsim10.0_partnerrep_2gram-1e+20',
    'convai2_finetuned_baseline.valid.usemodelreply.beam20.beamminnbest10.WDfeatures:extrep_2gram-3.5_extrep_nonstopword-1e+20_intrep_2gram-1e+20_intrep_nonstopword-1e+20_lastuttsim13.0_partnerrep_2gram-1e+20',

]

def mean(l):
    return sum(l)/len(l)

def model2row(mf, data):
    """Given the data from a json file, make a row of data for the table"""
    row = [mf]
    for attr in columns:
        sent_attrs = data['sent_attrs']
        if attr in sent_attrs:
            attr_mean = mean(sent_attrs[attr])
            if attr in ['avg_nidf', 'lastuttsim']:
                row.append("%.4f" % (attr_mean))
            else:
                row.append("%.2f%%" % (attr_mean*100))
        else:
            row.append('')
    return row

# Build table
table = [header_row] 
for mf in rows:
    data = mf2data[mf]
    table.append(model2row(mf, data))
html = HTML(tabulate.tabulate(table, tablefmt='html', stralign='center'))
html.data = html.data.replace("text-align: center;", "text-align: left;") # fix left-alignment 
display(html)


model name extrep_2gramextrep_nonstopwordintrep_2gramintrep_nonstopwordpartnerrep_2gramavg_nidflastuttsimquestion
goldresponse 4.65% 9.62% 0.38% 0.97% 5.10% 0.2119 0.1691 28.80%
convai2_finetuned_baseline.valid.usemodelreply.beam1 35.88% 36.31% 8.08% 10.59% 12.20% 0.1688 0.1850 6.46%
convai2_finetuned_baseline.valid.usemodelreply.beam20.beamminnbest10 46.85% 44.15% 0.32% 0.61% 12.90% 0.1662 0.0957 80.87%
convai2_finetuned_baseline.valid.usemodelreply.beam20.beamminnbest10.WDfeatures:extrep_2gram-0.5 19.70% 16.85% 0.26% 0.62% 11.93% 0.1730 0.1348 73.04%
convai2_finetuned_baseline.valid.usemodelreply.beam20.beamminnbest10.WDfeatures:extrep_2gram-1.25 4.62% 4.79% 0.40% 0.89% 10.61% 0.1763 0.1504 61.22%
convai2_finetuned_baseline.valid.usemodelreply.beam20.beamminnbest10.WDfeatures:extrep_2gram-3.5 0.75% 4.61% 0.47% 0.94% 9.89% 0.1771 0.1681 48.89%
convai2_finetuned_baseline.valid.usemodelreply.beam20.beamminnbest10.WDfeatures:extrep_2gram-1e+20 0.00% 4.74% 0.51% 1.05% 9.56% 0.1780 0.1711 45.98%
convai2_finetuned_baseline.valid.usemodelreply.beam20.beamminnbest10.WDfeatures:extrep_2gram-3.5_extrep_nonstopword-1e+20_intrep_nonstopword-1e+20 0.73% 0.00% 0.17% 0.00% 9.55% 0.1766 0.1676 49.98%
control_questionb11e10.valid.usemodelreply.beam20.beamminnbest10.setcontrols:question0.WDfeatures:extrep_2gram-3.5_extrep_nonstopword-1e+20_intrep_nonstopword-1e+20 0.06% 0.00% 0.19% 0.00% 9.20% 0.1871 0.1753 2.01%
control_questionb11e10.valid.usemodelreply.beam20.beamminnbest10.setcontrols:question1.WDfeatures:extrep_2gram-3.5_extrep_nonstopword-1e+20_intrep_nonstopword-1e+20 0.09% 0.00% 0.19% 0.00% 8.66% 0.1844 0.1722 17.33%
control_questionb11e10.valid.usemodelreply.beam20.beamminnbest10.setcontrols:question4.WDfeatures:extrep_2gram-3.5_extrep_nonstopword-1e+20_intrep_nonstopword-1e+20 0.40% 0.00% 0.25% 0.00% 8.53% 0.1794 0.1713 48.88%
control_questionb11e10.valid.usemodelreply.beam20.beamminnbest10.setcontrols:question7.WDfeatures:extrep_2gram-3.5_extrep_nonstopword-1e+20_intrep_nonstopword-1e+20 0.80% 0.00% 0.17% 0.00% 8.48% 0.1771 0.1724 65.65%
control_questionb11e10.valid.usemodelreply.beam20.beamminnbest10.setcontrols:question10.WDfeatures:extrep_2gram-3.5_extrep_nonstopword-1e+20_intrep_nonstopword-1e+20 1.27% 0.00% 0.16% 0.00% 8.48% 0.1761 0.1728 79.67%
control_questionb11e10.valid.usemodelreply.beam20.beamminnbest10.setcontrols:question10.beamreorder_best_extrep2gram_qn.WDfeatures:extrep_nonstopword-1e+20_intrep_nonstopword-1e+20 7.64% 0.00% 0.03% 0.00% 10.76% 0.1701 0.1651 99.54%
control_avgnidf10b10e.valid.usemodelreply.beam20.beamminnbest10.setcontrols:avg_nidf0.WDfeatures:extrep_2gram-3.5_extrep_nonstopword-1e+20_intrep_nonstopword-1e+20 0.60% 0.00% 0.20% 0.00% 9.05% 0.1478 0.1522 48.75%
control_avgnidf10b10e.valid.usemodelreply.beam20.beamminnbest10.setcontrols:avg_nidf2.WDfeatures:extrep_2gram-3.5_extrep_nonstopword-1e+20_intrep_nonstopword-1e+20 0.28% 0.00% 0.10% 0.00% 8.37% 0.1772 0.1833 50.57%
control_avgnidf10b10e.valid.usemodelreply.beam20.beamminnbest10.setcontrols:avg_nidf4.WDfeatures:extrep_2gram-3.5_extrep_nonstopword-1e+20_intrep_nonstopword-1e+20 0.12% 0.00% 0.08% 0.00% 7.90% 0.1921 0.1877 29.46%
control_avgnidf10b10e.valid.usemodelreply.beam20.beamminnbest10.setcontrols:avg_nidf7.WDfeatures:extrep_2gram-3.5_extrep_nonstopword-1e+20_intrep_nonstopword-1e+20 0.02% 0.00% 0.14% 0.00% 8.17% 0.2156 0.1955 16.51%
control_avgnidf10b10e.valid.usemodelreply.beam20.beamminnbest10.setcontrols:avg_nidf9.WDfeatures:extrep_2gram-3.5_extrep_nonstopword-1e+20_intrep_nonstopword-1e+20 0.01% 0.00% 0.11% 0.00% 8.01% 0.2462 0.1990 8.50%
convai2_finetuned_baseline.valid.usemodelreply.beam20.beamminnbest10.WDfeatures:extrep_2gram-3.5_extrep_nonstopword-1e+20_intrep_nonstopword-1e+20_nidf-10.0 0.14% 0.00% 10.59% 0.00% 8.70% 0.1107 0.0994 33.55%
convai2_finetuned_baseline.valid.usemodelreply.beam20.beamminnbest10.WDfeatures:extrep_2gram-3.5_extrep_nonstopword-1e+20_intrep_nonstopword-1e+20_nidf-4.0 0.65% 0.00% 1.98% 0.00% 9.95% 0.1501 0.1398 44.92%
convai2_finetuned_baseline.valid.usemodelreply.beam20.beamminnbest10.WDfeatures:extrep_2gram-3.5_extrep_nonstopword-1e+20_intrep_nonstopword-1e+20_nidf4.0 0.15% 0.00% 0.19% 0.00% 7.54% 0.2121 0.1972 45.53%
convai2_finetuned_baseline.valid.usemodelreply.beam20.beamminnbest10.WDfeatures:extrep_2gram-3.5_extrep_nonstopword-1e+20_intrep_nonstopword-1e+20_nidf6.0 0.07% 0.00% 0.13% 0.00% 6.50% 0.2546 0.2040 39.37%
convai2_finetuned_baseline.valid.usemodelreply.beam20.beamminnbest10.WDfeatures:extrep_2gram-3.5_extrep_nonstopword-1e+20_intrep_nonstopword-1e+20_nidf8.0 0.01% 0.00% 0.10% 0.00% 3.40% 0.4035 0.1436 26.68%
convai2_finetuned_baseline.valid.usemodelreply.beam20.beamminnbest10.WDfeatures:extrep_2gram-3.5_extrep_nonstopword-1e+20_intrep_2gram-1e+20_intrep_nonstopword-1e+20_lastuttsim-10.0_partnerrep_2gram-1e+20 0.13% 0.00% 0.00% 0.00% 0.00% 0.1914 -0.0921 25.71%
convai2_finetuned_baseline.valid.usemodelreply.beam20.beamminnbest10.WDfeatures:extrep_2gram-3.5_extrep_nonstopword-1e+20_intrep_2gram-1e+20_intrep_nonstopword-1e+20_lastuttsim0.0_partnerrep_2gram-1e+20 0.24% 0.00% 0.00% 0.00% 0.00% 0.1785 0.1414 44.55%
convai2_finetuned_baseline.valid.usemodelreply.beam20.beamminnbest10.WDfeatures:extrep_2gram-3.5_extrep_nonstopword-1e+20_intrep_2gram-1e+20_intrep_nonstopword-1e+20_lastuttsim5.0_partnerrep_2gram-1e+20 0.15% 0.00% 0.00% 0.00% 0.00% 0.1973 0.4360 39.78%
convai2_finetuned_baseline.valid.usemodelreply.beam20.beamminnbest10.WDfeatures:extrep_2gram-3.5_extrep_nonstopword-1e+20_intrep_2gram-1e+20_intrep_nonstopword-1e+20_lastuttsim10.0_partnerrep_2gram-1e+20 0.05% 0.00% 0.00% 0.00% 0.00% 0.2535 0.6653 27.56%
convai2_finetuned_baseline.valid.usemodelreply.beam20.beamminnbest10.WDfeatures:extrep_2gram-3.5_extrep_nonstopword-1e+20_intrep_2gram-1e+20_intrep_nonstopword-1e+20_lastuttsim13.0_partnerrep_2gram-1e+20 0.02% 0.00% 0.00% 0.00% 0.00% 0.2999 0.7251 20.47%

Show predictions of a model


In [7]:
mf = 'convai2_finetuned_baseline.valid.usemodelreply.beam20.beamminnbest10' # beam search baseline
num_show = 100  # Show the top 100 most common utterances

def show_preds(mf, num_show=None):
    counter = Counter()
    preds = mf2data[mf]['word_statistics']['pred_list'] # this is the normalized version; use pure_pred_list for unnormalized
    counter.update(preds)
    num_unique = len([p for p,count in counter.items() if count==1])
    print("%% of utterances that are unique: %.2f%% (%i/%i)\n" % (num_unique*100/sum(counter.values()), num_unique, sum(counter.values())))
    print("COUNT   UTTERANCE")
    for p, count in counter.most_common(num_show):
        print("%5i   %s" % (count, p))

show_preds(mf, num_show)


% of utterances that are unique: 14.77% (1152/7801)

COUNT   UTTERANCE
 2945   what city are you from
 1245   what do you do for living
  205   i am good how are you
  190   do you have any pets
  104   what kind of music do you like
   97   hi how are you today
   80   do you have any hobbies
   71   i am great how are you
   51   no i do not do you
   44   do you play any instruments
   42   what kind of music do you play
   41   hello how are you today
   34   i am in cali
   31   i am good thanks for asking
   30   i am stay at home mom
   29   that sounds like lot of fun
   28   i am well how are you
   27   i am doing well how are you
   27   what kind of work do you do
   26   what do you do for work
   25   what is your favorite food
   23   what is your favorite color
   22   i do not have any pets
   22   i am from los angeles
   21   what languages do you speak
   19   what kind of dog do you have
   19   what kind of dogs do you have
   19   what kind of car do you drive
   17   i am in los angeles
   16   i m doing well how are you
   16   i am doing well how about you
   16   how are you doing today
   15   i do not do you
   15   i am sorry to hear that
   15   i am from united states
   14   how many kids do you have
   14   i am in 3rd grade
   14   i like all kinds of music
   13   what are you going to school for
   13   i was forced to marry when i was young
   12   what kind of food do you like
   12   what color is your hair
   12   do you have any children
   12   do you have any kids
   12   what is your favorite food mine is mexican
   11   i live in los angeles
   11   what kind of music do you listen to
   10   my ex cheated on me and left me for lawyer
   10   i m good how are you
    9   what instrument do you play
    9   hi how are you doing
    9   i used to be painter but now i am retired
    9   what do you do for fun
    9   i like to listen to music
    9   what is your favorite color mine is red
    8   i m doing well how about you
    8   not too bad how about you
    8   i m sorry to hear that
    8   no i don t do you
    8   i like to spend time with my family
    8   i love red reminds me of summer time
    7   sorry to hear that what do you do for living
    7   good morning how are you
    7   what is your favorite thing to buy
    7   i live in alabama where do you live
    6   what kind of car do you have
    6   what is your favorite color mine is yellow
    6   do you have any siblings
    6   sure what do you do for living
    6   no i have not do you
    6   no i am stay at home mom
    6   i work as construction worker
    6   i love pizza what about you
    6   do you have any family
    6   do you play any sports
    6   my wife left me and took care of my children
    6   what is your favorite season mine is winter
    6   i was raised on horse farm
    5   i walk dogs for living
    5   thank you what do you do for living
    5   that sounds like plan
    5   i like all kinds what about you
    5   what do you do in your spare time
    5   hi there how are you
    5   i am pregnant with my first child
    5   yes it is what do you do for living
    5   yes i do what do you do for living
    5   my dad taught me everything i know he taught me everything i know
    5   i do not have any pets do you
    5   happy birthday what do you like to do for fun
    5   i grew up on farm
    5   i like to cook and cook
    5   i am country music singer
    5   what is your favorite book
    4   pretty good thanks and you
    4   what kind of games do you play
    4   sorry to hear that what happened
    4   no i don t have any kids
    4   i play piano and sing folk music
    4   yes i do what do you do