Comparing Encoder-Decoders Analysis

Model Architecture


In [ ]:
report_files = ['/Users/bking/IdeaProjects/LanguageModelRNN/experiment_results/encdec_noing6_200_512_04drb/encdec_noing6_200_512_04drb.json','/Users/bking/IdeaProjects/LanguageModelRNN/experiment_results/encdec_noing10_200_512_04drb/encdec_noing10_200_512_04drb.json','/Users/bking/IdeaProjects/LanguageModelRNN/experiment_results/encdec_noing15_200_512_04drb/encdec_noing15_200_512_04drb.json', '/Users/bking/IdeaProjects/LanguageModelRNN/experiment_results/encdec_noing23_200_512_04drb/encdec_noing23_200_512_04drb.json']
log_files = ['/Users/bking/IdeaProjects/LanguageModelRNN/experiment_results/encdec_noing6_200_512_04drb/encdec_noing6_200_512_04drb_logs.json','/Users/bking/IdeaProjects/LanguageModelRNN/experiment_results/encdec_noing10_200_512_04drb/encdec_noing10_200_512_04drb_logs.json','/Users/bking/IdeaProjects/LanguageModelRNN/experiment_results/encdec_noing15_200_512_04drb/encdec_noing15_200_512_04drb_logs.json', '/Users/bking/IdeaProjects/LanguageModelRNN/experiment_results/encdec_noing23_200_512_04drb/encdec_noing23_200_512_04drb_logs.json']
reports = []
logs = []
import json
import matplotlib.pyplot as plt
import numpy as np

for report_file in report_files:
    with open(report_file) as f:
        reports.append((report_file.split('/')[-1].split('.json')[0], json.loads(f.read())))
for log_file in log_files:
    with open(log_file) as f:
        logs.append((log_file.split('/')[-1].split('.json')[0], json.loads(f.read())))
        
for report_name, report in reports:
    print '\n', report_name, '\n'
    print 'Encoder: \n', report['architecture']['encoder']
    print 'Decoder: \n', report['architecture']['decoder']

Perplexity on Each Dataset


In [ ]:
%matplotlib inline
from IPython.display import HTML, display

def display_table(data):
    display(HTML(
        u'<table><tr>{}</tr></table>'.format(
            u'</tr><tr>'.join(
                u'<td>{}</td>'.format('</td><td>'.join(unicode(_) for _ in row)) for row in data)
            )
    ))

def bar_chart(data):
    n_groups = len(data)
    
    train_perps = [d[1] for d in data]
    valid_perps = [d[2] for d in data]
    test_perps = [d[3] for d in data]
    
    fig, ax = plt.subplots(figsize=(10,8))
    
    index = np.arange(n_groups)
    bar_width = 0.3

    opacity = 0.4
    error_config = {'ecolor': '0.3'}

    train_bars = plt.bar(index, train_perps, bar_width,
                     alpha=opacity,
                     color='b',
                     error_kw=error_config,
                     label='Training Perplexity')

    valid_bars = plt.bar(index + bar_width, valid_perps, bar_width,
                     alpha=opacity,
                     color='r',
                     error_kw=error_config,
                     label='Valid Perplexity')
    test_bars = plt.bar(index + 2*bar_width, test_perps, bar_width,
                     alpha=opacity,
                     color='g',
                     error_kw=error_config,
                     label='Test Perplexity')

    plt.xlabel('Model')
    plt.ylabel('Scores')
    plt.title('Perplexity by Model and Dataset')
    plt.xticks(index + bar_width / 3, [d[0] for d in data])
    plt.legend()

    plt.tight_layout()
    plt.show()

data = [['<b>Model</b>', '<b>Train Perplexity</b>', '<b>Valid Perplexity</b>', '<b>Test Perplexity</b>']]

for rname, report in reports:
    data.append([rname, report['train_perplexity'], report['valid_perplexity'], report['test_perplexity']])

display_table(data)
bar_chart(data[1:])

Loss vs. Epoch


In [ ]:
%matplotlib inline
plt.figure(figsize=(10, 8))
for rname, l in logs:
    for k in l.keys():
        plt.plot(l[k][0], l[k][1], label=str(k) + ' ' + rname + ' (train)')
        plt.plot(l[k][0], l[k][2], label=str(k) + ' ' + rname + ' (valid)')
plt.title('Loss v. Epoch')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

Perplexity vs. Epoch


In [ ]:
%matplotlib inline
plt.figure(figsize=(10, 8))
for rname, l in logs:
    for k in l.keys():
        plt.plot(l[k][0], l[k][3], label=str(k) + ' ' + rname + ' (train)')
        plt.plot(l[k][0], l[k][4], label=str(k) + ' ' + rname + ' (valid)')
plt.title('Perplexity v. Epoch')
plt.xlabel('Epoch')
plt.ylabel('Perplexity')
plt.legend()
plt.show()

Generations


In [ ]:
def print_sample(sample, best_bleu=None):
    enc_input = ' '.join([w for w in sample['encoder_input'].split(' ') if w != '<pad>'])
    gold = ' '.join([w for w in sample['gold'].split(' ') if w != '<mask>'])
    print('Input: '+ enc_input + '\n')
    print('Gend: ' + sample['generated'] + '\n')
    print('True: ' + gold + '\n')
    if best_bleu is not None:
        cbm = ' '.join([w for w in best_bleu['best_match'].split(' ') if w != '<mask>'])
        print('Closest BLEU Match: ' + cbm + '\n')
        print('Closest BLEU Score: ' + str(best_bleu['best_score']) + '\n')
    print('\n')
    
def display_sample(samples, best_bleu=False):
    for enc_input in samples:
        data = []
        for rname, sample in samples[enc_input]:
            gold = ' '.join([w for w in sample['gold'].split(' ') if w != '<mask>'])
            data.append([rname, '<b>Generated: </b>' + sample['generated']])
            if best_bleu:
                cbm = ' '.join([w for w in sample['best_match'].split(' ') if w != '<mask>'])
                data.append([rname, '<b>Closest BLEU Match: </b>' + cbm + ' (Score: ' + str(sample['best_score']) + ')'])
        data.insert(0, ['<u><b>' + enc_input + '</b></u>', '<b>True: ' + gold+ '</b>'])
        display_table(data)

def process_samples(samples):
    # consolidate samples with identical inputs
    result = {}
    for rname, t_samples, t_cbms in samples:
        for i, sample in enumerate(t_samples):
            enc_input = ' '.join([w for w in sample['encoder_input'].split(' ') if w != '<pad>'])
            if t_cbms is not None:
                sample.update(t_cbms[i])
            if enc_input in result:
                result[enc_input].append((rname, sample))
            else:
                result[enc_input] = [(rname, sample)]
    return result

In [ ]:
samples = process_samples([(rname, r['train_samples'], r['best_bleu_matches_train'] if 'best_bleu_matches_train' in r else None) for (rname, r) in reports])
display_sample(samples, best_bleu='best_bleu_matches_train' in reports[1][1])

In [ ]:
samples = process_samples([(rname, r['valid_samples'], r['best_bleu_matches_valid'] if 'best_bleu_matches_valid' in r else None) for (rname, r) in reports])
display_sample(samples, best_bleu='best_bleu_matches_valid' in reports[1][1])

In [ ]:
samples = process_samples([(rname, r['test_samples'], r['best_bleu_matches_test'] if 'best_bleu_matches_test' in r else None) for (rname, r) in reports])
display_sample(samples, best_bleu='best_bleu_matches_test' in reports[1][1])

BLEU Analysis


In [ ]:
def print_bleu(blue_structs):
    data= [['<b>Model</b>', '<b>Overall Score</b>','<b>1-gram Score</b>','<b>2-gram Score</b>','<b>3-gram Score</b>','<b>4-gram Score</b>']]
    for rname, blue_struct in blue_structs:
        data.append([rname, blue_struct['score'], blue_struct['components']['1'], blue_struct['components']['2'], blue_struct['components']['3'], blue_struct['components']['4']])
    display_table(data)

In [ ]:
# Training Set BLEU Scores
print_bleu([(rname, report['train_bleu']) for (rname, report) in reports])

In [ ]:
# Validation Set BLEU Scores
print_bleu([(rname, report['valid_bleu']) for (rname, report) in reports])

In [ ]:
# Test Set BLEU Scores
print_bleu([(rname, report['test_bleu']) for (rname, report) in reports])

In [ ]:
# All Data BLEU Scores
print_bleu([(rname, report['combined_bleu']) for (rname, report) in reports])

N-pairs BLEU Analysis

This analysis randomly samples 1000 pairs of generations/ground truths and treats them as translations, giving their BLEU score. We can expect very low scores in the ground truth and high scores can expose hyper-common generations


In [ ]:
# Training Set BLEU n-pairs Scores
print_bleu([(rname, report['n_pairs_bleu_train']) for (rname, report) in reports])

In [ ]:
# Validation Set n-pairs BLEU Scores
print_bleu([(rname, report['n_pairs_bleu_valid']) for (rname, report) in reports])

In [ ]:
# Test Set n-pairs BLEU Scores
print_bleu([(rname, report['n_pairs_bleu_test']) for (rname, report) in reports])

In [ ]:
# Combined n-pairs BLEU Scores
print_bleu([(rname, report['n_pairs_bleu_all']) for (rname, report) in reports])

In [ ]:
# Ground Truth n-pairs BLEU Scores
print_bleu([(rname, report['n_pairs_bleu_gold']) for (rname, report) in reports])

Alignment Analysis

This analysis computs the average Smith-Waterman alignment score for generations, with the same intuition as N-pairs BLEU, in that we expect low scores in the ground truth and hyper-common generations to raise the scores


In [ ]:
def print_align(reports):
    data= [['<b>Model</b>', '<b>Average (Train) Generated Score</b>','<b>Average (Valid) Generated Score</b>','<b>Average (Test) Generated Score</b>','<b>Average (All) Generated Score</b>', '<b>Average (Gold) Score</b>']]
    for rname, report in reports:
        data.append([rname, report['average_alignment_train'], report['average_alignment_valid'], report['average_alignment_test'], report['average_alignment_all'], report['average_alignment_gold']])
    display_table(data)

print_align(reports)