In [ ]:
import pandas as pd
import logging
import glob
from sklearn.model_selection import train_test_split
pd.set_option('display.max_colwidth', 500)
logger = logging.getLogger()
logger.setLevel(logging.WARNING)

Download Data


In [ ]:
# Ensure that the github-issues-data volume is mounted in /mnt
!ls -la /mnt

In [ ]:
# Set path for data dir
%env DATA_DIR=/mnt/github-issues-data

In [ ]:
# Download the github-issues.zip training data to /mnt/github-issues-data
!wget --directory-prefix=${DATA_DIR} https://storage.googleapis.com/kubeflow-examples/github-issue-summarization-data/github-issues.zip

# Unzip the file into /mnt/github-issues-data directory
!unzip ${DATA_DIR}/github-issues.zip -d ${DATA_DIR}

In [ ]:
# Create a symlink from <current_directory>/github-issues-data to /mnt/github-issues-data
!ln -sf ${DATA_DIR} github-issues-data

In [ ]:
# Make sure that the github-issues-data symlink is created
!ls -lh github-issues-data/github_issues.csv

Process Data

Split data into train and test set and preview data


In [ ]:
data_file='github-issues-data/github_issues.csv'

# read in data sample 2000 rows (for speed of tutorial)
# Set this to False to train on the entire dataset
use_sample_data=True

if use_sample_data:
    training_data_size=2000
    traindf, testdf = train_test_split(pd.read_csv(data_file).sample(n=training_data_size), 
                                   test_size=.10)
else:
    traindf, testdf = train_test_split(pd.read_csv(data_file),test_size=.10)


#print out stats about shape of data
print(f'Train: {traindf.shape[0]:,} rows {traindf.shape[1]:,} columns')
print(f'Test: {testdf.shape[0]:,} rows {testdf.shape[1]:,} columns')

# preview data
traindf.head(3)

Convert to lists in preparation for modeling


In [ ]:
train_body_raw = traindf.body.tolist()
train_title_raw = traindf.issue_title.tolist()
#preview output of first element
train_body_raw[0]

Pre-Process Data For Deep Learning

See this repo for documentation on the ktext package


In [ ]:
%reload_ext autoreload
%autoreload 2
from ktext.preprocess import processor

In [ ]:
%%time
# Clean, tokenize, and apply padding / truncating such that each document length = 70
#  also, retain only the top 8,000 words in the vocabulary and set the remaining words
#  to 1 which will become common index for rare words 
body_pp = processor(keep_n=8000, padding_maxlen=70)
train_body_vecs = body_pp.fit_transform(train_body_raw)

Look at one example of processed issue bodies


In [ ]:
print('\noriginal string:\n', train_body_raw[0], '\n')
print('after pre-processing:\n', train_body_vecs[0], '\n')

In [ ]:
# Instantiate a text processor for the titles, with some different parameters
#  append_indicators = True appends the tokens '_start_' and '_end_' to each
#                      document
#  padding = 'post' means that zero padding is appended to the end of the 
#             of the document (as opposed to the default which is 'pre')
title_pp = processor(append_indicators=True, keep_n=4500, 
                     padding_maxlen=12, padding ='post')

# process the title data
train_title_vecs = title_pp.fit_transform(train_title_raw)

Look at one example of processed issue titles


In [ ]:
print('\noriginal string:\n', train_title_raw[0])
print('after pre-processing:\n', train_title_vecs[0])

Serialize all of this to disk for later use


In [ ]:
import dill as dpickle
import numpy as np

# Save the preprocessor
with open('body_pp.dpkl', 'wb') as f:
    dpickle.dump(body_pp, f)

with open('title_pp.dpkl', 'wb') as f:
    dpickle.dump(title_pp, f)

# Save the processed data
np.save('train_title_vecs.npy', train_title_vecs)
np.save('train_body_vecs.npy', train_body_vecs)

Define Model Architecture

Load the data from disk into variables


In [ ]:
from seq2seq_utils import load_decoder_inputs, load_encoder_inputs, load_text_processor

In [ ]:
encoder_input_data, doc_length = load_encoder_inputs('train_body_vecs.npy')
decoder_input_data, decoder_target_data = load_decoder_inputs('train_title_vecs.npy')

In [ ]:
num_encoder_tokens, body_pp = load_text_processor('body_pp.dpkl')
num_decoder_tokens, title_pp = load_text_processor('title_pp.dpkl')

Define Model Architecture


In [ ]:
%matplotlib inline
from keras.models import Model
from keras.layers import Input, LSTM, GRU, Dense, Embedding, Bidirectional, BatchNormalization
from keras import optimizers

In [ ]:
#arbitrarly set latent dimension for embedding and hidden units
latent_dim = 300

##### Define Model Architecture ######

########################
#### Encoder Model ####
encoder_inputs = Input(shape=(doc_length,), name='Encoder-Input')

# Word embeding for encoder (ex: Issue Body)
x = Embedding(num_encoder_tokens, latent_dim, name='Body-Word-Embedding', mask_zero=False)(encoder_inputs)
x = BatchNormalization(name='Encoder-Batchnorm-1')(x)

# Intermediate GRU layer (optional)
#x = GRU(latent_dim, name='Encoder-Intermediate-GRU', return_sequences=True)(x)
#x = BatchNormalization(name='Encoder-Batchnorm-2')(x)

# We do not need the `encoder_output` just the hidden state.
_, state_h = GRU(latent_dim, return_state=True, name='Encoder-Last-GRU')(x)

# Encapsulate the encoder as a separate entity so we can just 
#  encode without decoding if we want to.
encoder_model = Model(inputs=encoder_inputs, outputs=state_h, name='Encoder-Model')

seq2seq_encoder_out = encoder_model(encoder_inputs)

########################
#### Decoder Model ####
decoder_inputs = Input(shape=(None,), name='Decoder-Input')  # for teacher forcing

# Word Embedding For Decoder (ex: Issue Titles)
dec_emb = Embedding(num_decoder_tokens, latent_dim, name='Decoder-Word-Embedding', mask_zero=False)(decoder_inputs)
dec_bn = BatchNormalization(name='Decoder-Batchnorm-1')(dec_emb)

# Set up the decoder, using `decoder_state_input` as initial state.
decoder_gru = GRU(latent_dim, return_state=True, return_sequences=True, name='Decoder-GRU')
decoder_gru_output, _ = decoder_gru(dec_bn, initial_state=seq2seq_encoder_out)
x = BatchNormalization(name='Decoder-Batchnorm-2')(decoder_gru_output)

# Dense layer for prediction
decoder_dense = Dense(num_decoder_tokens, activation='softmax', name='Final-Output-Dense')
decoder_outputs = decoder_dense(x)

########################
#### Seq2Seq Model ####

#seq2seq_decoder_out = decoder_model([decoder_inputs, seq2seq_encoder_out])
seq2seq_Model = Model([encoder_inputs, decoder_inputs], decoder_outputs)


seq2seq_Model.compile(optimizer=optimizers.Nadam(lr=0.001), loss='sparse_categorical_crossentropy')

Examine Model Architecture Summary


In [ ]:
from seq2seq_utils import viz_model_architecture
seq2seq_Model.summary()
viz_model_architecture(seq2seq_Model)

Train Model


In [ ]:
from keras.callbacks import CSVLogger, ModelCheckpoint

script_name_base = 'tutorial_seq2seq'
csv_logger = CSVLogger('{:}.log'.format(script_name_base))
model_checkpoint = ModelCheckpoint('{:}.epoch{{epoch:02d}}-val{{val_loss:.5f}}.hdf5'.format(script_name_base),
                                   save_best_only=True)

batch_size = 1200
epochs = 7
history = seq2seq_Model.fit([encoder_input_data, decoder_input_data], np.expand_dims(decoder_target_data, -1),
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.12, callbacks=[csv_logger, model_checkpoint])

In [ ]:
#save model
seq2seq_Model.save('seq2seq_model_tutorial.h5')

See Example Results On Holdout Set

It is useful to see examples of real predictions on a holdout set to get a sense of the performance of the model. We will also evaluate the model numerically in a following section.


In [ ]:
from seq2seq_utils import Seq2Seq_Inference
seq2seq_inf = Seq2Seq_Inference(encoder_preprocessor=body_pp,
                                 decoder_preprocessor=title_pp,
                                 seq2seq_model=seq2seq_Model)

In [ ]:
# this method displays the predictions on random rows of the holdout set
seq2seq_inf.demo_model_predictions(n=50, issue_df=testdf)

Evaluate Model: BLEU Score

For machine-translation tasks such as this one, it is common to measure the accuracy of results using the BLEU Score. The convenience function illustrated below uses NLTK's corpus_bleu. The output of the below convenience function is an Average of BlEU-1, BLEU-2, BLEU-3 and BLEU-4.


In [ ]:
#convenience function that generates predictions on holdout set and calculates BLEU Score

bleu_score = seq2seq_inf.evaluate_model(holdout_bodies=testdf.body.tolist(), 
                                        holdout_titles=testdf.issue_title.tolist(), 
                                        max_len_title=12)

In [2]:
print(f'BLEU Score (avg of BLUE 1-4) on Holdout Set: {bleu_score * 100}')