In [1]:
cd ..


/home/kevin/Documents/github/article-tagging/lib

In [2]:
import os
import tagnews
import pandas as pd
from keras.models import Sequential
from keras.layers import LSTM, Dense, TimeDistributed
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint
import numpy as np
import json
import requests
import keras


Using TensorFlow backend.

In [3]:
glove = tagnews.load_glove('tagnews/data/glove.6B.50d.txt')

In [4]:
with open('tagnews/data/training.txt', encoding='utf-8') as f:
    our_training_data = f.read()
    
training_df = pd.DataFrame([x.split() for x in our_training_data.split('\n')],
                           columns=['word', 'tag'])
training_df.iloc[:,1] = training_df.iloc[:,1].apply(int)
training_df['all_tags'] = 'NA'

# If you want to join our data w/ kaggle data, you can do this.
# ner = tagnews.load_ner_data('tagnews/data/')
# pd.concat([training_df, ner]).reset_index(drop=True)

# If you just want to use our data, you can do this.
ner = training_df

ner = ner[['word', 'all_tags', 'tag']]

In [5]:
ner = pd.concat([ner,
                 pd.DataFrame(ner['word'].str[0].str.isupper().values),
                 pd.DataFrame(glove.loc[ner['word'].str.lower()].values)],
                axis='columns')
ner.fillna(value=0.0, inplace=True)

data_dim = 51
timesteps = 25 # only during training, testing can take arbitrary length.
num_classes = 2

train_val_split = int(19 * ner.shape[0] / 20.)

ner_train_idxs = range(0, train_val_split - timesteps, timesteps)
x_train = np.array([ner.iloc[i:i+timesteps, 3:].values
                    for i in ner_train_idxs])
y_train = np.array([to_categorical(ner.iloc[i:i+timesteps, 2].values, 2)
                    for i in ner_train_idxs])

ner_val_idxs = range(train_val_split, ner.shape[0] - timesteps, timesteps)
x_val = np.array([ner.iloc[i:i+timesteps, 3:].values
                  for i in ner_val_idxs])
y_val = np.array([to_categorical(ner.iloc[i:i+timesteps, 2].values, 2)
                  for i in ner_val_idxs])

In [6]:
model = Sequential()
model.add(LSTM(32, return_sequences=True, input_shape=(None, data_dim)))
model.add(LSTM(8, return_sequences=True))
model.add(TimeDistributed(Dense(2, activation='softmax')))
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['categorical_accuracy'])
print(model.summary(100))


____________________________________________________________________________________________________
Layer (type)                                 Output Shape                            Param #        
====================================================================================================
lstm_1 (LSTM)                                (None, None, 32)                        10752          
____________________________________________________________________________________________________
lstm_2 (LSTM)                                (None, None, 8)                         1312           
____________________________________________________________________________________________________
time_distributed_1 (TimeDistributed)         (None, None, 2)                         18             
====================================================================================================
Total params: 12,082
Trainable params: 12,082
Non-trainable params: 0
____________________________________________________________________________________________________
None

In [7]:
os.makedirs('tmp', exist_ok=True)
checkpointer = ModelCheckpoint(filepath='./tmp/weights-{epoch:02d}.hdf5',
                               monitor='val_categorical_accuracy',
                               verbose=1,
                               save_best_only=True)

class OurAUC(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs={}):
        # Go to https://geo-extract-tester.herokuapp.com/ and download
        # the validation data (validation.txt).
        with open('validation.txt', encoding='utf-8') as f:
            s = f.read()

        gloved_data = pd.concat([pd.DataFrame([[w[0].isupper()] for w in s.split('\n') if w]),
                                 glove.loc[[w for w in s.split('\n') if w]].fillna(0).reset_index(drop=True)],
                               axis='columns')
        glove_time_size = 100
        preds_batched = []
        i = 0
        while gloved_data[i:i+glove_time_size].size:
            preds_batched.append(model.predict(np.expand_dims(gloved_data[i:i+glove_time_size],
                                                              axis=0))[0][:,1])
            i += glove_time_size

        with open('guesses-{epoch:02d}.txt'.format(epoch=epoch), 'w') as f:
            for prob in [p for pred in preds_batched for p in pred]:
                f.write(str(prob) + '\n')

        with open('guesses-{epoch:02d}.txt'.format(epoch=epoch), 'rb') as f:
            url = 'https://geo-extract-tester.herokuapp.com/api/score'
            r = requests.post(url, files={'file': f})
            try:
                print('AUC: {:.5f}'.format(json.loads(r.text)['auc']))
            except KeyError:
                raise ValueError('Problem retrieving AUC from API. Is your validation set up to date?')

our_auc = OurAUC()

In [8]:
model.fit(x_train, y_train,
          epochs=20,
          validation_data=(x_val, y_val),
          callbacks=[checkpointer, our_auc],
          verbose=2)


Train on 2467 samples, validate on 129 samples
Epoch 1/20
Epoch 00000: val_categorical_accuracy improved from -inf to 0.93054, saving model to ./tmp/weights-00.hdf5
AUC: 0.88599
27s - loss: 0.3390 - categorical_accuracy: 0.9053 - val_loss: 0.2362 - val_categorical_accuracy: 0.9305
Epoch 2/20
Epoch 00001: val_categorical_accuracy did not improve
AUC: 0.93386
26s - loss: 0.2037 - categorical_accuracy: 0.9177 - val_loss: 0.1728 - val_categorical_accuracy: 0.9271
Epoch 3/20
Epoch 00002: val_categorical_accuracy did not improve
AUC: 0.94096
26s - loss: 0.1584 - categorical_accuracy: 0.9369 - val_loss: 0.1627 - val_categorical_accuracy: 0.9253
Epoch 4/20
Epoch 00003: val_categorical_accuracy did not improve
AUC: 0.94627
26s - loss: 0.1458 - categorical_accuracy: 0.9429 - val_loss: 0.1583 - val_categorical_accuracy: 0.9243
Epoch 5/20
Epoch 00004: val_categorical_accuracy did not improve
AUC: 0.94879
27s - loss: 0.1399 - categorical_accuracy: 0.9448 - val_loss: 0.1532 - val_categorical_accuracy: 0.9262
Epoch 6/20
Epoch 00005: val_categorical_accuracy did not improve
AUC: 0.95070
26s - loss: 0.1351 - categorical_accuracy: 0.9465 - val_loss: 0.1526 - val_categorical_accuracy: 0.9287
Epoch 7/20
Epoch 00006: val_categorical_accuracy did not improve
AUC: 0.95202
26s - loss: 0.1326 - categorical_accuracy: 0.9467 - val_loss: 0.1512 - val_categorical_accuracy: 0.9281
Epoch 8/20
Epoch 00007: val_categorical_accuracy did not improve
AUC: 0.95270
27s - loss: 0.1301 - categorical_accuracy: 0.9488 - val_loss: 0.1527 - val_categorical_accuracy: 0.9281
Epoch 9/20
Epoch 00008: val_categorical_accuracy did not improve
AUC: 0.95297
27s - loss: 0.1276 - categorical_accuracy: 0.9493 - val_loss: 0.1465 - val_categorical_accuracy: 0.9274
Epoch 10/20
Epoch 00009: val_categorical_accuracy did not improve
AUC: 0.95275
28s - loss: 0.1255 - categorical_accuracy: 0.9493 - val_loss: 0.1444 - val_categorical_accuracy: 0.9287
Epoch 11/20
Epoch 00010: val_categorical_accuracy did not improve
AUC: 0.95273
27s - loss: 0.1241 - categorical_accuracy: 0.9496 - val_loss: 0.1439 - val_categorical_accuracy: 0.9281
Epoch 12/20
Epoch 00011: val_categorical_accuracy did not improve
AUC: 0.95465
27s - loss: 0.1231 - categorical_accuracy: 0.9498 - val_loss: 0.1443 - val_categorical_accuracy: 0.9268
Epoch 13/20
Epoch 00012: val_categorical_accuracy did not improve
AUC: 0.95379
27s - loss: 0.1211 - categorical_accuracy: 0.9507 - val_loss: 0.1492 - val_categorical_accuracy: 0.9284
Epoch 14/20
Epoch 00013: val_categorical_accuracy did not improve
AUC: 0.95501
27s - loss: 0.1195 - categorical_accuracy: 0.9510 - val_loss: 0.1436 - val_categorical_accuracy: 0.9274
Epoch 15/20
Epoch 00014: val_categorical_accuracy did not improve
AUC: 0.95443
27s - loss: 0.1170 - categorical_accuracy: 0.9527 - val_loss: 0.1405 - val_categorical_accuracy: 0.9290
Epoch 16/20
Epoch 00015: val_categorical_accuracy did not improve
AUC: 0.95387
26s - loss: 0.1151 - categorical_accuracy: 0.9536 - val_loss: 0.1395 - val_categorical_accuracy: 0.9281
Epoch 17/20
Epoch 00016: val_categorical_accuracy did not improve
AUC: 0.95428
27s - loss: 0.1135 - categorical_accuracy: 0.9538 - val_loss: 0.1402 - val_categorical_accuracy: 0.9278
Epoch 18/20
Epoch 00017: val_categorical_accuracy did not improve
AUC: 0.95323
27s - loss: 0.1120 - categorical_accuracy: 0.9546 - val_loss: 0.1450 - val_categorical_accuracy: 0.9287
Epoch 19/20
Epoch 00018: val_categorical_accuracy improved from 0.93054 to 0.93240, saving model to ./tmp/weights-18.hdf5
AUC: 0.95366
27s - loss: 0.1107 - categorical_accuracy: 0.9557 - val_loss: 0.1386 - val_categorical_accuracy: 0.9324
Epoch 20/20
Epoch 00019: val_categorical_accuracy improved from 0.93240 to 0.93240, saving model to ./tmp/weights-19.hdf5
AUC: 0.95260
27s - loss: 0.1078 - categorical_accuracy: 0.9570 - val_loss: 0.1414 - val_categorical_accuracy: 0.9324
Out[8]:
<keras.callbacks.History at 0x7f9ca60245f8>

In [ ]:
# Go to https://geo-extract-tester.herokuapp.com/ and download
# the validation data (validation.txt).
with open('validation.txt', encoding='utf-8') as f:
    s = f.read()

gloved_data = glove.loc[[w for w in s.split('\n') if w]].fillna(0)
glove_time_size = 100
preds_batched = []
i = 0
while gloved_data[i:i+glove_time_size].size:
    preds_batched.append(model.predict(np.expand_dims(gloved_data[i:i+glove_time_size], axis=0))[0][:,1])
    i += glove_time_size

preds = [p for pred in preds_batched for p in pred]

print('\n'.join(['{:>15} {:>9.4f}'.format(w, p) for (w, p) in zip(words, preds)][400:500]))
    
with open('guesses.txt', 'w') as f:
    for prob in [p for pred in preds_batched for p in pred]:
        f.write(str(prob) + '\n')

# Now go to https://geo-extract-tester.herokuapp.com/ and upload `guesses.txt` to see how you did!

In [ ]: