In [1]:
cd '/Users/josh/Documents/chihack/article-tagging/lib'


/Users/josh/Documents/chihack/article-tagging/lib

In [2]:
from numpy.random import seed
seed(1)
from tensorflow import set_random_seed
set_random_seed(2)


/anaconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: compiletime version 3.5 of module 'tensorflow.python.framework.fast_tensor_util' does not match runtime version 3.6
  return f(*args, **kwds)

In [3]:
import os
import tagnews
import pandas as pd
from keras.models import Sequential
from keras.layers import LSTM, Dense, TimeDistributed
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint
import numpy as np
import json
import requests
import keras
import shutil

In [4]:
glove = tagnews.load_glove('tagnews/data/glove.6B.50d.txt')

In [5]:
glove.shape


Out[5]:
(400000, 50)

In [6]:
glove.loc['address_vec'] = glove.loc[['street', 'avenue', 'place', 'road', 'block', 'main', 'city', 'west', 'east', 'north', 'south']].mean()
glove.loc['neighborhood_vec'] = glove.loc[['neighborhood', 'burrough', 'community', 'area']].mean()

In [7]:
with open('tagnews/data/Chicago_Street_Names.csv') as street_names:
    streets = street_names.read().splitlines()[1:]
streets = [i.lower() for i in streets]

with open('tagnews/data/chicago_neighborhoods.csv') as neighborhoods:
    hoods = neighborhoods.read().splitlines()
hoods = list(set([j.lower().replace('\"', '') for j in hoods]))

for name in streets:
    glove.loc[name] = glove.loc['address_vec']
for hood in hoods:
    glove.loc[hood] = glove.loc['neighborhood_vec']

In [8]:
with open('tagnews/data/training.txt', encoding='utf-8') as f:
    our_training_data = f.read()
    
training_df = pd.DataFrame([x.split() for x in our_training_data.split('\n') if x],
                           columns=['word', 'tag'])

training_df.iloc[:,1] = training_df.iloc[:,1].apply(int)
training_df['all_tags'] = 'NA'

# If you want to join our data w/ kaggle data, you can do this.
# ner = tagnews.load_ner_data('tagnews/data/')
# pd.concat([training_df, ner]).reset_index(drop=True)

# If you just want to use our data, you can do this.
ner = training_df

ner = ner[['word', 'all_tags', 'tag']]

In [9]:
# pd.DataFrame(glove.loc[ner.loc[ner['word'] == 'Woodlawn']['word'].str.lower()].values)

In [10]:
ner = pd.concat([ner,
                 pd.DataFrame(ner['word'].str[0].str.isupper().values),
                 pd.DataFrame(glove.loc[ner['word'].str.lower()].values),
                 pd.DataFrame(ner['word'].str.isnumeric().values),
                 pd.DataFrame(ner['word'].str.len().values)],
                 axis='columns')
ner.fillna(value=0.0, inplace=True)

data_dim = 53
timesteps = 25 # only during training, testing can take arbitrary length.
num_classes = 2

train_val_split = int(19 * ner.shape[0] / 20.)

ner_train_idxs = range(0, train_val_split - timesteps, timesteps)
x_train = np.array([ner.iloc[i:i+timesteps, 3:].values
                    for i in ner_train_idxs])
y_train = np.array([to_categorical(ner.iloc[i:i+timesteps, 2].values, 2)
                    for i in ner_train_idxs])

ner_val_idxs = range(train_val_split, ner.shape[0] - timesteps, timesteps)
x_val = np.array([ner.iloc[i:i+timesteps, 3:].values
                  for i in ner_val_idxs])
y_val = np.array([to_categorical(ner.iloc[i:i+timesteps, 2].values, 2)
                  for i in ner_val_idxs])

In [11]:
model = Sequential()
model.add(LSTM(32, return_sequences=True, input_shape=(None, data_dim)))
model.add(LSTM(8, return_sequences=True))
model.add(TimeDistributed(Dense(2, activation='softmax')))
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['categorical_accuracy'])
print(model.summary(100))


____________________________________________________________________________________________________
Layer (type)                                 Output Shape                            Param #        
====================================================================================================
lstm_1 (LSTM)                                (None, None, 32)                        11008          
____________________________________________________________________________________________________
lstm_2 (LSTM)                                (None, None, 8)                         1312           
____________________________________________________________________________________________________
time_distributed_1 (TimeDistributed)         (None, None, 2)                         18             
====================================================================================================
Total params: 12,338
Trainable params: 12,338
Non-trainable params: 0
____________________________________________________________________________________________________
None

In [12]:
os.makedirs('tmp', exist_ok=True)
checkpointer = ModelCheckpoint(filepath='./tmp/weights-{epoch:02d}.hdf5',
                               monitor='val_categorical_accuracy',
                               verbose=1,
                               save_best_only=True)

class OurAUC(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs={}):
        # Go to https://geo-extract-tester.herokuapp.com/ and download
        # the validation data (validation.txt).
        '''with open('validation.txt', encoding='utf-8') as f:
            s = f.read()

        gloved_data = pd.concat([pd.DataFrame([[w[0].isupper()] for w in s.split('\n') if w]),
                                 glove.loc[[w for w in s.split('\n') if w]].fillna(0).reset_index(drop=True)],
                                 axis='columns')
        glove_time_size = 100
        preds_batched = []
        i = 0
        while gloved_data[i:i+glove_time_size].size:
            preds_batched.append(model.predict(np.expand_dims(gloved_data[i:i+glove_time_size],
                                                              axis=0))[0][:,1])
            i += glove_time_size

        with open('guesses-{epoch:02d}.txt'.format(epoch=epoch), 'w') as f:
            for prob in [p for pred in preds_batched for p in pred]:
                f.write(str(prob) + '\n')'''

        with open('validation.txt', encoding='utf-8') as f:
            s = f.read()

        gloved_data = pd.concat([pd.DataFrame([[w[0].isupper()] for w in s.split('\n') if w]),
                                 glove.loc[[w for w in s.split('\n') if w]].fillna(0).reset_index(drop=True),
                                 pd.DataFrame([[w[0].isnumeric()] for w in s.split('\n') if w]),
                                 pd.DataFrame([[len(w[0])] for w in s.split('\n') if w])],
                                 axis='columns')
        glove_time_size = 100
        preds_batched = []
        i = 0
        while gloved_data[i:i+glove_time_size].size:
            preds_batched.append(model.predict(np.expand_dims(gloved_data[i:i+glove_time_size],
                                                              axis=0))[0][:,1])
            i += glove_time_size

        with open('guesses-{epoch:02d}.txt'.format(epoch=epoch), 'w') as f:
            for prob in [p for pred in preds_batched for p in pred]:
                f.write(str(prob) + '\n')

        with open('guesses-{epoch:02d}.txt'.format(epoch=epoch), 'rb') as f:
            url = 'https://geo-extract-tester.herokuapp.com/api/score'
            r = requests.post(url, files={'file': f})
            try:
                print('AUC: {:.5f}'.format(json.loads(r.text)['auc']))
            except KeyError:
                raise ValueError('Problem retrieving AUC from API. Is your validation set up to date?')

our_auc = OurAUC()

In [13]:
model.fit(x_train, y_train,
          epochs=20,
          validation_data=(x_val, y_val),
          callbacks=[checkpointer, our_auc],
          verbose=2)


Train on 6271 samples, validate on 330 samples
Epoch 1/20
Epoch 00001: val_categorical_accuracy improved from -inf to 0.97697, saving model to ./tmp/weights-01.hdf5
AUC: 0.92249
 - 60s - loss: 0.1762 - categorical_accuracy: 0.9638 - val_loss: 0.0828 - val_categorical_accuracy: 0.9770
Epoch 2/20
Epoch 00002: val_categorical_accuracy improved from 0.97697 to 0.97867, saving model to ./tmp/weights-02.hdf5
AUC: 0.94973
 - 58s - loss: 0.0841 - categorical_accuracy: 0.9666 - val_loss: 0.0573 - val_categorical_accuracy: 0.9787
Epoch 3/20
Epoch 00003: val_categorical_accuracy improved from 0.97867 to 0.98048, saving model to ./tmp/weights-03.hdf5
AUC: 0.95489
 - 59s - loss: 0.0708 - categorical_accuracy: 0.9734 - val_loss: 0.0526 - val_categorical_accuracy: 0.9805
Epoch 4/20
Epoch 00004: val_categorical_accuracy did not improve
AUC: 0.95848
 - 59s - loss: 0.0666 - categorical_accuracy: 0.9751 - val_loss: 0.0506 - val_categorical_accuracy: 0.9792
Epoch 5/20
Epoch 00005: val_categorical_accuracy did not improve
AUC: 0.95974
 - 61s - loss: 0.0639 - categorical_accuracy: 0.9760 - val_loss: 0.0487 - val_categorical_accuracy: 0.9805
Epoch 6/20
Epoch 00006: val_categorical_accuracy did not improve
AUC: 0.96138
 - 63s - loss: 0.0623 - categorical_accuracy: 0.9765 - val_loss: 0.0483 - val_categorical_accuracy: 0.9804
Epoch 7/20
Epoch 00007: val_categorical_accuracy improved from 0.98048 to 0.98109, saving model to ./tmp/weights-07.hdf5
AUC: 0.95850
 - 61s - loss: 0.0609 - categorical_accuracy: 0.9769 - val_loss: 0.0490 - val_categorical_accuracy: 0.9811
Epoch 8/20
Epoch 00008: val_categorical_accuracy did not improve
AUC: 0.96303
 - 63s - loss: 0.0598 - categorical_accuracy: 0.9772 - val_loss: 0.0462 - val_categorical_accuracy: 0.9807
Epoch 9/20
Epoch 00009: val_categorical_accuracy did not improve
AUC: 0.96292
 - 62s - loss: 0.0589 - categorical_accuracy: 0.9774 - val_loss: 0.0468 - val_categorical_accuracy: 0.9808
Epoch 10/20
Epoch 00010: val_categorical_accuracy did not improve
AUC: 0.96326
 - 59s - loss: 0.0581 - categorical_accuracy: 0.9774 - val_loss: 0.0462 - val_categorical_accuracy: 0.9806
Epoch 11/20
Epoch 00011: val_categorical_accuracy did not improve
AUC: 0.96347
 - 63s - loss: 0.0569 - categorical_accuracy: 0.9778 - val_loss: 0.0456 - val_categorical_accuracy: 0.9800
Epoch 12/20
Epoch 00012: val_categorical_accuracy did not improve
AUC: 0.96203
 - 60s - loss: 0.0563 - categorical_accuracy: 0.9781 - val_loss: 0.0449 - val_categorical_accuracy: 0.9802
Epoch 13/20
Epoch 00013: val_categorical_accuracy did not improve
AUC: 0.96189
 - 61s - loss: 0.0553 - categorical_accuracy: 0.9784 - val_loss: 0.0458 - val_categorical_accuracy: 0.9808
Epoch 14/20
Epoch 00014: val_categorical_accuracy did not improve
AUC: 0.95982
 - 60s - loss: 0.0544 - categorical_accuracy: 0.9784 - val_loss: 0.0457 - val_categorical_accuracy: 0.9810
Epoch 15/20
Epoch 00015: val_categorical_accuracy did not improve
AUC: 0.96014
 - 64s - loss: 0.0536 - categorical_accuracy: 0.9788 - val_loss: 0.0465 - val_categorical_accuracy: 0.9806
Epoch 16/20
Epoch 00016: val_categorical_accuracy did not improve
AUC: 0.96055
 - 62s - loss: 0.0529 - categorical_accuracy: 0.9790 - val_loss: 0.0462 - val_categorical_accuracy: 0.9808
Epoch 17/20
Epoch 00017: val_categorical_accuracy did not improve
AUC: 0.96207
 - 63s - loss: 0.0522 - categorical_accuracy: 0.9793 - val_loss: 0.0464 - val_categorical_accuracy: 0.9802
Epoch 18/20
Epoch 00018: val_categorical_accuracy improved from 0.98109 to 0.98145, saving model to ./tmp/weights-18.hdf5
AUC: 0.96180
 - 64s - loss: 0.0511 - categorical_accuracy: 0.9798 - val_loss: 0.0459 - val_categorical_accuracy: 0.9815
Epoch 19/20
Epoch 00019: val_categorical_accuracy did not improve
AUC: 0.95842
 - 59s - loss: 0.0508 - categorical_accuracy: 0.9803 - val_loss: 0.0470 - val_categorical_accuracy: 0.9804
Epoch 20/20
Epoch 00020: val_categorical_accuracy did not improve
AUC: 0.95720
 - 61s - loss: 0.0498 - categorical_accuracy: 0.9802 - val_loss: 0.0467 - val_categorical_accuracy: 0.9810
Out[13]:
<keras.callbacks.History at 0x1a1731ce10>

In [ ]:


In [ ]: