In [1]:
cd '/Users/josh/Documents/chihack/article-tagging/lib'
In [2]:
from numpy.random import seed
seed(1)
from tensorflow import set_random_seed
set_random_seed(2)
In [3]:
import os
import tagnews
import pandas as pd
from keras.models import Sequential
from keras.layers import LSTM, Dense, TimeDistributed
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint
import numpy as np
import json
import requests
import keras
import shutil
In [4]:
glove = tagnews.load_glove('tagnews/data/glove.6B.50d.txt')
In [5]:
glove.shape
Out[5]:
In [6]:
glove.loc['address_vec'] = glove.loc[['street', 'avenue', 'place', 'road', 'block', 'main', 'city', 'west', 'east', 'north', 'south']].mean()
glove.loc['neighborhood_vec'] = glove.loc[['neighborhood', 'burrough', 'community', 'area']].mean()
In [7]:
with open('tagnews/data/Chicago_Street_Names.csv') as street_names:
streets = street_names.read().splitlines()[1:]
streets = [i.lower() for i in streets]
with open('tagnews/data/chicago_neighborhoods.csv') as neighborhoods:
hoods = neighborhoods.read().splitlines()
hoods = list(set([j.lower().replace('\"', '') for j in hoods]))
for name in streets:
glove.loc[name] = glove.loc['address_vec']
for hood in hoods:
glove.loc[hood] = glove.loc['neighborhood_vec']
In [8]:
with open('tagnews/data/training.txt', encoding='utf-8') as f:
our_training_data = f.read()
training_df = pd.DataFrame([x.split() for x in our_training_data.split('\n') if x],
columns=['word', 'tag'])
training_df.iloc[:,1] = training_df.iloc[:,1].apply(int)
training_df['all_tags'] = 'NA'
# If you want to join our data w/ kaggle data, you can do this.
# ner = tagnews.load_ner_data('tagnews/data/')
# pd.concat([training_df, ner]).reset_index(drop=True)
# If you just want to use our data, you can do this.
ner = training_df
ner = ner[['word', 'all_tags', 'tag']]
In [9]:
# pd.DataFrame(glove.loc[ner.loc[ner['word'] == 'Woodlawn']['word'].str.lower()].values)
In [10]:
ner = pd.concat([ner,
pd.DataFrame(ner['word'].str[0].str.isupper().values),
pd.DataFrame(glove.loc[ner['word'].str.lower()].values),
pd.DataFrame(ner['word'].str.isnumeric().values),
pd.DataFrame(ner['word'].str.len().values)],
axis='columns')
ner.fillna(value=0.0, inplace=True)
data_dim = 53
timesteps = 25 # only during training, testing can take arbitrary length.
num_classes = 2
train_val_split = int(19 * ner.shape[0] / 20.)
ner_train_idxs = range(0, train_val_split - timesteps, timesteps)
x_train = np.array([ner.iloc[i:i+timesteps, 3:].values
for i in ner_train_idxs])
y_train = np.array([to_categorical(ner.iloc[i:i+timesteps, 2].values, 2)
for i in ner_train_idxs])
ner_val_idxs = range(train_val_split, ner.shape[0] - timesteps, timesteps)
x_val = np.array([ner.iloc[i:i+timesteps, 3:].values
for i in ner_val_idxs])
y_val = np.array([to_categorical(ner.iloc[i:i+timesteps, 2].values, 2)
for i in ner_val_idxs])
In [11]:
model = Sequential()
model.add(LSTM(32, return_sequences=True, input_shape=(None, data_dim)))
model.add(LSTM(8, return_sequences=True))
model.add(TimeDistributed(Dense(2, activation='softmax')))
model.compile(loss='categorical_crossentropy',
optimizer='adam',
metrics=['categorical_accuracy'])
print(model.summary(100))
In [12]:
os.makedirs('tmp', exist_ok=True)
checkpointer = ModelCheckpoint(filepath='./tmp/weights-{epoch:02d}.hdf5',
monitor='val_categorical_accuracy',
verbose=1,
save_best_only=True)
class OurAUC(keras.callbacks.Callback):
def on_epoch_end(self, epoch, logs={}):
# Go to https://geo-extract-tester.herokuapp.com/ and download
# the validation data (validation.txt).
'''with open('validation.txt', encoding='utf-8') as f:
s = f.read()
gloved_data = pd.concat([pd.DataFrame([[w[0].isupper()] for w in s.split('\n') if w]),
glove.loc[[w for w in s.split('\n') if w]].fillna(0).reset_index(drop=True)],
axis='columns')
glove_time_size = 100
preds_batched = []
i = 0
while gloved_data[i:i+glove_time_size].size:
preds_batched.append(model.predict(np.expand_dims(gloved_data[i:i+glove_time_size],
axis=0))[0][:,1])
i += glove_time_size
with open('guesses-{epoch:02d}.txt'.format(epoch=epoch), 'w') as f:
for prob in [p for pred in preds_batched for p in pred]:
f.write(str(prob) + '\n')'''
with open('validation.txt', encoding='utf-8') as f:
s = f.read()
gloved_data = pd.concat([pd.DataFrame([[w[0].isupper()] for w in s.split('\n') if w]),
glove.loc[[w for w in s.split('\n') if w]].fillna(0).reset_index(drop=True),
pd.DataFrame([[w[0].isnumeric()] for w in s.split('\n') if w]),
pd.DataFrame([[len(w[0])] for w in s.split('\n') if w])],
axis='columns')
glove_time_size = 100
preds_batched = []
i = 0
while gloved_data[i:i+glove_time_size].size:
preds_batched.append(model.predict(np.expand_dims(gloved_data[i:i+glove_time_size],
axis=0))[0][:,1])
i += glove_time_size
with open('guesses-{epoch:02d}.txt'.format(epoch=epoch), 'w') as f:
for prob in [p for pred in preds_batched for p in pred]:
f.write(str(prob) + '\n')
with open('guesses-{epoch:02d}.txt'.format(epoch=epoch), 'rb') as f:
url = 'https://geo-extract-tester.herokuapp.com/api/score'
r = requests.post(url, files={'file': f})
try:
print('AUC: {:.5f}'.format(json.loads(r.text)['auc']))
except KeyError:
raise ValueError('Problem retrieving AUC from API. Is your validation set up to date?')
our_auc = OurAUC()
In [13]:
model.fit(x_train, y_train,
epochs=20,
validation_data=(x_val, y_val),
callbacks=[checkpointer, our_auc],
verbose=2)
Out[13]:
In [ ]:
In [ ]: