In [1]:
# until ttk is installed, add parent dir to path
import sys
sys.path.insert(0, '..')

In [2]:
# typicaL imports
import pandas as pd
import numpy as np
import re

import matplotlib
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (10.0, 8.0)

import matplotlib.pyplot as plt

import spacy
import ete3
import seaborn

from ttk.corpus import load_headline_corpus

In [3]:
%%time
# load the corpus
corpus = load_headline_corpus(verbose=True)

print ('Headlines:', len(corpus.sents()))


Loading corpus from: S:\git\tacticsiege\tactictoolkit\ttk\..\env\corpus\dated\2017_08_22\corpus
Corpus loaded.
Headlines: 190447
Wall time: 10.9 s

In [6]:
from ttk.preprocessing import Seq2WordVecTransformer
from ttk.preprocessing import SeqPaddingTransformer

from ttk.corpus import CategorizedDatedCorpusReporter

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from ttk.preprocessing import Seq2IndexTransformer

label_enc = LabelEncoder()
reporter = CategorizedDatedCorpusReporter()
corpus_df = reporter.to_data_frame(corpus, categories=['Washington Post', 'CNN'])

X = corpus_df['content'].values
X = [s.split() for s in X]
y = label_enc.fit_transform(corpus_df['category'].values)


def token_filter(t, s):
    return t.isalpha()

def token_processor(t, s):
    return t.lower()

indexer = Seq2IndexTransformer(add_delimiters=False, token_mapping_func=token_processor, token_filter_func=token_filter)
pad = SeqPaddingTransformer()

In [7]:
X_indexed = indexer.fit_transform(X)
for i in range(5):
    print ('X[%i]:' % i, X_indexed[i])


Max index: 17565
X[0]: [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]
X[1]: [19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]
X[2]: [31, 32, 33, 34, 35, 36, 21, 37, 38, 39, 20, 24, 25, 40, 39, 27]
X[3]: [41, 42, 27, 24, 25, 26, 7, 43, 44]
X[4]: [33, 45, 46, 14, 15, 16, 47, 48, 49, 50, 21, 51, 52]

In [8]:
X_padded = pad.fit_transform(X_indexed)
for i in range(5):
    print ('X[%i]' % i, X_padded[i])


X[0] [  2.   3.   4.   5.   6.   7.   8.   9.  10.  11.  12.  13.  14.  15.  16.
  17.  18.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.]
X[1] [ 19.  20.  21.  22.  23.  24.  25.  26.  27.  28.  29.  30.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.]
X[2] [ 31.  32.  33.  34.  35.  36.  21.  37.  38.  39.  20.  24.  25.  40.  39.
  27.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.]
X[3] [ 41.  42.  27.  24.  25.  26.   7.  43.  44.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.]
X[4] [ 33.  45.  46.  14.  15.  16.  47.  48.  49.  50.  21.  51.  52.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.]

In [9]:
# split
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.3, random_state=0)
print ('%i Training examples, %i Testing examples, Shape: %s' % (len(X_train), len(X_test), X_train.shape))


24229 Training examples, 10384 Testing examples, Shape: (24229, 27)

In [13]:
%%time
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM

from sklearn.metrics import mean_squared_error

N, t = X_train.shape
X_train_s = X_train.reshape(N, t, 1)

model = Sequential()
model.add(LSTM(4, input_shape=(27, 1)))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(X_train_s, y_train, epochs=100, batch_size=1, verbose=2)
print ("Done fitting model.")


Epoch 1/100
634s - loss: 0.1979
Epoch 2/100
589s - loss: 0.1936
Epoch 3/100
577s - loss: 0.1927
Epoch 4/100
575s - loss: 0.1927
Epoch 5/100
575s - loss: 0.1924
Epoch 6/100
577s - loss: 0.1922
Epoch 7/100
579s - loss: 0.1921
Epoch 8/100
576s - loss: 0.1923
Epoch 9/100
579s - loss: 0.1920
Epoch 10/100
579s - loss: 0.1925
Epoch 11/100
579s - loss: 0.1921
Epoch 12/100
580s - loss: 0.1919
Epoch 13/100
580s - loss: 0.1916
Epoch 14/100
582s - loss: 0.1918
Epoch 15/100
579s - loss: 0.1918
Epoch 16/100
580s - loss: 0.1916
Epoch 17/100
579s - loss: 0.1916
Epoch 18/100
576s - loss: 0.1918
Epoch 19/100
577s - loss: 0.1917
Epoch 20/100
579s - loss: 0.1915
Epoch 21/100
578s - loss: 0.1915
Epoch 22/100
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<timed exec> in <module>()

S:\Anaconda3\lib\site-packages\keras\models.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, **kwargs)
    865                               class_weight=class_weight,
    866                               sample_weight=sample_weight,
--> 867                               initial_epoch=initial_epoch)
    868 
    869     def evaluate(self, x, y, batch_size=32, verbose=1,

S:\Anaconda3\lib\site-packages\keras\engine\training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, **kwargs)
   1596                               initial_epoch=initial_epoch,
   1597                               steps_per_epoch=steps_per_epoch,
-> 1598                               validation_steps=validation_steps)
   1599 
   1600     def evaluate(self, x, y,

S:\Anaconda3\lib\site-packages\keras\engine\training.py in _fit_loop(self, f, ins, out_labels, batch_size, epochs, verbose, callbacks, val_f, val_ins, shuffle, callback_metrics, initial_epoch, steps_per_epoch, validation_steps)
   1181                     batch_logs['size'] = len(batch_ids)
   1182                     callbacks.on_batch_begin(batch_index, batch_logs)
-> 1183                     outs = f(ins_batch)
   1184                     if not isinstance(outs, list):
   1185                         outs = [outs]

S:\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py in __call__(self, inputs)
   2271         updated = session.run(self.outputs + [self.updates_op],
   2272                               feed_dict=feed_dict,
-> 2273                               **self.session_kwargs)
   2274         return updated[:len(self.outputs)]
   2275 

S:\Anaconda3\lib\site-packages\tensorflow\python\client\session.py in run(self, fetches, feed_dict, options, run_metadata)
    776     try:
    777       result = self._run(None, fetches, feed_dict, options_ptr,
--> 778                          run_metadata_ptr)
    779       if run_metadata:
    780         proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)

S:\Anaconda3\lib\site-packages\tensorflow\python\client\session.py in _run(self, handle, fetches, feed_dict, options, run_metadata)
    980     if final_fetches or final_targets:
    981       results = self._do_run(handle, final_targets, final_fetches,
--> 982                              feed_dict_string, options, run_metadata)
    983     else:
    984       results = []

S:\Anaconda3\lib\site-packages\tensorflow\python\client\session.py in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)
   1030     if handle is None:
   1031       return self._do_call(_run_fn, self._session, feed_dict, fetch_list,
-> 1032                            target_list, options, run_metadata)
   1033     else:
   1034       return self._do_call(_prun_fn, self._session, handle, feed_dict,

S:\Anaconda3\lib\site-packages\tensorflow\python\client\session.py in _do_call(self, fn, *args)
   1037   def _do_call(self, fn, *args):
   1038     try:
-> 1039       return fn(*args)
   1040     except errors.OpError as e:
   1041       message = compat.as_text(e.message)

S:\Anaconda3\lib\site-packages\tensorflow\python\client\session.py in _run_fn(session, feed_dict, fetch_list, target_list, options, run_metadata)
   1019         return tf_session.TF_Run(session, options,
   1020                                  feed_dict, fetch_list, target_list,
-> 1021                                  status, run_metadata)
   1022 
   1023     def _prun_fn(session, handle, feed_dict, fetch_list):

KeyboardInterrupt: 

In [ ]:
# make predictions
trainPredict = model.predict(X_train)
test