In [1]:
# until ttk is installed, add parent dir to path
import sys
sys.path.insert(0, '..')

In [2]:
# typicaL imports
import pandas as pd
import numpy as np
import re

import matplotlib
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (10.0, 8.0)

import matplotlib.pyplot as plt

import spacy
import ete3
import seaborn

from ttk.corpus import load_headline_corpus

In [3]:
%%time
# load the corpus
corpus = load_headline_corpus(verbose=True)

print ('Headlines:', len(corpus.sents()))


Loading corpus from: S:\git\tacticsiege\tactictoolkit\ttk\..\env\corpus\dated\2017_08_22\corpus
Corpus loaded.
Headlines: 190447
Wall time: 2.82 s

In [ ]:


In [4]:
from ttk.preprocessing import Seq2WordVecTransformer
from ttk.preprocessing import SeqPaddingTransformer

from ttk.corpus import CategorizedDatedCorpusReporter

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from ttk.preprocessing import Seq2IndexTransformer

label_enc = LabelEncoder()
reporter = CategorizedDatedCorpusReporter()
corpus_df = reporter.to_data_frame(corpus, categories=['Washington Post', 'CNN'])

X = corpus_df['content'].values
X = [s.split() for s in X]
y = label_enc.fit_transform(corpus_df['category'].values)


def token_filter(t, s):
    return t.isalpha()

def token_processor(t, s):
    return t.lower()

indexer = Seq2IndexTransformer(add_delimiters=False, token_mapping_func=token_processor, token_filter_func=token_filter)
pad = SeqPaddingTransformer()


S:\Anaconda3\lib\site-packages\gensim\utils.py:865: UserWarning: detected Windows; aliasing chunkize to chunkize_serial
  warnings.warn("detected Windows; aliasing chunkize to chunkize_serial")
Using TensorFlow backend.

In [5]:
for i in range(5):
    print ('X:', X[i], 'y:', y[i])


X: ['"', 'Uninsured', 'ranks', 'still', 'to', 'grow', 'by', 'tens', 'of', 'millions', 'under', 'latest', 'House', 'health', '-', 'care', 'bill', ',', 'CBO', 'says', '"'] y: 1
X: ['"', 'Republican', 'candidate', 'in', 'Montana', 'race', 'allegedly', "'", 'body', '-', 'slammed', "'", 'reporter', ',', 'prompting', 'police', 'investigation', '"'] y: 1
X: ['"', 'Prepare', 'for', 'the', 'weirdest', 'Election', 'Day', 'in', 'history', ',', 'after', 'a', 'candidate', 'allegedly', 'body', '-', 'slams', 'a', 'reporter', '"'] y: 1
X: ['Audio', ':', 'Guardian', 'reporter', 'allegedly', 'body', '-', 'slammed', 'by', 'Greg', 'Gianforte'] y: 1
X: ['"', 'The', 'new', 'GOP', 'health', '-', 'care', 'bill', 'isn', "'", 't', 'any', 'better', ',', 'in', 'four', 'charts', '"'] y: 1

In [6]:
X_indexed = indexer.fit_transform(X)
for i in range(5):
    print ('X[%i]:' % i, X_indexed[i])


Max index: 17565
X[0]: [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]
X[1]: [19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]
X[2]: [31, 32, 33, 34, 35, 36, 21, 37, 38, 39, 20, 24, 25, 40, 39, 27]
X[3]: [41, 42, 27, 24, 25, 26, 7, 43, 44]
X[4]: [33, 45, 46, 14, 15, 16, 47, 48, 49, 50, 21, 51, 52]

In [7]:
X_padded = pad.fit_transform(X_indexed)
for i in range(5):
    print ('X[%i]' % i, X_padded[i])


X[0] [  2.   3.   4.   5.   6.   7.   8.   9.  10.  11.  12.  13.  14.  15.  16.
  17.  18.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.]
X[1] [ 19.  20.  21.  22.  23.  24.  25.  26.  27.  28.  29.  30.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.]
X[2] [ 31.  32.  33.  34.  35.  36.  21.  37.  38.  39.  20.  24.  25.  40.  39.
  27.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.]
X[3] [ 41.  42.  27.  24.  25.  26.   7.  43.  44.   0.   0.   0.   0.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.]
X[4] [ 33.  45.  46.  14.  15.  16.  47.  48.  49.  50.  21.  51.  52.   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.]

In [8]:
# split
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.3, random_state=0)
print ('%i Training examples, %i Testing examples, Shape: %s' % (len(X_train), len(X_test), X_train.shape))


24229 Training examples, 10384 Testing examples, Shape: (24229, 27)

In [9]:
def reshape_and_seq_target(X, y):
    N, t = X.shape
    print ('N, t', N, t)
    Y_t = np.zeros(X.shape, dtype=np.int32)
    for n in range(N):
        #print ('n:', n)
        for i in range(t):
            if i != t-1:
                Y_t[n, i] = X[n, (i+1)]
            else:
                Y_t[n, i] = y[n]
    X_shaped = X.reshape(N, t, 1).astype(np.int32)
    return X_shaped, Y_t

def reshape_and_class_target(X, y):
    N, t = X.shape
    print ('N, t', N, t)
    Y_t = np.zeros(X.shape, dtype=np.int32)
    for n in range(N):
        #print ('n:', n)
        for i in range(t):
            Y_t[n, i] = y[n]
    X_shaped = X.reshape(N, t, 1).astype(np.int32)
    return X_shaped, Y_t

In [10]:
X_t, Y_t = reshape_and_class_target(X_train, y_train)
for i in range(3):
    print ('X[%i]:' % i, X_t[i], 'Y_t[%i]:' % i, Y_t[i])
    
X_t_f = X_t.astype(np.float32)
for i in range(3):
    print ('X[%i]:' % i, X_t_f[i], 'Y_t[%i]:' % i, Y_t[i])


N, t 24229 27
X[0]: [[ 743]
 [1967]
 [ 720]
 [1013]
 [   0]
 [   0]
 [   0]
 [   0]
 [   0]
 [   0]
 [   0]
 [   0]
 [   0]
 [   0]
 [   0]
 [   0]
 [   0]
 [   0]
 [   0]
 [   0]
 [   0]
 [   0]
 [   0]
 [   0]
 [   0]
 [   0]
 [   0]] Y_t[0]: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
X[1]: [[  39]
 [  45]
 [1903]
 [  59]
 [6006]
 [1847]
 [1631]
 [ 152]
 [  21]
 [  90]
 [1450]
 [4833]
 [   0]
 [   0]
 [   0]
 [   0]
 [   0]
 [   0]
 [   0]
 [   0]
 [   0]
 [   0]
 [   0]
 [   0]
 [   0]
 [   0]
 [   0]] Y_t[1]: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
X[2]: [[   67]
 [   83]
 [  661]
 [    5]
 [ 1150]
 [14811]
 [10816]
 [  407]
 [   21]
 [10843]
 [ 8066]
 [    0]
 [    0]
 [    0]
 [    0]
 [    0]
 [    0]
 [    0]
 [    0]
 [    0]
 [    0]
 [    0]
 [    0]
 [    0]
 [    0]
 [    0]
 [    0]] Y_t[2]: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
X[0]: [[  743.]
 [ 1967.]
 [  720.]
 [ 1013.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]] Y_t[0]: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
X[1]: [[   39.]
 [   45.]
 [ 1903.]
 [   59.]
 [ 6006.]
 [ 1847.]
 [ 1631.]
 [  152.]
 [   21.]
 [   90.]
 [ 1450.]
 [ 4833.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]] Y_t[1]: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
X[2]: [[  6.70000000e+01]
 [  8.30000000e+01]
 [  6.61000000e+02]
 [  5.00000000e+00]
 [  1.15000000e+03]
 [  1.48110000e+04]
 [  1.08160000e+04]
 [  4.07000000e+02]
 [  2.10000000e+01]
 [  1.08430000e+04]
 [  8.06600000e+03]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]] Y_t[2]: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]

In [11]:
from ttk.sandbox.udemy import SimpleRNNClassifier

import theano

theano.config.exception_verbosity='high'

clf = SimpleRNNClassifier(4)
clf.fit(X_t_f, Y_t, show_fig=True)


Using cuDNN version 5110 on context None
Mapped name None to device cuda0: GeForce GTX 1080 Ti (0000:01:00.0)
D, K, N, M: 1 2 24229 4
iteration: 0
shape y: (27, 1, 2)
i: 0 cost: nan classification rate: 0.5298609104791778
duration: 177.09498119354248
iteration: 1
shape y: (27, 1, 2)
i: 1 cost: nan classification rate: 0.5324611003343102
duration: 175.81551504135132
iteration: 2
shape y: (27, 1, 2)
i: 2 cost: nan classification rate: 0.5324611003343102
duration: 175.8624029159546
iteration: 3
shape y: (27, 1, 2)
i: 3 cost: nan classification rate: 0.5324611003343102
duration: 175.87741804122925
iteration: 4
shape y: (27, 1, 2)
i: 4 cost: nan classification rate: 0.5324611003343102
duration: 175.4425926208496
iteration: 5
shape y: (27, 1, 2)
i: 5 cost: nan classification rate: 0.5324611003343102
duration: 176.13470792770386
iteration: 6
shape y: (27, 1, 2)
i: 6 cost: nan classification rate: 0.5324611003343102
duration: 177.26166033744812
iteration: 7
shape y: (27, 1, 2)
i: 7 cost: nan classification rate: 0.5324611003343102
duration: 178.00531554222107
iteration: 8
shape y: (27, 1, 2)
i: 8 cost: nan classification rate: 0.5324611003343102
duration: 178.00516200065613
iteration: 9
shape y: (27, 1, 2)
i: 9 cost: nan classification rate: 0.5324611003343102
duration: 178.12058520317078
iteration: 10
shape y: (27, 1, 2)
i: 10 cost: nan classification rate: 0.5324611003343102
duration: 176.07921695709229
iteration: 11
shape y: (27, 1, 2)
i: 11 cost: nan classification rate: 0.5324611003343102
duration: 175.98539209365845
iteration: 12
shape y: (27, 1, 2)
i: 12 cost: nan classification rate: 0.5324611003343102
duration: 176.25476670265198
iteration: 13
shape y: (27, 1, 2)
i: 13 cost: nan classification rate: 0.5324611003343102
duration: 176.03014731407166
iteration: 14
shape y: (27, 1, 2)
i: 14 cost: nan classification rate: 0.5324611003343102
duration: 176.04616689682007
iteration: 15
shape y: (27, 1, 2)
i: 15 cost: nan classification rate: 0.5324611003343102
duration: 176.25447249412537
iteration: 16
shape y: (27, 1, 2)
i: 16 cost: nan classification rate: 0.5324611003343102
duration: 175.82351398468018
iteration: 17
shape y: (27, 1, 2)
i: 17 cost: nan classification rate: 0.5324611003343102
duration: 175.98437356948853
iteration: 18
shape y: (27, 1, 2)
i: 18 cost: nan classification rate: 0.5324611003343102
duration: 175.81087231636047
iteration: 19
shape y: (27, 1, 2)
i: 19 cost: nan classification rate: 0.5324611003343102
duration: 175.56478786468506
iteration: 20
shape y: (27, 1, 2)
i: 20 cost: nan classification rate: 0.5324611003343102
duration: 175.7314648628235
iteration: 21
shape y: (27, 1, 2)
i: 21 cost: nan classification rate: 0.5324611003343102
duration: 175.90101218223572
iteration: 22
shape y: (27, 1, 2)
i: 22 cost: nan classification rate: 0.5324611003343102
duration: 175.89651203155518
iteration: 23
shape y: (27, 1, 2)
i: 23 cost: nan classification rate: 0.5324611003343102
duration: 176.0100758075714
iteration: 24
shape y: (27, 1, 2)
i: 24 cost: nan classification rate: 0.5324611003343102
duration: 175.88544845581055
iteration: 25
shape y: (27, 1, 2)
i: 25 cost: nan classification rate: 0.5324611003343102
duration: 175.03661608695984
iteration: 26
shape y: (27, 1, 2)
i: 26 cost: nan classification rate: 0.5324611003343102
duration: 174.6952362060547
iteration: 27
shape y: (27, 1, 2)
i: 27 cost: nan classification rate: 0.5324611003343102
duration: 176.8682734966278
iteration: 28
shape y: (27, 1, 2)
i: 28 cost: nan classification rate: 0.5324611003343102
duration: 174.73530435562134
iteration: 29
shape y: (27, 1, 2)
i: 29 cost: nan classification rate: 0.5324611003343102
duration: 175.17007422447205
iteration: 30
shape y: (27, 1, 2)
i: 30 cost: nan classification rate: 0.5324611003343102
duration: 175.14048719406128
iteration: 31
shape y: (27, 1, 2)
i: 31 cost: nan classification rate: 0.5324611003343102
duration: 175.3279356956482
iteration: 32
shape y: (27, 1, 2)
i: 32 cost: nan classification rate: 0.5324611003343102
duration: 175.4613676071167
iteration: 33
shape y: (27, 1, 2)
i: 33 cost: nan classification rate: 0.5324611003343102
duration: 175.45380878448486
iteration: 34
shape y: (27, 1, 2)
i: 34 cost: nan classification rate: 0.5324611003343102
duration: 175.084308385849
iteration: 35
shape y: (27, 1, 2)
i: 35 cost: nan classification rate: 0.5324611003343102
duration: 175.73502373695374
iteration: 36
shape y: (27, 1, 2)
i: 36 cost: nan classification rate: 0.5324611003343102
duration: 175.662926197052
iteration: 37
shape y: (27, 1, 2)
i: 37 cost: nan classification rate: 0.5324611003343102
duration: 177.029123544693
iteration: 38
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-11-a61a664af64a> in <module>()
      6 
      7 clf = SimpleRNNClassifier(4)
----> 8 clf.fit(X_t_f, Y_t, show_fig=True)

S:\git\tacticsiege\tactictoolkit\ttk\sandbox\udemy\SimpleRNNClassifier.py in fit(self, X, Y, learning_rate, mu, reg, activation, epochs, show_fig)
     98                 #print ('X[j]:', X[j], 'Y[j]:', Y[j])
     99 
--> 100                 c, p, rout = self.train_op(X[j], Y[j])
    101                 #print ('c:', c)
    102                 cost += c

S:\Anaconda3\lib\site-packages\theano\compile\function_module.py in __call__(self, *args, **kwargs)
    882         try:
    883             outputs =\
--> 884                 self.fn() if output_subset is None else\
    885                 self.fn(output_subset=output_subset)
    886         except Exception:

S:\Anaconda3\lib\site-packages\theano\scan_module\scan_op.py in rval(p, i, o, n, allow_gc)
    987         def rval(p=p, i=node_input_storage, o=node_output_storage, n=node,
    988                  allow_gc=allow_gc):
--> 989             r = p(n, [x[0] for x in i], o)
    990             for o in node.outputs:
    991                 compute_map[o][0] = True

S:\Anaconda3\lib\site-packages\theano\scan_module\scan_op.py in p(node, args, outs)
    976                                                 args,
    977                                                 outs,
--> 978                                                 self, node)
    979         except (ImportError, theano.gof.cmodule.MissingGXX):
    980             p = self.execute

theano/scan_module/scan_perform.pyx in theano.scan_module.scan_perform.perform (C:\Users\TacticSiege\AppData\Local\Theano\compiledir_Windows-10-10.0.15063-SP0-Intel64_Family_6_Model_158_Stepping_9_GenuineIntel-3.6.2-64\scan_perform\mod.cpp:4490)()

S:\Anaconda3\lib\site-packages\theano\gof\op.py in rval(p, i, o, n)
    869         if params is graph.NoParams:
    870             # default arguments are stored in the closure of `rval`
--> 871             def rval(p=p, i=node_input_storage, o=node_output_storage, n=node):
    872                 r = p(n, [x[0] for x in i], o)
    873                 for o in node.outputs:

KeyboardInterrupt: 

In [ ]: