In [1]:
# until ttk is installed, add parent dir to path
import sys
sys.path.insert(0, '..')

In [2]:
# typicaL imports
import pandas as pd
import numpy as np
import re

import matplotlib
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (10.0, 8.0)

import matplotlib.pyplot as plt

import spacy
import ete3
import seaborn

from ttk.corpus import load_headline_corpus

In [3]:
%%time
# load the corpus
corpus = load_headline_corpus(verbose=True)

print ('Headlines:', len(corpus.sents()))


Loading corpus from: S:\git\tacticsiege\tactictoolkit\ttk\..\env\corpus\dated\2017_08_22\corpus
Corpus loaded.
Headlines: 190447
Wall time: 2.7 s

In [4]:
from ttk.preprocessing import Seq2WordVecTransformer
from ttk.preprocessing import SeqPaddingTransformer

from ttk.corpus import CategorizedDatedCorpusReporter

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

label_enc = LabelEncoder()

reporter = CategorizedDatedCorpusReporter()

corpus_df = reporter.to_data_frame(corpus, categories=['Washington Post', 'CNN'])

X = corpus_df['content'].values
X = [s.split() for s in X]

print ('Fitting on len(X):', len(X))
print ('X is', type(X))
print ('Elements of X are', type(X[0]))
print ('X[0] =', X[0])
print ('Elements of the Elements of X are', type(X[0][0]))
print ('Elements:', X[0][0], X[0][1], X[0][2])

y = label_enc.fit_transform(corpus_df['category'].values)


S:\Anaconda3\lib\site-packages\gensim\utils.py:865: UserWarning: detected Windows; aliasing chunkize to chunkize_serial
  warnings.warn("detected Windows; aliasing chunkize to chunkize_serial")
Using TensorFlow backend.
Fitting on len(X): 34613
X is <class 'list'>
Elements of X are <class 'list'>
X[0] = ['"', 'Uninsured', 'ranks', 'still', 'to', 'grow', 'by', 'tens', 'of', 'millions', 'under', 'latest', 'House', 'health', '-', 'care', 'bill', ',', 'CBO', 'says', '"']
Elements of the Elements of X are <class 'str'>
Elements: " Uninsured ranks

In [5]:
for i in range(5):
    print (i, X[i])
    
print ('len(X):', len(X))


0 ['"', 'Uninsured', 'ranks', 'still', 'to', 'grow', 'by', 'tens', 'of', 'millions', 'under', 'latest', 'House', 'health', '-', 'care', 'bill', ',', 'CBO', 'says', '"']
1 ['"', 'Republican', 'candidate', 'in', 'Montana', 'race', 'allegedly', "'", 'body', '-', 'slammed', "'", 'reporter', ',', 'prompting', 'police', 'investigation', '"']
2 ['"', 'Prepare', 'for', 'the', 'weirdest', 'Election', 'Day', 'in', 'history', ',', 'after', 'a', 'candidate', 'allegedly', 'body', '-', 'slams', 'a', 'reporter', '"']
3 ['Audio', ':', 'Guardian', 'reporter', 'allegedly', 'body', '-', 'slammed', 'by', 'Greg', 'Gianforte']
4 ['"', 'The', 'new', 'GOP', 'health', '-', 'care', 'bill', 'isn', "'", 't', 'any', 'better', ',', 'in', 'four', 'charts', '"']
len(X): 34613

In [6]:
vect = Seq2WordVecTransformer()
X_vect = vect.fit_transform(X, verbose='debug')
print ('len(X_vect):', len(X_vect))
#for i in range(5):
#    print (i, X_vect[i])


Fitting on len(X): 34613
X is <class 'list'>
Elements of X are <class 'list'>
X[0] = ['"', 'Uninsured', 'ranks', 'still', 'to', 'grow', 'by', 'tens', 'of', 'millions', 'under', 'latest', 'House', 'health', '-', 'care', 'bill', ',', 'CBO', 'says', '"']
Elements of the Elements of X are <class 'str'>
Elements: " Uninsured ranks
2017-09-16 22:36:51,270 : INFO : collecting all words and their counts
2017-09-16 22:36:51,271 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-09-16 22:36:51,298 : INFO : PROGRESS: at sentence #10000, processed 110351 words, keeping 12012 word types
2017-09-16 22:36:51,320 : INFO : PROGRESS: at sentence #20000, processed 222094 words, keeping 16758 word types
2017-09-16 22:36:51,339 : INFO : PROGRESS: at sentence #30000, processed 333851 words, keeping 20202 word types
2017-09-16 22:36:51,349 : INFO : collected 21533 word types from a corpus of 385756 raw words and 34613 sentences
2017-09-16 22:36:51,351 : INFO : Loading a fresh vocabulary
2017-09-16 22:36:51,385 : INFO : min_count=0 retains 21533 unique words (100% of original 21533, drops 0)
2017-09-16 22:36:51,386 : INFO : min_count=0 leaves 385756 word corpus (100% of original 385756, drops 0)
2017-09-16 22:36:51,433 : INFO : deleting the raw counts dictionary of 21533 items
2017-09-16 22:36:51,435 : INFO : sample=0.001 downsamples 28 most-common words
2017-09-16 22:36:51,436 : INFO : downsampling leaves estimated 309416 word corpus (80.2% of prior 385756)
2017-09-16 22:36:51,437 : INFO : estimated required memory for 21533 words and 100 dimensions: 27992900 bytes
2017-09-16 22:36:51,485 : INFO : resetting layer weights
2017-09-16 22:36:51,692 : INFO : training model with 4 workers on 21533 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=5 window=5
2017-09-16 22:36:52,423 : INFO : worker thread finished; awaiting finish of 3 more threads
2017-09-16 22:36:52,425 : INFO : worker thread finished; awaiting finish of 2 more threads
2017-09-16 22:36:52,430 : INFO : worker thread finished; awaiting finish of 1 more threads
2017-09-16 22:36:52,435 : INFO : worker thread finished; awaiting finish of 0 more threads
2017-09-16 22:36:52,436 : INFO : training on 1928780 raw words (1546701 effective words) took 0.7s, 2102068 effective words/s
len(X_vect): 34613

In [7]:
from ttk.preprocessing import Seq2IndexTransformer

indexer = Seq2IndexTransformer(add_delimiters=False)
pad = SeqPaddingTransformer()

#X_vect = vect.fit_transform(X)
X_indexed = indexer.fit_transform(X)
X_pad = pad.fit_transform(X_indexed)

# split
X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size=0.3, random_state=0)
print ('%i Training examples, %i Testing examples' % (len(X_train), len(X_test)))


24229 Training examples, 10384 Testing examples

In [8]:
print (X_train.shape)


(24229, 32)

In [9]:
seq = X_train[0]
t0 = seq[0]
print ('t0 type:', type(t0))

print (type(t0) is type(np.ndarray(1)))
print ('dim', t0.shape)


t0 type: <class 'numpy.float64'>
False
dim ()

In [10]:
from ttk.sandbox.udemy import SimpleRNNClassifier

N, t = X_train.shape

Y_t = np.zeros(X_train.shape, dtype=np.int32)
for n in range(N):
    for i in range(t):
        if i == t - 1:
            Y_t[n, i] = int(y_train[n])
        else:
            Y_t[n, i] = int(y_train[n])
            
X_train_shaped = X_train.reshape(N, t, 1).astype(np.float32)


Using cuDNN version 5110 on context None
Mapped name None to device cuda0: GeForce GTX 1080 Ti (0000:01:00.0)

In [14]:
def words_and_class_labels(X, y):
    N, t = X.shape
    Y_t = np.zeros(X.shape, dtype=np.int32)
    for n in range(N):
        for i in range(t):
            if i == t-1:
                Y_t[n, i] = float(y[n])
            else:
                Y_t[n, i] = float(X[n, i+1])
    X_shaped = X.reshape(N, t, 1).astype(np.float32)
    
    return X_shaped, Y_t
    
    
def only_class_labels(X, y):
    N, t = X.shape
    Y_t = np.zeros(X.shape, dtype=np.int32)
    for n in range(N):
        for i in range(t):
            Y_t[n, i] = int(y[n])
    X_shaped = X.reshape(N, t, 1).astype(np.float32)

In [15]:
X_t, Y_t = words_and_class_labels(X_train, y_train)

print ('Y_t.shape:', Y_t.shape)
print ('X_train.shape:', X_t.shape)


Y_t.shape: (24229, 32)
X_train.shape: (24229, 32, 1)

In [17]:
for i in range(6):
    print (X_t[i])


[[ 3423.]
 [ 2199.]
 [  776.]
 [ 1098.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]]
[[  160.]
 [   51.]
 [ 2123.]
 [   65.]
 [ 6957.]
 [ 2055.]
 [ 1805.]
 [  164.]
 [   24.]
 [   96.]
 [ 1587.]
 [ 5524.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]]
[[  7.30000000e+01]
 [  2.80000000e+01]
 [  8.90000000e+01]
 [  7.11000000e+02]
 [  6.00000000e+00]
 [  1.92600000e+03]
 [  1.79100000e+04]
 [  1.28570000e+04]
 [  9.25000000e+02]
 [  2.40000000e+01]
 [  2.80000000e+01]
 [  1.28900000e+04]
 [  2.80000000e+01]
 [  1.70940000e+04]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]]
[[ 1063.]
 [   46.]
 [ 1847.]
 [  789.]
 [ 2879.]
 [ 2849.]
 [    6.]
 [ 4027.]
 [  526.]
 [ 1316.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]]
[[  2.00000000e+00]
 [  1.08500000e+03]
 [  4.60000000e+01]
 [  7.30000000e+01]
 [  2.80000000e+01]
 [  8.90000000e+01]
 [  8.25000000e+02]
 [  1.00000000e+01]
 [  4.02000000e+02]
 [  1.90000000e+01]
 [  1.68840000e+04]
 [  2.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]]
[[   92.]
 [   43.]
 [  803.]
 [   10.]
 [ 4287.]
 [ 1248.]
 [   43.]
 [ 2771.]
 [   65.]
 [   37.]
 [ 1971.]
 [   10.]
 [ 7003.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]]

In [16]:
clf = SimpleRNNClassifier(4)
clf.fit(X_t, Y_t, show_fig=True)


iteration: 0
X[j]: [[  485.]
 [  146.]
 [ 1224.]
 [  146.]
 [   33.]
 [ 5952.]
 [  206.]
 [   28.]
 [ 3592.]
 [ 2747.]
 [   28.]
 [   72.]
 [ 2686.]
 [   16.]
 [ 3174.]
 [ 9365.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]
 [    0.]] Y[j]: [ 146 1224  146   33 5952  206   28 3592 2747   28   72 2686   16 3174 9365
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    1]
c: 9.76809047784846
p[-1]: 17134 Y[j,-1]: 1
X[j]: [[  2.00000000e+00]
 [  7.68900000e+03]
 [  1.90000000e+01]
 [  1.15700000e+04]
 [  1.44900000e+03]
 [  8.56000000e+02]
 [  9.32200000e+03]
 [  2.40000000e+01]
 [  2.93000000e+02]
 [  9.82000000e+02]
 [  2.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]] Y[j]: [ 7689    19 11570  1449   856  9322    24   293   982     2     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0]
c: 9.420943056877515
p[-1]: 0 Y[j,-1]: 0
X[j]: [[  7075.]
 [   208.]
 [   209.]
 [ 11172.]
 [    46.]
 [  6116.]
 [    94.]
 [   449.]
 [ 11173.]
 [     0.]
 [     0.]
 [     0.]
 [     0.]
 [     0.]
 [     0.]
 [     0.]
 [     0.]
 [     0.]
 [     0.]
 [     0.]
 [     0.]
 [     0.]
 [     0.]
 [     0.]
 [     0.]
 [     0.]
 [     0.]
 [     0.]
 [     0.]
 [     0.]
 [     0.]
 [     0.]] Y[j]: [  208   209 11172    46  6116    94   449 11173     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0]
c: 8.44972147259385
p[-1]: 0 Y[j,-1]: 0
X[j]: [[  2.00000000e+00]
 [  1.06300000e+03]
 [  2.40000000e+01]
 [  3.98300000e+03]
 [  2.66000000e+02]
 [  4.30000000e+01]
 [  2.26100000e+03]
 [  6.28500000e+03]
 [  3.82000000e+02]
 [  2.09080000e+04]
 [  4.30000000e+01]
 [  3.50500000e+03]
 [  1.90000000e+01]
 [  2.01900000e+03]
 [  7.45200000e+03]
 [  2.82000000e+02]
 [  2.40000000e+01]
 [  3.70000000e+01]
 [  3.81000000e+02]
 [  2.80000000e+01]
 [  8.90000000e+01]
 [  7.22600000e+03]
 [  2.09090000e+04]
 [  2.89100000e+03]
 [  2.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]
 [  0.00000000e+00]] Y[j]: [ 1063    24  3983   266    43  2261  6285   382 20908    43  3505    19
  2019  7452   282    24    37   381    28    89  7226 20909  2891     2
     0     0     0     0     0     0     0     1]
---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
S:\Anaconda3\lib\site-packages\theano\compile\function_module.py in __call__(self, *args, **kwargs)
    883             outputs =\
--> 884                 self.fn() if output_subset is None else\
    885                 self.fn(output_subset=output_subset)

S:\Anaconda3\lib\site-packages\theano\gof\op.py in rval(p, i, o, n)
    871             def rval(p=p, i=node_input_storage, o=node_output_storage, n=node):
--> 872                 r = p(n, [x[0] for x in i], o)
    873                 for o in node.outputs:

S:\Anaconda3\lib\site-packages\theano\tensor\subtensor.py in perform(self, node, inputs, out_)
   2243         elif config.cxx:
-> 2244             inplace_increment(out[0], tuple(inputs[2:]), inputs[1])
   2245         else:

IndexError: index 20908 is out of bounds for axis 1 with size 17485

During handling of the above exception, another exception occurred:

IndexError                                Traceback (most recent call last)
<ipython-input-16-a71f0b7ead28> in <module>()
      1 clf = SimpleRNNClassifier(4)
----> 2 clf.fit(X_t, Y_t, show_fig=True)

S:\git\tacticsiege\tactictoolkit\ttk\sandbox\udemy\SimpleRNNClassifier.py in fit(self, X, Y, learning_rate, mu, reg, activation, epochs, show_fig)
     93                 print ('X[j]:', X[j], 'Y[j]:', Y[j])
     94 
---> 95                 c, p, rout = self.train_op(X[j], Y[j])
     96                 print ('c:', c)
     97                 cost += c

S:\Anaconda3\lib\site-packages\theano\compile\function_module.py in __call__(self, *args, **kwargs)
    896                     node=self.fn.nodes[self.fn.position_of_error],
    897                     thunk=thunk,
--> 898                     storage_map=getattr(self.fn, 'storage_map', None))
    899             else:
    900                 # old-style linkers raise their own exceptions

S:\Anaconda3\lib\site-packages\theano\gof\link.py in raise_with_op(node, thunk, exc_info, storage_map)
    323         # extra long error message in that case.
    324         pass
--> 325     reraise(exc_type, exc_value, exc_trace)
    326 
    327 

S:\Anaconda3\lib\site-packages\six.py in reraise(tp, value, tb)
    683             value = tp()
    684         if value.__traceback__ is not tb:
--> 685             raise value.with_traceback(tb)
    686         raise value
    687 

S:\Anaconda3\lib\site-packages\theano\compile\function_module.py in __call__(self, *args, **kwargs)
    882         try:
    883             outputs =\
--> 884                 self.fn() if output_subset is None else\
    885                 self.fn(output_subset=output_subset)
    886         except Exception:

S:\Anaconda3\lib\site-packages\theano\gof\op.py in rval(p, i, o, n)
    870             # default arguments are stored in the closure of `rval`
    871             def rval(p=p, i=node_input_storage, o=node_output_storage, n=node):
--> 872                 r = p(n, [x[0] for x in i], o)
    873                 for o in node.outputs:
    874                     compute_map[o][0] = True

S:\Anaconda3\lib\site-packages\theano\tensor\subtensor.py in perform(self, node, inputs, out_)
   2242             out[0][inputs[2:]] = inputs[1]
   2243         elif config.cxx:
-> 2244             inplace_increment(out[0], tuple(inputs[2:]), inputs[1])
   2245         else:
   2246             raise NotImplementedError(

IndexError: index 20908 is out of bounds for axis 1 with size 17485
Apply node that caused the error: AdvancedIncSubtensor{inplace=False,  set_instead_of_inc=False}(Alloc.0, HostFromGpu(gpuarray).0, ARange{dtype='int64'}.0, Y)
Toposort index: 165
Inputs types: [TensorType(float64, matrix), TensorType(float64, vector), TensorType(int64, vector), TensorType(int32, vector)]
Inputs shapes: [(32, 17485), (32,), (32,), (32,)]
Inputs strides: [(139880, 8), (8,), (8,), (4,)]
Inputs values: ['not shown', 'not shown', 'not shown', 'not shown']
Outputs clients: [[GpuFromHost<None>(AdvancedIncSubtensor{inplace=False,  set_instead_of_inc=False}.0)]]

Backtrace when the node is created(use Theano flag traceback.limit=N to make it longer):
  File "S:\Anaconda3\lib\site-packages\theano\gradient.py", line 1272, in access_grad_cache
    term = access_term_cache(node)[idx]
  File "S:\Anaconda3\lib\site-packages\theano\gradient.py", line 967, in access_term_cache
    output_grads = [access_grad_cache(var) for var in node.outputs]
  File "S:\Anaconda3\lib\site-packages\theano\gradient.py", line 967, in <listcomp>
    output_grads = [access_grad_cache(var) for var in node.outputs]
  File "S:\Anaconda3\lib\site-packages\theano\gradient.py", line 1272, in access_grad_cache
    term = access_term_cache(node)[idx]
  File "S:\Anaconda3\lib\site-packages\theano\gradient.py", line 967, in access_term_cache
    output_grads = [access_grad_cache(var) for var in node.outputs]
  File "S:\Anaconda3\lib\site-packages\theano\gradient.py", line 967, in <listcomp>
    output_grads = [access_grad_cache(var) for var in node.outputs]
  File "S:\Anaconda3\lib\site-packages\theano\gradient.py", line 1272, in access_grad_cache
    term = access_term_cache(node)[idx]
  File "S:\Anaconda3\lib\site-packages\theano\gradient.py", line 1108, in access_term_cache
    new_output_grads)

HINT: Use the Theano flag 'exception_verbosity=high' for a debugprint and storage map footprint of this apply node.

In [ ]: