In [1]:
from keras.preprocessing.text import text_to_word_sequence, one_hot, Tokenizer
import string

In [5]:
def base_filter():
    f = string.punctuation
    f = f.replace("'", '')
    f += '\t\n'
    return f

In [9]:
text = "i am a boy\
\nyou are boy"

In [21]:
text_to_word_sequence(text, filters=base_filter(), lower=True, split=" ")


Out[21]:
['i', 'am', 'a', 'boy', 'you', 'are', 'boy']

In [14]:
t = one_hot(text, n=5, filters=base_filter(), lower=True, split=" ")
print t


[1, 1, 1, 2, 3, 2, 2]

In [35]:
tokenizer = Tokenizer(nb_words=10, filters=base_filter(), lower=True, split=" ")

In [23]:
a = tokenizer.fit_on_texts(text)

In [36]:
a = [[1,2, 6], [1,2, 5]]
tokenizer.sequences_to_matrix(a)


Out[36]:
array([[ 0.,  1.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.],
       [ 0.,  1.,  1.,  0.,  0.,  1.,  0.,  0.,  0.,  0.]])

In [11]:
tokenizer.fit_on_texts(text)
print tokenizer.document_count
print tokenizer.filters
print tokenizer.fit_on_sequences(text)
print tokenizer.index_docs
print tokenizer.nb_words
print tokenizer.word_counts
print tokenizer.word_docs
print tokenizer.word_index


22
!"#$%&()*+,-./:;<=>?@[\]^_`{|}~	

None
{'a': 3, ' ': 5, 'b': 2, 'e': 1, 'i': 1, '\n': 1, 'm': 1, 'o': 3, 'r': 1, 'u': 1, 'y': 3}
None
{'a': 3, 'b': 2, 'e': 1, 'i': 1, 'm': 1, 'o': 3, 'r': 1, 'u': 1, 'y': 3}
{'a': 3, 'b': 2, 'e': 1, 'i': 1, 'm': 1, 'o': 3, 'r': 1, 'u': 1, 'y': 3}
{'a': 1, 'b': 4, 'e': 5, 'i': 6, 'm': 7, 'o': 2, 'r': 8, 'u': 9, 'y': 3}