In [1]:
from keras.preprocessing.text import text_to_word_sequence, one_hot, Tokenizer
import string
In [5]:
def base_filter():
f = string.punctuation
f = f.replace("'", '')
f += '\t\n'
return f
In [9]:
text = "i am a boy\
\nyou are boy"
In [21]:
text_to_word_sequence(text, filters=base_filter(), lower=True, split=" ")
Out[21]:
In [14]:
t = one_hot(text, n=5, filters=base_filter(), lower=True, split=" ")
print t
In [35]:
tokenizer = Tokenizer(nb_words=10, filters=base_filter(), lower=True, split=" ")
In [23]:
a = tokenizer.fit_on_texts(text)
In [36]:
a = [[1,2, 6], [1,2, 5]]
tokenizer.sequences_to_matrix(a)
Out[36]:
In [11]:
tokenizer.fit_on_texts(text)
print tokenizer.document_count
print tokenizer.filters
print tokenizer.fit_on_sequences(text)
print tokenizer.index_docs
print tokenizer.nb_words
print tokenizer.word_counts
print tokenizer.word_docs
print tokenizer.word_index