In [1]:
import keras
import pandas as pd
import numpy as np
In [13]:
import nltk
from nltk import corpus
# nltk.download()
# print(dir(corpus))
# corp = corpus.gutenberg
files = corpus.gutenberg.fileids()
print(files)
In [18]:
# NOTE: This is only needed to open NLTK's downloads manager!
# nltk.download()
Out[18]:
In [46]:
# Get our source corpora from gutenberg in nltk.
emma_sents = corpus.gutenberg.sents('austen-emma.txt')
# Assign all of our samples that we'll be using.
corpora = emma_sents[:20]
# Iterate across the sentences.
alpha_sentences = pd.DataFrame()
for sentence in corpora:
# print(sent)
sent = list(filter(lambda x: str.isalpha(x), sentence))
sent = ' '.join(sent)
sent = pd.Series(sent)
alpha_sentences = alpha_sentences.append(sent, ignore_index=True)
print(alpha_sentences.head(10))
In [47]:
print(emma_sents[3],alpha_sentences.iloc[3][0])
In [ ]:
# We'll need to vectorize the words, put that into a dataframe,
# and then generate another dataframe that's a vector of letter values.
In [44]:
from sklearn.decomposition import LatentDirichletAllocation as LDA
lda = LDA()
lda.fit(emma_sents)
In [ ]:
In [ ]: