In [1]:
import os
TEXT_DIR = os.path.join(os.getcwd(), 'text/')
In [29]:
import os, os.path, codecs
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import decomposition
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
import numpy as np
In [30]:
dir_data = "text"
file_paths = [os.path.join(dir_data, fname) for fname in os.listdir(dir_data) if fname.endswith(".txt") ]
documents = [codecs.open(file_path, 'r', encoding="utf8", errors='ignore').read() for file_path in file_paths ]
In [31]:
tfidf = TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS, lowercase=True, strip_accents="unicode", use_idf=True, norm="l2", min_df = 5)
A = tfidf.fit_transform(documents)
In [32]:
A
Out[32]:
In [33]:
num_terms = len(tfidf.vocabulary_)
terms = [""] * num_terms
for term in tfidf.vocabulary_.keys():
terms[ tfidf.vocabulary_[term] ] = term
In [34]:
model = decomposition.NMF(init="nndsvd", n_components=30, max_iter=200)
W = model.fit_transform(A)
H = model.components_
In [35]:
for topic_index in range( H.shape[0] ):
top_indices = np.argsort( H[topic_index,:] )[::-1][0:10]
term_ranking = [terms[i] for i in top_indices]
print("Topic %d: %s" % ( topic_index, ", ".join( term_ranking ) ))
As you can see we need to do more cleaning - it is clear from above that there are a number of French stop words there. However some of the topics are interesting, we see words like montmartre, bastille, eiffel etc all included - which is what we would expect naively in Parisian locations.
In [45]:
n_samples = 20000
n_features = 1000
n_topics = 50
n_top_words = 20
from time import time
def print_top_words(model, feature_names, n_top_words):
for topic_idx, topic in enumerate(model.components_):
print("Topic #%d:" % topic_idx)
print(" ".join([feature_names[i]
for i in topic.argsort()[:-n_top_words - 1:-1]]))
print()
tfidf_vectorizer = TfidfVectorizer(max_df=0.80, min_df=5, #max_features=n_features,
stop_words='english')
# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.80, min_df=2, max_features=n_features,
stop_words='english')
t0 = time()
tf = tf_vectorizer.fit_transform(documents)
print("done in %0.3fs." % (time() - t0))
print("Fitting LDA models with tf features, n_samples=%d and n_features=%d..."
% (n_samples, n_features))
lda = decomposition.LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
learning_method='online', learning_offset=50.,
random_state=0)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))
print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)
In [ ]: