In [ ]:
import pandas as pd
pd.set_option('display.max_colwidth', -1)
from string import punctuation
from collections import defaultdict
from gensim import corpora, models, matutils
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
import re
import glob
Turn on logging.
In [ ]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
In [ ]:
files = glob.glob('tmfiles/*')
docs = [open(f).read() for f in files]
len(docs)
Remove stopwords and tokenize.
In [ ]:
stoplist = [word for word in stopwords.words('english')]
texts = [[word for word in doc.lower().split() if word not in stoplist] for doc in docs]
Remove words that appear only once.
In [ ]:
frequency = defaultdict(int)
for text in texts:
for token in text:
frequency[token] += 1
texts = [[token for token in text if frequency[token] > 1]for text in texts]
Filter out words from the dictionary that occur in less than x documents, or more than y% of the documents.
In [ ]:
dictionary = corpora.Dictionary(texts)
dictionary.filter_extremes(no_below=3, no_above=0.5)
Then, create a bag-of-words corpus of numerical document vectors.
In [ ]:
corpus = [dictionary.doc2bow(text) for text in texts]
Check dataset size.
In [ ]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))
Evaluations can be run to decide optimal number of topics. This takes time, and we won't do this now.
In [ ]:
#num_topics = 500
#passes = 15 # loops through the entire corpus
#iterations = 50 # runs through each document
#eval_every = 2 # evaluate model perplexity
#lda_model = models.LdaModel(corpus=corpus,
#id2word=dictionary,
#num_topics=num_topics,
#eval_every=eval_every,
#iterations=iterations,
#passes=passes)
#lda_corpus = lda_model[corpus]
In [ ]:
# VIEW TOPICS
#topics = lda_model.show_topics(num_topics=32, num_words=20)
An example topic:
In [ ]:
## Convert the corpus into a sparse matrix,
## in scipy.sparse.csc_matrix format, with documents as columns
#matrix = matutils.corpus2csc(lda_corpus)
#matrix
In [ ]:
#lda_df = pd.DataFrame(matrix.toarray()) # convert to pandas
#lda_df = pd.DataFrame.transpose(lda_df) # flip rows / columns
##df rows are docs, cols are topics
#lda_df.to_csv("topicmodel.csv")
To get the same data as in my original #metoo study, we access that exact topicmodel.
In [ ]:
df = pd.DataFrame.from_csv("topicmodel.csv", index_col=None)
df.head()
In [ ]:
df = df.sort_index()
df.rename(columns={'Unnamed: 0': 'day'}, inplace=True)
df = df.set_index('day')
# Converting the index as date
df.index = pd.to_datetime(df.index)
df.head()