Simon #metoo step2


In [ ]:
import pandas as pd
pd.set_option('display.max_colwidth', -1)

from string import punctuation
from collections import defaultdict

from gensim import corpora, models, matutils

from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords

import re
import glob

Turn on logging.


In [ ]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

Document pre-processing

We start with units of text represented as text files in a folder.


In [ ]:
files = glob.glob('tmfiles/*')
docs = [open(f).read() for f in files]
len(docs)

Remove stopwords and tokenize.


In [ ]:
stoplist = [word for word in stopwords.words('english')]
texts = [[word for word in doc.lower().split() if word not in stoplist] for doc in docs]

Remove words that appear only once.


In [ ]:
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1
texts = [[token for token in text if frequency[token] > 1]for text in texts]

Vectorize the corpus (bow)

First create a mapping between tokens and their integer IDs.

Filter out words from the dictionary that occur in less than x documents, or more than y% of the documents.


In [ ]:
dictionary = corpora.Dictionary(texts)
dictionary.filter_extremes(no_below=3, no_above=0.5)

Then, create a bag-of-words corpus of numerical document vectors.


In [ ]:
corpus = [dictionary.doc2bow(text) for text in texts]

Check dataset size.


In [ ]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Evaluations can be run to decide optimal number of topics. This takes time, and we won't do this now.

Train the LDA model

The code below shows how to create the model and get a document-topic matrix. We will not run it here.


In [ ]:
#num_topics = 500
#passes = 15 # loops through the entire corpus
#iterations = 50 # runs through each document 
#eval_every = 2 # evaluate model perplexity

#lda_model = models.LdaModel(corpus=corpus,
                             #id2word=dictionary,
                             #num_topics=num_topics,
                             #eval_every=eval_every,
                             #iterations=iterations,
                             #passes=passes)

#lda_corpus = lda_model[corpus]

In [ ]:
# VIEW TOPICS
#topics = lda_model.show_topics(num_topics=32, num_words=20)

An example topic:


In [ ]:
## Convert the corpus into a sparse matrix, 
## in scipy.sparse.csc_matrix format, with documents as columns

#matrix = matutils.corpus2csc(lda_corpus)
#matrix

In [ ]:
#lda_df = pd.DataFrame(matrix.toarray()) # convert to pandas
#lda_df = pd.DataFrame.transpose(lda_df) # flip rows / columns

##df rows are docs, cols are topics

#lda_df.to_csv("topicmodel.csv")

To get the same data as in my original #metoo study, we access that exact topicmodel.


In [ ]:
df = pd.DataFrame.from_csv("topicmodel.csv", index_col=None)
df.head()

In [ ]:
df = df.sort_index()
df.rename(columns={'Unnamed: 0': 'day'}, inplace=True)
df = df.set_index('day')

# Converting the index as date
df.index = pd.to_datetime(df.index)

df.head()