In [32]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
pd.set_option('display.mpl_style', 'default') # Make the graphs a bit prettier
plt.rcParams['figure.figsize'] = (15, 5)
Reading at scale:
Martha Ballard's Diary http://dohistory.org/diary/index.html
http://www.cameronblevins.org/posts/topic-modeling-martha-ballards-diary/
Richmond Dispatch
In [1]:
from IPython.display import Image
Image("http://journalofdigitalhumanities.org/wp-content/uploads/2013/02/blei_lda_illustration.png")
Out[1]:
In [ ]:
In [4]:
import textmining_blackboxes as tm
##when I put this on-line you'll have topic modeling helper functions!!
tm
is our temporarily helper, not a standard python
package!!download it from my github: https://github.com/matthewljones/computingincontext
In [3]:
#see if package imported correctly
tm.icantbelieve("butter")
Let's keep using the remarkable narratives available from Documenting the American South (http://docsouth.unc.edu/docsouthdata/)
Assuming that you are storing your data in a directory in the same place as your iPython notebook.
In [61]:
our_texts, names=tm.readtextfiles("data/british-fiction-corpus")
In [62]:
names
Out[62]:
In [63]:
our_texts=tm.data_cleanse(our_texts)
#more necessary when have messy text
#eliminate escaped characters
In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
In [8]:
vectorizer=TfidfVectorizer(min_df=0.5, stop_words='english', use_idf=True)
In [9]:
document_term_matrix=vectorizer.fit_transform(our_texts)
In [10]:
# now let's get our vocabulary--the names corresponding to the rows
vocab=vectorizer.get_feature_names()
In [13]:
len(vocab)
Out[13]:
In [14]:
document_term_matrix.shape
Out[14]:
In [15]:
document_term_matrix_dense=document_term_matrix.toarray()
In [16]:
dtmdf=pd.DataFrame(document_term_matrix_dense, columns=vocab)
In [17]:
dtmdf
Out[17]:
In [11]:
#easy to program, but let's use a robust version from sklearn!
from sklearn.metrics.pairwise import cosine_similarity
In [12]:
similarity=cosine_similarity(document_term_matrix)
#Note here that the `cosine_similiary` can take
#an entire matrix as its argument
In [13]:
pd.DataFrame(similarity, index=names, columns=names)
Out[13]:
In [28]:
similarity_df.ix[1].order(ascending=False)
Out[28]:
In [14]:
#here's the blackbox
from sklearn.manifold import MDS
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
positions= mds.fit_transform(1-similarity)
In [15]:
positions.shape
Out[15]:
It's an 11 by 2 matrix
OR
simply an (x,y) coordinate pair for each of our texts
In [16]:
#let's plot it: I've set up a black box
tm.plot_mds(positions,names)
In [17]:
names=[name.replace(".txt", "") for name in names]
In [18]:
tm.plot_mds(positions,names)
What has this got us?
It suggests that even this crude measure of similarity is able to capture something significant.
Note: the axes don't really mean anything
Get the stoplist in the data directory in my github.
In [3]:
our_texts, names=tm.readtextfiles("Data/PCCIPtext")
In [5]:
our_texts=tm.data_cleanse(our_texts)
In [6]:
#improved stoplist--may be too complete
stop=[]
with open('data/stoplist-multilingual') as f:
stop=f.readlines()
stop=[word.strip('\n') for word in stop]
In [7]:
texts = [[word for word in document.lower().split() if word not in stop] for document in our_texts] #gensim requires list of list of words in documents
In [8]:
from gensim import corpora, models, similarities, matutils
"""gensim includes its own vectorizing tools"""
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
#doc2bow just means `doc`uments to `b`ag `o`f `w`ords
#ok, this has just vectorized our texts; it's another form
In [10]:
number_topics=40
model = models.LdaModel(corpus, id2word=dictionary, num_topics=number_topics, passes=10) #use gensim multicore LDA
In [11]:
model.show_topics()
Out[11]:
In [14]:
topics_indexed=[[b for (a,b) in topics] for topics in model.show_topics(number_topics,10,formatted=False)]
topics_indexed=pd.DataFrame(topics_indexed)
In [15]:
topics_indexed
Out[15]:
So which topics most significant for each document? Pass a bag of words version of each document to the model.
In [16]:
model[dictionary.doc2bow(texts[1])]
Out[16]:
Let's find them for every document--with a list comprehension, of course
In [17]:
primarytopics=[model[dictionary.doc2bow(text)] for text in texts]
make it pretty with a list comprehension
In [39]:
import numpy as np
primarytopics_matrix=pd.DataFrame(np.matrix([matutils.sparse2full(primarytopic, number_topics) for primarytopic in primarytopics]))
In [54]:
primarytopics_matrix.ix[18].plot(kind="bar")
Out[54]:
In [55]:
primarytopics_matrix.ix[18].plot(kind="bar", title=names[18])
Out[55]:
In [58]:
topics_indexed.ix[[6, 21, 27, 38]]
Out[58]:
In [59]:
#FIND which documents focus on a particular topic
In [60]:
primarytopics_matrix[27].plot(kind="bar")
Out[60]:
In [53]:
primarytopics_matrix[17].plot(kind="bar", title=str(topics_indexed.ix[17]))
Out[53]:
In [ ]:
#that's ugly!