In [ ]:
%matplotlib inline
# You can ignore the pink warning that appears
In [ ]:
import itertools
import math
import nltk
import string
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import linkage, dendrogram
Term Frequency: Number of appearances of a word in a document (the counts we saw already)
Document Frequency: Number of documents that contain a word in a set of docs
TF-IDF is Term Frequency / Document Frequency, with some extra fiddles.
Example from Manning, Raghavan, and Schuetze showing IDF of a rare term is high:
TF-IDF for a word and document is usually calculated as:
(Word t's frequency in the doc) * Log( Number of Docs / Number of docs that contain the word t)
However, it is usually done with a + 1 term or two. You can consider it an information measure for document words (or "features") in a bag-of-words style analysis, where the order of the words doesn't matter, just the set of words. It is a "weight" for a word. Some features of TF-IDF:
See the discussion in Manning, Raghavan, and Schuetze, and even more math in Wikipedia. Depending on implementation, TF-IDF may or may not be normalized. Always check to see if the implementation you use cleans stopwords or not and decide if you like that.
Some more python references:
In other languages than Python, for instance:
In [ ]:
# code example from Building Machine Learning Systems with Python (Richert & Coelho)
# - modified slightly by Lynn
import math
def tfidf(t, d, D):
tf = float(d.count(t)) / sum(d.count(w) for w in set(d)) # normalized
# Note his version doesn't use +1 in denominator.
idf = math.log( float(len(D)) / (len([doc for doc in D if t in doc])))
return tf * idf
a, abb, abc = ["a"], ["a", "b", "b"], ["a", "b", "c"] # try adding another c to the last doc!
D = [a, abb, abc]
print(tfidf("a", a, D)) # a is in all of them
print(tfidf("a", abc, D)) # a is in all of them
print(tfidf("b", abc, D)) # b occurs only once here, but in 2 docs
print(tfidf("b", abb, D)) # b occurs more frequently in this doc
print(tfidf("c", abc, D)) # c is unique in the doc set
What if you change some of those docs, or add another one? Add another c in the last doc, e.g.
In [ ]:
filelist = !ls ../data/movie_reviews/positive/*
In [ ]:
filelist
In [ ]:
from nltk.corpus import stopwords
import collections
def clean_tokens(tokens, stopwords):
import string
""" Lowercases, takes out punct and stopwords and short strings """
return [token.lower() for token in tokens if (token not in string.punctuation)and (token.lower() not in stopwords) and len(token) > 2]
def makeText(filename, stopwords):
from nltk import Text
with open(filename) as handle:
text = handle.read()
return Text(clean_tokens(nltk.word_tokenize(text.decode('ascii', 'ignore')), stopwords))
def makeTextCollection(files, stopwords=stopwords.words('english')):
from nltk import TextCollection
texts= [makeText(filename, stopwords) for filename in files]
collection = TextCollection(texts)
return collection, texts
# use the data for the vocab in a single doc for a wordcloud, for instance
def compute_tfidf_by_doc(coll, texts, filenames):
tfidf_by_doc = collections.defaultdict(list)
for i, text in enumerate(texts):
for word in set(text.tokens): # just use the words in this text
tfidfscore = coll.tf_idf(word, text)
tf = coll.tf(word, text) # is actually count / len(text)
count = text.count(word)
if tfidfscore:
tfidf_by_doc[filenames[i]].append({
"word": word,
"tfidf": tfidfscore,
"tf": tf,
"count": count
})
return tfidf_by_doc
In [ ]:
# We need to make the text collection, then use it to compute the tf-idf for the words in the docs.
res = makeTextCollection(filelist)
coll = res[0]
texts = res[1]
In [ ]:
coll.tf_idf("woman", texts[0])
In [ ]:
tfidfs = compute_tfidf_by_doc(coll, texts, filelist)
In [ ]:
tfidfs[tfidfs.keys()[0]] # the first filename is the first key... it contains a list of words and scores
In [ ]:
import json
jsonified = json.dumps(tfidfs)
with open('../outputdata/pos_movies_tfidf.json', 'w') as handle:
handle.write(jsonified)
In [ ]:
!ls -al ../outputdata/pos_movies_tfidf.json
Now we can look at these reviews as little wordclouds, using different measures to size our words. Let's work with word_clouds_tfidf.html and we can compare how our clouds look using regular word counts, term frequencies (which is count / length of the document), and tfidf across all the documents.
For movie cv681_tok-28559.txt, by counts is useless:
By tf (term frequency, normalized to document length), it's a bit better, in that some of the smaller words are getting larger:
It's still not stellar with tf-idf, but the more unique words are popping a bit better and the words shared across many docs, like "film" and "movie" are disappearing:
This is where some more libraries start to be needed. NLTK is fine for some things, but can be very slow for stuff like making vectors of terms across large numbers of documents.
Each document is a collection of weighted words, which we'll call a vector. Vectors can be compared to each other, to compute similarity. A common metric is "cosine similarity." Image from Manning, Raghavan and Schuetze:
Reminder: Angles close to each other are near 1 in cosine, far apart are closer to 0. This means that in practice you may want to subtract from 1, so that a higher score = further away. You should think of it as cosine = distance, 1-cos = similarity. Pattern (the library) does this for you so that similarity = 1 - cos.
Another, perhaps simpler to understand, is euclidean distance (image from this article):
This is essentially the hypoteneuse between two sides of a vector triangle. Larger numbers = further apart vectors!
Links:
In [ ]:
# Load in the docs... again. We're going to make TF-IDF vectors with sklearn (scikit-learn) because it's faster.
def load_texts(filenames, dirpath):
""" filenames are the leaves, dirpath is the path to them with the / """
loaded_text = {}
for filen in filenames:
with open(dirpath + filen) as handle:
loaded_text[filen] = handle.read()
return loaded_text
In [ ]:
texts = load_texts(filelist, "")
In [ ]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer().fit_transform([text.decode('ascii', 'ignore') for text in texts.values()])
In [ ]:
vectors = tfidf.toarray()
This gets us the input we need for clustering and making dendrograms.
In [ ]:
import numpy as np
from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import linkage, dendrogram
In [ ]:
vectors
Scipy's pdist is pairwise distance - see http://docs.scipy.org/doc/scipy/reference/spatial.distance.html You can use cosine here as well! or a host of other options...
In [ ]:
dist = pdist(vectors, metric='cosine') # look at the manpage and pick a different measure to try
In [ ]:
linkage(dist)
In [ ]:
# this is a base diagram, using defaults...
dendrogram(linkage(dist)) # this plotting function has a ton of things you can manipulate if you look at the docs.
In [ ]:
texts[texts.keys()[14]]
In [ ]:
def make_dend(data, labels=None, height=6):
from pylab import rcParams
dist = pdist(data, metric='cosine')
link = linkage(dist, method='complete')
rcParams['figure.figsize'] = 6, height
rcParams['axes.labelsize'] = 5
if not labels:
dend = dendrogram(link, orientation='right') #labels=names)
else:
dend = dendrogram(link, orientation='right', labels=[str(i) + label for i, label in enumerate(labels)])
return dist
In [ ]:
dist = make_dend(vectors, height=15, labels=texts.keys())
In [ ]:
texts.keys()[23]
In [ ]:
texts[texts.keys()[23]]
In [ ]:
texts[texts.keys()[4]]
What do you notice about them both?
In [ ]:
# Code borrowed from: http://nbviewer.ipython.org/github/OxanaSachenkova/hclust-python/blob/master/hclust.ipynb
def make_heatmap_matrix(dist, method='complete'):
""" Pass in the distance matrix; method options are complete or single """
# Compute and plot first dendrogram.
fig = plt.figure(figsize=(10,10))
# x ywidth height
ax1 = fig.add_axes([0.05,0.1,0.2,0.6])
Y = linkage(dist, method=method)
Z1 = dendrogram(Y, orientation='right') # adding/removing the axes
ax1.set_xticks([])
# Compute and plot second dendrogram.
ax2 = fig.add_axes([0.3,0.71,0.6,0.2])
Z2 = dendrogram(Y)
ax2.set_xticks([])
ax2.set_yticks([])
#Compute and plot the heatmap
axmatrix = fig.add_axes([0.3,0.1,0.6,0.6])
idx1 = Z1['leaves']
idx2 = Z2['leaves']
D = squareform(dist)
D = D[idx1,:]
D = D[:,idx2]
im = axmatrix.matshow(D, aspect='auto', origin='lower', cmap=plt.cm.YlGnBu)
axmatrix.set_xticks([])
axmatrix.set_yticks([])
# Plot colorbar.
axcolor = fig.add_axes([0.91,0.1,0.02,0.6])
plt.colorbar(im, cax=axcolor)
In [ ]:
make_heatmap_matrix(dist, method='complete')
In [ ]:
## clustering in NLTK:
import numpy
from nltk.cluster import KMeansClusterer, GAAClusterer, euclidean_distance
import nltk.corpus
import nltk.stem
stemmer_func = nltk.stem.snowball.SnowballStemmer("english").stem
stopwords = set(nltk.corpus.stopwords.words('english'))
cluster = KMeansClusterer(4, euclidean_distance)
cluster.cluster(vectors)
classified_examples = [cluster.classify(vec) for vec in vectors]
In [ ]:
for i,val in enumerate(classified_examples):
print val, texts.keys()[i]
In [ ]:
texts['../data/movie_reviews/positive/cv677_tok-11867.txt']
In [ ]:
texts['../data/movie_reviews/positive/cv684_tok-10367.txt']
In [ ]:
texts['../data/movie_reviews/positive/cv683_tok-12295.txt']
For another, more advanced tour of clustering, including plotting these, see the notebook cluster_analysis_by_brandon_rose.ipynb.
In [ ]: