In [7]:
import pandas as pd
import nltk
import re
In [8]:
stopwords = nltk.corpus.stopwords.words('english')
In [9]:
from nltk.stem.snowball import EnglishStemmer
stemmer = EnglishStemmer()
In [10]:
from nltk.tokenize import TreebankWordTokenizer, SpaceTokenizer
tokenizer = TreebankWordTokenizer()
In [11]:
tokenizer.tokenize("Hello, My Name i`s Guy (Really!)")
Out[11]:
In [12]:
def tokenize_only(text):
tokens = tokenizer.tokenize(text)
# filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
filtered_tokens = [token for token in tokens if re.search('[a-zA-Z]', token)]
return filtered_tokens
def tokenize_and_stem(text):
tokens = tokenize_only(text)
stems = map(stemmer.stem, tokens)
return stems
In [13]:
tokenize_and_stem("Hello, My Name is Guy (Really!)")
Out[13]:
Below I use my stemming/tokenizing and tokenizing functions to iterate over the list of synopses to create two vocabularies: one stemmed and one only tokenized.
In [14]:
df = pd.read_csv('../data/wiki/wiki.csv.gz', encoding='utf8', index_col=None)
In [15]:
df['text'] = df.text.str[:3000]
In [16]:
totalvocab_stemmed = []
totalvocab_tokenized = []
for doc_text in df.text:
allwords_stemmed = tokenize_and_stem(doc_text) #for each item in 'synopses', tokenize/stem
totalvocab_stemmed.extend(allwords_stemmed) #extend the 'totalvocab_stemmed' list
allwords_tokenized = tokenize_only(doc_text)
totalvocab_tokenized.extend(allwords_tokenized)
In [19]:
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)
In [20]:
vocab_frame.head(10)
Out[20]:
In [21]:
len(vocab_frame)
Out[21]:
In [ ]:
In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_df=0.01, min_df=3,
stop_words='english',
tokenizer=tokenize_and_stem, ngram_range=(1,3))
%time tfidf_matrix = tfidf_vectorizer.fit_transform(df.text)
print(tfidf_matrix.shape)
In [25]:
terms = tfidf_vectorizer.get_feature_names()
In [ ]:
In [ ]:
In [26]:
len(vocab_frame)
Out[26]:
In [27]:
terms = tfidf_vectorizer.get_feature_names()
In [28]:
idx = 1000
terms[idx]
Out[28]:
In [29]:
terms[2001]
Out[29]:
In [30]:
vocab_frame.ix[terms[idx]].head(5)
Out[30]:
In [ ]:
In [ ]:
In [31]:
from sklearn.cluster import KMeans
num_clusters =30
km = KMeans(n_clusters=num_clusters)
%time km.fit(tfidf_matrix)
clusters = km.labels_.tolist()
In [32]:
len(clusters)
Out[32]:
In [ ]:
In [33]:
clustered = df.join(pd.DataFrame({'cluster': clusters}))
In [34]:
clustered.head()
Out[34]:
In [ ]:
In [35]:
km.cluster_centers_
Out[35]:
In [ ]:
In [36]:
order_centroids = km.cluster_centers_.argsort()
In [37]:
order_centroids
Out[37]:
In [ ]:
In [ ]:
In [ ]:
In [38]:
term_words = vocab_frame.ix[terms[idx]]
In [39]:
term_words['words'].value_counts().keys()[0]
Out[39]:
In [ ]:
In [40]:
cluster_names = []
for cluster_centeroids in order_centroids:
words = []
for ind in cluster_centeroids[-6:]:
term_words = vocab_frame.ix[terms[ind].split(' ')]
best_word = term_words['words'].value_counts().keys()[0]
words.append(best_word)
cluster_names.append(', '.join(words))
In [41]:
cluster_names
Out[41]:
In [44]:
clustered['cluster_name'] =clustered.cluster.map(lambda cluster: cluster_names[cluster])
In [45]:
clustered.head(60)
Out[45]:
In [ ]:
In [ ]:
In [ ]:
In [46]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)
In [47]:
import os # for os.path.basename
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.manifold import MDS
MDS()
# two components as we're plotting points in a two-dimensional plane
# "precomputed" because we provide a distance matrix
# we will also specify `random_state` so the plot is reproducible.
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
pos = mds.fit_transform(dist) # shape (n_components, n_samples)
xs, ys = pos[:, 0], pos[:, 1]
In [48]:
import seaborn as sns
In [49]:
#set up colors per clusters using a dict
cluster_colors = sns.color_palette(n_colors=len(clusters))
In [50]:
%matplotlib inline
In [51]:
#create data frame that has the result of the MDS plus the cluster numbers and titles
df = pd.DataFrame(dict(x=xs, y=ys, label=clusters, title=df.name))
#group by cluster
groups = df.groupby('label')
# set up plot
fig, ax = plt.subplots(figsize=(30, 30)) # set size
ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling
#iterate through groups to layer the plot
#note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label
for name, group in groups:
ax.plot(group.x, group.y, marker='o', linestyle='', ms=12, label=cluster_names[name], color=cluster_colors[name], mec='none')
ax.set_aspect('auto')
ax.tick_params(\
axis= 'x', # changes apply to the x-axis
which='both', # both major and minor ticks are affected
bottom='off', # ticks along the bottom edge are off
top='off', # ticks along the top edge are off
labelbottom='off')
ax.tick_params(\
axis= 'y', # changes apply to the y-axis
which='both', # both major and minor ticks are affected
left='off', # ticks along the bottom edge are off
top='off', # ticks along the top edge are off
labelleft='off')
ax.legend(numpoints=1) #show legend with only 1 point
#add label in x,y position with the label as the film title
for i in range(len(df)):
ax.text(df.ix[i]['x'], df.ix[i]['y'], df.ix[i]['title'], size=8)
#plt.show() #show the plot
#uncomment the below to save the plot if need be
plt.savefig('clusters_small_noaxes1.png', dpi=300)
In [ ]: