notebook.community

Edit and run



In [4]:

    
import pandas as pd
import nltk
import re



In [5]:

    
stopwords = nltk.corpus.stopwords.words('english')



In [42]:

    
from nltk.stem.snowball import EnglishStemmer
stemmer = EnglishStemmer()



In [34]:

    
from nltk.tokenize import TreebankWordTokenizer, SpaceTokenizer
tokenizer = TreebankWordTokenizer()



In [43]:

    
tokenizer.tokenize("Hello, My Name is Guy (Really!)")









    Out[43]:





['Hello', ',', 'My', 'Name', 'is', 'Guy', '(', 'Really', '!', ')']



In [233]:

    
def tokenize_only(text):
    tokens = tokenizer.tokenize(text)
    
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    filtered_tokens = [token for token in tokens if (re.search('[a-zA-Z]', token) 
                                                     and len(token) > 1 
                                                     and token.find('`') == -1)]
    
    return filtered_tokens

def tokenize_and_stem(text):
    tokens = tokenize_only(text)
    stems = map(stemmer.stem, tokens)
    return stems



In [234]:

    
tokenize_only("Hello, My Name is Guy (Really!)")









    Out[234]:





['Hello', 'My', 'Name', 'is', 'Guy', 'Really']

Below I use my stemming/tokenizing and tokenizing functions to iterate over the list of synopses to create two vocabularies: one stemmed and one only tokenized.



In [269]:

    
df = pd.read_csv('../data/wiki/wiki.csv.gz', encoding='utf8', index_col=None)



In [270]:

    
df['text'] = df.text.str[:3000]



In [238]:

    
totalvocab_stemmed = []
totalvocab_tokenized = []
for doc_text in df.text:
    allwords_stemmed = tokenize_and_stem(doc_text) #for each item in 'synopses', tokenize/stem
    totalvocab_stemmed.extend(allwords_stemmed) #extend the 'totalvocab_stemmed' list
    
    allwords_tokenized = tokenize_only(doc_text)
    totalvocab_tokenized.extend(allwords_tokenized)



In [239]:

    
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)



In [240]:

    
vocab_frame.head(10)









    Out[240]:






  
    
      
      words
    
  
  
    
      may
      may
    
    
      refer
      refer
    
    
      to
      to
    
    
      number
      number
    
    
      the
      the
    
    
      integ
      integer
    
    
      between
      between
    
    
      and
      and
    
    
      year
      year
    
    
      year
      year



In [241]:

    
len(vocab_frame)









    Out[241]:





336092



In [ ]:

Tf-idf and document similarity



In [528]:

    
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_df=12, min_df=3, 
                                   stop_words='english',
                                   tokenizer=tokenize_and_stem, ngram_range=(1,3))

%time tfidf_matrix = tfidf_vectorizer.fit_transform(df.text)

print(tfidf_matrix.shape)









    



Wall time: 13.6 s
(835, 10124)



In [529]:

    
terms = tfidf_vectorizer.get_feature_names()



In [ ]:



In [530]:

    
len(terms)









    Out[530]:





10124



In [531]:

    
len(vocab_frame)









    Out[531]:





336092



In [532]:

    
terms = tfidf_vectorizer.get_feature_names()



In [536]:

    
idx = 2742
terms[idx]









    Out[536]:





u'diagnos'



In [537]:

    
terms[800]









    Out[537]:





u'area border'



In [538]:

    
vocab_frame.ix[terms[idx]].head(5)









    Out[538]:






  
    
      
      words
    
  
  
    
      diagnos
      diagnosed
    
    
      diagnos
      diagnosed
    
    
      diagnos
      diagnosed
    
    
      diagnos
      diagnoses
    
    
      diagnos
      diagnosed



In [ ]:



In [ ]:

Lets cluster!



In [539]:

    
from sklearn.cluster import KMeans

num_clusters = 10

km = KMeans(n_clusters=num_clusters)

%time km.fit(tfidf_matrix)

clusters = km.labels_.tolist()









    



Wall time: 620 ms



In [540]:

    
len(clusters)









    Out[540]:





835



In [ ]:

But what did we get?



In [541]:

    
clustered = df.join(pd.DataFrame({'cluster': clusters}))



In [542]:

    
clustered.head()









    Out[542]:






  
    
      
      name
      text
      cluster
    
  
  
    
      0
      0
      0  may refer to:  0 (number) , the integer b...
      0
    
    
      1
      11.22.63
      11.22.63  is an American  science fiction ...
      5
    
    
      2
      2016
      2016 has been designated as:  Inte...
      4
    
    
      3
      4chan
      4chan  is an  English-language   image...
      8
    
    
      4
      8chan
      8chan,  also called  Infinitechan,  is a...
      7



In [543]:

    
len(km.cluster_centers_[1])









    Out[543]:





10124



In [ ]:



In [544]:

    
order_centroids = km.cluster_centers_.argsort()



In [545]:

    
order_centroids









    Out[545]:





array([[10123,  5672,  5671, ...,  3310,  4481,  7596],
       [    0,  6177,  6176, ...,  4035,  2446,   727],
       [    0,  5817,  5816, ...,  3367,  5681,  6988],
       ..., 
       [    0,  5902,  5901, ...,  9132,  6410,  6533],
       [    0,  5936,  5935, ...,  7037,  6250,  3860],
       [    0,  6355,  6354, ...,  1918,  9647,  6630]], dtype=int64)



In [ ]:



In [ ]:



In [ ]:



In [546]:

    
term_words = vocab_frame.ix[terms[idx]]



In [547]:

    
term_words['words'].value_counts().keys()[0]









    Out[547]:





u'diagnosed'



In [ ]:



In [548]:

    
cluster_names = []

for cluster_centeroids in order_centroids:
    words = []
    for ind in cluster_centeroids[-6:]:
        term_words = vocab_frame.ix[terms[ind].split(' ')]
        
        best_word = term_words['words'].value_counts().keys()[0]
        
        words.append(best_word)
        
    cluster_names.append(', '.join(words))



In [549]:

    
cluster_names









    Out[549]:





[u'ontology, metaphysics, Prussia, epistemology, Hulk, REDIRECT',
 u'dinosaurs, shark, cat, gorilla, crocodile, apes',
 u'crew, Muhammad, Sudan, Ethiopia, Mauritius, Plato',
 u'cable, WWE, Championship, automobile, Wrestling, Volkswagen',
 u'procedure, managemen, antibiotics, amphetamine, vaccine, benzodiazepines',
 u'hop, No., certified, guitar, guitarist, DJ',
 u'Wikipedia, HTML, iPhone, Google, email, browser',
 u'Aboriginal, Finland, Ontario, Texas, Norway, Ohio',
 u'Ukraine, Hungary, Colombia, Poland, Netherlands, fruit',
 u'Pornhub, porn, penis, clitoris, vaginal, orgasm']



In [550]:

    
clustered['cluster_name'] =clustered.cluster.map(lambda cluster: cluster_names[cluster])



In [551]:

    
clustered.head(10)









    Out[551]:






  
    
      
      name
      text
      cluster
      cluster_name
    
  
  
    
      0
      0
      0  may refer to:  0 (number) , the integer b...
      0
      ontology, metaphysics, Prussia, epistemology, ...
    
    
      1
      11.22.63
      11.22.63  is an American  science fiction ...
      5
      hop, No., certified, guitar, guitarist, DJ
    
    
      2
      2016
      2016 has been designated as:  Inte...
      4
      procedure, managemen, antibiotics, amphetamine...
    
    
      3
      4chan
      4chan  is an  English-language   image...
      8
      Ukraine, Hungary, Colombia, Poland, Netherland...
    
    
      4
      8chan
      8chan,  also called  Infinitechan,  is a...
      7
      Aboriginal, Finland, Ontario, Texas, Norway, Ohio
    
    
      5
      Aaliyah
      Aaliyah Dana Haughton  ( ; January 16,...
      5
      hop, No., certified, guitar, guitarist, DJ
    
    
      6
      ABBA
      ABBA  (stylised  ᗅᗺᗷᗅ ;  ) were a Swed...
      5
      hop, No., certified, guitar, guitarist, DJ
    
    
      7
      Abraham
      Abraham  (   ;  ,  ), originally  Abra...
      9
      Pornhub, porn, penis, clitoris, vaginal, orgasm
    
    
      8
      Achilles
      Achilles and the  Nereid  Cymothoe: Attic ...
      6
      Wikipedia, HTML, iPhone, Google, email, browser
    
    
      9
      Adderall
      Adderall  is a  co mbination drug  con...
      4
      procedure, managemen, antibiotics, amphetamine...



In [ ]:



In [ ]:



In [ ]:



In [502]:

    
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)



In [503]:

    
import os  # for os.path.basename

import matplotlib.pyplot as plt
import matplotlib as mpl

from sklearn.manifold import MDS

MDS()

# two components as we're plotting points in a two-dimensional plane
# "precomputed" because we provide a distance matrix
# we will also specify `random_state` so the plot is reproducible.
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)

pos = mds.fit_transform(dist)  # shape (n_components, n_samples)

xs, ys = pos[:, 0], pos[:, 1]



In [307]:

    
import seaborn as sns



In [308]:

    
#set up colors per clusters using a dict
cluster_colors = sns.color_palette(n_colors=len(clusters))



In [309]:

    
%matplotlib inline



In [310]:

    
#create data frame that has the result of the MDS plus the cluster numbers and titles
df_print = pd.DataFrame(dict(x=xs, y=ys, label=clusters, title=df.name)) 

#group by cluster
groups = df_print.groupby('label')


# set up plot
fig, ax = plt.subplots(figsize=(30, 30)) # set size
ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling

#iterate through groups to layer the plot
#note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label
for name, group in groups:
    ax.plot(group.x, group.y, marker='o', linestyle='', ms=12, label=cluster_names[name], color=cluster_colors[name], mec='none')
    ax.set_aspect('auto')
    ax.tick_params(\
        axis= 'x',          # changes apply to the x-axis
        which='both',      # both major and minor ticks are affected
        bottom='off',      # ticks along the bottom edge are off
        top='off',         # ticks along the top edge are off
        labelbottom='off')
    ax.tick_params(\
        axis= 'y',         # changes apply to the y-axis
        which='both',      # both major and minor ticks are affected
        left='off',      # ticks along the bottom edge are off
        top='off',         # ticks along the top edge are off
        labelleft='off')
    
ax.legend(numpoints=1)  #show legend with only 1 point

#add label in x,y position with the label as the film title
for i in range(len(df)):
    ax.text(df_print.ix[i]['x'], df_print.ix[i]['y'], df_print.ix[i]['title'], size=8)  

    
    
#plt.show() #show the plot

#uncomment the below to save the plot if need be
plt.savefig('clusters_small_noaxes.png', dpi=300)



In [ ]:

	words
may	may
refer	refer
to	to
number	number
the	the
integ	integer
between	between
and	and
year	year
year	year

	words
diagnos	diagnosed
diagnos	diagnosed
diagnos	diagnosed
diagnos	diagnoses
diagnos	diagnosed

	name	text	cluster
0	0	0 may refer to: 0 (number) , the integer b...	0
1	11.22.63	11.22.63 is an American science fiction ...	5
2	2016	2016 has been designated as: Inte...	4
3	4chan	4chan is an English-language image...	8
4	8chan	8chan, also called Infinitechan, is a...	7

	name	text	cluster	cluster_name
0	0	0 may refer to: 0 (number) , the integer b...	0	ontology, metaphysics, Prussia, epistemology, ...
1	11.22.63	11.22.63 is an American science fiction ...	5	hop, No., certified, guitar, guitarist, DJ
2	2016	2016 has been designated as: Inte...	4	procedure, managemen, antibiotics, amphetamine...
3	4chan	4chan is an English-language image...	8	Ukraine, Hungary, Colombia, Poland, Netherland...
4	8chan	8chan, also called Infinitechan, is a...	7	Aboriginal, Finland, Ontario, Texas, Norway, Ohio
5	Aaliyah	Aaliyah Dana Haughton ( ; January 16,...	5	hop, No., certified, guitar, guitarist, DJ
6	ABBA	ABBA (stylised ᗅᗺᗷᗅ ; ) were a Swed...	5	hop, No., certified, guitar, guitarist, DJ
7	Abraham	Abraham ( ; , ), originally Abra...	9	Pornhub, porn, penis, clitoris, vaginal, orgasm
8	Achilles	Achilles and the Nereid Cymothoe: Attic ...	6	Wikipedia, HTML, iPhone, Google, email, browser
9	Adderall	Adderall is a co mbination drug con...	4	procedure, managemen, antibiotics, amphetamine...