notebook.community

Edit and run



In [9]:

    
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD



In [4]:

    
my_documents = ['The movie was about a spaceship and aliens.',
 'I really liked the movie!',
 'Awesome action scenes, but boring characters.',
 'The movie was awful! I hate alien films.',
 'Space is cool! I liked the movie.',
 'More space films, please!',]



In [5]:

    
tfidf = TfidfVectorizer() 

# Apply fit_transform to document: csr_mat
csr_mat = tfidf.fit_transform(my_documents)



In [6]:

    
print(csr_mat.toarray())









    



[[ 0.43127749  0.          0.          0.43127749  0.43127749  0.          0.
   0.          0.          0.          0.          0.          0.          0.
   0.          0.          0.25585931  0.          0.          0.          0.
   0.43127749  0.25585931  0.35365371]
 [ 0.          0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.          0.
   0.53194592  0.          0.38484912  0.          0.64870321  0.          0.
   0.          0.38484912  0.        ]
 [ 0.          0.40824829  0.          0.          0.          0.40824829
   0.          0.40824829  0.40824829  0.40824829  0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.40824829  0.          0.          0.          0.        ]
 [ 0.          0.          0.44504881  0.          0.          0.
   0.44504881  0.          0.          0.          0.          0.36494639
   0.44504881  0.          0.          0.          0.26402928  0.          0.
   0.          0.          0.          0.26402928  0.36494639]
 [ 0.          0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.4969801   0.          0.
   0.4969801   0.40753079  0.          0.29483799  0.          0.          0.
   0.40753079  0.          0.29483799  0.        ]
 [ 0.          0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.44836665  0.          0.
   0.          0.54677906  0.          0.54677906  0.          0.
   0.44836665  0.          0.          0.        ]]



In [8]:

    
words = tfidf.get_feature_names()
words









    Out[8]:





['about',
 'action',
 'alien',
 'aliens',
 'and',
 'awesome',
 'awful',
 'boring',
 'but',
 'characters',
 'cool',
 'films',
 'hate',
 'is',
 'liked',
 'more',
 'movie',
 'please',
 'really',
 'scenes',
 'space',
 'spaceship',
 'the',
 'was']



In [11]:

    
svd = TruncatedSVD(n_components=5)
svd.fit(csr_mat)
svd.transform(csr_mat)









    Out[11]:





array([[  5.35010090e-01,  -3.33183344e-01,  -6.68408892e-17,
          5.55290628e-01,  -5.41864996e-01],
       [  7.17807689e-01,  -3.04653070e-01,   7.34799999e-15,
         -3.79199917e-01,   1.68303659e-01],
       [  5.66059640e-16,   1.47993232e-14,   1.00000000e+00,
          9.07456653e-15,   6.23152389e-16],
       [  5.90897413e-01,   1.48528691e-01,  -8.11228430e-15,
          5.47150194e-01,   5.58405856e-01],
       [  7.19645035e-01,   5.37964050e-02,   3.40645939e-15,
         -4.90642699e-01,  -1.28178931e-01],
       [  3.00665035e-01,   8.99536695e-01,  -1.32944813e-14,
          1.62488982e-02,  -2.28239594e-01]])



In [ ]:

    
# Perform the necessary imports
# Data from https://blog.lateral.io/2015/06/the-unknown-perils-of-mining-wikipedia/
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline

# Create a TruncatedSVD instance: svd
svd = TruncatedSVD(n_components=50)

# Create a KMeans instance: kmeans
kmeans = KMeans(n_clusters=6)

# Create a pipeline: pipeline
pipeline = make_pipeline(svd,kmeans)



In [ ]:

    
# Import pandas
import pandas as pd

# Fit the pipeline to articles
pipeline.fit(articles)

# Calculate the cluster labels: labels
labels = pipeline.predict(articles)

# Create a DataFrame aligning labels and titles: df
df = pd.DataFrame({'label': labels, 'article': titles})

# Display df sorted by cluster label
print(df.sort_values('label'))