notebook.community

Edit and run



In [0]:

    
import pandas as pd
import numpy as np
from google.colab import files



In [2]:

    
uploaded = files.upload()









    





     
     
      Upload widget is only available when the cell has been executed in the
      current browser session. Please rerun this cell to enable.
      
       






    



Saving movie_overview.csv to movie_overview.csv



In [3]:

    
from sklearn.feature_extraction.text import TfidVectorizer
from sklearn.cluster import KMeans
from wordcloud import WordCloud
import matplotlib.pyplot as plt









    



---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
<ipython-input-3-c3e1be46d542> in <module>()
----> 1 from sklearn.feature_extraction.text import TfidVectorizer
      2 from sklearn.cluster import KMeans
      3 from wordcloud import WordCloud
      4 import matplotlib.pyplot as plt

ImportError: cannot import name 'TfidVectorizer'

---------------------------------------------------------------------------
NOTE: If your import is failing due to a missing package, you can
manually install dependencies using either !pip or !apt.

To view examples of installing some common dependencies, click the
"Open Examples" button below.
---------------------------------------------------------------------------



In [0]:

    
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from wordcloud import WordCloud
import matplotlib.pyplot as plt



In [0]:

    
df = pd.read_csv('./movie_overview.csv')



In [6]:

    
df.head()









    Out[6]:







  
    
      
      id
      title
      overview
    
  
  
    
      0
      0
      Toy Story
      Led by Woody, Andy's toys live happily in his ...
    
    
      1
      1
      Jumanji
      When siblings Judy and Peter discover an encha...
    
    
      2
      2
      Grumpier Old Men
      A family wedding reignites the ancient feud be...
    
    
      3
      3
      Waiting to Exhale
      Cheated on, mistreated and stepped on, the wom...
    
    
      4
      4
      Father of the Bride Part II
      Just when George Banks has recovered from his ...



In [0]:

    
documents = df['overview'].values.astype("U") #to set as binary to transform,
#binary coz if 1 2 3 then machine will weight it differently, 4 higher than 1

vectorizer = TfidfVectorizer(stop_words='english')
features = vectorizer.fit_transform(documents)



In [8]:

    
k = 20 #means 20 group/sectors/segments
model = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1)
model.fit(features)









    Out[8]:





KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
       n_clusters=20, n_init=1, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)



In [9]:

    
df['cluster'] = model.labels_
df.head(10)









    Out[9]:







  
    
      
      id
      title
      overview
      cluster
    
  
  
    
      0
      0
      Toy Story
      Led by Woody, Andy's toys live happily in his ...
      3
    
    
      1
      1
      Jumanji
      When siblings Judy and Peter discover an encha...
      3
    
    
      2
      2
      Grumpier Old Men
      A family wedding reignites the ancient feud be...
      0
    
    
      3
      3
      Waiting to Exhale
      Cheated on, mistreated and stepped on, the wom...
      13
    
    
      4
      4
      Father of the Bride Part II
      Just when George Banks has recovered from his ...
      3
    
    
      5
      5
      Heat
      Obsessive master thief, Neil McCauley leads a ...
      13
    
    
      6
      6
      Sabrina
      An ugly duckling having undergone a remarkable...
      3
    
    
      7
      7
      Tom and Huck
      A mischievous young boy, Tom Sawyer, witnesses...
      17
    
    
      8
      8
      Sudden Death
      International action superstar Jean Claude Van...
      3
    
    
      9
      9
      GoldenEye
      James Bond must unmask the mysterious head of ...
      3



In [0]:

    
clusters = df.groupby('cluster')    

for cluster in clusters.groups:
    f = open('cluster'+str(cluster)+ '.csv', 'w')
    data = clusters.get_group(cluster)[['title','overview']]
    f.write(data.to_csv(index_label='id'))
    f.close()



In [11]:

    
%config InlineBackend.figure_format = 'retina'

from os import path
from wordcloud import WordCloud
import matplotlib.pyplot as plt



fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2,2, figsize=(15,15))


text = open('cluster0.csv').read() #number cluster can change to see category
wordcloud = WordCloud(width=800, height=800,background_color="black").generate(text)

ax1.imshow(wordcloud)



text = open('cluster1.csv').read()
wordcloud = WordCloud(width=800, height=800, background_color="black").generate(text)

ax2.imshow(wordcloud)


text = open('cluster2.csv').read()
wordcloud = WordCloud(width=800, height=800,background_color="white").generate(text)

ax3.imshow(wordcloud)


text = open('cluster3.csv').read()
wordcloud = WordCloud(width=800, height=800,background_color="white").generate(text)

ax4.imshow(wordcloud)









    Out[11]:





<matplotlib.image.AxesImage at 0x7fb071d1ac50>



In [0]:

	id	title	overview
0	0	Toy Story	Led by Woody, Andy's toys live happily in his ...
1	1	Jumanji	When siblings Judy and Peter discover an encha...
2	2	Grumpier Old Men	A family wedding reignites the ancient feud be...
3	3	Waiting to Exhale	Cheated on, mistreated and stepped on, the wom...
4	4	Father of the Bride Part II	Just when George Banks has recovered from his ...