In [0]:
import pandas as pd
import numpy as np
from google.colab import files

In [2]:
uploaded = files.upload()


Upload widget is only available when the cell has been executed in the current browser session. Please rerun this cell to enable.
Saving movie_overview.csv to movie_overview.csv

In [3]:
from sklearn.feature_extraction.text import TfidVectorizer
from sklearn.cluster import KMeans
from wordcloud import WordCloud
import matplotlib.pyplot as plt


---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
<ipython-input-3-c3e1be46d542> in <module>()
----> 1 from sklearn.feature_extraction.text import TfidVectorizer
      2 from sklearn.cluster import KMeans
      3 from wordcloud import WordCloud
      4 import matplotlib.pyplot as plt

ImportError: cannot import name 'TfidVectorizer'

---------------------------------------------------------------------------
NOTE: If your import is failing due to a missing package, you can
manually install dependencies using either !pip or !apt.

To view examples of installing some common dependencies, click the
"Open Examples" button below.
---------------------------------------------------------------------------

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from wordcloud import WordCloud
import matplotlib.pyplot as plt

In [0]:
df = pd.read_csv('./movie_overview.csv')

In [6]:
df.head()


Out[6]:
id title overview
0 0 Toy Story Led by Woody, Andy's toys live happily in his ...
1 1 Jumanji When siblings Judy and Peter discover an encha...
2 2 Grumpier Old Men A family wedding reignites the ancient feud be...
3 3 Waiting to Exhale Cheated on, mistreated and stepped on, the wom...
4 4 Father of the Bride Part II Just when George Banks has recovered from his ...

In [0]:
documents = df['overview'].values.astype("U") #to set as binary to transform,
#binary coz if 1 2 3 then machine will weight it differently, 4 higher than 1

vectorizer = TfidfVectorizer(stop_words='english')
features = vectorizer.fit_transform(documents)

In [8]:
k = 20 #means 20 group/sectors/segments
model = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1)
model.fit(features)


Out[8]:
KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
       n_clusters=20, n_init=1, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [9]:
df['cluster'] = model.labels_
df.head(10)


Out[9]:
id title overview cluster
0 0 Toy Story Led by Woody, Andy's toys live happily in his ... 3
1 1 Jumanji When siblings Judy and Peter discover an encha... 3
2 2 Grumpier Old Men A family wedding reignites the ancient feud be... 0
3 3 Waiting to Exhale Cheated on, mistreated and stepped on, the wom... 13
4 4 Father of the Bride Part II Just when George Banks has recovered from his ... 3
5 5 Heat Obsessive master thief, Neil McCauley leads a ... 13
6 6 Sabrina An ugly duckling having undergone a remarkable... 3
7 7 Tom and Huck A mischievous young boy, Tom Sawyer, witnesses... 17
8 8 Sudden Death International action superstar Jean Claude Van... 3
9 9 GoldenEye James Bond must unmask the mysterious head of ... 3

In [0]:
clusters = df.groupby('cluster')    

for cluster in clusters.groups:
    f = open('cluster'+str(cluster)+ '.csv', 'w')
    data = clusters.get_group(cluster)[['title','overview']]
    f.write(data.to_csv(index_label='id'))
    f.close()

In [11]:
%config InlineBackend.figure_format = 'retina'

from os import path
from wordcloud import WordCloud
import matplotlib.pyplot as plt



fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2,2, figsize=(15,15))


text = open('cluster0.csv').read() #number cluster can change to see category
wordcloud = WordCloud(width=800, height=800,background_color="black").generate(text)

ax1.imshow(wordcloud)



text = open('cluster1.csv').read()
wordcloud = WordCloud(width=800, height=800, background_color="black").generate(text)

ax2.imshow(wordcloud)


text = open('cluster2.csv').read()
wordcloud = WordCloud(width=800, height=800,background_color="white").generate(text)

ax3.imshow(wordcloud)


text = open('cluster3.csv').read()
wordcloud = WordCloud(width=800, height=800,background_color="white").generate(text)

ax4.imshow(wordcloud)


Out[11]:
<matplotlib.image.AxesImage at 0x7fb071d1ac50>

In [0]: