In [0]:
import pandas as pd
import numpy as np
from google.colab import files
In [2]:
uploaded = files.upload()
In [3]:
from sklearn.feature_extraction.text import TfidVectorizer
from sklearn.cluster import KMeans
from wordcloud import WordCloud
import matplotlib.pyplot as plt
In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from wordcloud import WordCloud
import matplotlib.pyplot as plt
In [0]:
df = pd.read_csv('./movie_overview.csv')
In [6]:
df.head()
Out[6]:
In [0]:
documents = df['overview'].values.astype("U") #to set as binary to transform,
#binary coz if 1 2 3 then machine will weight it differently, 4 higher than 1
vectorizer = TfidfVectorizer(stop_words='english')
features = vectorizer.fit_transform(documents)
In [8]:
k = 20 #means 20 group/sectors/segments
model = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1)
model.fit(features)
Out[8]:
In [9]:
df['cluster'] = model.labels_
df.head(10)
Out[9]:
In [0]:
clusters = df.groupby('cluster')
for cluster in clusters.groups:
f = open('cluster'+str(cluster)+ '.csv', 'w')
data = clusters.get_group(cluster)[['title','overview']]
f.write(data.to_csv(index_label='id'))
f.close()
In [11]:
%config InlineBackend.figure_format = 'retina'
from os import path
from wordcloud import WordCloud
import matplotlib.pyplot as plt
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2,2, figsize=(15,15))
text = open('cluster0.csv').read() #number cluster can change to see category
wordcloud = WordCloud(width=800, height=800,background_color="black").generate(text)
ax1.imshow(wordcloud)
text = open('cluster1.csv').read()
wordcloud = WordCloud(width=800, height=800, background_color="black").generate(text)
ax2.imshow(wordcloud)
text = open('cluster2.csv').read()
wordcloud = WordCloud(width=800, height=800,background_color="white").generate(text)
ax3.imshow(wordcloud)
text = open('cluster3.csv').read()
wordcloud = WordCloud(width=800, height=800,background_color="white").generate(text)
ax4.imshow(wordcloud)
Out[11]:
In [0]: