Word Cloud

Building a word cloud to define word characteristic from a bunch of documents


In [8]:
from os import path
from wordcloud import WordCloud

Finding Hoax Characteristic


In [9]:
file = open('all_hoax.txt', 'r') 
text = file.read()

file_fact = open('all_facts.txt', 'r')
text_fact = file_fact.read()

In [10]:
#generate wordcloud image
wc = WordCloud().generate(text)

In [11]:
import matplotlib.pyplot as plt
plt.imshow(wc, interpolation = 'bilinear')
plt.axis('off')


Out[11]:
(-0.5, 399.5, 199.5, -0.5)

Without Pre-processing steps


In [12]:
print('sebelum preprocess')
wc = WordCloud(max_font_size=40).generate(text)
plt.figure()
plt.imshow(wc, interpolation = 'bilinear')
plt.axis('off')
plt.show()


sebelum preprocess

With Stopwords Removal


In [13]:
print('setelah stopwords removal')
stoplist = []
f_stoplist = open('/home/adhanindita/tugas-akhir/fnc-id/django_project/hoaxdetector/hoax/static/stopwords_id.txt', 'r')
for line in f_stoplist:
    w = line.split()
    for word in w:
        stoplist.append(word)
wc = WordCloud(max_words=100, stopwords = stoplist, max_font_size = 70).generate(text)
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()


setelah stopwords removal

In [14]:
hoax_stemmed = open('all_hoax_stemmed.txt').read()
wc = WordCloud().generate(hoax_stemmed)
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()


WordCloud Facts


In [15]:
stoplist = []
f_stoplist = open('/home/adhanindita/tugas-akhir/fnc-id/django_project/hoaxdetector/hoax/static/stopwords_id.txt', 'r')
for line in f_stoplist:
    w = line.split()
    for word in w:
        stoplist.append(word)
wc = WordCloud(max_words=100, stopwords = stoplist, max_font_size = 70).generate(text_fact)
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()



In [16]:
fact_stemmed = open('all_facts_stemmed.txt').read()
wc = WordCloud().generate(fact_stemmed)
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()



In [24]:
from nltk.book import *
import nltk
token = nltk.word_tokenize(fact_stemmed)
freq = FreqDist(token).most_common(10)
freq[1][0]


Out[24]:
'indonesia'

In [18]:
token = nltk.word_tokenize(hoax_stemmed)
FreqDist(token).most_common(10)


Out[18]:
[('pesan', 59),
 ('hati', 42),
 ('orang', 38),
 ('kirim', 28),
 ('makan', 23),
 ('malam', 23),
 ('jalan', 21),
 ('anak', 21),
 ('berita', 20),
 ('jakarta', 20)]

In [ ]: