Word Cloud

Building a word cloud to define word characteristic from a bunch of documents



In [8]:

    
from os import path
from wordcloud import WordCloud

Finding Hoax Characteristic



In [9]:

    
file = open('all_hoax.txt', 'r') 
text = file.read()

file_fact = open('all_facts.txt', 'r')
text_fact = file_fact.read()



In [10]:

    
#generate wordcloud image
wc = WordCloud().generate(text)



In [11]:

    
import matplotlib.pyplot as plt
plt.imshow(wc, interpolation = 'bilinear')
plt.axis('off')









    Out[11]:





(-0.5, 399.5, 199.5, -0.5)

Without Pre-processing steps



In [12]:

    
print('sebelum preprocess')
wc = WordCloud(max_font_size=40).generate(text)
plt.figure()
plt.imshow(wc, interpolation = 'bilinear')
plt.axis('off')
plt.show()









    



sebelum preprocess

With Stopwords Removal



In [13]:

    
print('setelah stopwords removal')
stoplist = []
f_stoplist = open('/home/adhanindita/tugas-akhir/fnc-id/django_project/hoaxdetector/hoax/static/stopwords_id.txt', 'r')
for line in f_stoplist:
    w = line.split()
    for word in w:
        stoplist.append(word)
wc = WordCloud(max_words=100, stopwords = stoplist, max_font_size = 70).generate(text)
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()









    



setelah stopwords removal



In [14]:

    
hoax_stemmed = open('all_hoax_stemmed.txt').read()
wc = WordCloud().generate(hoax_stemmed)
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()

WordCloud Facts



In [15]:

    
stoplist = []
f_stoplist = open('/home/adhanindita/tugas-akhir/fnc-id/django_project/hoaxdetector/hoax/static/stopwords_id.txt', 'r')
for line in f_stoplist:
    w = line.split()
    for word in w:
        stoplist.append(word)
wc = WordCloud(max_words=100, stopwords = stoplist, max_font_size = 70).generate(text_fact)
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()



In [16]:

    
fact_stemmed = open('all_facts_stemmed.txt').read()
wc = WordCloud().generate(fact_stemmed)
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()



In [24]:

    
from nltk.book import *
import nltk
token = nltk.word_tokenize(fact_stemmed)
freq = FreqDist(token).most_common(10)
freq[1][0]









    Out[24]:





'indonesia'



In [18]:

    
token = nltk.word_tokenize(hoax_stemmed)
FreqDist(token).most_common(10)









    Out[18]:





[('pesan', 59),
 ('hati', 42),
 ('orang', 38),
 ('kirim', 28),
 ('makan', 23),
 ('malam', 23),
 ('jalan', 21),
 ('anak', 21),
 ('berita', 20),
 ('jakarta', 20)]



In [ ]: