Building a word cloud to define word characteristic from a bunch of documents
In [8]:
from os import path
from wordcloud import WordCloud
In [9]:
file = open('all_hoax.txt', 'r')
text = file.read()
file_fact = open('all_facts.txt', 'r')
text_fact = file_fact.read()
In [10]:
#generate wordcloud image
wc = WordCloud().generate(text)
In [11]:
import matplotlib.pyplot as plt
plt.imshow(wc, interpolation = 'bilinear')
plt.axis('off')
Out[11]:
In [12]:
print('sebelum preprocess')
wc = WordCloud(max_font_size=40).generate(text)
plt.figure()
plt.imshow(wc, interpolation = 'bilinear')
plt.axis('off')
plt.show()
In [13]:
print('setelah stopwords removal')
stoplist = []
f_stoplist = open('/home/adhanindita/tugas-akhir/fnc-id/django_project/hoaxdetector/hoax/static/stopwords_id.txt', 'r')
for line in f_stoplist:
w = line.split()
for word in w:
stoplist.append(word)
wc = WordCloud(max_words=100, stopwords = stoplist, max_font_size = 70).generate(text)
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()
In [14]:
hoax_stemmed = open('all_hoax_stemmed.txt').read()
wc = WordCloud().generate(hoax_stemmed)
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()
In [15]:
stoplist = []
f_stoplist = open('/home/adhanindita/tugas-akhir/fnc-id/django_project/hoaxdetector/hoax/static/stopwords_id.txt', 'r')
for line in f_stoplist:
w = line.split()
for word in w:
stoplist.append(word)
wc = WordCloud(max_words=100, stopwords = stoplist, max_font_size = 70).generate(text_fact)
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()
In [16]:
fact_stemmed = open('all_facts_stemmed.txt').read()
wc = WordCloud().generate(fact_stemmed)
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()
In [24]:
from nltk.book import *
import nltk
token = nltk.word_tokenize(fact_stemmed)
freq = FreqDist(token).most_common(10)
freq[1][0]
Out[24]:
In [18]:
token = nltk.word_tokenize(hoax_stemmed)
FreqDist(token).most_common(10)
Out[18]:
In [ ]: