Similarity Content menggunakan vector merupakan cara sederhana untuk mendapatkan kesamaan dari sebuah artikel. Dalam kasus ini saya akan menggunakan hasil scraping data google news indonesia berjumlah 77 documents saja.
Adapun module yang digunakan adalah menggunkan gensim
In [1]:
# import logging
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
In [2]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import gensim
from pprint import pprint
import multiprocessing
import os
import re
In [3]:
def get_stoplist():
stop = []
for line in open('stopwords.txt','rU'):
stop += line.replace(',','').split()
return stop
In [4]:
stopwords = get_stoplist()
print(len(stopwords))
print(stopwords[:20])
In [5]:
dirname = 'google_news'
documents_file = os.listdir(dirname)
documents = []
for fname in documents_file:
f = open(os.path.join(dirname,fname),'rU')
content = f.read().decode('utf-8').lower()
words_split = re.findall(r"[\w']+|[.,!?;]",content) # memotong kalimat berdasarkan punctuation
words_split = [word for word in words_split if word not in stopwords]
phrases = gensim.models.phrases.Phrases(words_split,min_count=20, threshold=100)
bigram = gensim.models.phrases.Phraser(phrases)
trigram = gensim.models.phrases.Phrases(bigram[words_split],min_count=20, threshold=100)
for idx in range(len(words_split)):
for token in bigram[words_split[idx]]:
if '_' in token:
# Token is a bigram, add to document.
# print(words_split[idx])
# words_split[idx].append(token)
words_split[idx] += token
for idx in range(len(words_split)):
for token in trigram[words_split[idx]]:
if '_' in token:
# Token is a trigram, add to document.
# words_split[idx].append(token)
words_split[idx] += token
# print(words_split)
filtered_tokens = []
for token in words_split:
if re.search('[a-zA-Z]', token): # filter hanya huruf saja
filtered_tokens.append(token)
title = fname.replace('.txt','')
documents.append(TaggedDocument(filtered_tokens,[title]))
In [6]:
pprint(documents[:1][0].tags)
# pprint(documents[4].words)
In [7]:
print(documents[:2][0].words[:100])
In [8]:
cores = multiprocessing.cpu_count()
model = Doc2Vec(dm=0, dbow_words=1, size=100, window=8, min_count=20, iter=100, workers=cores, sample=1e-4, negative=2)
In [9]:
model.scan_vocab(documents,update=False)
print(str(model))
In [10]:
model.build_vocab(documents,update=False)
print(str(model))
In [11]:
%time model.train(documents, total_examples=model.corpus_count, epochs=model.iter)
Out[11]:
In [12]:
print(str(model))
pprint(model.docvecs.most_similar(positive=["5 Kesamaan Xiaomi Mi 6 dan iPhone 7 Plus"], topn=10))
In [25]:
text_search = '''busa sabun jalan sudirman'''
inferred_vector = model.infer_vector(text_search.lower().split())
model.docvecs.most_similar([inferred_vector], topn=10)
Out[25]:
In [26]:
model.docvecs['5 Kesamaan Xiaomi Mi 6 dan iPhone 7 Plus']
Out[26]:
In [ ]: