Gensim - Doc2vec untuk Similarity Content

Similarity Content menggunakan vector merupakan cara sederhana untuk mendapatkan kesamaan dari sebuah artikel. Dalam kasus ini saya akan menggunakan hasil scraping data google news indonesia berjumlah 77 documents saja.

Adapun module yang digunakan adalah menggunkan gensim

Requirement

  • Gensim 2.0

Kode Sederhana


In [1]:
# import logging
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import gensim
from pprint import pprint
import multiprocessing
import os
import re

In [3]:
def get_stoplist():
    stop = []
    for line in open('stopwords.txt','rU'):
        stop += line.replace(',','').split()
    return stop

In [4]:
stopwords = get_stoplist()
print(len(stopwords))
print(stopwords[:20])


782
['ada', 'adalah', 'adanya', 'adapun', 'agak', 'agaknya', 'agar', 'akan', 'akankah', 'akhir', 'akhiri', 'akhirnya', 'aku', 'akulah', 'amat', 'amatlah', 'anda', 'andalah', 'antar', 'antara']

In [5]:
dirname = 'google_news'
documents_file = os.listdir(dirname)

documents = []

for fname in documents_file:
    f = open(os.path.join(dirname,fname),'rU')
    content = f.read().decode('utf-8').lower()
    words_split = re.findall(r"[\w']+|[.,!?;]",content) # memotong kalimat berdasarkan punctuation
    words_split = [word for word in words_split if word not in stopwords]

    phrases = gensim.models.phrases.Phrases(words_split,min_count=20, threshold=100)
    bigram = gensim.models.phrases.Phraser(phrases)
    trigram = gensim.models.phrases.Phrases(bigram[words_split],min_count=20, threshold=100)
    for idx in range(len(words_split)):
        for token in bigram[words_split[idx]]:
            if '_' in token:
                # Token is a bigram, add to document.
#                 print(words_split[idx])
#                 words_split[idx].append(token)
                words_split[idx] += token
            
    for idx in range(len(words_split)):
        for token in trigram[words_split[idx]]:
            if '_' in token:
                # Token is a trigram, add to document.
#                 words_split[idx].append(token)
                words_split[idx] += token
#     print(words_split)
    filtered_tokens = []
    for token in words_split:
        if re.search('[a-zA-Z]', token): # filter hanya huruf saja
            filtered_tokens.append(token)
    
    title = fname.replace('.txt','')
    documents.append(TaggedDocument(filtered_tokens,[title]))


/usr/local/lib/python2.7/site-packages/gensim/models/phrases.py:274: UserWarning: For a faster implementation, use the gensim.models.phrases.Phraser class
  warnings.warn("For a faster implementation, use the gensim.models.phrases.Phraser class")

In [6]:
pprint(documents[:1][0].tags)
# pprint(documents[4].words)


["'Formasi Tiga Bek Arsenal Belum Sempurna'"]

In [7]:
print(documents[:2][0].words[:100])


[u'arsene', u'wenger', u'menerapkan', u'formasi', u'bek', u'memetik', u'kemenangan', u'manchester', u'united', u'the', u'professor', u'bersikeras', u'mengubah', u'skema', u'bermainnya', u'pasca', u'kekalahan', u'crystal', u'palace', u'arsenal', u'mengubah', u'gaya', u'bermainnya', u'formasi', u'bek', u'ditinggalkan', u'arsene', u'wenger', u'pertandingan', u'wenger', u'menerapkan', u'formasi', u'bek', u'membawa', u'arsenal', u'menang', u'kali', u'menelan', u'kekalahan', u'liga', u'inggris', u'kemenangan', u'terbaru', u'melakoni', u'pertandingan', u'melawan', u'the', u'red', u'devils', u'skor', u'arsenal', u'menang', u'berkat', u'gol', u'dibukukan', u'granit', u'xhaka', u'danny', u'welbeck', u'wenger', u'menilai', u'tim', u'besutannya', u'bertahan', u'stabil', u'bermain', u'bek', u'laga', u'melawan', u'mu', u'babak', u'laga', u'emirates', u'stadium', u'kestabilan', u'bertahab', u'kebobolan', u'wenger', u'sky', u'sports', u'sempurna', u'fokus', u'menggalang', u'pertahanan', u'dibandingkan', u'melawan', u'mu', u'wenger', u'kesempatan', u'bermain', u'lauren', u'koscielny', u'rob', u'holding', u'nacho', u'monreal', u'tampil', u'satunya', u'cela', u'kesalahan']

In [8]:
cores = multiprocessing.cpu_count()
model = Doc2Vec(dm=0, dbow_words=1, size=100, window=8, min_count=20, iter=100, workers=cores, sample=1e-4, negative=2)

In [9]:
model.scan_vocab(documents,update=False)
print(str(model))


Doc2Vec(dbow+w,d100,n2,w8,mc20,s0.0001,t4)

In [10]:
model.build_vocab(documents,update=False)
print(str(model))


Doc2Vec(dbow+w,d100,n2,w8,mc20,s0.0001,t4)

In [11]:
%time model.train(documents, total_examples=model.corpus_count, epochs=model.iter)


CPU times: user 4.34 s, sys: 1.38 s, total: 5.72 s
Wall time: 3.3 s
Out[11]:
410344

In [12]:
print(str(model))
pprint(model.docvecs.most_similar(positive=["5 Kesamaan Xiaomi Mi 6 dan iPhone 7 Plus"], topn=10))


Doc2Vec(dbow+w,d100,n2,w8,mc20,s0.0001,t4)
[('Ini Alasan Xiaomi Mi 6 Tiru iPhone 7', 0.9499949216842651),
 ('Mengapa Xiaomi Mi 6 Harus Mengikuti Langkah iPhone 7? - Kompas.com',
  0.906842827796936),
 ('Smartphone Baru ASUS X00ID Muncul di GFXBench dengan Dual-camera',
  0.865568995475769),
 ('Qualcomm Sudah Daftarkan Snapdragon 845 di Website Resminya',
  0.7996097803115845),
 ('Resmi Masuk Negara Turki, iPhone 7 Tembus Harga Rp 16 Jutaan Dan Menjadi Rekor Harga Tertinggi Hingga Saat Ini',
  0.7913281917572021),
 ('ASUS Zenfone X015D Muncul di GFXBench Usung Chipset MediaTek dan RAM 3GB',
  0.7558867931365967),
 ('Samsung Galaxy J7 (2017) Muncul Lagi di GFXBench dengan Spesifikasi Berbeda',
  0.7387588024139404),
 ('Nokia 3310 Akan Dipasarkan Secara Global', 0.6635607481002808),
 ('Nokia 3310 Terbaru Mulai Dikirim', 0.6366978883743286),
 ('Negara Mana yang Jual iPhone 7 Termahal dan Termurah?', 0.6267486810684204)]

In [25]:
text_search = '''busa sabun jalan sudirman'''
inferred_vector = model.infer_vector(text_search.lower().split())
model.docvecs.most_similar([inferred_vector], topn=10)


Out[25]:
[('Begini cerita busa sabun yang dikira hujan salju di Jalan Sudirman',
  0.7201998233795166),
 ('Benda Mirip Salju di Jalan Jenderal Sudirman Sabun untuk Melunakkan Tanah',
  0.6918919086456299),
 ('Penjelasan MRT Jakarta Soal Buih Mirip Salju di Jalan Sudirman',
  0.6918407678604126),
 ('Dari Mana Asal Busa yang Dikira Salju di Jalan Sudirman? - Kompas.com',
  0.6818339824676514),
 ('Memastikan Bukan Salju di Jalan Jendral Sudirman, Ini Penjelasan PT MRT Jakarta - Wartakota',
  0.6708693504333496),
 ('Akibat Kecerobohan Karyawan, Busa di Sepanjang Jalan di Jakarta Dikira Salju - eKoran.Net',
  0.6666598320007324),
 ('Pemprov DKI Selidiki Keamanan Busa Proyek MRT yang Menghebohkan',
  0.6551097631454468),
 ('Gempar Hujan Salju di Jakarta, PT MRT Minta Maaf', 0.647350013256073),
 ('Petugas PT MRT Bersihkan Buih Mirip Salju di Jalan Sudirman',
  0.6446686387062073),
 ('Heboh Turun Salju di Jakarta Ternyata Cuma Cairan Sabun - Tribun Jabar',
  0.6381543874740601)]

In [26]:
model.docvecs['5 Kesamaan Xiaomi Mi 6 dan iPhone 7 Plus']


Out[26]:
array([ -2.25954324e-01,   1.23186693e-01,  -8.79537761e-02,
         5.32568574e-01,  -5.72701871e-01,   4.46958281e-02,
        -4.84704703e-01,  -1.41727492e-01,  -3.09160829e-01,
        -4.09467816e-01,  -9.00486633e-02,   1.73302397e-01,
        -1.49284795e-01,   1.00194395e-01,   4.47514504e-01,
         1.63158298e-01,   3.00338954e-01,  -5.69132119e-02,
        -4.92458679e-02,   1.90102588e-02,   6.97194831e-04,
        -2.40246728e-01,  -1.85915813e-01,   5.26626185e-02,
         2.70957738e-01,   1.77305005e-02,   1.61580682e-01,
         1.84968039e-01,  -7.90281072e-02,   5.15848219e-01,
         8.77135277e-01,   5.90857714e-02,  -3.30129974e-02,
         4.39862847e-01,  -4.59410280e-01,   3.97212803e-01,
        -1.82566181e-01,  -3.90071571e-02,   8.50754321e-01,
         6.19083226e-01,  -2.20257878e-01,   4.85509992e-01,
        -4.70316187e-02,   3.62264849e-02,  -5.28140925e-02,
         1.85851693e-01,   3.31997097e-01,   3.08416873e-01,
         2.81796247e-01,  -2.29108453e-01,   2.90523082e-01,
        -1.10582218e-01,   8.34555551e-02,   5.78473434e-02,
        -5.44825256e-01,   2.52919674e-01,  -2.93720532e-02,
         4.86641526e-01,   4.98908997e-01,  -2.83786714e-01,
        -1.07791714e-01,  -2.18909398e-01,   4.54612672e-01,
         2.85787493e-01,   6.09603405e-01,   9.43249613e-02,
        -4.28612716e-02,  -3.47521335e-01,   3.11856925e-01,
        -1.28419340e-01,  -9.75396574e-01,   2.30593786e-01,
        -8.06702748e-02,  -5.59487760e-01,   3.24030042e-01,
        -5.35586655e-01,   4.53751713e-01,   2.61457950e-01,
        -1.12564512e-01,  -2.81359881e-01,  -2.31419317e-02,
        -4.42185968e-01,   7.94908702e-02,  -1.51606083e-01,
         1.00206710e-01,  -5.68046212e-01,   1.28284603e-01,
        -7.04542339e-01,  -2.00651074e-03,  -1.44865111e-01,
        -4.26971763e-01,   3.05639327e-01,  -2.99131244e-01,
         3.35587673e-02,   3.14426810e-01,   4.60964739e-01,
        -5.07430255e-01,   1.56259283e-01,  -3.53760362e-01,
        -2.15146452e-01], dtype=float32)

In [ ]: