Doc2vec


In [ ]:
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec
import multiprocessing
import pandas as pd
import pickle
from tqdm import tqdm #gang-chu
from konlpy.tag import Kkma
import pandas as pd

In [ ]:
kor_corpus = pd.read_csv("한류_연합뉴스.csv", encoding = 'cp949')

In [ ]:
kor_corpus.head()

In [ ]:
len(kor_corpus.contents)

In [ ]:
result = []
import time



for i, doc in enumerate(kor_corpus.contents[:100]):
    try:
        start_time = time.time()
        result.append([t[0] for t in Kkma().pos(doc) if t[1][0] =='N'])
        print(i," processed time ",time.time()-start_time)
    except:
        print(i,"th doc has error!")

In [ ]:
data = pd.DataFrame()
data['split'] = result

In [ ]:
data.split

In [ ]:
len(data)

In [ ]:
documents = []

for index, row in tqdm(data.iterrows()):
    """
    document = TaggedDocument(words=['founded', 'incorporated', 'subsidiaries', 'engaged', 'manufacture'], tags=['doc1'])
    와 같은 형태이며
    위와 같은 document를 documents에 담음
    """
    words = row['split']
    document = TaggedDocument(words=words, tags=['doc'+str(index)])
    documents.append(document)

In [ ]:
# doc2vec model 학습하기

mc = 10 # min count 이하 단어 제거
w = 5 # window (context) size
h = 100 # hidden layer(vector size)
sample = 1e-5 # sub-sampling rate

model = Doc2Vec(dm=1, #1 이면 PV-DM , 0 은 dbow
                dm_mean=1, # average
                min_count=mc, 
                sample=sample,
                window=w, size=h, 
                workers=multiprocessing.cpu_count(), #core 개수
                alpha=0.025, min_alpha=0.025)
model.build_vocab(documents)

epochs = 10 #일반적으로 10번
for epoch in range(epochs):
    # random.shuffle(documents)  epoch 마다 문서 순서를 섞어 주면 성능이 아주 약간 좋아질 수도 있음
    model.train(documents)
    model.alpha -= 0.002  # decrease the learning rate
    model.min_alpha = model.alpha  # fix the learning rate, no decay

In [ ]:
# 특정 단어의 vector return 하기
model['서울']

In [ ]:
# 학습한 단어간 유사도 계산
model.most_similar('서울', topn=15)

In [ ]:
# 특정 문서 vector return 하기
model.docvecs['doc0']

In [ ]:
# 학습한 문서 벡터간 유사도 계산
model.docvecs.most_similar('doc1', topn=10)

In [ ]:
kor_corpus.contents.iloc[1]

In [ ]:
kor_corpus.contents.iloc[3]

In [ ]:
kor_corpus.contents.iloc[12]