In [ ]:
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec
import multiprocessing
import pandas as pd
import pickle
from tqdm import tqdm #gang-chu
from konlpy.tag import Kkma
import pandas as pd
In [ ]:
kor_corpus = pd.read_csv("한류_연합뉴스.csv", encoding = 'cp949')
In [ ]:
kor_corpus.head()
In [ ]:
len(kor_corpus.contents)
In [ ]:
result = []
import time
for i, doc in enumerate(kor_corpus.contents[:100]):
try:
start_time = time.time()
result.append([t[0] for t in Kkma().pos(doc) if t[1][0] =='N'])
print(i," processed time ",time.time()-start_time)
except:
print(i,"th doc has error!")
In [ ]:
data = pd.DataFrame()
data['split'] = result
In [ ]:
data.split
In [ ]:
len(data)
In [ ]:
documents = []
for index, row in tqdm(data.iterrows()):
"""
document = TaggedDocument(words=['founded', 'incorporated', 'subsidiaries', 'engaged', 'manufacture'], tags=['doc1'])
와 같은 형태이며
위와 같은 document를 documents에 담음
"""
words = row['split']
document = TaggedDocument(words=words, tags=['doc'+str(index)])
documents.append(document)
In [ ]:
# doc2vec model 학습하기
mc = 10 # min count 이하 단어 제거
w = 5 # window (context) size
h = 100 # hidden layer(vector size)
sample = 1e-5 # sub-sampling rate
model = Doc2Vec(dm=1, #1 이면 PV-DM , 0 은 dbow
dm_mean=1, # average
min_count=mc,
sample=sample,
window=w, size=h,
workers=multiprocessing.cpu_count(), #core 개수
alpha=0.025, min_alpha=0.025)
model.build_vocab(documents)
epochs = 10 #일반적으로 10번
for epoch in range(epochs):
# random.shuffle(documents) epoch 마다 문서 순서를 섞어 주면 성능이 아주 약간 좋아질 수도 있음
model.train(documents)
model.alpha -= 0.002 # decrease the learning rate
model.min_alpha = model.alpha # fix the learning rate, no decay
In [ ]:
# 특정 단어의 vector return 하기
model['서울']
In [ ]:
# 학습한 단어간 유사도 계산
model.most_similar('서울', topn=15)
In [ ]:
# 특정 문서 vector return 하기
model.docvecs['doc0']
In [ ]:
# 학습한 문서 벡터간 유사도 계산
model.docvecs.most_similar('doc1', topn=10)
In [ ]:
kor_corpus.contents.iloc[1]
In [ ]:
kor_corpus.contents.iloc[3]
In [ ]:
kor_corpus.contents.iloc[12]