In [1]:
import itertools
import operator
import gensim
import konlpy
import pickle
import pandas as pd
import scipy as sp
import numpy as np
from konlpy.tag import Kkma
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import pairwise_distances
In [2]:
cf_df = pd.read_excel('cf_description.xlsx')
In [3]:
# Select morphemes
feature_pos = ('NNP', 'NNG', 'NNB', 'NP', 'VV', 'VA', 'MAG')
In [ ]:
# wrapper
kkma = Kkma()
In [ ]:
tags = []
kon_words = []
kon_words_tags = []
# tokenization
for i in cf_df['description'][0]:
kon_word = kkma.pos(i)
kon_words.append(kon_word)
# morphemes
des_token =[]
for i in range(0, len(kon_words)):
tokens = []
for j in range(0, len(kon_words[i])):
if kon_words[i][j][1] in feature_pos:
tokens.append(kon_words[i][j][0])
des_token.append(tokens)
In [ ]:
# transform requirements of doc2vec (words=[], tags=[])
kon_token_tags=[]
for i in np.arange(len(kon_words)):
kon_token_tag = gensim.models.doc2vec.LabeledSentence(words = des_token[i], tags = [tb_df['id'][i]])
kon_token_tags.append(kon_token_tag)
In [ ]:
#token data save
output = open('des_token.pkl', 'wb')
pickle.dump(des_token, output)
output.close()
In [4]:
#token data load
pkl_file = open('des_token.pkl', 'rb')
des_token = pickle.load(pkl_file)
pkl_file.close()
In [ ]:
# token_tags data save
output = open('kon_token_tags.pkl', 'wb')
pickle.dump(kon_token_tags, output)
output.close()
In [5]:
# token_tags data load
pkl_file = open('kon_token_tags.pkl', 'rb')
kon_token_tags = pickle.load(pkl_file)
pkl_file.close()
In [ ]:
# doc2vec modeling with tokens
cf_doc2vec_model = gensim.models.Doc2Vec(kon_token_tags, size = 200)
In [ ]:
# doc2vec model save (based on konply tokenizer)
cf_doc2vec_model.save('cf_doc2vec_model.model')
In [6]:
# doc2vec model load (based on konply tokenizer)
cf_doc2vec_model = gensim.models.Doc2Vec.load('cf_doc2vec_model.model')
In [7]:
#Find the top-10 most similar words
cf_doc2vec_model.most_similar(['아빠'])
Out[7]:
In [8]:
#Find the top-10 most similar words
cf_doc2vec_model.most_similar(['고양이'])
Out[8]:
In [9]:
""" cosine distance """
def cos_dist_cal(doc_model):
import scipy as sp
# make vector bag
doc_vecs = []
dists = []
id_list = []
eu_dist = sp.spatial.distance.cosine
for i in np.arange(len(kon_token_tags)):
vec = doc_model.infer_vector(kon_token_tags[i][0])
ids = kon_token_tags[i][1][0]
id_list.append(ids)
doc_vecs.append(vec)
# caculate distance
for a, b in itertools.combinations(np.arange(len(doc_vecs)), 2):
dist = eu_dist(doc_vecs[a], doc_vecs[b])
dists.append((id_list[a], id_list[b], dist))
return dists
In [10]:
cos_token_df = pd.DataFrame(cos_dist_cal(cf_doc2vec_model))
In [11]:
"""recommend project list"""
def project_recommend(dist_df, number):
import pandas as pd
recommend_list = pd.DataFrame()
if number <= 2000:
recommend = dist_df.loc[dist_df[0] == number]
sorting = recommend.sort_values(by=[2], ascending = False)[:11]
sorting.index = np.arange(len(sorting))
for i in sorting[1]:
pj = cf_df.loc[cf_df['id'] == i]
recommend_list = recommend_list.append(pj)
recommend_list.index = np.arange(len(recommend_list))
return recommend_list
else:
recommend = dist_df.loc[dist_df[1] == number]
sorting = recommend.sort_values(by=[2], ascending = False)[:11]
sorting.index = np.arange(len(sorting))
for i in sorting[0]:
pj = cf_df.loc[cf_df['id'] == i]
recommend_list = recommend_list.append(pj)
recommend_list.index = np.arange(len(recommend_list))
return recommend_list
In [12]:
cf_df[cf_df['id'] == 1152]
Out[12]:
In [13]:
project_recommend(cos_token_df, 1152)
Out[13]:
In [ ]: