In [76]:
%load_ext autoreload
In [85]:
%autoreload
In [1]:
import numpy as np
import time
import matplotlib.pyplot as plt
import pandas as pd
from pandas import DataFrame
import csv
import gensim
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec
from gensim.models import Phrases
assert gensim.models.doc2vec.FAST_VERSION == 1, "this will be painfully slow otherwise"
import nltk, re
import datetime
import random
import multiprocessing
from ast import literal_eval
import pickle
from tqdm import tqdm
from nltk.stem.porter import PorterStemmer
from scipy.cluster.hierarchy import dendrogram, linkage
#%matplotlib inline
In [2]:
from parameter_search import util
from modeling import functions
In [4]:
data_path = "E:/dataset/Amazon/"
result_path = "E:/dataset/MasterThesis/FINAL/"
save_path = "E:/dataset/MasterThesis/FINAL/preprocess_data/"
model_path = "E:/dataset/MasterThesis/FINAL/doc2vec/"
category_list = ["Electronics"]
In [6]:
for category in category_list:
data = pd.read_csv(save_path + "preprocess_complete_" + category + ".csv")
data['preprocessed'] = data.preprocessed.apply(lambda row: literal_eval(row))
In [12]:
data.head()
Out[12]:
In [17]:
sentence_list = []
brand_list = []
for index, row in data.iterrows():
review = [word for sentence in row['preprocessed'] for word in sentence]
brand_list.append(row['brand'])
sentence_list.append(review)
documents, bigram = util.make_documents(sentence_list, brand_list, tagby = False)
In [24]:
with open(result_path + category + '_documents.pkl', 'wb') as f:
pickle.dump(documents, f)
with open(result_path + category + '_bigrams.pkl', 'wb') as f:
pickle.dump(bigram, f)
In [22]:
#리뷰당 평균 token 수
tokens_per_review = []
review_length = []
for doc in tqdm(documents):
tokens_per_review.append(doc.words )
review_length.append(len(doc.words))
total = [word for sent in tokens_per_review for word in sent]
In [23]:
#총 token 수
print("total token is ", len(total))
t_corpus = nltk.Text(total)
t_freq = nltk.FreqDist(t_corpus)
# unique 한 token 수
print("unique token is ", len(t_freq.keys()))
# total word frequency
df = DataFrame(columns=[['word','freq']])
for word_tup in t_freq.most_common(100):
df.loc[len(df)] = word_tup[0], t_freq[word_tup[0]]
df.to_csv(result_path + "total_word_freq.csv", index=False)
# 저장
with open(result_path + category + '_total_freq_dist.pkl', 'wb') as f:
pickle.dump(t_freq, f)
#bigram 형태소 분석해서 형용사만 추출
selected = []
re_adj = re.compile('[JJ.*]')
re_verb = re.compile('[VB.*]')
re_adverb = re.compile('[RB.*]')
st = PorterStemmer()
for doc in tqdm(documents):
pos = nltk.pos_tag(doc.words)
pos = [tup[0] for tup in pos if re_adj.match(tup[1]) or re_verb.match(tup[1]) or re_adverb.match(tup[1])]
selected.append(pos)
with open(result_path + category + '_adv_adj_verb.pkl', 'wb') as f:
pickle.dump(selected, f)
In [30]:
# avg number of tokens in review:
length = [len(review) for review in tokens_per_review]
print("평균 token 개수는 : ", np.mean(np.array(length))
In [7]:
# 저장한거 불러오기
with open(result_path + category + '_adv_adj_verb.pkl', 'rb') as f:
selected = pickle.load(f)
# documents for doc2vec
with open(result_path + category + '_documents.pkl', 'rb') as f:
documents = pickle.load(f)
# bigrams
with open(result_path + category + '_bigrams.pkl', 'rb') as f:
bigram = pickle.load(f)
# total_freq_dist
with open(result_path + category + '_total_freq_dist.pkl', 'rb') as f:
t_freq = pickle.load(f)
In [33]:
sizes = 500
window = 2
alpha, min_alpha, passes = (0.02, 0.001, 10)
alpha_delta = (alpha-min_alpha)/passes
model = Doc2Vec(dm=1, dm_mean=1, min_count=50, sample=1e-5, seed=1, workers=multiprocessing.cpu_count(),
window=window, size=sizes, alpha=0.025, min_alpha=0.025)
model.build_vocab(documents)
for epoch in range(passes):
start = time.time()
random.shuffle(documents)
model.train(documents)
model.alpha -= alpha_delta # decrease the learning rate
model.min_alpha = model.alpha # fix the learning rate, no decay
end = time.time()
t = end-start
print("size : %s, window : %s, epoch : %s, time : %i"%(sizes, window, epoch, t))
model.save('E:/dataset/MasterThesis/FINAL/final_model/' + 'final_model')
In [8]:
model = Doc2Vec.load('E:/dataset/MasterThesis/FINAL/final_model/final_model')
In [9]:
model.docvecs.most_similar('Samsung')
Out[9]:
In [10]:
model.docvecs.most_similar('Canon')
Out[10]:
In [11]:
model.docvecs.most_similar('Apple')
Out[11]:
In [12]:
model.most_similar([model.docvecs['Samsung']])
Out[12]:
In [13]:
model.most_similar([model.docvecs['Canon']])
Out[13]:
In [14]:
model.most_similar([model.docvecs['Apple']])
Out[14]:
In [52]:
functions.extract_sim_words(model, "Canon", result_path, t_freq, 100, save=True, topn=100)
functions.extract_sim_words(model, "Apple", result_path, t_freq, 100, save=True, topn=100)
functions.extract_sim_words(model, "Samsung", result_path, t_freq, 100, save=True, topn=100)
functions.extract_sim_brand(model, "Canon", result_path, save=True, topn=30)
functions.extract_sim_brand(model, "Apple", result_path, save=True, topn=30)
functions.extract_sim_brand(model, "Samsung", result_path, save=True, topn=30)
In [22]:
brand_list, doc_arr = functions.clustering(model)
In [33]:
#cosine similarity 와 Ward's minimum variance method 방법을 이용한 hierarchical agglomerative clustering
Z = linkage(doc_arr, method='ward', metric='euclidean')
plt.figure(figsize=(10, 13))
dendrogram(Z,
orientation='right',
leaf_rotation=0., # rotates the x axis labels
leaf_font_size=14., # font size for the x axis labels
labels=brand_list,
color_threshold = 11.5,
)
plt.tight_layout()
plt.xlim([4,18])
plt.savefig(result_path + "Visualization/dendrogram_vertical.png", dpi=300)
# plt.ylim([4,18])
plt.show()
In [182]:
#cosine similarity 와 Ward's minimum variance method 방법을 이용한 hierarchical agglomerative clustering
Z = linkage(doc_arr, method='ward', metric='euclidean')
plt.figure(figsize=(14, 8))
dendrogram(Z,
orientation='top',
leaf_rotation=90., # rotates the x axis labels
leaf_font_size=14., # font size for the x axis labels
labels=brand_list,
color_threshold = 11.5,
)
plt.savefig(result_path + "Visualization/dendrogram.png", dpi=300)
plt.ylim([4,18])
plt.show()
In [64]:
last = Z[-12:, 2]
last_rev = last[::-1]
idxs = np.arange(1, len(last) + 1)
plt.plot(idxs, last_rev)
acceleration = np.diff(last, 2) # 2nd derivative of the distances
acceleration_rev = acceleration[::-1]
plt.plot(idxs[:-2] + 1, acceleration_rev)
plt.show()
k = acceleration_rev.argmax() + 2 # if idx 0 is the max of this we want 2 clusters
print("clusters:", k)
In [63]:
#dendrogram 결과를 반영해 각 cluster labeling
from scipy.cluster.hierarchy import fcluster
clusters_s = fcluster(Z, 11.5, criterion='distance')
print(len(set(clusters_s)))
In [66]:
df_result = pd.DataFrame()
df_result['Name'] = brand_list
df_result['Cluster'] = clusters_s
In [186]:
# 7,8,9 번 cluster를 7로 통합
df_result.ix[df_result.Cluster >= 8, 'Cluster'] = 7
In [188]:
cluster_idx = [1,2,3,4,5,6,7]
words_df_lst=[]
for idx in cluster_idx:
mean_vec = functions.cal_mean_cluster(df_result, idx, model)
top_words_df = functions.print_result(mean_vec, model, t_freq, 100, topn=100)
words_df_lst.append(top_words_df)
top_words_df.to_csv(result_path + 'simWithCluster/cluster_{0:02d}.csv'.format(idx))
In [189]:
result = pd.concat([df for df in words_df_lst], axis=1)
result.to_csv(result_path + 'simWithCluster/sim_words_종합.csv', index=False)
In [72]:
camera = (model['camera'] + model['cameras'])/2
model.docvecs.most_similar([camera])
Out[72]:
In [73]:
earphone = (model['earphone'] + model['headphone'])/2
model.docvecs.most_similar([earphone])
Out[73]:
In [18]:
computer = (model['pc'] + model['laptop'])/2
model.docvecs.most_similar([computer])
Out[18]:
In [19]:
functions.save_brand_sim(model, earphone, 'earphone', result_path, 10)
functions.save_brand_sim(model, camera, 'camera', result_path, 10)
functions.save_brand_sim(model, computer, 'computer', result_path, 10)
In [195]:
dic_doc = {}
for word in model.index2word:
dic_doc[word] = 1
total_word_list = []
for doc in documents:
word_list = [word for word in doc.words if word in dic_doc]
total_word_list.append(word_list)
In [198]:
#각 브랜드에서 많이 등장한 단어
brands = ["Samsung","Microsoft","Canon"]
for brand in brands:
review, freq = functions.brand_freq(documents, total_word_list, brand)
top_freq_words = freq.most_common(100)
df = DataFrame(columns=[["word","count"]])
for top_words in top_freq_words:
df.loc[len(df)] = top_words[0], freq[top_words[0]]
df = df.sort_values("count", ascending=False)
df.to_csv(result_path + 'tf-idf/' + brand + '_adj_adv_verb.csv', index=False)
In [196]:
df_tfidf = functions.tf_idf(documents, total_word_list, brand_list, max_feature = 5000)
df_tfidf.to_csv(result_path + '/tf-idf/tf-idf-score_total.csv')
In [115]:
# 전체 단어에서 추출
df_score = functions.scoring(model, brand_list, total_word_list, topn=5000)
brands = ["Samsung", "Canon", "Apple","Microsoft"]
for brand in brands:
df_keywords = functions.extract_words_by_score(df_score, brand, documents, total_word_list, min_count = 100)
df_keywords.to_csv(result_path + '/scoring_keywords/' + brand + '_KeywordsByScore_total.csv', index=False)
In [116]:
# 형용사, 동사, 부사를 이용해 추출한 결과
df_score = functions.scoring(model, brand_list, selected, topn=5000)
brands = ["Samsung", "Canon", "Apple","Microsoft"]
for brand in brands:
df_keywords = functions.extract_words_by_score(df_score, brand, documents, selected, min_count = 100)
df_keywords.to_csv(result_path + '/scoring_keywords/' + brand + '_KeywordsByScore_AdjVerbAdv.csv', index=False)
In [192]:
import tsne
import random
import numpy
numpy.random.seed(1)
X = np.zeros((len(brand_list), model.vector_size))
for i, brand in enumerate(brand_list):
X[i] = model.docvecs[brand].flatten()
numpy.random.seed(1)
t_sne = tsne.tsne(X,initial_dims = 10, perplexity = 5, iteration=1000)
In [193]:
fig = plt.figure(figsize=(10,10))
plt.scatter(t_sne[:,0], t_sne[:,1])
for i, brand in enumerate(brand_list):
plt.annotate(brand, (t_sne[i,0],t_sne[i,1]))
plt.show()
In [194]:
#tsne 결과를 저장
df = DataFrame(columns=('brand','dim_1','dim_2'))
for i, brand in enumerate(brand_list):
df.loc[i] = [brand, t_sne[i,0],t_sne[i,1]]
df.to_csv(result_path + 'Visualization/t-sne_result.csv', index=False)
In [ ]: