In [1]:
import numpy as np
import time
import matplotlib.pyplot as plt
import pandas as pd
from pandas import DataFrame
import csv
import nltk, re
import datetime
import random
import multiprocessing
from ast import literal_eval
import pickle
from tqdm import tqdm
from nltk.stem.porter import PorterStemmer
from scipy.cluster.hierarchy import dendrogram, linkage
#%matplotlib inline
In [2]:
data_path = "E:/dataset/Amazon/"
result_path = "E:/dataset/MasterThesis/FINAL/"
save_path = "E:/dataset/MasterThesis/FINAL/preprocess_data/"
model_path = "E:/dataset/MasterThesis/FINAL/doc2vec/"
category_list = ["Electronics"]
for category in category_list:
data = pd.read_csv(save_path + "preprocess_complete_" + category + ".csv")
data['preprocessed'] = data.preprocessed.apply(lambda row: literal_eval(row))
In [46]:
data.head()
Out[46]:
In [3]:
# documents for doc2vec
with open(result_path + category + '_documents.pkl', 'rb') as f:
documents = pickle.load(f)
In [77]:
def brands_text(documents, data, brand):
df = data[data['brand']==brand]
df.reset_index(drop=True, inplace=True)
df = df[['asin', 'reviewerID', 'overall', 'helpful', 'reviewText', 'title', 'brand']]
reviews = []
for doc in documents:
if doc.tags[0] == brand:
reviews.append(doc.words)
corpus = [word for sent in reviews for word in sent]
corpus = nltk.Text(corpus)
return df, reviews, corpus
In [109]:
def find_word(nltk_finder_result, target_word):
find_word = target_word
n_gram_list = []
for i in nltk_finder_result:
if len(n_gram_list) > 10:
break
elif i[0][0] == find_word or i[0][1] == find_word:
n_gram_list.append(i)
return n_gram_list
In [64]:
#단어에 대해서 bigram(pmi) 확인
import nltk
from nltk.collocations import *
In [78]:
samsung_df, samsung_lst, samsung_txt = brands_text(documents, data, 'Samsung')
In [127]:
window = 4
min_count = 10
samsung_bigram_measures = nltk.collocations.BigramAssocMeasures()
s_finder = BigramCollocationFinder.from_words(samsung_txt, window_size = window)
s_finder.apply_freq_filter(min_count) #13번 미만으로 나온 것들 무시
#finder.nbest(bigram_measures.pmi, 20)
s_result = s_finder.score_ngrams(samsung_bigram_measures.pmi)
In [128]:
#pmi
target_word = ['picture_quality', 'basic', 'lag','compared']
for i in target_word:
print( "-------------")
print(i)
print('\n')
print(find_word(s_result, i))
print('\n')
print('\n')
In [129]:
for target in target_word:
target_index = []
for index, review in enumerate(samsung_lst):
if target in review:
target_index.append(index)
samsung_df.loc[target_index].to_csv(interprete_path + 'samsung_' + target + '.csv', index=False)
In [88]:
ms_df, ms_lst, ms_txt = brands_text(documents, data, 'Microsoft')
In [119]:
window = 4
min_count = 10
bigram_measures = nltk.collocations.BigramAssocMeasures()
s_finder = BigramCollocationFinder.from_words(ms_txt, window_size = window)
s_finder.apply_freq_filter(min_count) #13번 미만으로 나온 것들 무시
#finder.nbest(bigram_measures.pmi, 20)
s_result = s_finder.score_ngrams(bigram_measures.pmi)
In [120]:
#pmi
target_word = ['ergonomic', 'natural','easily']
for i in target_word:
print( "-------------")
print(i)
print('\n')
print(find_word(s_result, i))
print('\n')
print('\n')
In [93]:
for target in target_word:
target_index = []
for index, review in enumerate(ms_lst):
if target in review:
target_index.append(index)
ms_df.loc[target_index].to_csv(interprete_path + 'ms_' + target + '.csv', index=False)
In [94]:
ap_df, ap_lst, ap_txt = brands_text(documents, data, 'Apple')
In [123]:
window = 3
min_count = 10
bigram_measures = nltk.collocations.BigramAssocMeasures()
s_finder = BigramCollocationFinder.from_words(ms_txt, window_size = window)
s_finder.apply_freq_filter(min_count) #13번 미만으로 나온 것들 무시
#finder.nbest(bigram_measures.pmi, 20)
s_result = s_finder.score_ngrams(bigram_measures.pmi)
In [125]:
#pmi
target_word = ['her', 'loved','ever']
for i in target_word:
print( "-------------")
print(i)
print('\n')
print(find_word(s_result, i))
print('\n')
print('\n')
In [126]:
for target in target_word:
target_index = []
for index, review in enumerate(ap_lst):
if target in review:
target_index.append(index)
ap_df.loc[target_index].to_csv(interprete_path + 'apple_' + target + '.csv', index=False)
In [ ]: