In [1]:
import numpy as np
import time
import matplotlib.pyplot as plt
import pandas as pd
from pandas import DataFrame
import csv
import nltk, re
import datetime
import random
import multiprocessing
from ast import literal_eval
import pickle
from tqdm import tqdm
from nltk.stem.porter import PorterStemmer
from scipy.cluster.hierarchy import dendrogram, linkage
#%matplotlib inline

In [2]:
data_path = "E:/dataset/Amazon/"
result_path = "E:/dataset/MasterThesis/FINAL/"
save_path = "E:/dataset/MasterThesis/FINAL/preprocess_data/"
model_path = "E:/dataset/MasterThesis/FINAL/doc2vec/"
category_list = ["Electronics"]
for category in category_list:
    data = pd.read_csv(save_path + "preprocess_complete_" + category + ".csv")
    data['preprocessed'] = data.preprocessed.apply(lambda row: literal_eval(row))

In [46]:
data.head()


Out[46]:
reviewTime asin reviewerID overall helpful reviewText title brand reviewSentence sent_length reviewSentence_tagged preprocessed
0 2013-07-21 B00CM0XHNS A372YX80GGM7DR 5.0 576 Ok, so I didn't buy this on Amazon, as I didn'... Ultimate Ears BOOM Wireless Bluetooth Speaker ... Logitech ["Ok, so I didn't buy this on Amazon, as I did... 58 [[('Ok', 'NNP'), (',', ','), ('so', 'IN'), ('I... [[ok, so, i, did, n't, buy, this, on, amazon, ...
1 2013-05-19 B00BQ5RY1G A1BG2Z071TYO7P 2.0 522 I received a Harmony Ultimate from Logitech be... Logitech Harmony Ultimate Remote with Customiz... Logitech ['I received a Harmony Ultimate from Logitech ... 27 [[('I', 'PRP'), ('received', 'VBD'), ('a', 'DT... [[i, received, a, harmony, ultimate, from, log...
2 2013-12-16 B00EZ9XG62 AELAESM03451 1.0 290 This review is for the iPad Air keyboard. I ha... Logitech Ultrathin Keyboard Cover for iPad Air... Logitech ['This review is for the iPad Air keyboard.', ... 23 [[('This', 'DT'), ('review', 'NN'), ('is', 'VB... [[this, review, is, for, the, ipad, air, keybo...
3 2013-01-21 B0099SMFVQ A36CMGR5ELUM34 5.0 283 Design: Very well put together. Elegant and th... Logitech Bluetooth Illuminated Keyboard K810 f... Logitech ['Design: Very well put together.', 'Elegant a... 28 [[('Design', 'NN'), (':', ':'), ('Very', 'RB')... [[design, very, well, put, together], [elegant...
4 2013-07-29 B00CM0XHNS A9TETE58A7JR3 3.0 260 So, I've been testing a few bluetooth speakers... Ultimate Ears BOOM Wireless Bluetooth Speaker ... Logitech ["So, I've been testing a few bluetooth speake... 57 [[('So', 'RB'), (',', ','), ('I', 'PRP'), ("'v... [[so, i, been, testing, a, few, bluetooth, spe...

In [3]:
# documents for doc2vec
with open(result_path  + category + '_documents.pkl', 'rb') as f:
    documents = pickle.load(f)

In [77]:
def brands_text(documents, data, brand):
    df = data[data['brand']==brand]
    df.reset_index(drop=True, inplace=True)
    df = df[['asin', 'reviewerID', 'overall', 'helpful', 'reviewText', 'title', 'brand']]
    reviews = []
    for doc in documents:
        if doc.tags[0] == brand:
            reviews.append(doc.words)
    corpus = [word for sent in reviews for word in sent]
    corpus = nltk.Text(corpus)
    return df, reviews, corpus

In [109]:
def find_word(nltk_finder_result, target_word):
    find_word = target_word
    n_gram_list = []
    for i in nltk_finder_result:
        if len(n_gram_list) > 10:
            break
        elif i[0][0] == find_word or i[0][1] == find_word:
            n_gram_list.append(i)
    return n_gram_list

In [64]:
#단어에 대해서 bigram(pmi) 확인
import nltk
from nltk.collocations import *

In [78]:
samsung_df, samsung_lst, samsung_txt = brands_text(documents, data, 'Samsung')

In [127]:
window = 4
min_count = 10

samsung_bigram_measures = nltk.collocations.BigramAssocMeasures()
s_finder = BigramCollocationFinder.from_words(samsung_txt, window_size = window)

s_finder.apply_freq_filter(min_count) #13번 미만으로 나온 것들 무시
#finder.nbest(bigram_measures.pmi, 20)
s_result = s_finder.score_ngrams(samsung_bigram_measures.pmi)

In [128]:
#pmi
target_word = ['picture_quality', 'basic', 'lag','compared']
for i in target_word:
    print( "-------------")
    print(i)
    print('\n')
    print(find_word(s_result, i))
    print('\n')
    print('\n')


-------------
picture_quality


[(('picture_quality', 'amazing'), 4.781451833888468), (('great', 'picture_quality'), 3.2950439151871223), (('picture_quality', 'good'), 2.936099648589078), (('picture_quality', 'sound'), 2.7408098493911197), (('picture_quality', 'great'), 2.710081414465968), (('better', 'picture_quality'), 2.671461802700385), (('picture_quality', 'is'), 2.4319344330527315), (('good', 'picture_quality'), 2.043014852505589), (('tv', 'picture_quality'), 1.7883875263668827), (('the', 'picture_quality'), 1.753679335107524), (('picture_quality', 'was'), 1.6898069459570202)]




-------------
basic


[(('some', 'basic'), 3.2492125612020075), (('very', 'basic'), 2.6305726013640793), (('for', 'basic'), 1.7960063002534419), (('a', 'basic'), 1.3938724462796976), (('with', 'basic'), 1.215878138302628), (('basic', 'for'), 0.9765785458952649), (('basic', 'and'), 0.9601669243632003), (('is', 'basic'), 0.5079986316275757), (('the', 'basic'), 0.4726165305761043), (('basic', 'that'), 0.38073339344517265), (('basic', 'of'), 0.3510292852891794)]




-------------
lag


[(('no', 'lag'), 4.515752402182041), (('little', 'lag'), 4.113788040907643), (('any', 'lag'), 3.842606761864804), (('there', 'lag'), 3.7029958075143448), (('lag', 'time'), 3.1476508589010272), (('some', 'lag'), 2.6932937195649416), (('lag', 'when'), 2.609703887461805), (('lag', 'or'), 2.1711413710600382), (('is', 'lag'), 1.6192570539998528), (('lag', 'but'), 1.2225442761337284), (('with', 'lag'), 1.1302792314455878)]




-------------
compared


[(('compared', 'lines'), 7.347740130447223), (('compared', 'ipad'), 4.284507505669318), (('features', 'compared'), 3.873915911421662), (('compared', 'other'), 3.7211703089324892), (('best', 'compared'), 3.649667933506734), (('compared', 'to'), 3.3788548128824054), (('when', 'compared'), 3.1467839825057418), (('compared', 'what'), 2.204126406120757), (('compared', 'my'), 2.107214077394744), (('as', 'compared'), 1.7814651355821027), (('very', 'compared'), 1.7537528596433773)]





In [129]:
for target in target_word:
    target_index = []
    for index, review in enumerate(samsung_lst):
        if target in review:
            target_index.append(index)
    samsung_df.loc[target_index].to_csv(interprete_path + 'samsung_' + target + '.csv', index=False)

Microsoft


In [88]:
ms_df, ms_lst, ms_txt = brands_text(documents, data, 'Microsoft')

In [119]:
window = 4
min_count = 10

bigram_measures = nltk.collocations.BigramAssocMeasures()
s_finder = BigramCollocationFinder.from_words(ms_txt, window_size = window)

s_finder.apply_freq_filter(min_count) #13번 미만으로 나온 것들 무시
#finder.nbest(bigram_measures.pmi, 20)
s_result = s_finder.score_ngrams(bigram_measures.pmi)

In [120]:
#pmi
target_word = ['ergonomic', 'natural','easily']
for i in target_word:
    print( "-------------")
    print(i)
    print('\n')
    print(find_word(s_result, i))
    print('\n')
    print('\n')


-------------
ergonomic


[(('microsoft_sculpt', 'ergonomic'), 8.070524720096817), (('an', 'ergonomic'), 2.767492607212777), (('ergonomic', 'mouse'), 2.7551627994931422), (('very', 'ergonomic'), 2.2368880337645116), (('is', 'ergonomic'), 0.7642172428184502), (('ergonomic', 'is'), 0.6862147308171735), (('the', 'ergonomic'), 0.6082068378240315), (('ergonomic', 'and'), 0.5106219634075728), (('ergonomic', 'this'), 0.43942520988136025), (('for', 'ergonomic'), 0.4064592786489172), (('ergonomic', 'i'), 0.06289539289234725)]




-------------
natural


[(('natural', 'position'), 6.837483132987684), (('feels', 'natural'), 5.546348191645926), (('natural', 'keyboards'), 4.6556595482988), (('feel', 'natural'), 4.2553746293216825), (('more', 'natural'), 4.198393322514562), (('natural', 'keyboard'), 3.0426435054241097), (('in', 'natural'), 1.9026384758081676), (('a', 'natural'), 1.142568342770275), (('natural', 'for'), 1.1326811009870674), (('the', 'natural'), 1.1114211481999625), (('natural', 'in'), 1.0401419995581023)]




-------------
easily


[(('quickly', 'easily'), 4.930303507018408), (('can', 'easily'), 4.032293939220757), (('easily', 'into'), 3.9010162800501647), (('could', 'easily'), 3.4812634942277114), (('easily', 'by'), 3.3369123842266717), (('small', 'easily'), 2.9231487879364657), (('very', 'easily'), 2.331037468267759), (('so', 'easily'), 1.7243745320422548), (('easily', 'in'), 1.7219922439081756), (('be', 'easily'), 1.4878207153220515), (('easily', 'be'), 1.3882850417711374)]





In [93]:
for target in target_word:
    target_index = []
    for index, review in enumerate(ms_lst):
        if target in review:
            target_index.append(index)
    ms_df.loc[target_index].to_csv(interprete_path + 'ms_' + target + '.csv', index=False)

Apple


In [94]:
ap_df, ap_lst, ap_txt = brands_text(documents, data, 'Apple')

In [123]:
window = 3
min_count = 10

bigram_measures = nltk.collocations.BigramAssocMeasures()
s_finder = BigramCollocationFinder.from_words(ms_txt, window_size = window)

s_finder.apply_freq_filter(min_count) #13번 미만으로 나온 것들 무시
#finder.nbest(bigram_measures.pmi, 20)
s_result = s_finder.score_ngrams(bigram_measures.pmi)

In [125]:
#pmi
target_word = ['her', 'loved','ever']
for i in target_word:
    print( "-------------")
    print(i)
    print('\n')
    print(find_word(s_result, i))
    print('\n')
    print('\n')


-------------
her


[(('her', 'laptop'), 4.561330754502553), (('for', 'her'), 2.0297882191671235), (('on', 'her'), 1.8035128078929787), (('with', 'her'), 1.7630697977596306), (('her', 'and'), 0.9964473801758444), (('to', 'her'), 0.9059325517441152), (('her', 'to'), 0.58400445685675)]




-------------
loved


[(('i', 'loved'), 3.043955855858883), (('loved', 'it'), 2.6805223138840084), (('loved', 'this'), 2.2880353768242436), (('and', 'loved'), 2.044850844516443), (('loved', 'the'), 1.55474342545617)]




-------------
ever


[(('ever', 'used'), 5.481768259682067), (('ever', 'since'), 5.074487096004162), (('best', 'ever'), 4.703689590231281), (('ever', 'made'), 4.661552957705688), (('have', 'ever'), 3.529506156312241), (('ever', 'had'), 3.40712252760396), (('than', 'ever'), 3.239271786100222), (('ever', 'need'), 3.210537353749981), (('what', 'ever'), 3.2094752903616204), (('i', 'ever'), 2.575680335683046), (('ever', 'get'), 2.479130829132348)]





In [126]:
for target in target_word:
    target_index = []
    for index, review in enumerate(ap_lst):
        if target in review:
            target_index.append(index)
    ap_df.loc[target_index].to_csv(interprete_path + 'apple_' + target + '.csv', index=False)

In [ ]: