notebook.community

Edit and run



In [1]:

    
import numpy as np
import time
import matplotlib.pyplot as plt
import pandas as pd
from pandas import DataFrame
import csv
import nltk, re
import datetime
import random
import multiprocessing
from ast import literal_eval
import pickle
from tqdm import tqdm
from nltk.stem.porter import PorterStemmer
from scipy.cluster.hierarchy import dendrogram, linkage
#%matplotlib inline



In [2]:

    
data_path = "E:/dataset/Amazon/"
result_path = "E:/dataset/MasterThesis/FINAL/"
save_path = "E:/dataset/MasterThesis/FINAL/preprocess_data/"
model_path = "E:/dataset/MasterThesis/FINAL/doc2vec/"
category_list = ["Electronics"]
for category in category_list:
    data = pd.read_csv(save_path + "preprocess_complete_" + category + ".csv")
    data['preprocessed'] = data.preprocessed.apply(lambda row: literal_eval(row))



In [46]:

    
data.head()









    Out[46]:






  
    
      
      reviewTime
      asin
      reviewerID
      overall
      helpful
      reviewText
      title
      brand
      reviewSentence
      sent_length
      reviewSentence_tagged
      preprocessed
    
  
  
    
      0
      2013-07-21
      B00CM0XHNS
      A372YX80GGM7DR
      5.0
      576
      Ok, so I didn't buy this on Amazon, as I didn'...
      Ultimate Ears BOOM Wireless Bluetooth Speaker ...
      Logitech
      ["Ok, so I didn't buy this on Amazon, as I did...
      58
      [[('Ok', 'NNP'), (',', ','), ('so', 'IN'), ('I...
      [[ok, so, i, did, n't, buy, this, on, amazon, ...
    
    
      1
      2013-05-19
      B00BQ5RY1G
      A1BG2Z071TYO7P
      2.0
      522
      I received a Harmony Ultimate from Logitech be...
      Logitech Harmony Ultimate Remote with Customiz...
      Logitech
      ['I received a Harmony Ultimate from Logitech ...
      27
      [[('I', 'PRP'), ('received', 'VBD'), ('a', 'DT...
      [[i, received, a, harmony, ultimate, from, log...
    
    
      2
      2013-12-16
      B00EZ9XG62
      AELAESM03451
      1.0
      290
      This review is for the iPad Air keyboard. I ha...
      Logitech Ultrathin Keyboard Cover for iPad Air...
      Logitech
      ['This review is for the iPad Air keyboard.', ...
      23
      [[('This', 'DT'), ('review', 'NN'), ('is', 'VB...
      [[this, review, is, for, the, ipad, air, keybo...
    
    
      3
      2013-01-21
      B0099SMFVQ
      A36CMGR5ELUM34
      5.0
      283
      Design: Very well put together. Elegant and th...
      Logitech Bluetooth Illuminated Keyboard K810 f...
      Logitech
      ['Design: Very well put together.', 'Elegant a...
      28
      [[('Design', 'NN'), (':', ':'), ('Very', 'RB')...
      [[design, very, well, put, together], [elegant...
    
    
      4
      2013-07-29
      B00CM0XHNS
      A9TETE58A7JR3
      3.0
      260
      So, I've been testing a few bluetooth speakers...
      Ultimate Ears BOOM Wireless Bluetooth Speaker ...
      Logitech
      ["So, I've been testing a few bluetooth speake...
      57
      [[('So', 'RB'), (',', ','), ('I', 'PRP'), ("'v...
      [[so, i, been, testing, a, few, bluetooth, spe...



In [3]:

    
# documents for doc2vec
with open(result_path  + category + '_documents.pkl', 'rb') as f:
    documents = pickle.load(f)



In [77]:

    
def brands_text(documents, data, brand):
    df = data[data['brand']==brand]
    df.reset_index(drop=True, inplace=True)
    df = df[['asin', 'reviewerID', 'overall', 'helpful', 'reviewText', 'title', 'brand']]
    reviews = []
    for doc in documents:
        if doc.tags[0] == brand:
            reviews.append(doc.words)
    corpus = [word for sent in reviews for word in sent]
    corpus = nltk.Text(corpus)
    return df, reviews, corpus



In [109]:

    
def find_word(nltk_finder_result, target_word):
    find_word = target_word
    n_gram_list = []
    for i in nltk_finder_result:
        if len(n_gram_list) > 10:
            break
        elif i[0][0] == find_word or i[0][1] == find_word:
            n_gram_list.append(i)
    return n_gram_list



In [64]:

    
#단어에 대해서 bigram(pmi) 확인
import nltk
from nltk.collocations import *



In [78]:

    
samsung_df, samsung_lst, samsung_txt = brands_text(documents, data, 'Samsung')



In [127]:

    
window = 4
min_count = 10

samsung_bigram_measures = nltk.collocations.BigramAssocMeasures()
s_finder = BigramCollocationFinder.from_words(samsung_txt, window_size = window)

s_finder.apply_freq_filter(min_count) #13번 미만으로 나온 것들 무시
#finder.nbest(bigram_measures.pmi, 20)
s_result = s_finder.score_ngrams(samsung_bigram_measures.pmi)



In [128]:

    
#pmi
target_word = ['picture_quality', 'basic', 'lag','compared']
for i in target_word:
    print( "-------------")
    print(i)
    print('\n')
    print(find_word(s_result, i))
    print('\n')
    print('\n')









    



-------------
picture_quality


[(('picture_quality', 'amazing'), 4.781451833888468), (('great', 'picture_quality'), 3.2950439151871223), (('picture_quality', 'good'), 2.936099648589078), (('picture_quality', 'sound'), 2.7408098493911197), (('picture_quality', 'great'), 2.710081414465968), (('better', 'picture_quality'), 2.671461802700385), (('picture_quality', 'is'), 2.4319344330527315), (('good', 'picture_quality'), 2.043014852505589), (('tv', 'picture_quality'), 1.7883875263668827), (('the', 'picture_quality'), 1.753679335107524), (('picture_quality', 'was'), 1.6898069459570202)]




-------------
basic


[(('some', 'basic'), 3.2492125612020075), (('very', 'basic'), 2.6305726013640793), (('for', 'basic'), 1.7960063002534419), (('a', 'basic'), 1.3938724462796976), (('with', 'basic'), 1.215878138302628), (('basic', 'for'), 0.9765785458952649), (('basic', 'and'), 0.9601669243632003), (('is', 'basic'), 0.5079986316275757), (('the', 'basic'), 0.4726165305761043), (('basic', 'that'), 0.38073339344517265), (('basic', 'of'), 0.3510292852891794)]




-------------
lag


[(('no', 'lag'), 4.515752402182041), (('little', 'lag'), 4.113788040907643), (('any', 'lag'), 3.842606761864804), (('there', 'lag'), 3.7029958075143448), (('lag', 'time'), 3.1476508589010272), (('some', 'lag'), 2.6932937195649416), (('lag', 'when'), 2.609703887461805), (('lag', 'or'), 2.1711413710600382), (('is', 'lag'), 1.6192570539998528), (('lag', 'but'), 1.2225442761337284), (('with', 'lag'), 1.1302792314455878)]




-------------
compared


[(('compared', 'lines'), 7.347740130447223), (('compared', 'ipad'), 4.284507505669318), (('features', 'compared'), 3.873915911421662), (('compared', 'other'), 3.7211703089324892), (('best', 'compared'), 3.649667933506734), (('compared', 'to'), 3.3788548128824054), (('when', 'compared'), 3.1467839825057418), (('compared', 'what'), 2.204126406120757), (('compared', 'my'), 2.107214077394744), (('as', 'compared'), 1.7814651355821027), (('very', 'compared'), 1.7537528596433773)]



In [129]:

    
for target in target_word:
    target_index = []
    for index, review in enumerate(samsung_lst):
        if target in review:
            target_index.append(index)
    samsung_df.loc[target_index].to_csv(interprete_path + 'samsung_' + target + '.csv', index=False)

Microsoft



In [88]:

    
ms_df, ms_lst, ms_txt = brands_text(documents, data, 'Microsoft')



In [119]:

    
window = 4
min_count = 10

bigram_measures = nltk.collocations.BigramAssocMeasures()
s_finder = BigramCollocationFinder.from_words(ms_txt, window_size = window)

s_finder.apply_freq_filter(min_count) #13번 미만으로 나온 것들 무시
#finder.nbest(bigram_measures.pmi, 20)
s_result = s_finder.score_ngrams(bigram_measures.pmi)



In [120]:

    
#pmi
target_word = ['ergonomic', 'natural','easily']
for i in target_word:
    print( "-------------")
    print(i)
    print('\n')
    print(find_word(s_result, i))
    print('\n')
    print('\n')









    



-------------
ergonomic


[(('microsoft_sculpt', 'ergonomic'), 8.070524720096817), (('an', 'ergonomic'), 2.767492607212777), (('ergonomic', 'mouse'), 2.7551627994931422), (('very', 'ergonomic'), 2.2368880337645116), (('is', 'ergonomic'), 0.7642172428184502), (('ergonomic', 'is'), 0.6862147308171735), (('the', 'ergonomic'), 0.6082068378240315), (('ergonomic', 'and'), 0.5106219634075728), (('ergonomic', 'this'), 0.43942520988136025), (('for', 'ergonomic'), 0.4064592786489172), (('ergonomic', 'i'), 0.06289539289234725)]




-------------
natural


[(('natural', 'position'), 6.837483132987684), (('feels', 'natural'), 5.546348191645926), (('natural', 'keyboards'), 4.6556595482988), (('feel', 'natural'), 4.2553746293216825), (('more', 'natural'), 4.198393322514562), (('natural', 'keyboard'), 3.0426435054241097), (('in', 'natural'), 1.9026384758081676), (('a', 'natural'), 1.142568342770275), (('natural', 'for'), 1.1326811009870674), (('the', 'natural'), 1.1114211481999625), (('natural', 'in'), 1.0401419995581023)]




-------------
easily


[(('quickly', 'easily'), 4.930303507018408), (('can', 'easily'), 4.032293939220757), (('easily', 'into'), 3.9010162800501647), (('could', 'easily'), 3.4812634942277114), (('easily', 'by'), 3.3369123842266717), (('small', 'easily'), 2.9231487879364657), (('very', 'easily'), 2.331037468267759), (('so', 'easily'), 1.7243745320422548), (('easily', 'in'), 1.7219922439081756), (('be', 'easily'), 1.4878207153220515), (('easily', 'be'), 1.3882850417711374)]



In [93]:

    
for target in target_word:
    target_index = []
    for index, review in enumerate(ms_lst):
        if target in review:
            target_index.append(index)
    ms_df.loc[target_index].to_csv(interprete_path + 'ms_' + target + '.csv', index=False)

Apple



In [94]:

    
ap_df, ap_lst, ap_txt = brands_text(documents, data, 'Apple')



In [123]:

    
window = 3
min_count = 10

bigram_measures = nltk.collocations.BigramAssocMeasures()
s_finder = BigramCollocationFinder.from_words(ms_txt, window_size = window)

s_finder.apply_freq_filter(min_count) #13번 미만으로 나온 것들 무시
#finder.nbest(bigram_measures.pmi, 20)
s_result = s_finder.score_ngrams(bigram_measures.pmi)



In [125]:

    
#pmi
target_word = ['her', 'loved','ever']
for i in target_word:
    print( "-------------")
    print(i)
    print('\n')
    print(find_word(s_result, i))
    print('\n')
    print('\n')









    



-------------
her


[(('her', 'laptop'), 4.561330754502553), (('for', 'her'), 2.0297882191671235), (('on', 'her'), 1.8035128078929787), (('with', 'her'), 1.7630697977596306), (('her', 'and'), 0.9964473801758444), (('to', 'her'), 0.9059325517441152), (('her', 'to'), 0.58400445685675)]




-------------
loved


[(('i', 'loved'), 3.043955855858883), (('loved', 'it'), 2.6805223138840084), (('loved', 'this'), 2.2880353768242436), (('and', 'loved'), 2.044850844516443), (('loved', 'the'), 1.55474342545617)]




-------------
ever


[(('ever', 'used'), 5.481768259682067), (('ever', 'since'), 5.074487096004162), (('best', 'ever'), 4.703689590231281), (('ever', 'made'), 4.661552957705688), (('have', 'ever'), 3.529506156312241), (('ever', 'had'), 3.40712252760396), (('than', 'ever'), 3.239271786100222), (('ever', 'need'), 3.210537353749981), (('what', 'ever'), 3.2094752903616204), (('i', 'ever'), 2.575680335683046), (('ever', 'get'), 2.479130829132348)]



In [126]:

    
for target in target_word:
    target_index = []
    for index, review in enumerate(ap_lst):
        if target in review:
            target_index.append(index)
    ap_df.loc[target_index].to_csv(interprete_path + 'apple_' + target + '.csv', index=False)



In [ ]:

	reviewTime	asin	reviewerID	overall	helpful	reviewText	title	brand	reviewSentence	sent_length	reviewSentence_tagged	preprocessed
0	2013-07-21	B00CM0XHNS	A372YX80GGM7DR	5.0	576	Ok, so I didn't buy this on Amazon, as I didn'...	Ultimate Ears BOOM Wireless Bluetooth Speaker ...	Logitech	["Ok, so I didn't buy this on Amazon, as I did...	58	[[('Ok', 'NNP'), (',', ','), ('so', 'IN'), ('I...	[[ok, so, i, did, n't, buy, this, on, amazon, ...
1	2013-05-19	B00BQ5RY1G	A1BG2Z071TYO7P	2.0	522	I received a Harmony Ultimate from Logitech be...	Logitech Harmony Ultimate Remote with Customiz...	Logitech	['I received a Harmony Ultimate from Logitech ...	27	[[('I', 'PRP'), ('received', 'VBD'), ('a', 'DT...	[[i, received, a, harmony, ultimate, from, log...
2	2013-12-16	B00EZ9XG62	AELAESM03451	1.0	290	This review is for the iPad Air keyboard. I ha...	Logitech Ultrathin Keyboard Cover for iPad Air...	Logitech	['This review is for the iPad Air keyboard.', ...	23	[[('This', 'DT'), ('review', 'NN'), ('is', 'VB...	[[this, review, is, for, the, ipad, air, keybo...
3	2013-01-21	B0099SMFVQ	A36CMGR5ELUM34	5.0	283	Design: Very well put together. Elegant and th...	Logitech Bluetooth Illuminated Keyboard K810 f...	Logitech	['Design: Very well put together.', 'Elegant a...	28	[[('Design', 'NN'), (':', ':'), ('Very', 'RB')...	[[design, very, well, put, together], [elegant...
4	2013-07-29	B00CM0XHNS	A9TETE58A7JR3	3.0	260	So, I've been testing a few bluetooth speakers...	Ultimate Ears BOOM Wireless Bluetooth Speaker ...	Logitech	["So, I've been testing a few bluetooth speake...	57	[[('So', 'RB'), (',', ','), ('I', 'PRP'), ("'v...	[[so, i, been, testing, a, few, bluetooth, spe...