Dynamic Aspect Extraction for camera Reviews Part B

Han, Kehang (hkh12@mit.edu)

As a follow-up demonstration, this ipynb is focused on extracting aspects from datasets called AmazonReviews, which has much more reviews on cameras.

Set up


In [1]:
import json
import nltk
import string

from srs.utilities import Product, AspectPattern

s1: load raw data from AmazonReviews datasets


In [2]:
product_name = 'B00000JFIF'
reviewJsonFile = product_name + '.json'
product = Product(name=product_name)
product.loadReviewsFromJsonFile('../data/trainingFiles/AmazonReviews/cameras/' + reviewJsonFile)

s2: define aspect patterns


In [3]:
aspectPatterns = []
# define an aspect pattern1
pattern_name = 'adj_nn'
pattern_structure ="""
adj_nn:{<JJ><NN.?>}
"""
aspectTagIndices = [1]
aspectPattern = AspectPattern(name='adj_nn', structure=pattern_structure, aspectTagIndices=aspectTagIndices)
aspectPatterns.append(aspectPattern)
# define an aspect pattern2
pattern_name = 'nn_nn'
pattern_structure ="""
nn_nn:{<NN.?><NN.?>}
"""
aspectTagIndices = [0,1]
aspectPattern = AspectPattern(name='nn_nn', structure=pattern_structure, aspectTagIndices=aspectTagIndices)
aspectPatterns.append(aspectPattern)

s3: match sentence to pattern to extract aspects


In [ ]:
# pos tagging
for review in product.reviews:
    for sentence in review.sentences:
        sentence.pos_tag()
        sentence.matchDaynamicAspectPatterns(aspectPatterns)

s4: statistic analysis on aspects extracted across all reviews


In [6]:
word_dict = {}
for review in product.reviews:
    for sentence in review.sentences:
        for aspect in sentence.dynamic_aspects:
            if aspect in word_dict:
                word_dict[aspect] += 1
            else:
                word_dict[aspect] = 1

In [7]:
word_sorted = sorted(word_dict.items(), key=lambda tup:-tup[1])
word_sorted[:15]


Out[7]:
[(u'camera', 47),
 (u'batteries', 22),
 (u'pictures', 18),
 (u'cameras', 13),
 (u'battery life', 12),
 (u'ac adapter', 11),
 (u'quality', 11),
 (u'shots', 10),
 (u'zoom', 9),
 (u'memory card', 9),
 (u'time', 8),
 (u'media', 7),
 (u'price', 7),
 (u'software', 7),
 (u'resolution', 7)]

s5: save most frequent dynamic aspects


In [8]:
import json
word_output = open('../data/word_list/{0}_wordlist.txt'.format(product_name), 'w')
json.dump(word_sorted[:15], word_output)
word_output.close()

s6: stemming analysis


In [9]:
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer('english')

In [10]:
# collect word with same stem
stemmedWord_dict = {}
for word in word_dict:
    stemmedWord = stemmer.stem(word)
    if stemmedWord in stemmedWord_dict:
        stemmedWord_dict[stemmedWord] += word_dict[word]
    else:
        stemmedWord_dict[stemmedWord] = word_dict[word]

In [11]:
# frequency ranking
stemmedWord_sorted = sorted(stemmedWord_dict.items(), key=lambda tup:-tup[1])
stemmedWord_sorted[:15]


Out[11]:
[(u'camera', 61),
 (u'batteri', 26),
 (u'pictur', 23),
 (u'shot', 13),
 (u'battery lif', 12),
 (u'qualiti', 11),
 (u'ac adapt', 11),
 (u'memory card', 10),
 (u'resolut', 9),
 (u'zoom', 9),
 (u'featur', 9),
 (u'problem', 8),
 (u'time', 8),
 (u'card', 7),
 (u'softwar', 7)]

In [12]:
# save most frequent stemmed words
stemmedWord_output = open('../data/word_list/{0}_stemmedwordlist.txt'.format(product_name), 'w')
json.dump(stemmedWord_sorted[:15], stemmedWord_output)
stemmedWord_output.close()

In [ ]: