camera
Reviews Part AHan, Kehang (hkh12@mit.edu)
This ipynb demonstrates how we process the raw review data and get aspect dynamically for each sentence. The dynamic aspects collected would be detailed features customers care about, such as battery life, product looking, etc. As the very first step of our aspect-based sentiment analysis, dynamic aspect extraction is foundation of following static aspect classfication, aspect aggregation.
In [1]:
from srs.utilities import Product, AspectPattern
# load training data
product_name = 'Canon'
reviewTrainingFile = product_name + '.txt'
product = Product(name=product_name)
product.loadReviewsFromTrainingFile('../data/trainingFiles/Liu/' + reviewTrainingFile)
In [2]:
aspectPatterns = []
# define an aspect pattern1
pattern_name = 'adj_nn'
pattern_structure ="""
adj_nn:{<JJ><NN.?>}
"""
aspectTagIndices = [1]
aspectPattern = AspectPattern(name='adj_nn', structure=pattern_structure, aspectTagIndices=aspectTagIndices)
aspectPatterns.append(aspectPattern)
# define an aspect pattern2
pattern_name = 'nn_nn'
pattern_structure ="""
nn_nn:{<NN.?><NN.?>}
"""
aspectTagIndices = [0,1]
aspectPattern = AspectPattern(name='nn_nn', structure=pattern_structure, aspectTagIndices=aspectTagIndices)
aspectPatterns.append(aspectPattern)
In [ ]:
# pos tagging
for review in product.reviews:
for sentence in review.sentences:
sentence.pos_tag()
sentence.matchDaynamicAspectPatterns(aspectPatterns)
In [47]:
word_dict = {}
for review in product.reviews:
for sentence in review.sentences:
for aspect in sentence.dynamic_aspects:
if aspect in word_dict:
word_dict[aspect] += 1
else:
word_dict[aspect] = 1
In [48]:
word_sorted = sorted(word_dict.items(), key=lambda tup:-tup[1])
word_sorted[:15]
Out[48]:
In [49]:
import json
word_output = open('../data/word_list/{0}_wordlist.txt'.format(product_name), 'w')
json.dump(word_sorted[:15], word_output)
word_output.close()
In [50]:
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer('english')
In [51]:
# collect word with same stem
stemmedWord_dict = {}
for word in word_dict:
stemmedWord = stemmer.stem(word)
if stemmedWord in stemmedWord_dict:
stemmedWord_dict[stemmedWord] += word_dict[word]
else:
stemmedWord_dict[stemmedWord] = word_dict[word]
In [52]:
# frequency ranking
stemmedWord_sorted = sorted(stemmedWord_dict.items(), key=lambda tup:-tup[1])
stemmedWord_sorted[:15]
Out[52]:
In [53]:
# save most frequent stemmed words
stemmedWord_output = open('../data/word_list/{0}_stemmedwordlist.txt'.format(product_name), 'w')
json.dump(stemmedWord_sorted[:15], stemmedWord_output)
stemmedWord_output.close()
In [ ]: