In [6]:
# import
import graphlab as gl
gl.canvas.set_target('ipynb')
In [3]:
# reading the data
data = gl.SFrame("data/amazon_baby.gl/")
data.head(5)
Out[3]:
In [4]:
# Build a word count vector
data['word_count'] = gl.text_analytics.count_words(data['review'])
In [5]:
data.head(4)
Out[5]:
Defining Positive and Negative Sentense
Ignore 0 and 3 star ratings
1 and 2 are treated as Negative
4 and 4 are treated as Positive
In [8]:
# ignoring the 3 star rating
data2 = data[data['rating'] != 3 ]
In [9]:
data2['sentiment'] = data2['rating'] > 3
In [11]:
data2.head(5)
Out[11]:
In [12]:
# training the classifier model
# first, spliting the data into train and test datasets
train_data, test_data = data2.random_split(0.8, seed=0)
In [14]:
sentiment_model = gl.logistic_classifier.create(train_data,
target='sentiment',
features=['word_count'],
validation_set=test_data)
In [15]:
# Evaluate the sentiment model
sentiment_model.evaluate(test_data, metric='roc_curve')
Out[15]:
In [16]:
sentiment_model.show(view='Evaluation')
In [17]:
selected_words = ['awesome', 'great', 'fantastic', 'amazing', 'love', 'horrible', 'bad', 'terrible', 'awful', 'wow', 'hate']
In [18]:
def word_count(line):
wc={}
for word in line.split():
wc[word] = wc.get(word, 0) +1
return wc
In [60]:
#data3['word_count_dic'] = data2['review'].apply(word_count)
In [20]:
data2.head(5)
Out[20]:
def selected_word_count(line, selected_words=['awesome', 'great', 'fantastic', 'amazing', 'love', 'horrible', 'bad', 'terrible', 'awful', 'wow', 'hate']):
wc={}
for word in line.split():
if word in selected_words:
wc[word] = wc.get(word, 0) +1
return wc
In [37]:
def selected_word_count(line, selected_words=['awesome', 'great', 'fantastic', 'amazing', 'love', 'horrible', 'bad', 'terrible', 'awful', 'wow', 'hate']):
wc={}
for key in selected_words:
if key in line.keys():
wc[key] = line[key]
return wc
In [68]:
#data2['selected_word_count_dic'] = data2['word_count'].apply(selected_word_count)
data2['selected_word_count_dic'] = data2['word_count'].dict_trim_by_keys(selected_words, exclude=False)
In [69]:
def get_count(data, word):
return data.get(word,0)
In [70]:
for word in selected_words:
data2[word+'_count'] = data2['selected_word_count_dic'].apply(lambda line: get_count(line, word))
In [71]:
data2.head(4)
Out[71]:
In [44]:
train_data,test_data = data2.random_split(.8, seed=0)
In [45]:
selected_words_model = gl.logistic_classifier.create(train_data,
target='sentiment',
features=['selected_word_count_dic'],
validation_set=test_data)
In [67]:
#gl.SFrame.print_rows(num_rows=12, num_columns=5)
coef=selected_words_model['coefficients'].sort('value')
coef.print_rows(num_rows=12, num_columns=5)
In [47]:
# accuracy
selected_words_model.evaluate(test_data)
Out[47]:
In [48]:
sentiment_model.evaluate(test_data)
Out[48]:
In [49]:
# Analysis why clf is works better than selected_Word_model
diaper_champ_reviews = data2[data2['name']=='Baby Trend Diaper Champ']
diaper_champ_reviews.head(2)
Out[49]:
In [50]:
diaper_champ_reviews['pred_sentiment'] = sentiment_model.predict(diaper_champ_reviews, output_type='probability')
diaper_champ_reviews_sorted = diaper_champ_reviews.sort('pred_sentiment', ascending=False)
diaper_champ_reviews_sorted.head(2)
Out[50]:
In [75]:
diaper_champ_reviews['sel_pred_sentiment'] = selected_words_model.predict(diaper_champ_reviews, output_type='probability')
diaper_champ_reviews_sel_sorted = diaper_champ_reviews.sort('pred_sentiment', ascending=False)
diaper_champ_reviews_sel_sorted.head(2)
Out[75]:
In [77]:
diaper_champ_reviews_sel_sorted[0:1]['review']
Out[77]:
In [74]:
# Out of the 11 words in selected_words, which one is most used in the reviews in the dataset?
print('hate_count',data2['hate_count'].sum())
print('wow_count',data2['wow_count'].sum())
print('awful_count',data2['awful_count'].sum())
print('terrible_count',data2['terrible_count'].sum())
print('bad_count',data2['bad_count'].sum())
print('horrible_count',data2['horrible_count'].sum())
print('love_count',data2['love_count'].sum())
print('amazing_count',data2['amazing_count'].sum())
print('fantastic_count',data2['fantastic_count'].sum())
print('great_count',data2['great_count'].sum())
print('awesome_count',data2['awesome_count'].sum())
In [59]:
# It is quite common to use the **majority class classifier** as the a baseline (or reference) model for
# comparison with your classifier model. The majority classifier model predicts the majority class for all data points.
# At the very least, you should healthily beat the majority class classifier, otherwise, the model is (usually) pointless.
num_positive = (train_data['sentiment'] == +1).sum()
num_negative = (train_data['sentiment'] == -1).sum()
print (num_positive)
print (num_negative)
print (num_positive*1.0/len(train_data))
In [58]:
test_num_positive = (test_data['sentiment'] == +1).sum()
test_num_negative = (test_data['sentiment'] == -1).sum()
print (test_num_positive)
print (test_num_negative)
print(len(test_data))
print(test_num_positive*1.0/len(test_data))
In [ ]: