In [1]:
import graphlab
In [2]:
#limit number of worker processes to 4
graphlab.set_runtime_config('GRAPHLAB_DEFAULT_NUM_PYLAMBDA_WORKERS', 8)
In [3]:
#set canvas to open inline
graphlab.canvas.set_target('ipynb')
In [4]:
products = graphlab.SFrame('amazon_baby.gl/')
In [5]:
products.head()
Out[5]:
In [6]:
selected_words = ['awesome', 'great', 'fantastic', 'amazing', 'love', 'horrible', 'bad', 'terrible', 'awful', 'wow', 'hate']
In [7]:
products['word_count'] = graphlab.text_analytics.count_words(products['review'])
In [8]:
products.head()
Out[8]:
In [9]:
for key in selected_words:
products[key] = 0
In [10]:
products.head()
Out[10]:
In [11]:
products[0]['word_count'].keys()
Out[11]:
In [12]:
def key_count(dict, key):
if key in dict:
return dict[key]
else:
return 0
In [13]:
for key in selected_words:
products[key] = products['word_count'].apply(lambda x: key_count(x, key))
In [14]:
products.head()
Out[14]:
In [15]:
products.tail()
Out[15]:
In [16]:
products['awesome'].show(view='Categorical')
In [18]:
for key in selected_words:
print key + ' : ' + str(products[key].sum())
In [19]:
len(selected_words)
Out[19]:
In [21]:
# ignore all 3* reviews
products = products[products['rating'] != 3]
In [22]:
# positive sentiment = 4* or 5* reviews
products['sentiment'] = products['rating'] >=4
In [23]:
for key in selected_words:
products[key] = products['word_count'].apply(lambda x: key_count(x, key))
In [24]:
for key in selected_words:
print key + ' : ' + str(products[key].sum())
In [25]:
train_data,test_data = products.random_split(.8, seed=0)
In [26]:
selected_words_model = graphlab.logistic_classifier.create(train_data,
target='sentiment',
features=selected_words,
validation_set=test_data)
Using this approach, sort the learned coefficients according to the ‘value’ column using .sort(). Out of the 11 words in selected_words, which one got the most positive weight? Which one got the most negative weight? Do these values make sense for you? Save these results to answer the quiz at the end.
In [29]:
selected_words_model['coefficients'].sort('value', ascending=False).print_rows(12,5)
In [30]:
sentiment_model = graphlab.logistic_classifier.create(train_data,
target='sentiment',
features=['word_count'],
validation_set=test_data)
In [31]:
sentiment_model.evaluate(test_data, metric='roc_curve')
Out[31]:
In [32]:
sentiment_model.show(view='Evaluation')
In [33]:
sentiment_model.evaluate(test_data)
Out[33]:
In [34]:
selected_words_model.evaluate(test_data)
Out[34]:
In [35]:
selected_words_model.show(view='Evaluation')
What is the accuracy of the selected_words_model on the test_data? What was the accuracy of the sentiment_model that we learned using all the word counts in the IPython Notebook above from the lectures? What is the accuracy majority class classifier on this task? How do you compare the different learned models with the baseline approach where we are just predicting the majority class? Save these results to answer the quiz at the end.
In [36]:
test_data['sentiment'].show(view='Categorical')
In [42]:
diaper_champ_reviews = products[products['name'] == 'Baby Trend Diaper Champ']
In [43]:
len(diaper_champ_reviews)
Out[43]:
In [44]:
diaper_champ_reviews.head()
Out[44]:
In [46]:
diaper_champ_reviews['predicted_sentiment'] = sentiment_model.predict(diaper_champ_reviews, output_type='probability')
In [47]:
diaper_champ_reviews = diaper_champ_reviews.sort('predicted_sentiment', ascending=False)
In [48]:
diaper_champ_reviews.head()
Out[48]:
Now use the selected_words_model you learned using just the selected_words to predict the sentiment most positive review you found above. Save this result to answer the quiz at the end.
In [49]:
selected_words_model.predict(diaper_champ_reviews[0:1], output_type='probability')
Out[49]:
Why is the predicted_sentiment for the most positive review found using the model with all word counts (sentiment_model) much more positive than the one using only the selected_words (selected_words_model)? Hint: examine the text of this review, the extracted word counts for all words, and the word counts for each of the selected_words, and you will see what each model used to make its prediction. Save this result to answer the quiz at the end.
In [50]:
diaper_champ_reviews[0]['review']
Out[50]:
In [51]:
diaper_champ_reviews[0]['word_count']
Out[51]:
In [52]:
diaper_champ_reviews[0]
Out[52]:
In [ ]: