In [1]:
import graphlab
In [3]:
products = graphlab.SFrame('amazon_baby.gl')
In [4]:
products.head()
Out[4]:
In [5]:
products['word_count'] = graphlab.text_analytics.count_words(products['review'])
In [6]:
products.head()
Out[6]:
In [7]:
graphlab.canvas.set_target('ipynb')
In [8]:
products['name'].show()
In [9]:
giraffe_reviews = products[products['name'] == 'Vulli Sophie the Giraffe Teether']
In [10]:
len(giraffe_reviews)
Out[10]:
In [12]:
giraffe_reviews['rating'].show(view = 'Categorical')
In [13]:
products['rating'].show(view = 'Categorical')
In [14]:
# ignore 3 stars
products = products[products['rating'] != 3]
In [15]:
# positive sentiment
products['sentiment'] = products['rating'] >= 4
In [16]:
products.head()
Out[16]:
In [17]:
train_data, test_data = products.random_split(.8, seed=0)
In [18]:
sentiment_model = graphlab.logistic_classifier.create(train_data,
target='sentiment',
features=['word_count'],
validation_set=test_data)
In [19]:
sentiment_model.evaluate(test_data, metric='roc_curve')
Out[19]:
In [20]:
sentiment_model.show(view='Evaluation')
In [24]:
giraffe_reviews['predicted_sentiment'] = sentiment_model.predict(giraffe_reviews, output_type='probability')
In [25]:
giraffe_reviews.head()
Out[25]:
In [26]:
giraffe_reviews = giraffe_reviews.sort('predicted_sentiment', ascending=False)
In [27]:
giraffe_reviews.head()
Out[27]:
In [29]:
giraffe_reviews[0]['review']
Out[29]:
In [30]:
giraffe_reviews[1]['review']
Out[30]:
In [31]:
giraffe_reviews[-1]['review']
Out[31]:
In [32]:
giraffe_reviews[-2]['review']
Out[32]:
In [47]:
selected_words = ['awesome', 'great', 'fantastic', 'amazing',
'love', 'horrible', 'bad', 'terrible', 'awful', 'wow', 'hate']
def awesome_count(word_count):
if curr_word in word_count:
return word_count[curr_word]
else:
return 0
for curr_word in selected_words:
print curr_word
products[curr_word] = products['word_count'].apply(awesome_count)
In [48]:
products.head()
Out[48]:
In [49]:
wc = {w: products[w].sum() for w in selected_words}
In [50]:
print wc
In [51]:
train_data,test_data = products.random_split(.8, seed=0)
In [52]:
features=selected_words
selected_words_model = graphlab.logistic_classifier.create(train_data,
target='sentiment',
features=selected_words,
validation_set=test_data)
In [53]:
selected_words_model['coefficients']
Out[53]:
In [54]:
selected_words_model['coefficients'].sort('value')
Out[54]:
In [55]:
selected_words_model.evaluate(test_data)
Out[55]:
In [56]:
sentiment_model.evaluate(test_data)
Out[56]:
In [57]:
selected_words_model.evaluate(train_data)
Out[57]:
In [60]:
(27846 + 130.0) / len(test_data)
Out[60]:
In [62]:
diaper_champ_reviews = products[products['name'] == 'Baby Trend Diaper Champ']
In [63]:
diaper_champ_reviews['predicted_sentiment'] = sentiment_model.predict(diaper_champ_reviews, output_type='probability')
In [70]:
diaper_champ_reviews = diaper_champ_reviews.sort('predicted_sentiment', ascending=False)
In [71]:
diaper_champ_reviews.head()
Out[71]:
In [72]:
selected_words_model.predict(diaper_champ_reviews[0:1], output_type='probability')
Out[72]:
In [73]:
diaper_champ_reviews[0]['review']
Out[73]:
In [ ]: