In [1]:
import graphlab
In [20]:
products = graphlab.SFrame('amazon_baby.gl/')
In [21]:
products.head()
Out[21]:
In [22]:
products['word_count'] = graphlab.text_analytics.count_words(products['review'])
In [23]:
products.head()
Out[23]:
In [24]:
graphlab.canvas.set_target('ipynb')
In [25]:
products['name'].show()
In [26]:
giraffe_reviews = products[products['name'] == 'Vulli Sophie the Giraffe Teether']
In [27]:
len(giraffe_reviews)
Out[27]:
In [28]:
giraffe_reviews.head()
Out[28]:
In [29]:
giraffe_reviews['rating'].show(view='Categorical')
In [30]:
products['rating'].show(view='Categorical')
In [31]:
# ignore products with rating 3*
products = products[products['rating']!=3]
In [32]:
# positive sentiment := 4* and 5*
products['sentiment'] = products['rating'] >= 4
In [33]:
products.head()
Out[33]:
In [34]:
train_data, test_data = products.random_split(0.8, seed=0)
In [35]:
sentiment_mode = graphlab.logistic_classifier.create(train_data,
target='sentiment',
features=['word_count'],
validation_set=test_data)
In [36]:
sentiment_mode.evaluate(test_data, metric='roc_curve')
Out[36]:
In [37]:
sentiment_mode.show(view='Evaluation')
In [40]:
giraffe_reviews['predicted_sentiment'] = sentiment_mode.predict(giraffe_reviews, output_type='probability')
In [41]:
giraffe_reviews.head()
Out[41]:
In [42]:
giraffe_reviews = giraffe_reviews.sort('predicted_sentiment', ascending=False)
In [43]:
giraffe_reviews.head()
Out[43]:
In [44]:
giraffe_reviews[0]['review']
Out[44]:
In [45]:
giraffe_reviews[1]['review']
Out[45]:
In [46]:
giraffe_reviews[-1]['review']
Out[46]:
In [47]:
giraffe_reviews[-2]['review']
Out[47]:
In [48]:
selected_words = ['awesome', 'great', 'fantastic', 'amazing', 'love', 'horrible', 'bad', 'terrible', 'awful', 'wow', 'hate']
In [49]:
selected_words
Out[49]:
In [52]:
products.head(n=2)
Out[52]:
In [53]:
def awesome_count(word_count):
if 'awesome' in word_count:
return word_count['awesome']
else:
return 0
In [57]:
myprodawesome = products['word_count'].apply(awesome_count)
In [71]:
myprodawesome.sum()
Out[71]:
In [72]:
products['awesome'] = products['word_count'].apply(awesome_count)
In [74]:
selected_words
Out[74]:
In [92]:
def awesome_count(word_count):
#if 'awesome' in word_count:
# return word_count['awesome']
#if 'great' in word_count:
# return word_count['great']
#if 'fantastic' in word_count:
# return word_count['fantastic']
#if 'amazing' in word_count:
# return word_count['amazing']
#if 'love' in word_count:
# return word_count['love']
#if 'horrible' in word_count:
# return word_count['horrible']
#if 'bad' in word_count:
# return word_count['bad']
#if 'terrible' in word_count:
# return word_count['terrible']
#if 'awful' in word_count:
# return word_count['awful']
#if 'wow' in word_count:
# return word_count['wow']
if 'hate' in word_count:
return word_count['hate']
else:
return 0
In [93]:
products['hate'] = products['word_count'].apply(awesome_count)
In [94]:
products.head(n=1)
Out[94]:
In [95]:
selected_words
Out[95]:
In [96]:
for wrd in selected_words:
print wrd, " : ", products[wrd].sum()
In [99]:
selected_word_model = graphlab.logistic_classifier.create(train_data,
target='sentiment',
features=selected_words,
validation_set=test_data)
In [98]:
train_data,test_data = products.random_split(.8, seed=0)
In [104]:
selected_word_model['coefficients'].sort('value',ascending=False).print_rows(num_rows=15)
In [105]:
selected_word_model.evaluate(test_data)
Out[105]:
In [106]:
sentiment_mode.evaluate(test_data)
Out[106]:
In [107]:
diaper_champ_reviews = products[products['name'] == 'Baby Trend Diaper Champ']
In [108]:
diaper_champ_reviews['predicted_sentiment'] = sentiment_mode.predict(diaper_champ_reviews, output_type='probability')
diaper_champ_reviews = diaper_champ_reviews.sort('predicted_sentiment', ascending=False)
In [109]:
diaper_champ_reviews.head()
Out[109]:
In [111]:
selected_word_model.predict(diaper_champ_reviews[0:1], output_type='probability')
Out[111]:
In [113]:
[test_data['sentiment'] == 1]
Out[113]:
In [114]:
len(test_data)
Out[114]:
In [115]:
test_data['sentiment'].sum()
Out[115]:
In [117]:
float(27976)/33304
Out[117]:
In [ ]: