In [1]:
import graphlab
In [2]:
products = graphlab.SFrame('amazon_baby.gl/')
In [3]:
products.head()
Out[3]:
In [4]:
products['word_count'] = graphlab.text_analytics.count_words(products['review'])
In [5]:
products.head()
Out[5]:
In [6]:
graphlab.canvas.set_target('ipynb')
In [7]:
products['name'].show()
In [8]:
giraffe_reviews = products[products['name'] == 'Vulli Sophie the Giraffe Teether']
In [9]:
len(giraffe_reviews)
Out[9]:
In [12]:
giraffe_reviews['rating'].show(view='Categorical')
In [13]:
products['rating'].show(view='Categorical')
In [14]:
#ignore all 3* reviews
products = products[products['rating'] != 3]
In [15]:
#positive sentiment = 4* or 5* reviews
products['sentiment'] = products['rating'] >=4
In [16]:
products.head()
Out[16]:
In [73]:
len(products)
Out[73]:
In [74]:
print products['sentiment'].sum()
In [17]:
train_data,test_data = products.random_split(.8, seed=0)
In [18]:
sentiment_model = graphlab.logistic_classifier.create(train_data,
target='sentiment',
features=['word_count'],
validation_set=test_data)
In [19]:
sentiment_model.evaluate(test_data, metric='roc_curve')
Out[19]:
In [23]:
graphlab.canvas.set_target('browser')
sentiment_model.show(view='Evaluation')
In [20]:
giraffe_reviews['predicted_sentiment'] = sentiment_model.predict(giraffe_reviews, output_type='probability')
In [21]:
giraffe_reviews.head()
Out[21]:
In [22]:
giraffe_reviews = giraffe_reviews.sort('predicted_sentiment', ascending=False)
In [24]:
giraffe_reviews.head()
Out[24]:
In [25]:
giraffe_reviews[0]['review']
Out[25]:
In [26]:
giraffe_reviews[1]['review']
Out[26]:
In [27]:
giraffe_reviews[-1]['review']
Out[27]:
In [28]:
giraffe_reviews[-2]['review']
Out[28]:
In [25]:
selected_words = ['awesome', 'great', 'fantastic', 'amazing', 'love', 'horrible', 'bad', 'terrible', 'awful', 'wow', 'hate']
In [27]:
def awesome_count(word_count_dict):
if 'awesome' in word_count_dict:
word_count_dict.get('awesome')
else:
return 0
In [28]:
products['awesome'] = products['word_count'].apply(awesome_count)
In [32]:
def selected_word_count(word_count_dict,word):
if word in word_count_dict:
return word_count_dict.get(word)
else:
return 0
In [33]:
for word in selected_words:
products[word] = products['word_count'].apply(lambda x : selected_word_count(x,word))
In [34]:
products.head(4)
Out[34]:
In [41]:
for word in selected_words:
print word, products[word].sum()
In [43]:
train_data,test_data = products.random_split(.8, seed=0)
In [44]:
selected_words__model = graphlab.logistic_classifier.create(train_data,
target='sentiment',
features=selected_words,
validation_set=test_data)
In [46]:
selected_words__model['coefficients'].sort('value')
Out[46]:
In [48]:
selected_words__model.evaluate(test_data,metric='roc_curve')
Out[48]:
In [61]:
diaper_champ_reviews = products[products['name']=='Baby Trend Diaper Champ']
In [62]:
diaper_champ_reviews
Out[62]:
In [63]:
diaper_champ_reviews['predicted_sentiment'] = sentiment_model.predict(diaper_champ_reviews, output_type='probability')
In [64]:
diaper_champ_reviews = diaper_champ_reviews.sort('predicted_sentiment', ascending=False)
In [65]:
diaper_champ_reviews
Out[65]:
In [66]:
diaper_champ_reviews[0]
Out[66]:
In [68]:
top_pred= selected_words__model.predict(diaper_champ_reviews, output_type='probability')
In [70]:
top_pred[0]
Out[70]:
In [72]:
diaper_champ_reviews[0]['review']
Out[72]:
In [ ]: