In [3]:
    
import graphlab
    
In [4]:
    
products = graphlab.SFrame('amazon_baby.gl/')
    
In [5]:
    
products.head(5)
    
    Out[5]:
In [6]:
    
products['word_count'] = graphlab.text_analytics.count_words(products['review'])
    
In [7]:
    
products.head(5)
    
    Out[7]:
In [8]:
    
graphlab.canvas.set_target('ipynb')
    
In [9]:
    
products['name'].show()
    
    
In [10]:
    
giraffe_reviews = products[products['name'] == 'Vulli Sophie the Giraffe Teether']
    
In [11]:
    
len(giraffe_reviews)
    
    Out[11]:
total 785 review are there for the Vulli Sophie the Giraffe Teether from the Amazon dataset
In [12]:
    
giraffe_reviews['rating'].show(view='Categorical')
    
    
In [13]:
    
products['rating'].show(view='Categorical')
    
    
In [14]:
    
#ignore all 3* reviews
products = products[products['rating'] != 3]
    
In [15]:
    
#positive sentiment = 4* or 5* reviews
products['sentiment'] = products['rating'] >=4
    
In [16]:
    
products.head(6)
    
    Out[16]:
In [17]:
    
train_data,test_data = products.random_split(.8, seed=0)
    
In [18]:
    
sentiment_model = graphlab.logistic_classifier.create(train_data,
                                                     target='sentiment',
                                                     features=['word_count'],
                                                     validation_set=test_data)
    
    
In [19]:
    
sentiment_model.evaluate(test_data, metric='roc_curve')
    
    Out[19]:
In [20]:
    
sentiment_model.show(view='Evaluation')
    
    
In [21]:
    
giraffe_reviews['predicted_sentiment'] = sentiment_model.predict(giraffe_reviews, output_type='probability')
    
In [22]:
    
giraffe_reviews.head(4)
    
    Out[22]:
In [23]:
    
giraffe_reviews = giraffe_reviews.sort('predicted_sentiment', ascending=False)
    
In [24]:
    
giraffe_reviews.head(5)
    
    Out[24]:
In [25]:
    
giraffe_reviews[0]['review']
    
In [26]:
    
giraffe_reviews[1]['review']
    
In [28]:
    
giraffe_reviews[-1]['review']
    
    Out[28]:
In [29]:
    
giraffe_reviews[-2]['review']
    
    Out[29]:
In [30]:
    
selected_words = ['awesome', 'great', 'fantastic', 'amazing','love', 'horrible', 
                  'bad', 'terrible','awful', 'wow', 'hate']
    
In [31]:
    
Selected_Frame = graphlab.SArray(selected_words)
    
In [32]:
    
Selected_Frame
    
    Out[32]:
In [33]:
    
bow = graphlab.text_analytics.count_words(products['review'])
    
In [34]:
    
# Only we are considering the review which are having word count which are present in the Selected Frame
# add a new column for that words_clean 
products['words_clean'] = bow.dict_trim_by_keys(Selected_Frame, exclude=False)
    
In [35]:
    
## Remove the old colunm for the words count
products = products['name','review','rating','sentiment','words_clean']
    
In [36]:
    
products.head(5)
    
    Out[36]:
In [37]:
    
train_data_clean,test_data_clean = products.random_split(.8, seed=0)
    
In [38]:
    
sentiment_model_clean = graphlab.logistic_classifier.create(train_data_clean,
                                                     target='sentiment',
                                                     features=['words_clean'],
                                                     validation_set=test_data_clean)
    
    
In [39]:
    
sentiment_model_clean.evaluate(test_data_clean, metric='roc_curve')
    
    Out[39]:
In [40]:
    
sentiment_model_clean.show(view='Evaluation')
    
    
In [41]:
    
giraffe_reviews['predicted_sentiment'] = sentiment_model_clean.predict(giraffe_reviews, output_type='probability')
    
In [42]:
    
giraffe_reviews.head(4)
    
    Out[42]:
In [43]:
    
giraffe_reviews = giraffe_reviews.sort('predicted_sentiment', ascending=False)
    
In [44]:
    
giraffe_reviews.head(4)
    
    Out[44]:
In [45]:
    
sentiment_model['coefficients']
    
    Out[45]:
In [46]:
    
sentiment_model_clean['coefficients']
    
    Out[46]: