In [3]:
%matplotlib inline
import graphlab
In [4]:
products = graphlab.SFrame('amazon_baby.gl/')
In [5]:
products.head()
Out[5]:
In [6]:
products['word_count'] = graphlab.text_analytics.count_words(products['review'])
In [7]:
products.head()
Out[7]:
In [8]:
graphlab.canvas.set_target('ipynb')
In [9]:
products['name'].show()
In [10]:
giraff_review = products[products['name'] == 'Vulli Sophie the Giraffe Teether']
In [11]:
print len(giraff_review)
In [12]:
giraff_review['rating'].show(view = 'Categorical')
In [13]:
products = products[products['rating'] != 3]
In [14]:
products['sentiment'] = products['rating']>=4
In [15]:
products.tail()
Out[15]:
In [16]:
train_data, test_data = products.random_split(.8, seed=0)
In [17]:
sentiment_model = graphlab.logistic_classifier.create(train_data,
target='sentiment',
features=['word_count'],
validation_set = test_data)
In [18]:
sentiment_model.evaluate(test_data, metric = 'roc_curve')
Out[18]:
In [19]:
sentiment_model.show(view='Evaluation')
In [20]:
giraff_review['predicted_sentiment'] = sentiment_model.predict(giraff_review, output_type='probability')
In [21]:
giraff_review.head()
Out[21]:
In [22]:
giraff_review = giraff_review.sort('predicted_sentiment', ascending=False)
In [23]:
giraff_review.head()
Out[23]:
In [24]:
giraff_review[0]['review']
Out[24]:
In [25]:
giraff_review[1]['review']
Out[25]:
In [26]:
giraff_review[-1]['review']
Out[26]:
In [27]:
giraff_review[-2]['review']
Out[27]:
In [28]:
selected_words = ['awesome', 'great', 'fantastic', 'amazing', 'love', 'horrible', 'bad', 'terrible', 'awful', 'wow', 'hate']
In [29]:
type(products['word_count'][0])
Out[29]:
In [30]:
def select_word_count(word_count, word):
if word in word_count:
return word_count[word]
return 0
In [31]:
print select_word_count(products['word_count'][1], 'awesome')
In [34]:
for word in selected_words:
products[word] = products['word_count'].apply(lambda x: select_word_count(x, word))
In [35]:
products.sort('awesome', ascending=False).head()
Out[35]:
In [38]:
words = dict()
for i in products['word_count']:
for k,v in i.iteritems():
if k in words:
value = words[k]
words[k] = value + v
else:
words[k] = v
In [43]:
for w in selected_words:
print w, '=', words[w]
In [38]:
train_data_one, test_data_one = products.random_split(.8, seed=0)
selected_words_model = graphlab.logistic_classifier.create(train_data_one,
'selected_words_model',
features=selected_words)
In [ ]: