In [6]:
import graphlab
import os

Read some product review data


In [21]:
path='C:\Users\Matheus\Documents\GitHub\UW-Machine-Learning-Specialization\Week 3'
os.chdir(path)

In [24]:
products = graphlab.SFrame("amazon_baby.csv")


Finished parsing file C:\Users\Matheus\Documents\GitHub\UW-Machine-Learning-Specialization\Week 3\amazon_baby.csv
Parsing completed. Parsed 100 lines in 0.963059 secs.
------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,long]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------
Read 105270 lines. Lines per second: 85706.3
Finished parsing file C:\Users\Matheus\Documents\GitHub\UW-Machine-Learning-Specialization\Week 3\amazon_baby.csv
Parsing completed. Parsed 183531 lines in 1.59374 secs.

In [25]:
products.head()


Out[25]:
name review rating
Planetwise Flannel Wipes These flannel wipes are
OK, but in my opinion ...
3
Planetwise Wipe Pouch it came early and was not
disappointed. i love ...
5
Annas Dream Full Quilt
with 2 Shams ...
Very soft and comfortable
and warmer than it ...
5
Stop Pacifier Sucking
without tears with ...
This is a product well
worth the purchase. I ...
5
Stop Pacifier Sucking
without tears with ...
All of my kids have cried
non-stop when I tried to ...
5
Stop Pacifier Sucking
without tears with ...
When the Binky Fairy came
to our house, we didn't ...
5
A Tale of Baby's Days
with Peter Rabbit ...
Lovely book, it's bound
tightly so you may no ...
4
Baby Tracker® - Daily
Childcare Journal, ...
Perfect for new parents.
We were able to keep ...
5
Baby Tracker® - Daily
Childcare Journal, ...
A friend of mine pinned
this product on Pinte ...
5
Baby Tracker® - Daily
Childcare Journal, ...
This has been an easy way
for my nanny to record ...
4
[10 rows x 3 columns]

Exploring this data


In [27]:
#Build a word count vector for each review
products['word_count'] = graphlab.text_analytics.count_words(products['review'])

In [28]:
products.head()


Out[28]:
name review rating word_count
Planetwise Flannel Wipes These flannel wipes are
OK, but in my opinion ...
3 {'and': 5L, 'stink': 1L,
'because': 1L, 'order ...
Planetwise Wipe Pouch it came early and was not
disappointed. i love ...
5 {'and': 3L, 'love': 1L,
'it': 2L, 'highly': 1L, ...
Annas Dream Full Quilt
with 2 Shams ...
Very soft and comfortable
and warmer than it ...
5 {'and': 2L, 'quilt': 1L,
'it': 1L, 'comfortable': ...
Stop Pacifier Sucking
without tears with ...
This is a product well
worth the purchase. I ...
5 {'ingenious': 1L, 'and':
3L, 'love': 2L, ...
Stop Pacifier Sucking
without tears with ...
All of my kids have cried
non-stop when I tried to ...
5 {'and': 2L, 'parents!!':
1L, 'all': 2L, 'puppe ...
Stop Pacifier Sucking
without tears with ...
When the Binky Fairy came
to our house, we didn't ...
5 {'and': 2L, 'cute': 1L,
'help': 2L, 'doll': 1L, ...
A Tale of Baby's Days
with Peter Rabbit ...
Lovely book, it's bound
tightly so you may no ...
4 {'shop': 1L, 'be': 1L,
'is': 1L, 'it': 1L, ' ...
Baby Tracker® - Daily
Childcare Journal, ...
Perfect for new parents.
We were able to keep ...
5 {'feeding,': 1L, 'and':
2L, 'all': 1L, 'right': ...
Baby Tracker® - Daily
Childcare Journal, ...
A friend of mine pinned
this product on Pinte ...
5 {'and': 1L, 'help': 1L,
'give': 1L, 'is': 1L, ...
Baby Tracker® - Daily
Childcare Journal, ...
This has been an easy way
for my nanny to record ...
4 {'journal.': 1L, 'all':
1L, 'standarad': 1L, ...
[10 rows x 4 columns]


In [29]:
graphlab.canvas.set_target('ipynb')

In [30]:
products['name'].show()


Explore Vulli Sophie


In [31]:
giraffe_reviews = products[products['name']=='Vulli Sophie the Giraffe Teether']

In [32]:
len(giraffe_reviews)


Out[32]:
785

In [33]:
giraffe_reviews['rating'].show(view='Categorical')


Build a sentiment classifier


In [34]:
products['rating'].show(view='Categorical')


Define what's positive and negative sentiment


In [36]:
#ignore all 3 stars reviews
products = products[products['rating']!=3]

In [37]:
#positive sentiment = 4* or 5* reviews
products['sentiment'] = products['rating']>=4

In [38]:
products.head()


Out[38]:
name review rating word_count sentiment
Planetwise Wipe Pouch it came early and was not
disappointed. i love ...
5 {'and': 3L, 'love': 1L,
'it': 2L, 'highly': 1L, ...
1
Annas Dream Full Quilt
with 2 Shams ...
Very soft and comfortable
and warmer than it ...
5 {'and': 2L, 'quilt': 1L,
'it': 1L, 'comfortable': ...
1
Stop Pacifier Sucking
without tears with ...
This is a product well
worth the purchase. I ...
5 {'ingenious': 1L, 'and':
3L, 'love': 2L, ...
1
Stop Pacifier Sucking
without tears with ...
All of my kids have cried
non-stop when I tried to ...
5 {'and': 2L, 'parents!!':
1L, 'all': 2L, 'puppe ...
1
Stop Pacifier Sucking
without tears with ...
When the Binky Fairy came
to our house, we didn't ...
5 {'and': 2L, 'cute': 1L,
'help': 2L, 'doll': 1L, ...
1
A Tale of Baby's Days
with Peter Rabbit ...
Lovely book, it's bound
tightly so you may no ...
4 {'shop': 1L, 'be': 1L,
'is': 1L, 'it': 1L, ' ...
1
Baby Tracker® - Daily
Childcare Journal, ...
Perfect for new parents.
We were able to keep ...
5 {'feeding,': 1L, 'and':
2L, 'all': 1L, 'right': ...
1
Baby Tracker® - Daily
Childcare Journal, ...
A friend of mine pinned
this product on Pinte ...
5 {'and': 1L, 'help': 1L,
'give': 1L, 'is': 1L, ...
1
Baby Tracker® - Daily
Childcare Journal, ...
This has been an easy way
for my nanny to record ...
4 {'journal.': 1L, 'all':
1L, 'standarad': 1L, ...
1
Baby Tracker® - Daily
Childcare Journal, ...
I love this journal and
our nanny uses it ...
4 {'all': 1L, 'forget': 1L,
'just': 1L, "daughter ...
1
[10 rows x 5 columns]

Train the sentiment classifier


In [39]:
train_data, test_data = products.random_split(.8,seed=0)

In [40]:
sentiment_model = graphlab.logistic_classifier.create(train_data,
                                                     target='sentiment',
                                                     features=['word_count'],
                                                     validation_set=test_data)


WARNING: The number of feature dimensions in this problem is very large in comparison with the number of examples. Unless an appropriate regularization value is set, this model may not provide accurate predictions for a validation/test set.
Logistic regression:
--------------------------------------------------------
Number of examples          : 133448
Number of classes           : 2
Number of feature columns   : 1
Number of unpacked features : 219217
Number of coefficients    : 219218
Starting L-BFGS
--------------------------------------------------------
+-----------+----------+-----------+--------------+-------------------+---------------------+
| Iteration | Passes   | Step size | Elapsed Time | Training-accuracy | Validation-accuracy |
+-----------+----------+-----------+--------------+-------------------+---------------------+
| 1         | 5        | 0.000002  | 5.320136     | 0.841481          | 0.839989            |
| 2         | 9        | 3.000000  | 8.945772     | 0.947425          | 0.894877            |
| 3         | 10       | 3.000000  | 10.277811    | 0.923768          | 0.866232            |
| 4         | 11       | 3.000000  | 11.553200    | 0.971779          | 0.912743            |
| 5         | 12       | 3.000000  | 12.806531    | 0.975511          | 0.908900            |
| 6         | 13       | 3.000000  | 14.243852    | 0.899991          | 0.825967            |
| 7         | 15       | 1.000000  | 16.176989    | 0.984548          | 0.921451            |
| 8         | 16       | 1.000000  | 17.665946    | 0.985118          | 0.921871            |
| 9         | 17       | 1.000000  | 19.077196    | 0.987066          | 0.919709            |
| 10        | 18       | 1.000000  | 20.333534    | 0.988715          | 0.916256            |
+-----------+----------+-----------+--------------+-------------------+---------------------+
TERMINATED: Iteration limit reached.
This model may not be optimal. To improve it, consider increasing `max_iterations`.

Evaluate the sentiment model


In [41]:
sentiment_model.evaluate(test_data, metric='roc_curve')


Out[41]:
{'roc_curve': Columns:
 	threshold	float
 	fpr	float
 	tpr	float
 	p	int
 	n	int
 
 Rows: 100001
 
 Data:
 +-----------+----------------+----------------+-------+------+
 | threshold |      fpr       |      tpr       |   p   |  n   |
 +-----------+----------------+----------------+-------+------+
 |    0.0    |      1.0       |      1.0       | 27976 | 5328 |
 |   1e-05   | 0.909346846847 | 0.998856162425 | 27976 | 5328 |
 |   2e-05   | 0.896021021021 | 0.998748927652 | 27976 | 5328 |
 |   3e-05   | 0.886448948949 | 0.998462968259 | 27976 | 5328 |
 |   4e-05   | 0.879692192192 | 0.998284243637 | 27976 | 5328 |
 |   5e-05   | 0.875187687688 | 0.998212753789 | 27976 | 5328 |
 |   6e-05   | 0.872184684685 | 0.998177008865 | 27976 | 5328 |
 |   7e-05   | 0.868618618619 | 0.998034029168 | 27976 | 5328 |
 |   8e-05   | 0.864677177177 | 0.997998284244 | 27976 | 5328 |
 |   9e-05   | 0.860735735736 | 0.997962539319 | 27976 | 5328 |
 +-----------+----------------+----------------+-------+------+
 [100001 rows x 5 columns]
 Note: Only the head of the SFrame is printed.
 You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.}

In [43]:
sentiment_model.show(view='Evaluation')


Applying the model to understand the sentiment for the Giraffe


In [44]:
giraffe_reviews['predicted_sentiment'] = sentiment_model.predict(giraffe_reviews, output_type='probability')

In [45]:
giraffe_reviews.head()


Out[45]:
name review rating word_count predicted_sentiment
Vulli Sophie the Giraffe
Teether ...
He likes chewing on all
the parts especially the ...
5 {'and': 1L, 'all': 1L,
'because': 1L, 'it': 1L, ...
0.999513023521
Vulli Sophie the Giraffe
Teether ...
My son loves this toy and
fits great in the diaper ...
5 {'and': 1L, 'right': 1L,
'help': 1L, 'just': 1L, ...
0.999320678306
Vulli Sophie the Giraffe
Teether ...
There really should be a
large warning on the ...
1 {'and': 2L, 'all': 1L,
'latex.': 1L, 'being': ...
0.013558811687
Vulli Sophie the Giraffe
Teether ...
All the moms in my moms'
group got Sophie for ...
5 {'and': 2L, 'one!': 1L,
'all': 1L, 'love': 1L, ...
0.995769474148
Vulli Sophie the Giraffe
Teether ...
I was a little skeptical
on whether Sophie was ...
5 {'and': 3L, 'all': 1L,
'old': 1L, 'her.': 1L, ...
0.662374415673
Vulli Sophie the Giraffe
Teether ...
I have been reading about
Sophie and was going ...
5 {'and': 6L, 'seven': 1L,
'already': 1L, 'love': ...
0.999997148186
Vulli Sophie the Giraffe
Teether ...
My neice loves her sophie
and has spent hours ...
5 {'and': 4L, 'drooling,':
1L, 'love': 1L, 'her.': ...
0.989190989536
Vulli Sophie the Giraffe
Teether ...
What a friendly face!
And those mesmerizing ...
5 {'and': 3L, 'chew': 1L,
"don't": 1L, 'is': 1L, ...
0.999563518413
Vulli Sophie the Giraffe
Teether ...
We got this just for my
son to chew on instea ...
5 {'chew': 2L, 'because':
1L, 'just': 2L, 'what': ...
0.970160542725
Vulli Sophie the Giraffe
Teether ...
My baby seems to like
this toy, but I could ...
3 {'and': 2L, 'already':
1L, 'in': 1L, 'some': ...
0.195367644588
[10 rows x 5 columns]


In [ ]: