In [1]:
import numpy as np
import pandas as pd
import json
import string
import math
from math import sqrt

In [2]:
products = pd.read_csv('amazon_baby_subset.csv')

In [3]:
products.head()


Out[3]:
name review rating sentiment
0 Stop Pacifier Sucking without tears with Thumb... All of my kids have cried non-stop when I trie... 5 1
1 Nature's Lullabies Second Year Sticker Calendar We wanted to get something to keep track of ou... 5 1
2 Nature's Lullabies Second Year Sticker Calendar My daughter had her 1st baby over a year ago. ... 5 1
3 Lamaze Peekaboo, I Love You One of baby's first and favorite books, and it... 4 1
4 SoftPlay Peek-A-Boo Where's Elmo A Children's ... Very cute interactive book! My son loves this ... 5 1

In [4]:
important_words = json.load(open('important_words.json'))

In [5]:
products[pd.isnull(products['review'])] = ''

In [6]:
remove_punct_map = dict.fromkeys(map(ord, string.punctuation))
def remove_punctuation(text):
    return text.translate(remove_punct_map)

In [7]:
products['review_clean'] = products['review'].apply(remove_punctuation)

In [8]:
for word in important_words:
    products[word] = products['review_clean'].apply(lambda text: text.split().count(word))

In [9]:
products['contains_perfect'] = products['perfect'].apply(lambda perfect: 1 if perfect>0 else 0)

Quiz Question. How many reviews contain the word perfect?


In [10]:
len(products[products['contains_perfect']==1])


Out[10]:
2955

In [11]:
def get_numpy_data(dataframe, features, label):
    dataframe['constant'] = 1
    features = ['constant'] + features
    features_frame = dataframe[features]
    features_matrix = features_frame.as_matrix()
    label_sarray = dataframe[label]
    label_array = label_sarray.as_matrix().reshape((len(label_sarray), 1))
    return (features_matrix, label_array)

In [12]:
feature_matrix, sentiment = get_numpy_data(products, important_words, 'sentiment')
print(feature_matrix.shape)
print(sentiment.shape)


(53072, 194)
(53072, 1)

Quiz Question: How many features are there in the feature_matrix?


In [13]:
feature_matrix.shape


Out[13]:
(53072, 194)

In [14]:
def predict_probability(feature_matrix, coefficients):
    score = feature_matrix.dot(coefficients)
    predictions = np.apply_along_axis(lambda x: 1/(1+math.exp(-x)), 1, score)
    return predictions.reshape((max(predictions.shape), 1))

In [15]:
w = np.ones((194,1))

In [16]:
predict_probability(feature_matrix, w)


Out[16]:
array([[ 0.99999774],
       [ 0.99966465],
       [ 0.99999386],
       ..., 
       [ 1.        ],
       [ 1.        ],
       [ 0.99999917]])

Compute derivative of log likelihood with respect to a single coefficient


In [29]:
def feature_derivative(errors, feature):
    derivative = errors.transpose().dot(feature)
    return derivative

In [30]:
def compute_log_likelihood(feature_matrix, sentiment, coefficients):
    indicator = (sentiment==+1)
    scores = feature_matrix.dot(coefficients)
    lp = np.sum((indicator-1)*scores - np.log(1 + np.exp(-scores)))
    return lp

In [61]:
def logistic_regression(feature_matrix, sentiment, initial_coefficients, step_size, max_iter):
    coefficients = np.array(initial_coefficients)
    for itr in range(max_iter):
        predictions = predict_probability(feature_matrix, coefficients)
        indicator = sentiment==+1
        errors = indicator - predictions
        for j in range(len(coefficients)):
            derivative = feature_derivative(errors, feature_matrix[:,j].transpose())
            coefficients[j] += step_size*derivative
        # Checking whether log likelihood is increasing
        if itr <= 15 or (itr <= 100 and itr % 10 == 0) or (itr <= 1000 and itr % 100 == 0) or (itr <= 10000 and itr % 1000 == 0) or itr % 10000 == 0:
            lp = compute_log_likelihood(feature_matrix, sentiment, coefficients)
            print('iteration %*d: log likelihood of observed labels = %.8f' % (int(np.ceil(np.log10(max_iter))), itr, lp))
    return coefficients

In [62]:
initial_coefficients = np.zeros((feature_matrix.shape[1], 1))
step_size = 1e-7
#max_iter = 301
max_iter = 301

In [63]:
coefficients = logistic_regression(feature_matrix, sentiment, initial_coefficients, step_size, max_iter)


iteration   0: log likelihood of observed labels = -36780.91693404
iteration   1: log likelihood of observed labels = -36775.13294402
iteration   2: log likelihood of observed labels = -36769.35517716
iteration   3: log likelihood of observed labels = -36763.58361418
iteration   4: log likelihood of observed labels = -36757.81823598
iteration   5: log likelihood of observed labels = -36752.05902356
iteration   6: log likelihood of observed labels = -36746.30595812
iteration   7: log likelihood of observed labels = -36740.55902099
iteration   8: log likelihood of observed labels = -36734.81819363
iteration   9: log likelihood of observed labels = -36729.08345768
iteration  10: log likelihood of observed labels = -36723.35479491
iteration  11: log likelihood of observed labels = -36717.63218723
iteration  12: log likelihood of observed labels = -36711.91561669
iteration  13: log likelihood of observed labels = -36706.20506549
iteration  14: log likelihood of observed labels = -36700.50051597
iteration  15: log likelihood of observed labels = -36694.80195059
iteration  20: log likelihood of observed labels = -36666.39828192
iteration  30: log likelihood of observed labels = -36610.03005331
iteration  40: log likelihood of observed labels = -36554.23404643
iteration  50: log likelihood of observed labels = -36498.99534891
iteration  60: log likelihood of observed labels = -36444.30008336
iteration  70: log likelihood of observed labels = -36390.13530419
iteration  80: log likelihood of observed labels = -36336.48890532
iteration  90: log likelihood of observed labels = -36283.34953775
iteration 100: log likelihood of observed labels = -36230.70653581
iteration 200: log likelihood of observed labels = -35729.60561673
iteration 300: log likelihood of observed labels = -35269.64174913

In [64]:
scores = feature_matrix.dot(coefficients)

In [69]:
predictions = np.apply_along_axis(lambda s: 1 if s >= 0 else -1, axis=1, arr=scores)

In [70]:
predictions


Out[70]:
array([ 1, -1,  1, ..., -1,  1, -1])

In [71]:
len([x for x in predictions if x == 1])


Out[71]:
24312

In [72]:
len([x for x in predictions if x == -1])


Out[72]:
28760

In [74]:
products['predictions'] = predictions

Quiz question: What is the accuracy of the model on predictions made above? (round to 2 digits of accuracy)


In [76]:
accuracy = len(products[products['sentiment']==products['predictions']])/len(products)

In [77]:
print('Accuracy: %s' % accuracy)


Accuracy: 0.7489448296653602

Which words contribute most to positive & negative sentiments


In [78]:
coefficients = list(coefficients[1:]) # exclude intercept
word_coefficient_tuples = [(word, coefficient) for word, coefficient in zip(important_words, coefficients)]
word_coefficient_tuples = sorted(word_coefficient_tuples, key=lambda x:x[1], reverse=True)

In [79]:
word_coefficient_tuples[:10]


Out[79]:
[('great', array([ 0.06666349])),
 ('love', array([ 0.06596698])),
 ('easy', array([ 0.06486981])),
 ('little', array([ 0.0455281])),
 ('loves', array([ 0.04502302])),
 ('well', array([ 0.03021992])),
 ('perfect', array([ 0.02977405])),
 ('old', array([ 0.02017383])),
 ('nice', array([ 0.01844861])),
 ('daughter', array([ 0.01776812]))]

In [80]:
sorted(word_coefficient_tuples, key=lambda x:x[1], reverse=False)[:10]


Out[80]:
[('would', array([-0.05369864])),
 ('product', array([-0.0413929])),
 ('money', array([-0.03893462])),
 ('work', array([-0.03300892])),
 ('even', array([-0.02997942])),
 ('disappointed', array([-0.02895293])),
 ('get', array([-0.02859737])),
 ('back', array([-0.02767238])),
 ('return', array([-0.02657062])),
 ('monitor', array([-0.02444431]))]

In [ ]: