notebook.community

Edit and run



In [1]:

    
import numpy as np
import pandas as pd
import json
import string
import math
from math import sqrt



In [2]:

    
products = pd.read_csv('amazon_baby_subset.csv')



In [3]:

    
products.head()









    Out[3]:






  
    
      
      name
      review
      rating
      sentiment
    
  
  
    
      0
      Stop Pacifier Sucking without tears with Thumb...
      All of my kids have cried non-stop when I trie...
      5
      1
    
    
      1
      Nature's Lullabies Second Year Sticker Calendar
      We wanted to get something to keep track of ou...
      5
      1
    
    
      2
      Nature's Lullabies Second Year Sticker Calendar
      My daughter had her 1st baby over a year ago. ...
      5
      1
    
    
      3
      Lamaze Peekaboo, I Love You
      One of baby's first and favorite books, and it...
      4
      1
    
    
      4
      SoftPlay Peek-A-Boo Where's Elmo A Children's ...
      Very cute interactive book! My son loves this ...
      5
      1



In [4]:

    
important_words = json.load(open('important_words.json'))



In [5]:

    
products[pd.isnull(products['review'])] = ''



In [6]:

    
remove_punct_map = dict.fromkeys(map(ord, string.punctuation))
def remove_punctuation(text):
    return text.translate(remove_punct_map)



In [7]:

    
products['review_clean'] = products['review'].apply(remove_punctuation)



In [8]:

    
for word in important_words:
    products[word] = products['review_clean'].apply(lambda text: text.split().count(word))



In [9]:

    
products['contains_perfect'] = products['perfect'].apply(lambda perfect: 1 if perfect>0 else 0)

Quiz Question. How many reviews contain the word perfect?



In [10]:

    
len(products[products['contains_perfect']==1])









    Out[10]:





2955



In [11]:

    
def get_numpy_data(dataframe, features, label):
    dataframe['constant'] = 1
    features = ['constant'] + features
    features_frame = dataframe[features]
    features_matrix = features_frame.as_matrix()
    label_sarray = dataframe[label]
    label_array = label_sarray.as_matrix().reshape((len(label_sarray), 1))
    return (features_matrix, label_array)



In [12]:

    
feature_matrix, sentiment = get_numpy_data(products, important_words, 'sentiment')
print(feature_matrix.shape)
print(sentiment.shape)









    



(53072, 194)
(53072, 1)

Quiz Question: How many features are there in the feature_matrix?



In [13]:

    
feature_matrix.shape









    Out[13]:





(53072, 194)



In [14]:

    
def predict_probability(feature_matrix, coefficients):
    score = feature_matrix.dot(coefficients)
    predictions = np.apply_along_axis(lambda x: 1/(1+math.exp(-x)), 1, score)
    return predictions.reshape((max(predictions.shape), 1))



In [15]:

    
w = np.ones((194,1))



In [16]:

    
predict_probability(feature_matrix, w)









    Out[16]:





array([[ 0.99999774],
       [ 0.99966465],
       [ 0.99999386],
       ..., 
       [ 1.        ],
       [ 1.        ],
       [ 0.99999917]])

Compute derivative of log likelihood with respect to a single coefficient



In [29]:

    
def feature_derivative(errors, feature):
    derivative = errors.transpose().dot(feature)
    return derivative



In [30]:

    
def compute_log_likelihood(feature_matrix, sentiment, coefficients):
    indicator = (sentiment==+1)
    scores = feature_matrix.dot(coefficients)
    lp = np.sum((indicator-1)*scores - np.log(1 + np.exp(-scores)))
    return lp



In [61]:

    
def logistic_regression(feature_matrix, sentiment, initial_coefficients, step_size, max_iter):
    coefficients = np.array(initial_coefficients)
    for itr in range(max_iter):
        predictions = predict_probability(feature_matrix, coefficients)
        indicator = sentiment==+1
        errors = indicator - predictions
        for j in range(len(coefficients)):
            derivative = feature_derivative(errors, feature_matrix[:,j].transpose())
            coefficients[j] += step_size*derivative
        # Checking whether log likelihood is increasing
        if itr <= 15 or (itr <= 100 and itr % 10 == 0) or (itr <= 1000 and itr % 100 == 0) or (itr <= 10000 and itr % 1000 == 0) or itr % 10000 == 0:
            lp = compute_log_likelihood(feature_matrix, sentiment, coefficients)
            print('iteration %*d: log likelihood of observed labels = %.8f' % (int(np.ceil(np.log10(max_iter))), itr, lp))
    return coefficients



In [62]:

    
initial_coefficients = np.zeros((feature_matrix.shape[1], 1))
step_size = 1e-7
#max_iter = 301
max_iter = 301



In [63]:

    
coefficients = logistic_regression(feature_matrix, sentiment, initial_coefficients, step_size, max_iter)









    



iteration   0: log likelihood of observed labels = -36780.91693404
iteration   1: log likelihood of observed labels = -36775.13294402
iteration   2: log likelihood of observed labels = -36769.35517716
iteration   3: log likelihood of observed labels = -36763.58361418
iteration   4: log likelihood of observed labels = -36757.81823598
iteration   5: log likelihood of observed labels = -36752.05902356
iteration   6: log likelihood of observed labels = -36746.30595812
iteration   7: log likelihood of observed labels = -36740.55902099
iteration   8: log likelihood of observed labels = -36734.81819363
iteration   9: log likelihood of observed labels = -36729.08345768
iteration  10: log likelihood of observed labels = -36723.35479491
iteration  11: log likelihood of observed labels = -36717.63218723
iteration  12: log likelihood of observed labels = -36711.91561669
iteration  13: log likelihood of observed labels = -36706.20506549
iteration  14: log likelihood of observed labels = -36700.50051597
iteration  15: log likelihood of observed labels = -36694.80195059
iteration  20: log likelihood of observed labels = -36666.39828192
iteration  30: log likelihood of observed labels = -36610.03005331
iteration  40: log likelihood of observed labels = -36554.23404643
iteration  50: log likelihood of observed labels = -36498.99534891
iteration  60: log likelihood of observed labels = -36444.30008336
iteration  70: log likelihood of observed labels = -36390.13530419
iteration  80: log likelihood of observed labels = -36336.48890532
iteration  90: log likelihood of observed labels = -36283.34953775
iteration 100: log likelihood of observed labels = -36230.70653581
iteration 200: log likelihood of observed labels = -35729.60561673
iteration 300: log likelihood of observed labels = -35269.64174913



In [64]:

    
scores = feature_matrix.dot(coefficients)



In [69]:

    
predictions = np.apply_along_axis(lambda s: 1 if s >= 0 else -1, axis=1, arr=scores)



In [70]:

    
predictions









    Out[70]:





array([ 1, -1,  1, ..., -1,  1, -1])



In [71]:

    
len([x for x in predictions if x == 1])









    Out[71]:





24312



In [72]:

    
len([x for x in predictions if x == -1])









    Out[72]:





28760



In [74]:

    
products['predictions'] = predictions

Quiz question: What is the accuracy of the model on predictions made above? (round to 2 digits of accuracy)



In [76]:

    
accuracy = len(products[products['sentiment']==products['predictions']])/len(products)



In [77]:

    
print('Accuracy: %s' % accuracy)









    



Accuracy: 0.7489448296653602

Which words contribute most to positive & negative sentiments



In [78]:

    
coefficients = list(coefficients[1:]) # exclude intercept
word_coefficient_tuples = [(word, coefficient) for word, coefficient in zip(important_words, coefficients)]
word_coefficient_tuples = sorted(word_coefficient_tuples, key=lambda x:x[1], reverse=True)



In [79]:

    
word_coefficient_tuples[:10]









    Out[79]:





[('great', array([ 0.06666349])),
 ('love', array([ 0.06596698])),
 ('easy', array([ 0.06486981])),
 ('little', array([ 0.0455281])),
 ('loves', array([ 0.04502302])),
 ('well', array([ 0.03021992])),
 ('perfect', array([ 0.02977405])),
 ('old', array([ 0.02017383])),
 ('nice', array([ 0.01844861])),
 ('daughter', array([ 0.01776812]))]



In [80]:

    
sorted(word_coefficient_tuples, key=lambda x:x[1], reverse=False)[:10]









    Out[80]:





[('would', array([-0.05369864])),
 ('product', array([-0.0413929])),
 ('money', array([-0.03893462])),
 ('work', array([-0.03300892])),
 ('even', array([-0.02997942])),
 ('disappointed', array([-0.02895293])),
 ('get', array([-0.02859737])),
 ('back', array([-0.02767238])),
 ('return', array([-0.02657062])),
 ('monitor', array([-0.02444431]))]



In [ ]:

	name	review	rating	sentiment
0	Stop Pacifier Sucking without tears with Thumb...	All of my kids have cried non-stop when I trie...	5	1
1	Nature's Lullabies Second Year Sticker Calendar	We wanted to get something to keep track of ou...	5	1
2	Nature's Lullabies Second Year Sticker Calendar	My daughter had her 1st baby over a year ago. ...	5	1
3	Lamaze Peekaboo, I Love You	One of baby's first and favorite books, and it...	4	1
4	SoftPlay Peek-A-Boo Where's Elmo A Children's ...	Very cute interactive book! My son loves this ...	5	1