In [1]:
import numpy as np
import pandas as pd
import json
import string
import math
from math import sqrt
In [2]:
products = pd.read_csv('amazon_baby_subset.csv')
In [3]:
products.head()
Out[3]:
In [4]:
important_words = json.load(open('important_words.json'))
In [5]:
products[pd.isnull(products['review'])] = ''
In [6]:
remove_punct_map = dict.fromkeys(map(ord, string.punctuation))
def remove_punctuation(text):
return text.translate(remove_punct_map)
In [7]:
products['review_clean'] = products['review'].apply(remove_punctuation)
In [8]:
for word in important_words:
products[word] = products['review_clean'].apply(lambda text: text.split().count(word))
In [9]:
products['contains_perfect'] = products['perfect'].apply(lambda perfect: 1 if perfect>0 else 0)
In [10]:
len(products[products['contains_perfect']==1])
Out[10]:
In [11]:
def get_numpy_data(dataframe, features, label):
dataframe['constant'] = 1
features = ['constant'] + features
features_frame = dataframe[features]
features_matrix = features_frame.as_matrix()
label_sarray = dataframe[label]
label_array = label_sarray.as_matrix().reshape((len(label_sarray), 1))
return (features_matrix, label_array)
In [12]:
feature_matrix, sentiment = get_numpy_data(products, important_words, 'sentiment')
print(feature_matrix.shape)
print(sentiment.shape)
In [13]:
feature_matrix.shape
Out[13]:
In [14]:
def predict_probability(feature_matrix, coefficients):
score = feature_matrix.dot(coefficients)
predictions = np.apply_along_axis(lambda x: 1/(1+math.exp(-x)), 1, score)
return predictions.reshape((max(predictions.shape), 1))
In [15]:
w = np.ones((194,1))
In [16]:
predict_probability(feature_matrix, w)
Out[16]:
In [29]:
def feature_derivative(errors, feature):
derivative = errors.transpose().dot(feature)
return derivative
In [30]:
def compute_log_likelihood(feature_matrix, sentiment, coefficients):
indicator = (sentiment==+1)
scores = feature_matrix.dot(coefficients)
lp = np.sum((indicator-1)*scores - np.log(1 + np.exp(-scores)))
return lp
In [61]:
def logistic_regression(feature_matrix, sentiment, initial_coefficients, step_size, max_iter):
coefficients = np.array(initial_coefficients)
for itr in range(max_iter):
predictions = predict_probability(feature_matrix, coefficients)
indicator = sentiment==+1
errors = indicator - predictions
for j in range(len(coefficients)):
derivative = feature_derivative(errors, feature_matrix[:,j].transpose())
coefficients[j] += step_size*derivative
# Checking whether log likelihood is increasing
if itr <= 15 or (itr <= 100 and itr % 10 == 0) or (itr <= 1000 and itr % 100 == 0) or (itr <= 10000 and itr % 1000 == 0) or itr % 10000 == 0:
lp = compute_log_likelihood(feature_matrix, sentiment, coefficients)
print('iteration %*d: log likelihood of observed labels = %.8f' % (int(np.ceil(np.log10(max_iter))), itr, lp))
return coefficients
In [62]:
initial_coefficients = np.zeros((feature_matrix.shape[1], 1))
step_size = 1e-7
#max_iter = 301
max_iter = 301
In [63]:
coefficients = logistic_regression(feature_matrix, sentiment, initial_coefficients, step_size, max_iter)
In [64]:
scores = feature_matrix.dot(coefficients)
In [69]:
predictions = np.apply_along_axis(lambda s: 1 if s >= 0 else -1, axis=1, arr=scores)
In [70]:
predictions
Out[70]:
In [71]:
len([x for x in predictions if x == 1])
Out[71]:
In [72]:
len([x for x in predictions if x == -1])
Out[72]:
In [74]:
products['predictions'] = predictions
In [76]:
accuracy = len(products[products['sentiment']==products['predictions']])/len(products)
In [77]:
print('Accuracy: %s' % accuracy)
In [78]:
coefficients = list(coefficients[1:]) # exclude intercept
word_coefficient_tuples = [(word, coefficient) for word, coefficient in zip(important_words, coefficients)]
word_coefficient_tuples = sorted(word_coefficient_tuples, key=lambda x:x[1], reverse=True)
In [79]:
word_coefficient_tuples[:10]
Out[79]:
In [80]:
sorted(word_coefficient_tuples, key=lambda x:x[1], reverse=False)[:10]
Out[80]:
In [ ]: