In [3]:
    
import pandas as pd
import numpy as np
    
In [4]:
    
products = pd.read_csv('amazon_baby_subset.csv')
    
In [3]:
    
products['name'][:10]
    
    Out[3]:
In [5]:
    
print (products['sentiment'] == 1).sum()
print (products['sentiment'] == -1).sum()
print (products['sentiment']).count()
    
    
In [5]:
    
import json
with open('important_words.json') as important_words_file:    
    important_words = json.load(important_words_file)
print important_words[:3]
    
    
In [6]:
    
products = products.fillna({'review':''})  # fill in N/A's in the review column
def remove_punctuation(text):
    import string
    return text.translate(None, string.punctuation) 
products['review_clean'] = products['review'].apply(remove_punctuation)
products.head(3)
    
    Out[6]:
In [7]:
    
for word in important_words:
    products[word] = products['review_clean'].apply(lambda s : s.split().count(word))
    
In [8]:
    
products.head(1)
    
    Out[8]:
In [9]:
    
products['contains_perfect'] = products['perfect'] >=1
print products['contains_perfect'].sum()
    
    
The function should accept three parameters:
The function should return two values:
The function should do the following:
In [10]:
    
def get_numpy_data(dataframe, features, label):
    dataframe['constant'] = 1
    features = ['constant'] + features
    features_frame = dataframe[features]
    feature_matrix = features_frame.as_matrix()
    label_sarray = dataframe[label]
    label_array = label_sarray.as_matrix()
    return(feature_matrix, label_array)
    
In [11]:
    
feature_matrix, sentiment = get_numpy_data(products, important_words, 'sentiment')
    
In [12]:
    
print feature_matrix.shape
    
    
In [19]:
    
'''
feature_matrix: N * D
coefficients: D * 1
predictions: N * 1
produces probablistic estimate for P(y_i = +1 | x_i, w).
estimate ranges between 0 and 1.
'''
def predict_probability(feature_matrix, coefficients):
    # Take dot product of feature_matrix and coefficients  
    # YOUR CODE HERE
    score = np.dot(feature_matrix, coefficients) # N * 1
    
    # Compute P(y_i = +1 | x_i, w) using the link function
    # YOUR CODE HERE
    predictions = 1.0/(1+np.exp(-score))
    
    # return predictions
    return predictions
    
In [14]:
    
"""
errors: N * 1
feature: N * 1
derivative: 1 
"""
def feature_derivative(errors, feature):     
    # Compute the dot product of errors and feature
    derivative = np.dot(np.transpose(errors), feature)
    # Return the derivative
    return derivative
    
In [15]:
    
def compute_log_likelihood(feature_matrix, sentiment, coefficients):
    indicator = (sentiment==+1)
    scores = np.dot(feature_matrix, coefficients)
    # scores.shape (53072L, 1L)
    # indicator.shape (53072L,)
    lp = np.sum((np.transpose(np.array([indicator]))-1)*scores - np.log(1. + np.exp(-scores)))
    return lp
    
The function accepts the following parameters:
The function carries out the following steps:
In [35]:
    
# coefficients: D * 1
from math import sqrt
def logistic_regression(feature_matrix, sentiment, initial_coefficients, step_size, max_iter):
    coefficients = np.array(initial_coefficients) # make sure it's a numpy array
    # lplist = []
    for itr in xrange(max_iter):
        # Predict P(y_i = +1|x_1,w) using your predict_probability() function
        # YOUR CODE HERE
        predictions = predict_probability(feature_matrix, coefficients)
        # Compute indicator value for (y_i = +1)
        indicator = (sentiment==+1)
        # Compute the errors as indicator - predictions
        errors = np.transpose(np.array([indicator])) - predictions
        for j in xrange(len(coefficients)): # loop over each coefficient
            # Recall that feature_matrix[:,j] is the feature column associated with coefficients[j]
            # compute the derivative for coefficients[j]. Save it in a variable called derivative
            # YOUR CODE HERE
            derivative = feature_derivative(errors, feature_matrix[:,j])
            # add the step size times the derivative to the current coefficient
            # YOUR CODE HERE
            coefficients[j] += step_size*derivative
        # Checking whether log likelihood is increasing
        if itr <= 15 or (itr <= 100 and itr % 10 == 0) or (itr <= 1000 and itr % 100 == 0) \
        or (itr <= 10000 and itr % 1000 == 0) or itr % 10000 == 0:
            # lplist.append(compute_log_likelihood(feature_matrix, sentiment, coefficients))
            lp = compute_log_likelihood(feature_matrix, sentiment, coefficients)
            print 'iteration %*d: log likelihood of observed labels = %.8f' % \
                (int(np.ceil(np.log10(max_iter))), itr, lp)
    """
    import matplotlib.pyplot as plt
    x= [i for i in range(len(lplist))]
    plt.plot(x,lplist,'ro')
    plt.show()
    """
    return coefficients
    
In [17]:
    
initial_coefficients = np.zeros((194,1))
step_size = 1e-7
max_iter = 301
    
In [20]:
    
coefficients = logistic_regression(feature_matrix, sentiment, initial_coefficients, step_size, max_iter)
    
    
In [34]:
    
coefficients = logistic_regression(feature_matrix, sentiment, initial_coefficients, step_size, max_iter)
    
    
In [36]:
    
"""
feature_matrix: N * D
coefficients: D * 1
predictions: N * 1
"""
predictions = predict_probability(feature_matrix, coefficients)
NumPositive = (predictions > 0.5).sum()
print NumPositive
score = np.dot(feature_matrix, coefficients) # N * 1
print (score > 0).sum()
    
    
In [22]:
    
print 0 in products['sentiment']
    
    
In [23]:
    
print -1 in products['sentiment']
    
    
In [24]:
    
print np.transpose(predictions.flatten()).shape
print (products['sentiment']).shape
    
    
In [25]:
    
print (np.transpose(predictions.flatten()))[:5]
    
    
In [46]:
    
correct_num = np.sum((np.transpose(predictions.flatten())> 0.5) == np.array(products['sentiment']>0))
total_num = len(products['sentiment'])
print "correct_num: {}, total_num: {}".format(correct_num, total_num)
accuracy = correct_num * 1./ total_num
print accuracy
    
    
In [39]:
    
np.transpose(predictions.flatten())> 0.5
    
    Out[39]:
In [45]:
    
np.array(products['sentiment']>0)
    
    Out[45]:
In [48]:
    
correct_num = np.sum((np.transpose(score.flatten())> 0) == np.array(products['sentiment']>0))
total_num = len(products['sentiment'])
print "correct_num: {}, total_num: {}".format(correct_num, total_num)
accuracy = correct_num * 1./ total_num
print accuracy
    
    
In [28]:
    
coefficients = list(coefficients[1:]) # exclude intercept
word_coefficient_tuples = [(word, coefficient) for word, coefficient in zip(important_words, coefficients)]
word_coefficient_tuples = sorted(word_coefficient_tuples, key=lambda x:x[1], reverse=True)
    
In [29]:
    
word_coefficient_tuples[:10]
    
    Out[29]:
In [30]:
    
word_coefficient_tuples[-10:]
    
    Out[30]:
In [31]:
    
print np.array([1,2,3])==np.array([1,3,2])
    
    
In [ ]: