In [117]:
import pandas as pd
import numpy as np
import string
import json
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
In [2]:
products = pd.read_csv('amazon_baby.csv', dtype=[('name',np.str_), ('review', np.str_),('rating', np.float)])
In [3]:
len(products)
Out[3]:
In [15]:
remove_punct_map = dict.fromkeys(map(ord, string.punctuation))
def remove_punctuation(text):
if type(text) == str and len(text.strip())!=0:
return text.translate(remove_punct_map)
else:
return ''
In [16]:
#products['review_clean'] = products[pd.notnull(products['review'])]['review'].apply(remove_punctuation)
products['review_clean'] = products['review'].apply(remove_punctuation)
In [17]:
len(products)
Out[17]:
In [18]:
products = products[products['rating'] != 3]
In [19]:
len(products)
Out[19]:
In [20]:
products['sentiment'] = products['rating'].apply(lambda rating: 1 if rating > 3 else -1)
In [21]:
len(products[pd.isnull(products['review'])])
Out[21]:
In [22]:
train_idx, test_idx = json.load(open('module-2-assignment-train-idx.json')), json.load(open('module-2-assignment-test-idx.json'))
In [23]:
train_data, test_data = products.iloc[train_idx], products.iloc[test_idx]
print(len(train_data))
#train_data, test_data = train_data[pd.notnull(train_data['review_clean'])], test_data[pd.notnull(test_data['review_clean'])]
print(len(train_data))
print(len(train_data[pd.isnull(train_data['review_clean'])]))
In [24]:
vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
train_matrix = vectorizer.fit_transform(train_data['review_clean'])
test_matrix = vectorizer.transform(test_data['review_clean'])
In [25]:
sentiment_model = LogisticRegression()
sentiment_model.fit(train_matrix, train_data['sentiment'])
Out[25]:
In [26]:
print(len(train_data['sentiment']))
print(sum(train_data['sentiment'].values))
In [27]:
print('Total number of coeeficients: %s' % len(sentiment_model.coef_[0]))
print('Number of positive coefficients: %s' %sum(sum(sentiment_model.coef_>=0)))
In [28]:
sample_test_data = test_data[10:13]
In [29]:
sample_test_data
Out[29]:
In [30]:
sample_test_matrix = vectorizer.transform(sample_test_data['review_clean'])
scores = sentiment_model.decision_function(sample_test_matrix)
print(scores)
In [32]:
sample_test_data['predictions_proba'] = sentiment_model.predict_proba(sample_test_matrix)[:,1]
In [34]:
sample_test_data
Out[34]:
In [36]:
sample_test_data['predictions'] = sample_test_data['predictions_proba'].apply(lambda proba: 1 if proba >= 0.5 else -1)
sample_test_data
Out[36]:
In [37]:
test_predictions = sentiment_model.predict_proba(test_matrix)[:,1]
test_data['predictions_proba'] = test_predictions
test_data['predictions'] = test_data['predictions_proba'].apply(lambda proba: 1 if proba >= 0.5 else -1)
In [150]:
test_data = test_data.sort_values('predictions_proba', ascending=False)
In [151]:
test_data[:20]
Out[151]:
In [154]:
test_data.sort_values('predictions_proba', ascending=True)[:20]
Out[154]:
In [153]:
print('Accuracy for test data: %s' % accuracy_score(test_data['sentiment'], test_data['predictions']))
In [157]:
test_data.ix[94560]['name']
Out[157]:
In [54]:
significant_words = ['love', 'great', 'easy', 'old', 'little', 'perfect', 'loves',
'well', 'able', 'car', 'broke', 'less', 'even', 'waste', 'disappointed',
'work', 'product', 'money', 'would', 'return']
In [55]:
vectorizer_word_subset = CountVectorizer(vocabulary=significant_words)
In [142]:
train_matrix_word_subset = vectorizer_word_subset.fit_transform(train_data['review_clean'])
test_matrix_word_subset = vectorizer_word_subset.transform(test_data['review_clean'])
In [59]:
simple_model = LogisticRegression()
simple_model.fit(train_matrix_word_subset, train_data['sentiment'])
Out[59]:
In [63]:
simple_model.coef_.flatten()
Out[63]:
In [104]:
simple_model_coef_table = pd.DataFrame(columns=['word','coef'], index=significant_words)
simple_model_coef_table['word'] = significant_words
In [105]:
simple_model_coef_table['coef'] = simple_model.coef_.flatten()
In [120]:
len(simple_model_coef_table[simple_model_coef_table['coef']>=0])
Out[120]:
In [155]:
simple_model_coef_table
Out[155]:
In [95]:
full_model_coef_table = pd.DataFrame(columns=['word','coef'], index=vectorizer.vocabulary_)
full_model_coef_table['word'] = vectorizer.vocabulary_
full_model_coef_table['coef'] = sentiment_model.coef_.flatten()
full_model_coef_table = full_model_coef_table.sort_values('coef', ascending=False)
In [122]:
combined_table = simple_model_coef_table.join(full_model_coef_table, on='word', how='left', lsuffix="_simple")
In [123]:
combined_table.sort_values('coef_simple', ascending=False)
Out[123]:
In [129]:
train_data['predictions'] = sentiment_model.predict(train_matrix)
In [141]:
sm_train_accurace = accuracy_score(train_data['sentiment'], sentiment_model.predict(train_matrix))
print('Sentiment model train accuracy: %s' % sm_train_accurace)
simple_train_accuracy = accuracy_score(train_data['sentiment'], simple_model.predict(train_matrix_word_subset))
print('Simple model train accuracy: %s' % simple_train_accuracy)
In [143]:
sm_test_accuracy = accuracy_score(test_data['sentiment'], sentiment_model.predict(test_matrix))
print('Sentiment model test accuracy: %s' % sm_test_accuracy)
simple_test_accuracy = accuracy_score(test_data['sentiment'], simple_model.predict(test_matrix_word_subset))
print('Simple model test accuracy: %s' % simple_test_accuracy)
In [148]:
test_data['majority'] = 1
In [149]:
accuracy_score(test_data['sentiment'], test_data['majority'])
Out[149]:
In [ ]: