This notebook replicates the experiments in the ICWSM'17 paper entitled "Identifying Leading Indicators of Product Recalls from Online Reviews using Positive Unlabeled Learning and Domain Adaptation," by Shreesh Kumara Bhat and Aron Culotta. A full version of the paper is here: https://arxiv.org/abs/1703.00518
This notebook first downloads all the required data files from Dropbox into the local folder data
(~194M).
In [1]:
from collections import Counter, defaultdict
from datetime import datetime
import gzip
from IPython.display import display
from itertools import groupby, cycle
import json
import math
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import random
from sklearn.feature_selection import chi2, f_classif
from scipy.sparse import hstack as sp_hstack
from scipy.sparse import vstack as sp_vstack
from scipy.sparse import csr_matrix, lil_matrix
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, auc, classification_report, confusion_matrix, f1_score, precision_recall_curve, precision_score, recall_score, roc_auc_score, roc_curve
from sklearn.naive_bayes import BernoulliNB
from sklearn.preprocessing import scale
from tabulate import tabulate
import urllib.request
%matplotlib inline
PATH = 'data'
complaints_file = PATH + os.path.sep + 'complaints.csv'
reviews_file = PATH + os.path.sep + 'reviews.json.gz'
test_file = PATH + os.path.sep + 'test.csv'
recalls_file = PATH + os.path.sep + 'recalls.csv'
recalled_asins_file = PATH + os.path.sep + 'recalled_asins.txt'
# Formatting for matplotlib
plt.rcParams["xtick.labelsize"] = "16"
plt.rcParams["ytick.labelsize"] = "16"
In [2]:
def download_data(path):
"""
Download any required files if not already present.
"""
files = [('https://www.dropbox.com/s/k18rpimaif014b0/complaints.csv?dl=1', 'complaints.csv'),
('https://www.dropbox.com/s/jwr77xpsa7d1w4b/recalls.csv?dl=1', 'recalls.csv'),
('https://www.dropbox.com/s/ww0bmhf4iw84a33/reviews.json.gz?dl=1', 'reviews.json.gz'),
('https://www.dropbox.com/s/mtppphs0bml727a/test.csv?dl=1', 'test.csv'),
('https://www.dropbox.com/s/53g3hqfodeb52xi/recalled_asins.txt?dl=1', 'recalled_asins.txt')]
if not os.path.exists(path):
os.makedirs(path)
for url, name in files:
if not os.path.exists(path + os.path.sep + name):
print('fetching %s' % name)
urllib.request.urlretrieve(url, path + os.path.sep + name)
download_data(PATH)
In [3]:
def parse_all_reviews(filename):
"""
Parse all reviews into a sparse document x term csr_matrix.
"""
def iter_reviews(filename):
i = 0
for line in gzip.open(filename, 'rt'):
js = json.loads(line)
yield js['reviewText'], js['asin'], js['reviewTime'], js['overall'], js['reviewText']
i += 1
if i % 100000 == 0:
print('read %d reviews' % i)
records = []
vec = CountVectorizer(min_df=50, ngram_range=(1,2), max_df=.95, binary=True)
X = vec.fit_transform(r[0] for r in iter_reviews(filename) if
not records.append(r[1:]))
return X, vec, pd.DataFrame(records, columns=['ASIN', 'review_time', 'review_score', 'reviewText'])
X_reviews, vec, reviews_df = parse_all_reviews(reviews_file)
print('X_reviews has shape %s' % str(X_reviews.shape))
reviews_df.head()
Out[3]:
In [4]:
# reformat date string.
def format_dates(reviews_df):
new_dates = []
for x in reviews_df['review_time']:
parts = [x.replace(',', '') for x in x.split()]
new_dates.append('%s-%s-%s' % (parts[2], parts[0].zfill(2), parts[1].zfill(2)))
print(new_dates[:10])
reviews_df['review_time'] = new_dates
format_dates(reviews_df)
In [5]:
# Number of reviews by score.
pd.value_counts(reviews_df.review_score)
Out[5]:
In [6]:
# Exploring length distribution of reviews.
lengths = [len(x.split()) for x in reviews_df['reviewText']]
lengths = [l for l in lengths if l != 0]
print(np.median(lengths))
pd.DataFrame(lengths).describe()
Out[6]:
In [7]:
def parse_complaints(complaints_file, vec):
"""
Parse all the CPSC complaints, using the same vectorizer fit on the Amazon review data.
"""
complaints_df = pd.read_csv(complaints_file)
X = vec.transform(complaints_df['Incident Description'])
vec.features = np.array(vec.get_feature_names())
return X, complaints_df
X_complaints, complaints_df = parse_complaints(complaints_file, vec)
print('complaints feature matrix has shape %s' % str(X_complaints.shape))
complaints_df.head()
Out[7]:
In [8]:
# Frequency of each victim severity.
complaints_df['(Primary) Victim Severity'].value_counts()
Out[8]:
In [9]:
# Frequency of product types in complaint data.
complaints_df['Product Type'].value_counts()
Out[9]:
In [10]:
# complaints by year
sorted(Counter([d[-4:] for d in complaints_df['Report Date']]).items())
Out[10]:
In [11]:
# Distribution of number of words per complaint.
lengths = [len(x.split()) for x in complaints_df['Incident Description']]
print(np.median(lengths))
display(pd.DataFrame(lengths).describe())
In [12]:
def parse_test_data(filename, vec):
"""
Parse labeled Amazon reviews using the same
vectorizer fit to the unlabeled Amazon reviews.
"""
df = pd.read_csv(test_file)
df.dropna(inplace=True)
df.rename(columns={'Review Text': 'text'}, inplace=True)
X = vec.transform(t for t in df['text'])
return X, df
X_test, test_df = parse_test_data(test_file, vec)
print('X_test has shape %s' % str(X_test.shape))
test_df.head()
Out[12]:
In [13]:
# Label distribution
display(test_df['label'].value_counts())
In [14]:
# This file contains manually labeled instances
# of recall / product pairs. The label indicates whether the match is valid.
recalls_df = pd.read_csv(recalls_file, sep='\t')
recalls_df.head()
Out[14]:
In [15]:
# Number of unique recalls.
len(set(recalls_df[recalls_df.label==1]['RecallNumber']))
Out[15]:
In [16]:
# Number of unique recalled ASINs
recalled_asins = set(l.strip() for l in open(recalled_asins_file))
print('%d recalled ASINs' % len(recalled_asins))
In [17]:
class Data:
"""
Container for all the data.
"""
def __init__(self, X_complaints, complaints_df,
X_test, test_df,
X_reviews, reviews_df,
recalls_df, recalled_asins, vec):
self.X_complaints = X_complaints
self.complaints_df = complaints_df
self.X_test = X_test
self.test_df = test_df
self.X_reviews = X_reviews
self.reviews_df = reviews_df
self.recalls_df = recalls_df
self.recalled_asins = recalled_asins
self.vec = vec
data = Data(X_complaints, complaints_df,
X_test, test_df,
X_reviews, reviews_df,
recalls_df, recalled_asins, vec)
In [18]:
class Evaluator(object):
"""
Evaluation metrics.
"""
def __init__(self, data):
self.data = data
def evaluate(self, model):
"""
Evaluate on test data.
"""
model.fit(self.data)
preds = model.predict(self.data)
probas = model.predict_proba(self.data)
truths = np.array(self.data.test_df['label'])
f1 = f1_score(truths, preds)
recall = recall_score(truths, preds)
precision = precision_score(truths, preds)
roc_auc = roc_auc_score(truths, probas, average=None)
precisions, recalls, pr_auc, pr_at_k = self.evaluate_recalls(model)
return {'f1': f1, 'roc_auc': roc_auc, 'pr_auc': pr_auc, 'recall': recall, 'precision': precision, 'pr_at_k': pr_at_k}
def evaluate_recalls(self, model):
"""
Evaluate against recalled products.
"""
asin2recall_score = model.score_asin_recalls(self.data)
asins = set(self.data.reviews_df.ASIN)
probas = np.array([asin2recall_score[x] for x in asins])
truths = np.array([1 if x in data.recalled_asins else 0 for x in asins])
roc_auc = roc_auc_score(truths, probas)
prec, recall, thresholds = precision_recall_curve(truths, probas)
prec = self._interpolate(prec)
pr_auc = auc(recall, prec)
# evaluate precision at number of true positives.
rank = sum(truths)
pred = np.argsort(probas)[::-1][:rank]
correct = len(set(pred) & set(np.where(truths==1)[0]))
pr_at_k = correct / rank
return prec, recall, pr_auc, pr_at_k
def plot_prec_recalls(self, results):
plt.figure()
for r in results:
plt.plot(r['prcurve'][1], r['prcurve'][0], '.-', label=r['model'])
plt.xlabel('recall')
plt.ylabel('precision')
plt.legend(loc="best")
plt.xlim(-.01, .2)
plt.show()
def _interpolate(self, prec):
p_temp = prec[0]
n = len(prec)
for i in range(n):
if prec[i] < p_temp:
prec[i] = p_temp
else:
p_temp = prec[i]
return prec
def confusion(self, truths, preds, labels):
m = confusion_matrix(truths, preds)
m = np.vstack((labels, m))
m = np.hstack((np.matrix([''] + list(labels)).T, m))
return tabulate(m.tolist(), headers='firstrow')
def top_terms(self, model, n=10):
"""
Print top terms per class.
"""
coef = model.get_coef()
print('\n\nTOP FEATURES:')
coefs = [-coef[0], coef[0]]
for li, label in enumerate(model.clf.classes_):
print('\nCLASS %s' % label)
coef = coefs[li]
top_coef_ind = np.argsort(coef)[::-1][:n]
top_coef_terms = self.data.vec.features[top_coef_ind]
top_coef = coef[top_coef_ind]
print('\n'.join(['%s\t%.3f' % (term, weight)
for term, weight in zip(top_coef_terms, top_coef)]))
def top_error_terms(self, model):
"""
Print top terms appearing in incorrectly labeled documents.
"""
truths = np.array(self.data.test_df['label'])
preds = model.predict(self.data)
X = self.data.X_test
data = np.array(self.data.test_df['text'])
print('\n\nERROR ANALYSIS:\n')
for label in model.clf.classes_:
print('\nincorrectly labeled %s' % label)
iserror = np.zeros(len(truths))
ind = [i for i, (t, p) in enumerate(zip(truths, preds)) if t != p and p == label]
iserror[ind] = 1
corrs, _ = f_classif(X, iserror)
corrs = np.nan_to_num(corrs)
pos_mask, pos_counts, neg_counts = self.get_pos_mask(X, iserror)
corrs *= pos_mask
# ignore features from only 1 incorrect instance.
corrs *= np.sign(X.sign()[np.where(iserror == 1)].sum(axis=0).A1 - 1)
for fidx in np.argsort(corrs)[::-1][:5]:
print('\n\t%s (%d incorrect, %d correct) corr=%.4f' %
(self.data.vec.features[fidx], pos_counts[fidx], neg_counts[fidx], corrs[fidx]))
matches = []
for midx in range(X.shape[0]):
if X[midx, fidx] > 0 and iserror[midx] == 1:
matches.append(midx)
for m in matches[:3]:
print('\t\t' + data[m])
def get_pos_mask(self, X, y, reg=1):
"""Get mask for indices that are more associated with class 1 than class 0."""
pos_counts = X.sign()[np.where(y == 1)].sum(axis=0).A1
neg_counts = X.sign()[np.where(y == 0)].sum(axis=0).A1
posp = (1. + pos_counts) / pos_counts.sum()
negp = (1. + neg_counts) / neg_counts.sum()
diffs = posp - negp
diffs = np.array([1 if v > 0 else -1 for v in diffs])
return np.array(diffs), pos_counts, neg_counts
def average_results(results):
avg = {}
for k in results[0].keys():
vals = [r[k] for r in results]
avg[k] = np.mean(vals)
avg[k+'_se'] = np.std(vals) / math.sqrt(len(vals))
return avg
def evaluate_models(models, data, seeds=[42, 11111, 12345678, 919191, 5555]):
"""
Evaluate all models using multiple seeds and average the results.
"""
evaluator = Evaluator(data)
all_results = []
for m in models:
results = []
for seed in seeds:
m.seed = seed
name = str(m)
print('Evaluating %s' % name)
results.append(evaluator.evaluate(m))
r = average_results(results)
r['model'] = m
all_results.append(r)
df = pd.DataFrame(all_results).sort_values('f1', ascending=False)
mdl = df['model']
df.drop(labels=['model'], axis=1,inplace = True)
df.insert(0, 'model', mdl)
return df
In [19]:
# Here we implement the different classification models.
# Some helper functions.
def _count(probs):
return len(np.where(np.array(probs) >= .5)[0])
def _count_mean(probs):
return len(np.where(np.array(probs) >= .5)[0]) / len(probs)
def _mean(probs):
return np.mean(probs)
def _max(probs):
return max(probs)
class Model(object):
"""
Abstract base class.
"""
def __init__(self, C=1):
self.C = C
self.make_clf()
def fit(self, data):
pass
def predict(self, data):
return self.clf.predict(data.X_test)
def predict_proba(self, data):
"""
Predict the probability of recall on each test example.
"""
return self.clf.predict_proba(data.X_test)[:,1]
def predict_reviews(self, data):
return self.clf.predict(data.X_reviews)
def predict_proba_reviews(self, data):
return self.clf.predict_proba(data.X_reviews)[:,1]
def score_asin_recalls(self, data, aggregator_fn=_count):
"""
Compute a score indicating the likelihood that each product
should be recalled, based on the classification of each review.
"""
probas = self.predict_proba_reviews(data)
preds = {}
for asin, group in groupby([x for x in zip(data.reviews_df['ASIN'], probas)],
key=lambda x: x[0]):
preds[asin] = aggregator_fn([x[1] for x in group])
return preds
def make_clf(self):
self.clf = LogisticRegression(class_weight='balanced', C=self.C)
def get_coef(self):
return self.clf.coef_
class RandomNegativeSamples(Model):
"""
Sample n_neg random examples from the unlabeled data
and pretend they are negative.
"""
def __init__(self, n_neg=-1, seed=42, C=1):
super().__init__(C=C)
self.seed = seed
self.n_neg = n_neg
def fit(self, data):
random.seed(self.seed)
if self.n_neg == -1:
neg = data.X_complaints.shape[0]
else:
neg = self.n_neg
samplei = random.sample(range(data.X_reviews.shape[0]), neg)
self.neg_sample_idx = samplei
X = sp_vstack((data.X_complaints, data.X_reviews[samplei]))
y = np.concatenate(([1] * data.X_complaints.shape[0],
[0] * neg))
self.clf.fit(X, y)
def __str__(self):
return "RandomNegSamples(C=%g,n=%d)" % (self.C, self.n_neg)
class RandomNegativeSamplesThreshold(Model):
"""
Sample n_neg random examples from the unlabeled data
with review >= threshold and pretend they are negative.
"""
def __init__(self, threshold=4.5, n_neg=-1, seed=42, C=1):
super().__init__(C=C)
self.seed = seed
self.threshold = threshold
self.n_neg = n_neg
def fit(self, data):
if self.n_neg == -1:
neg = data.X_complaints.shape[0]
else:
neg = self.n_neg
random.seed(self.seed)
pos_reviews = list(np.where(data.reviews_df['review_score'] >= self.threshold)[0])
samplei = random.sample(pos_reviews, neg)
self.neg_sample_idx = samplei
X = sp_vstack((data.X_complaints, data.X_reviews[samplei]))
y = np.concatenate(([1] * data.X_complaints.shape[0],
[0] * neg))
self.clf.fit(X, y)
def __str__(self):
return "RandomNegSamplesThresh(C=%d, nneg=%d, t=%.1f)" % (self.C, self.n_neg, self.threshold)
class RandomNegativeSamplesThresholdInformedPrior(Model):
"""
Sample n_neg random examples from the unlabeled data
and pretend they are negative.
Also implement the informed prior approach, described
in the paper.
"""
def __init__(self, threshold=4.5, n_neg=-1, seed=42, C=1000):
super().__init__(C=C)
self.seed = seed
self.threshold = threshold
self.n_neg = n_neg
def fit(self, data):
if self.n_neg == -1:
neg = data.X_complaints.shape[0]
else:
neg = self.n_neg
random.seed(self.seed)
pos_reviews = list(np.where(data.reviews_df['review_score'] >= self.threshold)[0])
samplei = random.sample(pos_reviews, neg)
self.neg_sample_idx = samplei
X = sp_vstack((data.X_complaints, data.X_reviews[samplei]))
y = np.concatenate(([1] * data.X_complaints.shape[0],
[0] * neg))
self.clf.fit(X, y)
# Now, predict on test set, compute priors and refit.
predictions = self.clf.predict(data.X_reviews)
nneg = len(np.where(predictions==0)[0])
npos = len(np.where(predictions==1)[0])
## get positive/negative coefficients.
pos_coef_i = np.where(self.clf.coef_[0] > 0)
neg_coef_i = np.where(self.clf.coef_[0] < 0)
pos_pct = npos / (nneg + npos)
ppos_counts = data.X_reviews[np.where(predictions==1)].sum(axis=0).A1
pneg_counts = data.X_reviews[np.where(predictions==0)].sum(axis=0).A1
pos_probs = ((1 + ppos_counts) / (2 + ppos_counts + pneg_counts))[pos_coef_i]
print('pos probs1', pos_probs[:20])
neg_probs = ((1 + pneg_counts) / (2 + ppos_counts + pneg_counts))[neg_coef_i]
# Normalize so that pos and neg coef are in similar range.
pos_probs = pos_probs / pos_probs.sum()
neg_probs = neg_probs / neg_probs.sum()
print('pos probs2', pos_probs[:20])
transform = np.zeros(X.shape[1])
transform[pos_coef_i] = pos_probs # pos_probs[pos_coef_i]
transform[neg_coef_i] = neg_probs # neg_probs[neg_coef_i]
transform *= len(transform) / transform.sum()
print('mean=', np.mean(transform))
print('mean2=', np.mean(transform[pos_coef_i]))
print('pos probs3', transform[pos_coef_i][:20])
print('transform:', sorted(transform)[:10], sorted(transform)[::-1][:10])
tops = []
for i in sorted(pos_coef_i[0], key=lambda x: -transform[x])[:20]:
tops.append({'term': data.vec.features[i], 'coef': self.clf.coef_[0][i], 'transform': transform[i]})
display(pd.DataFrame(tops).sort_values('transform', ascending=False))
# Some debug statements.
print('hazard transform=', transform[data.vec.vocabulary_['hazard']])
print('crib transform=', transform[data.vec.vocabulary_['crib']])
print('pampers transform=', transform[data.vec.vocabulary_['pampers']])
print('very dangerous transform=', transform[data.vec.vocabulary_['very dangerous']])
tops = []
for i in sorted(neg_coef_i[0], key=lambda x: -transform[x])[:20]:
tops.append({'term': data.vec.features[i], 'coef': self.clf.coef_[0][i], 'transform': transform[i]})
display(pd.DataFrame(tops).sort_values('transform', ascending=False))
self.transform = csr_matrix(transform)
self.clf.fit(X.multiply(self.transform), y)
def predict(self, data):
return self.clf.predict(data.X_test.multiply(self.transform))
def predict_proba(self, data):
"""
Predict the probability of recall on each test example.
"""
return self.clf.predict_proba(data.X_test.multiply(self.transform))[:,1]
def predict_reviews(self, data):
return self.clf.predict(data.X_reviews.multiply(self.transform))
def predict_proba_reviews(self, data):
return self.clf.predict_proba(data.X_reviews.multiply(self.transform))[:,1]
def __str__(self):
return "RandNegSampThreshInfoPrior(C=%g, nneg=%d, t=%.1f)" % (self.C, self.n_neg, self.threshold)
In [20]:
# Collect all models for comparison.
models = [
RandomNegativeSamples(n_neg=20000, C=1),
RandomNegativeSamplesThreshold(threshold=3.0, n_neg=20000, C=1),
RandomNegativeSamplesThreshold(threshold=4.0, n_neg=20000, C=1),
RandomNegativeSamplesThreshold(threshold=5.0, n_neg=20000, C=1),
RandomNegativeSamplesThresholdInformedPrior(threshold=3.0, n_neg=20000, C=1),
RandomNegativeSamplesThresholdInformedPrior(threshold=4.0, n_neg=20000, C=1),
RandomNegativeSamplesThresholdInformedPrior(threshold=5.0, n_neg=20000, C=1),
RandomNegativeSamplesThresholdInformedPrior(threshold=5.0, n_neg=1000, C=1),
RandomNegativeSamplesThresholdInformedPrior(threshold=5.0, n_neg=5000, C=1),
RandomNegativeSamplesThresholdInformedPrior(threshold=5.0, n_neg=10000, C=1),
RandomNegativeSamplesThresholdInformedPrior(threshold=5.0, n_neg=30000, C=1),
RandomNegativeSamplesThresholdInformedPrior(threshold=5.0, n_neg=40000, C=1),
RandomNegativeSamplesThresholdInformedPrior(threshold=5.0, n_neg=50000, C=1),
]
results = evaluate_models(models, data, seeds=[123456, 42, 987987])
In [21]:
models = results['model'].tolist()
results
Out[21]:
In [22]:
def print_main_results_table(results):
"""
Print Table 3.
"""
res = []
for ii, r in results.sort_values('roc_auc', ascending=False).iterrows():
if r['model'].n_neg == 20000:
rr = []
rr.append('informed prior' if type(r['model']) == RandomNegativeSamplesThresholdInformedPrior else 'baseline')
try:
rr.append('%.1f' % r['model'].threshold)
except:
rr.append('none')
rr.append('%.1f $\pm $ %.2f' % ((r['roc_auc'] * 100), (r['roc_auc_se'] * 100)))
rr.append('%.1f $\pm$ %.2f' % ((r['f1'] * 100), (r['f1_se'] * 100)))
rr.append('%.1f $\pm$ %.2f' % ((r['precision'] * 100), (r['precision_se'] * 100)))
rr.append('%.1f $\pm$ %.2f' % ((r['recall'] * 100), (r['recall_se'] * 100)))
res.append(rr)
df = pd.DataFrame(res, columns=['Model', 'Review Threshold', 'ROC AUC', 'F1', 'Precision', 'Recall'])
display(df)
print(df.to_latex(index=False, escape=False))
print_main_results_table(results)
In [23]:
def plot_f1_v_nneg(results):
""" Plot Figure 3"""
nnegs = []
f1s = []
ses = []
for i, r in results.iterrows():
if type(r['model']) == RandomNegativeSamplesThresholdInformedPrior and r['model'].threshold == 5.0:
nnegs.append(r['model'].n_neg)
f1s.append(r['f1'])
ses.append(r['f1_se'])
plt.figure(figsize=(8,6))
vals = sorted(zip(nnegs, f1s, ses), key=lambda x: x[0])
plt.plot([v[0] for v in vals], [v[1] for v in vals], 'bo-')
plt.errorbar([v[0] for v in vals], [v[1] for v in vals], yerr=[v[2] for v in vals])
plt.xlabel('Number of negative training examples', size=16)
plt.ylabel('F1', size=16)
plt.tight_layout()
plt.savefig('paper/figs/nneg.pdf')
plt.show()
plot_f1_v_nneg(results)
In [24]:
def plot_rocs(models, data, names):
"""
Plot Figure 2.
"""
truths = np.array(data.test_df['label'])
plt.figure(figsize=(8,6))
formats = ['bo-', 'g^--', 'rs:']
for model, name, fmt in zip(models, names, formats):
probas = model.predict_proba(data)
fpr, tpr, thresh = roc_curve(truths, probas)
auc = roc_auc_score(truths, probas)
plt.plot(fpr, tpr, fmt, ms=4, label='%s' % (name))
plt.legend(loc='lower right', prop={'size':16})
plt.xlabel('False Positive Rate', size=16)
plt.ylabel('True Positive Rate', size=16)
plt.xlim((0, .5))
plt.ylim((0.39, 1.005))
plt.tight_layout()
plt.savefig('paper/figs/roc.pdf')
plt.show()
def get_models(models, names):
r = []
for n in names:
for m in models:
if str(m) == n:
r.append(m)
return r
submodels = get_models(models, ['RandNegSampThreshInfoPrior(C=1, nneg=20000, t=5.0)',
'RandomNegSamplesThresh(C=1, nneg=20000, t=5.0)'])
plot_rocs(submodels, data, [r'informed prior, $\tau=5$', r'baseline, $\tau=5$'])
In [25]:
best_model = get_models(models, ['RandNegSampThreshInfoPrior(C=1, nneg=20000, t=5.0)'])[0]
baseline_model = get_models(models, ['RandomNegSamplesThresh(C=1, nneg=20000, t=5.0)'])[0]
In [26]:
def error_analysis_recalled_products(model, data, n):
"""
Look at the worst reviews for the top products predicted
to be recalled by this model.
"""
probas = model.predict_proba_reviews(data)
scores = model.score_asin_recalls(data, _max)
asins = np.array(list(scores.keys()))
found = 0
for asin, score in sorted(scores.items(), key=lambda x: -x[1]):
label = 1 if asin in data.recalled_asins else 0
#if label == 0:
print('\n\n------------\n', asin, label, score)
idx = np.array(data.reviews_df[data.reviews_df.ASIN==asin].index.tolist())
topi = probas[idx].argsort()[::-1][:1]
for i in topi:
ii = idx[i]
print(probas[ii], data.reviews_df.iloc[ii]['reviewText'])
found += 1
if found >= n:
break
error_analysis_recalled_products(best_model, data, 20)
In [27]:
# Plot Figures 4 and 5.
def get_reviews_before_date(data, asin, date):
reviews = data.reviews_df[data.reviews_df.ASIN==asin].sort_values('review_time')
return reviews
def time_diff(time1, time2):
"""
time2 - time1
time2 2012-01-02T00:00:00
time1 2011-11-03
"""
return (datetime.strptime(time2[:10], '%Y-%m-%d') - datetime.strptime(time1, '%Y-%m-%d')).days
def get_colors():
cmap = plt.get_cmap('Dark2')
colors = [cmap(i) for i in np.linspace(0, 1, 10)]
return cycle(colors)
def predict_by_time(model, data):
recalls_df = data.recalls_df[data.recalls_df.label==1]
probas = model.predict_proba_reviews(data)
found = 0
correct = 0
plt.figure(figsize=(8,6))
colors = get_colors()
all_diffs = []
n_pos = []
total_reviews = 0
total_pos = 0
total_time_diffs = 0
for asin in data.recalled_asins:
recall = recalls_df[recalls_df.AmazonAsin==asin].iloc[0]
reviews = get_reviews_before_date(data, asin, recall['RecallDate'])
if (len(reviews) > 9): # only consider products with at least 10 reviews.
found += 1
total_reviews += len(reviews)
idx = np.array(reviews.index.tolist())
vals = probas[idx]
pos_idx = np.where(vals >= 0.5)[0]
total_pos += len(pos_idx)
if asin == 'XXXX': # For manual analysis of recall reviews
print(recall['RecallName'])
print(recall['RecallDescription'])
print(recall['RecallDate'])
print(recall['RecallTitle'])
print('\n'.join('%s %s' % (x,y) for x,y in
zip(reviews.iloc[pos_idx]['review_time'],
reviews.iloc[pos_idx]['reviewText'])))
if len(pos_idx) > 0:
n_pos.append(len(pos_idx))
color = next(colors)
times = reviews.iloc[pos_idx]['review_time']
time_diffs = [-time_diff(t, recall['RecallDate']) for t in times]
if len(time_diffs) > 0: # found recall review within 500 days of recall (before/after)
all_diffs.extend(time_diffs)
if time_diffs[0] < 0: # found before recall
correct += 1
counts = np.arange(len(time_diffs)) + 1
plt.plot(time_diffs, counts, '.-', color=color)
plt.plot(time_diffs[0], 1, 'x', ms=6, color=color)
total_time_diffs += len(time_diffs)
print(asin, len(pos_idx), time_diffs[0], time_diffs[-1])
plt.xticks(rotation=90)
plt.axvline(x=0, color='k')
plt.ylabel('Total number of hazardous reviews found', size=16)
plt.xlabel(r'before recall $\leftarrow$ Days from recall $\rightarrow$ after recall', size=16)
plt.tight_layout()
plt.savefig('paper/figs/leadtime.pdf')
plt.show()
print('found early warning for %d/%d (%.2f) product recalls' % (correct, found, correct/found))
print('earliest day: mean=%.2f, median=%.2f' % (np.mean(all_diffs), np.median(all_diffs)))
print('%d / %d reviews classified as positive' % (total_pos, total_reviews))
print('%d total time diffs' % total_time_diffs)
plt.figure(figsize=(8,6))
plt.hist(all_diffs, bins=50)
plt.xticks(rotation=90)
plt.ylabel('Count', size=16)
plt.xlabel(r'before recall $\leftarrow$ Days from recall $\rightarrow$ after recall', size=16)
plt.tight_layout()
plt.savefig('paper/figs/dayshist.pdf')
plt.show()
plt.figure(figsize=(8,6))
plt.hist(n_pos, bins=20)
plt.ylabel('Count', size=16)
plt.xlabel('Number of hazardous reviews found', size=16)
plt.tight_layout()
plt.savefig('paper/figs/counthist.pdf')
plt.show()
predict_by_time(best_model, data)
In [28]:
# Plot review distribution for recalled vs non-recalled (Figure 1).
def plot_review_dist(data):
recalled = []
nonrecalled = []
for r in data.reviews_df.iterrows():
if r[1].ASIN in data.recalled_asins:
recalled.append(r[1].review_score)
else:
nonrecalled.append(r[1].review_score)
print('recalled mean=%g' % np.mean(recalled))
print('not recalled mean=%g' % np.mean(nonrecalled))
plt.figure(figsize=(8,6))
recalled_ct = Counter(recalled)
nonrecalled_ct = Counter(nonrecalled)
ratings = np.arange(5) + 1
bar_width = .3
plt.bar(ratings, [recalled_ct[r] / len(recalled) for r in ratings],
bar_width, alpha=.5, color='grey', label='recalled')
plt.bar(ratings + bar_width, [nonrecalled_ct[r] / len(nonrecalled) for r in ratings],
bar_width, alpha=.5, color='w', label='non-recalled')
plt.xticks(ratings + bar_width, ratings)
#plt.hist(recalled, alpha=.5, normed=True, label='recalled')
#plt.hist(nonrecalled, alpha=.5, normed=True, label='not recalled')
plt.legend(loc='best', prop={'size':16})
plt.ylabel('Percent of ratings', size=16)
plt.xlabel('Rating', size=16)
#plt.title('Rating distribution for recalled vs. non-recalled products', size=14)
plt.tight_layout()
plt.savefig('paper/figs/ratings.pdf')
plt.show()
plot_review_dist(data)
In [29]:
# Plot distribution of recall reviews for recalled vs non-recalled
def plot_recall_dist(data, model):
recalled = []
nonrecalled = []
preds = model.predict_reviews(data)
for r in data.reviews_df.iterrows():
if r[1].ASIN in data.recalled_asins:
recalled.append(preds[r[0]])
else:
nonrecalled.append(preds[r[0]])
plt.figure(figsize=(8,6))
recalled_ct = Counter(recalled)
nonrecalled_ct = Counter(nonrecalled)
print(recalled_ct)
print(nonrecalled_ct)
print('recalled pct pos=%g' % (recalled_ct[1] / len(recalled)))
print('nonrecalled pct pos=%g' % (nonrecalled_ct[1] / len(nonrecalled)))
ratings = np.arange(2)
bar_width = .3
plt.bar(ratings, [recalled_ct[r] / len(recalled) for r in ratings],
bar_width, alpha=.5, color='r', label='recalled', hatch="//")
plt.bar(ratings + bar_width, [nonrecalled_ct[r] / len(nonrecalled) for r in ratings],
bar_width, alpha=.5, color='g', label='non-recalled')
plt.xticks(ratings + bar_width, ratings)
plt.legend(loc='best')
plt.ylabel('Percent of reviews', size=16)
plt.xlabel('Predicted class', size=16)
plt.show()
plot_recall_dist(data, best_model)
plot_recall_dist(data, baseline_model)
In [30]:
# Plot Figure 6.
asin2recall_score = best_model.score_asin_recalls(data)
plt.figure()
plt.hist(sorted(asin2recall_score.values()), bins=50, bottom=1)
plt.yscale('log')
plt.xlabel('Number of hazardous reviews', size=16)
plt.ylabel('Number of products', size=16)
plt.tight_layout()
plt.savefig('paper/figs/pred_ratings.pdf')
plt.show()
In [31]:
Counter(asin2recall_score.values()).most_common(10)
Out[31]:
In [32]:
# Print top coef for inclusion in Table 4.
def print_coef_table(models, data):
submodels = get_models(models, ['RandNegSampThreshInfoPrior(C=1, nneg=20000, t=5.0)',
'RandomNegSamplesThresh(C=1, nneg=20000, t=5.0)'])
coef = submodels[0].clf.coef_[0] * submodels[0].transform[0,:].toarray()[0]
coef2 = submodels[1].clf.coef_[0]
terms1 = data.vec.features[np.argsort(coef)[::-1][:20]]
terms2 = data.vec.features[np.argsort(coef2)[::-1][:20]]
print('in informed prior, but not baseline:', set(terms1) - set(terms2))
print('in baseline prior, but not informed prior:', set(terms2) - set(terms1))
print('informed prior:')
print(', '.join(terms1))
print('baseline:')
print(', '.join(terms2))
scaled = scale(coef)
scaled2 = scale(coef2)
diff = scaled2 - scaled
print('\n\n')
for i in np.argsort(diff)[::-1][:20]:
print(data.vec.features[i], diff[i], coef2[i], coef[i], scaled2[i], scaled[i])
print('\n\n')
for i in np.argsort(diff)[:20]:
print(data.vec.features[i], diff[i], coef2[i], coef[i], scaled2[i], scaled[i])
print_coef_table(models, data)
In [33]:
def print_chi2_predicted(model, data, n_feats=100):
preds = model.predict_reviews(data) # model.clf.predict(data.X_reviews)
nneg = len(np.where(preds==0)[0])
npos = len(np.where(preds==1)[0])
print(Counter(preds))
chi, _ = chi2(data.X_reviews, preds)
chi = np.nan_to_num(chi)
# restrict to positive features
ppos_counts = data.X_reviews[np.where(preds==1)].sum(axis=0).A1
pneg_counts = data.X_reviews[np.where(preds==0)].sum(axis=0).A1
chi_pos = chi * np.array([1 if c > 0 else 0 for c in model.clf.coef_[0]])
print('RECALL TERMS')
terms = []
for i in np.argsort(chi_pos)[::-1][:n_feats]:
terms.append({'feature': data.vec.features[i],
'chi2': '%.1f' % chi_pos[i],
'coef': '%.2f' % model.clf.coef_[0][i],
'pos_count': ppos_counts[i],
'pos_frac': '%.3f' % (ppos_counts[i]/npos),
'neg_count': pneg_counts[i],
'neg_frac': '%.3f' % (pneg_counts[i]/nneg)})
display(pd.DataFrame(terms))
print('\n\nNON-RECALL TERMS')
chi_neg = chi * np.array([1 if c < 0 else 0 for c in model.clf.coef_[0]])
terms = []
for i in np.argsort(chi_neg)[::-1][:n_feats]:
terms.append({'feature': data.vec.features[i],
'chi2': '%.1f' % chi_neg[i],
'coef': '%.2f' % model.clf.coef_[0][i],
'pos_count': ppos_counts[i],
'pos_frac': '%.3f' % (ppos_counts[i]/npos),
'neg_count': pneg_counts[i],
'neg_frac': '%.3f' % (pneg_counts[i]/nneg)})
display(pd.DataFrame(terms))
Evaluator(data).top_terms(best_model, n=50)
print('\n\n')
print_chi2_predicted(best_model, data, n_feats=50)
In [34]:
def get_class_discrepancy(model, data, n):
"""
For each of the top n features in the positive class, get the class distribution
in the training data, and the predicted class distribution in the testing data.
"""
coef = model.get_coef()[0]
top_coef_ind = np.argsort(coef)[::-1] # [:n]
preds = model.predict_reviews(data) # model.clf.predict(data.X_reviews)
nneg = len(np.where(preds==0)[0])
npos = len(np.where(preds==1)[0])
# restrict to positive features
ppos_counts = data.X_reviews[np.where(preds==1)].sum(axis=0).A1
pneg_counts = data.X_reviews[np.where(preds==0)].sum(axis=0).A1
train_pos = data.X_complaints.sum(axis=0).A1
train_neg = data.X_reviews[model.neg_sample_idx].sum(axis=0).A1
results = []
count = 0
for i in top_coef_ind:
if train_pos[i] > 2:
train_pr = train_pos[i] / (train_pos[i] + train_neg[i])
test_pr = ppos_counts[i] / (ppos_counts[i] + pneg_counts[i])
results.append(
{
'term': data.vec.features[i],
'coef': coef[i],
'pr_pos_train': train_pr,
'pr_pos_test': test_pr,
'n_pos_train': train_pos[i],
'n_pos_test': ppos_counts[i],
'diff': train_pr - test_pr
}
)
count += 1
if count >= n:
break
pd.set_option('display.max_rows', 1000)
#return pd.DataFrame(results).sort_values('diff', ascending=False)
#return pd.DataFrame(results).sort_values('coef', ascending=False)
return pd.DataFrame(results).sort_values('pr_pos_test', ascending=False)
get_class_discrepancy(baseline_model, data, 100)
Out[34]:
In [35]:
get_class_discrepancy(best_model, data, 100)
Out[35]:
In [36]:
def plot_reviews_by_year(model, data):
"""
Plot number of pos/neg examples by year.
"""
preds = model.predict_reviews(data)
years = [d[:4] for d in data.reviews_df.review_time]
year2counts = defaultdict(lambda: Counter())
for p, y in zip(preds, years):
year2counts[y].update([p])
print('\n'.join('%s %f' % (str(t), t[1][1] / (t[1][0] + t[1][1])) for t in sorted(year2counts.items())))
plot_reviews_by_year(best_model, data)
In [37]:
# Plot Figure 7.
plt.figure()
plt.plot([1377, 1807, 2840], 'go-', label='detected complaints, Amazon')
plt.plot([502, 447, 432], 'bo-', label='submitted complaints, CPSC')
plt.xticks([0, 1, 2], [2011, 2012, 2013])
plt.xlim(-.1, 2.1)
plt.legend(loc='best')
plt.xlabel('year', size=16)
plt.ylabel('count', size=16)
plt.tight_layout()
plt.savefig('paper/figs/years.pdf')
plt.show()