This notebook shows how data can be pivoted by python pandas to reveal insights into the behaviour of reviewers. The use case and data is from Mark Harwood's talk on entity-centric indexing.
An alternative version of this notebook uses the Elastic data frames to create the same results.
In [1]:
import bz2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
In [2]:
csv_handle = bz2.open('./anonreviews.csv.bz2')
reviews = pd.read_csv(csv_handle)
reviews['date'] = pd.to_datetime(reviews['date'])
In [3]:
reviews.head()
Out[3]:
In [4]:
reviews.describe()
Out[4]:
In [5]:
reviews.info()
In [6]:
reviews.hist(column="rating", bins = 5)
plt.show()
In [7]:
#### Typically how many vendors does a reviewer review? (mainly one or two)
In [8]:
plt.plot(reviews.groupby('reviewerId')['vendorId'].nunique(), '.')
plt.xlabel('reviewerId')
plt.ylabel('dc(vendorId)')
plt.show()
In [9]:
aggregations = {
'rating':'mean',
'vendorId':'nunique',
'reviewerId':'count'
}
grouped = reviews.groupby('reviewerId').agg(aggregations)
grouped.columns=['avg_rating', 'dc_vendorId', 'count']
In [10]:
grouped.head()
Out[10]:
In [11]:
grouped.describe()
Out[11]:
In [12]:
plt.rcParams["figure.figsize"] = (10,10)
scatter_matrix(grouped)
plt.show()
In [13]:
grouped[
(grouped['dc_vendorId'] == 1) &
(grouped['count'] > 5) &
(grouped['avg_rating'] == 0)
].sort_values('count', ascending=False)
Out[13]:
For example, reviewer 10392 gives 94 zero star reviews to vendor 122
In [14]:
reviews[reviews['reviewerId'] == 10392].head()
Out[14]:
In [15]:
grouped[
(grouped['dc_vendorId'] == 1) &
(grouped['count'] > 5) &
(grouped['avg_rating'] == 5)
].sort_values('count', ascending=False)
Out[15]:
Reviewer 183751 gives 73 five star reviews to vendor 190
In [16]:
reviews[reviews['reviewerId'] == 183751].head()
Out[16]: