In [47]:
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
In [48]:
import plotly
import plotly.plotly as py
In [49]:
plotly.tools.set_credentials_file(username='falrashidi', api_key='XaO64TRYU0N3Sdup8Z3H')
In [50]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
In [51]:
import cufflinks as cf
print(cf.__version__)
import pandas as pd
import numpy as np
import gzip
# Configure cufflings
cf.set_config_file(offline=False, world_readable=True, theme='pearl')
The below functions are provided directly from the Amazon Review Data link by the author and it is used to load the 5-cores) book reviews as a panda dataframe.
In [52]:
def parse(path):
g = gzip.open(path, 'rb')
for l in g:
yield eval(l)
In [53]:
def getDF(path):
i = 0
df = {}
for d in parse(path):
df[i] = d
i += 1
return pd.DataFrame.from_dict(df, orient='index')
In [54]:
df = getDF('/Users/falehalrashidi/Downloads/reviews_Books_5.json.gz')
I used the below snippet to monitor the memory requirements for the loading.
In [55]:
%load_ext memory_profiler
%memit
Below you can see the fields loaded and a count of the values per field;
In [56]:
df.count()
Out[56]:
A sample of the overal data appears next:
In [57]:
df[0:10]
Out[57]:
In general, the loaded dataframe, include 7 fields:
reviewerID: AString` (probably a hashText) that uniquely identifies the user that submitted the review. asin: ASIN stands for Amazon Standard Identification Number. Almost every product on Amazon has its own ASIN, a unique code used to identify it. For books, the ASIN is the same as the book's ISBN number. reviewerName: The name of the reviewer. helpful: Amazon has implemented an interface that allows customers to vote on whether a particular review
has been helpful or unhelpful. This is captured by this field, which represents a rating of the review, e.g. if [2,3] --> 2/3. reviewText: The actual review provided by the reviewer. overall: The product's rating attributed by the same reviewer. summary: A summary of the review. unixReviewTime: Time of the review (unix time).reviewTime: Time of the review (raw).Of these fields, for the purposes of this project we care to keep the reviewerID, asin, reviewText, overall and helpful. Specifically, we keep reviewerID only to merge it with asin and create unique identifier (key) per review, e.g.:
key = reviewerID:"A10000012B7CGYKOMPQ4L" + asin:"000100039X"
asin is obviously necessary to identify the distinct books in the dataset, while the rest are necessary for the analysis (overall, reviewText) and for evaluation (helpful) purposes.
In [58]:
# Number of reviews:
number_of_reviews=len(df)
my_number_string = '{:0,.0f}'.format(number_of_reviews)
print('Number of Reviews: ' + my_number_string + '.')
In [59]:
# Unique number of items:
unique_books=len(df['asin'].unique())
my_number_string = '{:0,.0f}'.format(unique_books)
print('Number of Books: ' + my_number_string + '.')
In [60]:
# Distribution of Ratings (too many to plot with plotly)
fig = df['overall'].plot.hist(alpha=0.5, title='Ratings Distribution', figsize=(15,7), grid=True)
fig.set_xlabel("Ratings")
fig.set_ylabel("Number of Review")
Out[60]:
In [61]:
df10 = df[['overall','asin']]
In [62]:
df11 = pd.DataFrame(df10.groupby(['asin'])['overall'].mean())
In [63]:
len(df11)
Out[63]:
In [64]:
df11 = df11.reset_index()
df11.head()
Out[64]:
In [94]:
#df11['overall'].iplot(kind='histogram', bins=100, xTitle='Rating (0-5)',yTitle='Number of Books', title='Average Book Ratings')
df11.plot.hist(alpha=0.5,bins=100)
Out[94]:
In [67]:
df20 = df[['asin','reviewTime']]
In [68]:
def get_year(reviewTime):
day_month_year_list = reviewTime.split(',')
if(len(day_month_year_list)==2):
return day_month_year_list[1]
else:
return fillna(0)
In [69]:
df20['reviewYear'] = pd.DataFrame(df20['reviewTime'].apply(lambda time: get_year(time)))
In [70]:
df20.head()
Out[70]:
In [71]:
books_per_year = pd.DataFrame(df20.groupby(['reviewYear']).size())
In [72]:
books_per_year.columns = ['counts']
In [74]:
books_per_year.iplot(kind='bar', xTitle='Years', yTitle='Number of Reviews', title='Number of Reviews per Year')
Out[74]:
In [75]:
df30 = df[['asin','reviewTime', 'overall']]
In [76]:
df30['reviewYear'] = pd.DataFrame(df30['reviewTime'].apply(lambda time: get_year(time)))
In [77]:
df30.head()
Out[77]:
In [78]:
books_per_rating_per_year = df30.groupby(['reviewYear','overall']).size().reset_index(name='counts')
In [79]:
books_per_rating_per_year[0:10]
Out[79]:
In [80]:
pivot_df = books_per_rating_per_year.pivot(index='reviewYear', columns='overall', values='counts')
In [81]:
pivot_df.iplot(kind='bar', barmode='stack', xTitle='Years', yTitle='Number of Reviews', title='Number of Reviews per Rating per Year')
Out[81]:
In [82]:
df40 = df[['asin', 'helpful']]
In [83]:
# Create new Column for the enumerator
df40 = df40.assign(enum = df40['helpful'].apply(lambda enum_denom:enum_denom[0]))
In [84]:
# Create new Column for the denominator
df40 = df40.assign(denom = df40['helpful'].apply(lambda enum_denom:enum_denom[1]))
In [85]:
# Filter on the denom
df40 = df40.loc[df40['denom'] != 0]
In [86]:
df40[0:15]
Out[86]:
In [87]:
len(df40)
Out[87]:
In [88]:
bin_values = np.arange(start=0,stop=100,step=1)
df40['denom'].plot.hist(alpha=0.5, bins=bin_values, figsize=(15,7), grid=True, title='Distribution of Binary Helpfulness Ratings Counts per Review')
Out[88]:
In [89]:
# Focus on [10,100] range of rating per review
df40 = df40.loc[df40['denom'] > 15]
df40 = df40.loc[df40['denom'] < 100]
len(df40)
Out[89]:
In [ ]:
df50 = df40.assign(percentage = df40['enum']/df40['denom'])
df50['percentage'].iplot(kind='histogram', title='Distribution of Helpfulness Percentage')
In [90]:
df50.head()
Out[90]:
In [91]:
threshold = 0.7
df60 = df50.loc[df50['percentage'] > threshold]
In [92]:
len(df60)
Out[92]:
In [95]:
# END OF FILE