In [1]:
# For monitoring duration of pandas processes
from tqdm import tqdm, tqdm_pandas
# To avoid RuntimeError: Set changed size during iteration
tqdm.monitor_interval = 0
# Register `pandas.progress_apply` and `pandas.Series.map_apply` with `tqdm`
# (can use `tqdm_gui`, `tqdm_notebook`, optional kwargs, etc.)
tqdm.pandas(desc="Progress:")
# Now you can use `progress_apply` instead of `apply`
# and `progress_map` instead of `map`
# can also groupby:
# df.groupby(0).progress_apply(lambda x: x**2)
In [2]:
import pandas as pd
import numpy as np
import nltk
In [3]:
import plotly
import plotly.plotly as py
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import cufflinks as cf
print(cf.__version__)
# Configure cufflings
cf.set_config_file(offline=False, world_readable=True, theme='pearl')
In [4]:
reviews_and_ratings_df = pd.read_pickle('../data/interim/001_pre_processed_reviews+and_ratings.p')
reviews_and_ratings_df.head()
Out[4]:
In [5]:
reviews_vs_feature_opinion_pairs = pd.read_pickle("../data/interim/006_pairs_per_review.p")
In [6]:
reviews_vs_feature_opinion_pairs.head()
Out[6]:
In [7]:
df00 = reviews_vs_feature_opinion_pairs[['userId','asin','pairs']]
df00.columns = ['reviewerID','asin','pairs']
df00.head()
Out[7]:
In [8]:
df01 = df00.merge(reviews_and_ratings_df, left_on=['reviewerID','asin'], right_on=['reviewerID','asin'], how='inner')
df01[0:31]
Out[8]:
In [9]:
from nltk.tokenize import sent_tokenize
df01['reviewText'] = df01['reviewText'].progress_apply(lambda review: sent_tokenize(review))
df01.head()
Out[9]:
After identifying the distinct sentences, next we need to apply the same normalisation process we employed at the beggining of this project, but this time on each sentence rather than on reviews.
In [10]:
# Word Tokenize
import re
import string
import inflect
from nltk.corpus import wordnet
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import regexp_tokenize
tokenizer=RegexpTokenizer('[\'\w\-]+',gaps=False)
# Convert to Lowercase
def convert_to_lowercase(sentence):
for i in range(len(sentence)):
sentence[i] = sentence[i].lower()
return sentence
# Eliminate Punctuation
def eliminate_punctuation(sentence, regex):
new_sentence = []
for token in sentence:
new_token = regex.sub(u'', token)
if not new_token == u'':
new_sentence.append(new_token)
return new_sentence
r1 = re.compile("([a-zA-Z]+)([0-9]+)")
r2 = re.compile("([0-9]+)([a-zA-Z]+)")
r3 = re.compile("([a-zA-Z]+)([0-9]+)([a-zA-Z]+)")
r4 = re.compile("([0-9]+)([a-zA-Z]+)([0-9]+)")
# Split words into numbers
def split_words_and_nums(sentence):
new_sentence = []
for token in sentence:
firstRegexIsTrue = r1.match(token)
secondRegexIsTrue = r2.match(token)
thirdRegexIsTrue = r3.match(token)
fourthRegexIsTrue = r4.match(token)
if(firstRegexIsTrue):
new_sentence.append(firstRegexIsTrue.group(0))
new_sentence.append(firstRegexIsTrue.group(1))
elif(firstRegexIsTrue):
new_sentence.append(secondRegexIsTrue.group(0))
new_sentence.append(secondRegexIsTrue.group(1))
elif(thirdRegexIsTrue):
new_sentence.append(thirdRegexIsTrue.group(0))
new_sentence.append(thirdRegexIsTrue.group(1))
new_sentence.append(thirdRegexIsTrue.group(2))
elif(fourthRegexIsTrue):
new_sentence.append(fourthRegexIsTrue.group(0))
new_sentence.append(fourthRegexIsTrue.group(1))
new_sentence.append(fourthRegexIsTrue.group(2))
else:
new_sentence.append(token)
return new_sentence
## Convert Numbers to Words
def numStringToWord(sentence, p):
for i in range(len(sentence)):
if(sentence[i].isdigit()):
if(len(sentence[i])<10):
sentence[i] = p.number_to_words(sentence[i])
return sentence
# Replace negatives with antonyms
class AntonymReplacer(object):
def replace(self, token, pos=None):
antonyms = set()
for syn in wordnet.synsets(token, pos=pos):
for lemma in syn.lemmas():
for antonym in lemma.antonyms():
antonyms.add(antonym.name())
if len(antonyms) == 1:
return antonyms.pop()
else:
return None
def replace_negations(self, sentence):
i, l = 0, len(sentence)
tokens = []
while i<l:
token = sentence[i]
if token == 'not' and i+1 <l:
ant = self.replace(sentence[i+1])
if ant:
tokens.append(ant)
i += 2
continue
tokens.append(token)
i += 1
return tokens
In [11]:
replacer = AntonymReplacer()
regex=re.compile('[%s]' % re.escape(string.punctuation))
p = inflect.engine()
def normalise_and_tokenize_sentences(review):
new_review = []
for sentence in review:
step_0 = tokenizer.tokenize(sentence)
step_1 = convert_to_lowercase(step_0)
step_2 = eliminate_punctuation(step_1, regex)
step_3 = split_words_and_nums(step_2)
step_4 = numStringToWord(step_3, p)
step_5 = replacer.replace_negations(step_4)
new_review.append(step_5)
return new_review
In [12]:
df2 = df01.assign(norm_sentences = df01['reviewText'].progress_apply(lambda reviewText:normalise_and_tokenize_sentences(reviewText)))
df2.head()
Out[12]:
In [13]:
df2.to_pickle('../data/interim/007_pre_processed_dataset_for_excerpts_extraction.p')
In [14]:
matrix_m01 = df2.as_matrix()
In [15]:
matrix_m02 = np.append(matrix_m01,np.zeros([len(matrix_m01),1]),1)
sample = pd.DataFrame(matrix_m02[0:10])
sample
Out[15]:
In [16]:
def identify_excerpt_index_for(review_sentences, pair):
index = None
for i in range(len(review_sentences)):
sentence = review_sentences[i]
if pair[0] in sentence:
if pair[1] in sentence:
index = i
break
return index
In [17]:
from tqdm import tqdm
with tqdm(total=len(matrix_m02)) as pbar:
for i in range(len(matrix_m02)):
excerpt_indices = []
actual_sentences = matrix_m02[i][3]
review_sentences = matrix_m02[i][5]
pairs = matrix_m02[i][2]
for pair in pairs:
index_of_sentence_with_pair = identify_excerpt_index_for(review_sentences,pair)
if index_of_sentence_with_pair is not None and index_of_sentence_with_pair not in excerpt_indices:
excerpt_indices.append(index_of_sentence_with_pair)
excerpts = []
for index in excerpt_indices:
excerpts.append(actual_sentences[index])
matrix_m02[i][6] = excerpts
pbar.update(1)
In [18]:
df20 = pd.DataFrame(matrix_m02)
df20.columns = ['reviewerID','asin','pairs','reviewText','overall','norm_sentences','excerpts']
df20.head()
Out[18]:
In [19]:
df30 = df20[['reviewerID','asin','overall','excerpts']]
df30.head()
Out[19]:
In [20]:
len(df30)
Out[20]:
In [21]:
df31 = df30[df30['excerpts'].map(lambda excerpts: len(excerpts)) > 0]
len(df31)
Out[21]:
In [25]:
231936/249871
Out[25]:
In [26]:
249871 - 231936
Out[26]:
In [22]:
import numpy as np
from textblob import TextBlob
def get_overal_polarity(excerpts):
text = ''.join(excerpts)
blob = TextBlob(text)
polarity = []
for sentence in blob.sentences:
polarity.append(sentence.sentiment.polarity)
return np.mean(polarity)
In [23]:
df40 = df31.assign(polarity = df31['excerpts'].progress_apply(lambda excerpts:get_overal_polarity(excerpts)))
df40.head()
Out[23]:
In [24]:
df40.to_pickle('../data/interim/007_excerpts_with_polarity.p')
In [27]:
def merge_list(summariesList):
summary = []
for excerpt in summariesList:
summary = summary + excerpt
return summary
In [31]:
df_book_summaries = pd.DataFrame(df40.groupby(['asin'])['excerpts'].progress_apply(list)).reset_index()
df_book_summaries.head()
Out[31]:
In [32]:
df_book_summaries['excerpts'] = df_book_summaries['excerpts'].progress_apply(lambda summariesList: merge_list(summariesList))
df_book_summaries.head()
Out[32]:
In [37]:
df40["overall"] = pd.to_numeric(df40["overall"], errors='coerce')
df40["polarity"] = pd.to_numeric(df40["polarity"], errors='coerce')
df40.head()
Out[37]:
In [62]:
mean_rating_vs_polarity_per_book = pd.DataFrame(df40.groupby(['asin'])[["overall","polarity"]].mean()).reset_index()
mean_rating_vs_polarity_per_book.head()
Out[62]:
In [83]:
### Normalise polarity values to match
def normalise(polarity):
positeiv_polarity = polarity + 1
normalised_polarity = (4 * positeiv_polarity)/2
return normalised_polarity
mean_rating_vs_polarity_per_book = mean_rating_vs_polarity_per_book.assign(norm_polarity = mean_rating_vs_polarity_per_book['polarity'].progress_apply(lambda polarity:normalise(polarity)))
mean_rating_vs_polarity_per_book.head()
Out[83]:
In [84]:
mean_rating_vs_polarity_per_book = mean_rating_vs_polarity_per_book.assign(norm_overall = mean_rating_vs_polarity_per_book['overall'].progress_apply(lambda overall:overall - 1))
mean_rating_vs_polarity_per_book.head()
Out[84]:
In [85]:
import itertools
import numpy as np
x_ratings = np.asarray(list(itertools.chain(*mean_rating_vs_polarity_per_book.as_matrix(columns=mean_rating_vs_polarity_per_book.columns[4:5]))))
y_polarity = np.asarray(list(itertools.chain(*mean_rating_vs_polarity_per_book.as_matrix(columns=mean_rating_vs_polarity_per_book.columns[3:4]))))
In [97]:
import plotly.plotly as py
import plotly.graph_objs as go
trace1 = go.Scatter(x=x_ratings, y=y_polarity,
mode='markers',
name='ROC curve (area = %0.2f)' % roc_auc[2]
)
trace2 = go.Scatter(x=[0, 4], y=[0, 4],
mode='markers',
line=dict(color='red', width=lw, dash='dash'),
showlegend=False)
layout = go.Layout(title='Receiver Operating Characteristic Function',
xaxis=dict(title='False Positive Rate'),
yaxis=dict(title='True Positive Rate'))
fig = go.Figure(data=[trace1, trace2], layout=layout)
In [96]:
py.iplot(fig)
Out[96]:
In [99]:
# Create a trace
trace = go.Scatter(
x = x_ratings,
y = y_polarity,
mode = 'markers'
)
layout = go.Layout(title='Correlation between Polarity and Rating',
xaxis=dict(title='Ratings'),
yaxis=dict(title='Polarity'))
fig = go.Figure(data=[trace], layout=layout)
In [100]:
# Plot and embed in ipython notebook!
py.iplot(fig)
Out[100]:
In [101]:
mean_rating_vs_polarity_per_book['norm_overall'].corr(mean_rating_vs_polarity_per_book['norm_polarity'])
Out[101]:
In [127]:
df_book_summaries.to_csv("../data/processed/007_book_summaries.csv", sep="\t")
df_book_summaries.to_pickle("../data/processed/007_book_summaries.p")
In [112]:
df_book_summaries['asin'][0:1][0]
Out[112]:
In [111]:
print(df_book_summaries['excerpts'][0:1][0])
In [119]:
df_book_summaries['asin'][1:2][1]
Out[119]:
In [121]:
print(df_book_summaries['excerpts'][1:2][1])
In [122]:
df_book_summaries['asin'][2:3][2]
Out[122]:
In [123]:
df_book_summaries['excerpts'][2:3][2]
Out[123]:
In [124]:
df_book_summaries['asin'][3:4][3]
Out[124]:
In [126]:
df_book_summaries['excerpts'][3:4][3]
Out[126]:
In [132]:
df_book_summaries['asin'][6:7][6]
Out[132]:
In [131]:
df_book_summaries['excerpts'][6:7][6]
Out[131]:
In [120]:
## END_OF_FILE
In [ ]: