In [1]:
import pandas as pd
In [2]:
# For monitoring duration of pandas processes
from tqdm import tqdm, tqdm_pandas
# To avoid RuntimeError: Set changed size during iteration
tqdm.monitor_interval = 0
# Register `pandas.progress_apply` and `pandas.Series.map_apply` with `tqdm`
# (can use `tqdm_gui`, `tqdm_notebook`, optional kwargs, etc.)
tqdm.pandas(desc="Progress:")
# Now you can use `progress_apply` instead of `apply`
# and `progress_map` instead of `map`
# can also groupby:
# df.groupby(0).progress_apply(lambda x: x**2)
In [3]:
# df0 = pd.read_pickle('../data/interim/004_synonyms_grouped_1k.p')
df0 = pd.read_pickle('../data/interim/002_keyed_nouns.p')
In [4]:
df0.head()
Out[4]:
In [5]:
dictionary_df00 = pd.read_pickle('../data/interim/003_dictionary.p')
In [6]:
len(dictionary_df00)
Out[6]:
In [7]:
dictionary_df00.head()
Out[7]:
Words that only appear once cannot be frequent words even in their own context; so they will be filtered out. Then lets calculate the average frequency for the remaining words--remember; this dictionary does not only concern nouns.
Notice: grouping of noun synonyms done in `004_grouping_domain_synonyms` is repeated here once filtering out nouns is applied, since it will take far less time to be applied on the whole dataset once the latter is filter (`004_grouping_domain_synonyms` was aplied only on 1k reviews)
In [8]:
dictionary_df00.loc[dictionary_df00['frequency'] > 5].describe()
Out[8]:
In [9]:
dictionary_df00['word'].loc[dictionary_df00['frequency'] > 4].count()
Out[9]:
In [10]:
gt4_dictionary_df01 = dictionary_df00.loc[dictionary_df00['frequency'] > 4]
In [11]:
dictionary_df00['frequency'].loc[dictionary_df00['frequency'] > 4].describe()
Out[11]:
In [12]:
# Use threshold for first quantile
final_dic = gt4_dictionary_df01.loc[dictionary_df00['frequency'] < 8]
len(final_dic)
Out[12]:
In [13]:
final_dic_df01 = final_dic.assign(normalised = final_dic['frequency'].progress_apply(lambda frequency:frequency/486))
final_dic_df01.head()
Out[13]:
In [14]:
df0.head()
Out[14]:
In [15]:
df1 = pd.DataFrame(df0.uniqueKey.str.split('##',1).tolist(),columns = ['userId','asin'])
df1.head()
Out[15]:
In [16]:
df_reviewText = pd.DataFrame(df0['reviewText'])
df_reviewText.head()
Out[16]:
In [17]:
df_new = pd.concat([df1, df_reviewText], axis=1)
df_new.head()
Out[17]:
In [18]:
df_new_01 = df_new.assign(wordCountBefore = df_new['reviewText'].progress_apply(lambda review:len(review)))
df_new_01.head()
Out[18]:
In [19]:
final_dic_df01['word'] = final_dic_df01['word'].progress_apply(lambda word: word.replace(" ",""))
final_dic_df01 = final_dic_df01.reset_index()
final_dic_df01.head()
Out[19]:
In [20]:
filtered_dict = final_dic_df01['word'].to_dict()
inv_filtered_dict = {v: k for k, v in filtered_dict.items()}
inv_filtered_dict
Out[20]:
In [21]:
def filter_words(review):
new_review = []
for word in review:
word = word.strip()
if word in inv_filtered_dict:
new_review.append(word)
return new_review
In [22]:
df_new_02 = df_new_01.assign(filteredText = df_new_01['reviewText'].progress_apply(lambda review:filter_words(review)))
In [23]:
df_new_03 = df_new_02.assign(wordCountAfter = df_new_02['filteredText'].progress_apply(lambda review:len(review)))
df_new_03[0:20]
Out[23]:
In [24]:
remaining = 1 - df_new_03['wordCountAfter'].sum() / df_new_03['wordCountBefore'].sum()
In [25]:
print("Average noun reduction achieved:" + str(remaining*100) + "%")
In [26]:
df_books_bigReviews = pd.DataFrame(df_new_03[['asin','filteredText']].groupby(['asin'])['filteredText'].progress_apply(list))
df_books_bigReviews = df_books_bigReviews.reset_index()
df_books_bigReviews = df_books_bigReviews.assign(transactions = df_books_bigReviews['filteredText'].progress_apply(lambda reviews_lis:len(reviews_lis)))
df_books_bigReviews.head()
Out[26]:
In [27]:
from apyori import apriori
# Support
# Support is an indication of how frequently the itemset appears in the dataset.
# Confidence
# Confidence is an indication of how often the rule has been found to be true.
# Lift
# The ratio of the observed support to that expected if X and Y were independent.
def apply_arm(transactions):
return list(apriori(transactions, min_support = 1/len(transactions), min_confidence = 1, min_lift = len(transactions), max_length = 4))
In [28]:
books_with_arm = df_books_bigReviews.assign(arm = df_books_bigReviews['filteredText'].progress_apply(lambda list_of_reviews:apply_arm(list_of_reviews)))
books_with_arm.head()
Out[28]:
In [29]:
def get_important_nouns(arms):
imp_nns = []
if "items" in pd.DataFrame(arms).keys():
results = list(pd.DataFrame(arms)['items'])
for result in results:
if len(list(result)) > 4:
imp_nns = imp_nns + list(list(result))
if(len(imp_nns)==0):
for result in results:
if len(list(result)) > 3:
imp_nns = imp_nns + list(list(result))
return list(set(imp_nns))
return list(set(imp_nns))
In [30]:
imp_nns_df = books_with_arm.assign(imp_nns = books_with_arm['arm']
.progress_apply(lambda arms:get_important_nouns(arms)))
imp_nns_df.head()
Out[30]:
In [31]:
imp_nns_df = imp_nns_df[['asin','imp_nns']]
imp_nns_df.head()
Out[31]:
In [32]:
imp_nns_df.to_pickle("../data/interim/005_important_nouns.p")
In [33]:
imp_nns_df = imp_nns_df.assign(num_of_imp_nouns = imp_nns_df['imp_nns'].progress_apply(lambda imp_nouns:len(imp_nouns)))
imp_nns_df.head()
Out[33]:
In [34]:
import plotly
import plotly.plotly as py
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import cufflinks as cf
print(cf.__version__)
# Configure cufflings
cf.set_config_file(offline=False, world_readable=True, theme='pearl')
In [36]:
# Filter out synonyms again
In [38]:
booksWithNoImportantNouns = imp_nns_df.loc[imp_nns_df['num_of_imp_nouns'] == 0]
len(booksWithNoImportantNouns)
Out[38]:
In [39]:
booksWithNoImportantNouns = imp_nns_df.loc[imp_nns_df['num_of_imp_nouns'] != 0]
len(booksWithNoImportantNouns)
Out[39]:
In [41]:
booksWithNoImportantNouns[0:20]
Out[41]:
In [42]:
booksWithNoImportantNouns['num_of_imp_nouns'].iplot(kind='histogram', bins=100, xTitle='Number of Important Nouns', yTitle='Number of Books')
Out[42]:
In [43]:
booksWithNoImportantNouns.describe()
Out[43]:
In [ ]: