This note book is used for identifying and grouping domain synonyms in book reviews for each book. The approach relies on creating an nltk.context
per book which will be used to compare only nouns. Nouns that appear to have a highly similar context, will be grouped together under the same name (either the one's or the other's).
In [1]:
# For monitoring duration of pandas processes
from tqdm import tqdm, tqdm_pandas
# To avoid RuntimeError: Set changed size during iteration
tqdm.monitor_interval = 0
# Register `pandas.progress_apply` and `pandas.Series.map_apply` with `tqdm`
# (can use `tqdm_gui`, `tqdm_notebook`, optional kwargs, etc.)
tqdm.pandas(desc="Progress:")
# Now you can use `progress_apply` instead of `apply`
# and `progress_map` instead of `map`
# can also groupby:
# df.groupby(0).progress_apply(lambda x: x**2)
In [2]:
import pandas as pd
df0 = pd.read_csv("../data/interim/002_keyed_nouns.csv", sep="\t", low_memory=False)
df0.head()
Out[2]:
Convert back to a string list.
In [3]:
def convert_text_to_list(review):
return review.replace("[","").replace("]","").replace("'","").replace("\t","").split(",")
# Convert "reviewText" field to back to list
df0['reviewText'] = df0['reviewText'].astype(str)
df0['reviewText'] = df0['reviewText'].progress_apply(lambda text: convert_text_to_list(text));
df0['reviewText'].head()
Out[3]:
Split unique key to asin
and unserId
.
In [4]:
df1 = pd.DataFrame(df0.uniqueKey.str.split('##',1).tolist(),columns = ['userId','asin'])
df1.head()
Out[4]:
In [5]:
df_reviewText = pd.DataFrame(df0['reviewText'])
df_reviewText.head()
Out[5]:
Create new dataframe with userId
, asin
and reviewText
.
In [6]:
df_new = pd.concat([df1, df_reviewText], axis=1)
In [7]:
df_new.head()
Out[7]:
Drop userId
and groupby the same book.
In [8]:
df_books = df_new.drop(columns=['userId'])
In [9]:
df_books_bigReviews = df_books.groupby(['asin'])['reviewText'].progress_apply(list)
In [10]:
df_books_bigReviews_df = pd.DataFrame(df_books_bigReviews).reset_index()
df_books_bigReviews_df.head()
Out[10]:
In [11]:
def merge_list(reviewsList):
new_list = []
for review in reviewsList:
new_list = new_list + review
return list(set(new_list))
In [12]:
df_books_bigReviews_single_list_df = df_books_bigReviews.progress_apply(lambda reviewsList: merge_list(reviewsList))
df_books_bigReviews_single_list_df.head()
Out[12]:
In [13]:
df_books_vs_bigreviews = pd.DataFrame(df_books_bigReviews_single_list_df).reset_index()
df_books_vs_bigreviews.head()
Out[13]:
In [14]:
df2 = df_books_vs_bigreviews
In [15]:
len(df2.reviewText[0])
Out[15]:
In [16]:
from nltk.corpus import wordnet as wn
from itertools import product
def get_synonyms_dict(bigReview, theta):
synonyms = {}
for i in range(len(bigReview)):
wordx = bigReview[i]
for j in range(i,len(bigReview)):
wordy = bigReview[j]
# don't compare with the same word
if(wordx == wordy):
continue
sem1, sem2 = wn.synsets(wordx), wn.synsets(wordy)
prod = list(product(*[sem1,sem2]))
maxscore = 0.0
for k,l in prod:
score = k.wup_similarity(l) # Wu-Palmer Similarity
if score is not None:
if maxscore < score:
maxscore = score
if maxscore > theta and wordy not in synonyms:
synonyms[wordx] = wordy
return synonyms
From this point onwards computation needs increase dramatically, so I will reduce the dataset I am using to just keep in 1000/59324 books.
In [119]:
# Get Synonym Dicts per Book Reviews
df3 = df2[0:1000].assign(synDict = df2['reviewText'][0:1000].progress_apply(lambda big_review: get_synonyms_dict(big_review, 0.9)))
df3.head()
Out[119]:
In [132]:
df4 = df3.drop(columns=['reviewText'])
df4.head()
Out[132]:
In [133]:
df5 = pd.merge(df_new[0:1000], df4, how='inner', on='asin')
df5.head()
Out[133]:
In [134]:
matrix_m01 = df5.as_matrix()
In [135]:
for i in range(5):
new_list = []
for word in matrix_m01[i][2]:
clean_word = word.replace(" ", "");
if clean_word in matrix_m01[i][3].keys():
new_list.append(matrix_m01[i][3][clean_word])
else:
new_list.append(clean_word)
matrix_m01[i][2] = new_list
In [137]:
df_final = pd.DataFrame(matrix_m01)
df_final.head()
Out[137]:
In [139]:
df_final.columns = ['userId','asin', 'reviewText', 'synDict']
df_final.head()
Out[139]:
In [140]:
df_final = df_final.drop(columns=['synDict'])
In [141]:
df_final.head()
Out[141]:
In [142]:
df_final.to_csv("../data/interim/004_synonyms_grouped_1k.csv", sep='\t', header=True, index=False);
In [143]:
df_final.to_pickle("../data/interim/004_synonyms_grouped_1k.p")
In [129]:
# END OF FILE
In [ ]: