In [4]:
features = [np.array(['X_length', 'X_length', 'X_readability', 'X_readability', 'X_readability','X_topic', 'X_topic', \
'X_topic', 'X_topic', 'X_topic', 'X_topic', 'X_topic', 'X_topic', 'X_topic', 'X_sentiment', 'X_sentiment', \
'X_sentiment', 'Y']),
np.array(['sentence', 'word', 'flesch reading', 'word/sentence', 'syllable/word', 'topic1', '...', 'topic14', \
'n_topic', 'depth', 'depth/word', 'redundancy', 'redundancy/sentence', 'rank', 'pos topic', 'neg topic', 'density', \
'helpful class'])]
df_ex = pd.DataFrame(columns=features)
df_ex
Out[4]:
In [ ]:
df_tablet = pd.read_csv('./data/review_tablet.csv')
In [ ]:
import re
from textblob import TextBlob
df_length= pd.DataFrame(columns=['sentence', 'word'])
for i in range(0, 1257):
df_tablet['review'][i] = re.sub('(?<=\?)(?=[a-zA-Z])', ' ', df_tablet['review'][i]) # put space after punctuation marks
df_tablet['review'][i] = re.sub('(?<=\,)(?=[a-zA-Z])', ' ', df_tablet['review'][i])
df_tablet['review'][i] = re.sub('(?<=\!)(?=[a-zA-Z])', ' ', df_tablet['review'][i])
df_tablet['review'][i] = re.sub('(?<=\.)(?=[a-zA-Z])', ' ', df_tablet['review'][i])
sentence = len(TextBlob(df_tablet['review'][i]).sentences) # count number of sentences
word = len(TextBlob(df_tablet['review'][i]).words) # count the number of words
df_length.loc[len(df_length)] = [sentence, word]
df_tablet = pd.concat([df_tablet, df_length], axis=1) # merge with the original df
df_tablet = df_tablet.drop(['author', 'n_help', 'n_total', 'helpful class'], axis=1) # drop unnecessary columns
df_tablet['helpful score'] = df_tablet['helpful score'].round(2) # round off number to three decimal places
In [ ]:
from textstat.textstat import textstat
df_readability = pd.DataFrame(columns=['flesch reading', 'word/sentence', 'syllable'])
for i in range(0, 1257):
n_word = df_tablet['word'][i] # define variables needed in following calculation
n_sentence = df_tablet['sentence'][i]
n_syllable = textstat.syllable_count(df_tablet['review'][i])
flesch_reading = (206.835 - 1.015 * (n_word / n_sentence) - 84.6 * (n_syllable / n_word)).round(2) # flesch reading score
syllable = (n_syllable / n_word).round(2) # calculate average syllable per word
word_sentence = (n_word / n_sentence).round(2) # caldulate average word per sentence
df_readability.loc[len(df_readability)] = [flesch_reading, word_sentence, syllable]
df_tablet = pd.concat([df_tablet, df_readability], axis=1) # merge with original dataframe
In [ ]:
df_tablet['flesch reading'] = df_tablet['flesch reading'].apply(lambda x: 1 if x < 0 else x)
In [ ]:
def extract_candidate_words(text, good_tags=set(['NN','NNP','NNS','NNPS'])):
import itertools, nltk, string
punct = set(string.punctuation)
stop_words = set(nltk.corpus.stopwords.words('english'))
tagged_words = itertools.chain.from_iterable(nltk.pos_tag_sents(nltk.word_tokenize(sent) # tokenize and POS-tag words
for sent in nltk.sent_tokenize(text)))
candidates = [word.lower() for word, tag in tagged_words # filter on certain POS tags and lowercase all words
if tag in good_tags and word.lower() not in stop_words
and not all(char in punct for char in word)]
return candidates
In [ ]:
topic_candidates = []
for i in range(0, 1257):
topics_candidate = (extract_candidate_words(df_tablet['review'][i])) # extract unigram noun from reviews
topic_candidates.append(topic_candidate)
In [ ]:
fileObject = open("./data/topic_candidates",'wb') # save topic keywords as a pickle file for future use
pickle.dump(topic_candidates,fileObject)
In [ ]:
fileObject = open("./data/topic_candidates",'r') # load topic keywords
topic_candidates = pickle.load(fileObject)
fileObject.close()
In [ ]:
from collections import Counter
Counter(topic_candidates).most_common(50) # extract most frequently used unigram nouns
In [ ]:
from pattern.en import singularize
df_tablet['keyword'] = 0 # make new column to append text data (any value could be assigned to make column, in this case 0)
for i in range(0, 1257):
texts = [singularize(word) for word in df_tablet['review'][i].split()]
df_tablet['keyword'][i] = texts
In [ ]:
%run ./data/topics # module that contains variables including topic keywords
In [ ]:
# columns with topic words
for topic_column in topics_column:
df_tablet[topic_column] = 0
In [ ]:
for i in range(0, 1257): # count occurence of each topic in each review
if any(word in topic_display for word in df_tablet['keyword'][i]): df_tablet['display'][i] =1
if any(word in topic_sound for word in df_tablet['keyword'][i]): df_tablet['sound'][i] = 1
if any(word in topic_os for word in df_tablet['keyword'][i]): df_tablet['os'][i] = 1
if any(word in topic_security for word in df_tablet['keyword'][i]): df_tablet['security'][i] = 1
if any(word in topic_hardware for word in df_tablet['keyword'][i]): df_tablet['hardware'][i] = 1
if any(word in topic_battery for word in df_tablet['keyword'][i]): df_tablet['battery'][i] = 1
if any(word in topic_bug for word in df_tablet['keyword'][i]): df_tablet['bug'][i] = 1
if any(word in topic_price for word in df_tablet['keyword'][i]): df_tablet['price'][i] = 1
if any(word in topic_cs for word in df_tablet['keyword'][i]): df_tablet['cs'][i] = 1
if any(word in topic_wifi for word in df_tablet['keyword'][i]): df_tablet['wifi'][i] = 1
if any(word in topic_accesory for word in df_tablet['keyword'][i]): df_tablet['accesory'][i] = 1
if any(word in topic_app for word in df_tablet['keyword'][i]): df_tablet['app'][i] = 1
if any(word in topic_compatible for word in df_tablet['keyword'][i]): df_tablet['compatible'][i] = 1
if any(word in topic_usable for word in df_tablet['keyword'][i]): df_tablet['usable'][i] = 1
In [ ]:
df_tablet['total topic'] = 0
for topic_column in topics_column:
df_tablet['total topic'] += df_tablet[topic_column] # total topics per review
In [ ]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
df_depth = pd.DataFrame(columns=['depth'])
for i in range(0, 1257):
sentences = sent_tokenize(df_tablet['review'][i])
depth_sentences = set()
for sentence in sentences:
for topic_list in topics_list:
if any(word in word_tokenize(sentence) for word in topic_list):
depth_sentences.add(sentence)
depth_sentence = [len(tokenizer.tokenize(depth_sentence)) for depth_sentence in depth_sentences]
df_depth.loc[len(df_depth)] = [sum(depth_sentence)]
#print('review No', i, 'has', sum(depth_sentence), 'words realted to the topic',)
df_tablet = pd.concat([df_tablet, df_depth], axis = 1)
In [12]:
df_tablet['depth/word'] = (df_tablet['depth'] / df_tablet['word']).round(2) # depth per word
In [ ]:
from nltk.tokenize import sent_tokenize, word_tokenize
df_redundancy = pd.DataFrame(columns=['redundancy'])
for i in range(0, 1257):
sentences = sent_tokenize(df_tablet['review'][i])
depth_sentences = set()
for sentence in sentences:
for topic_list in topics_list:
if any(word in word_tokenize(sentence) for word in topic_list):
depth_sentences.add(sentence)
redundancy = len(sentences) - len(depth_sentences)
df_redundancy.loc[len(df_redundancy)] = [redundancy]
#print(i, len(sentences) - len(depth_sentences)) # redundancy
df_tablet = pd.concat([df_tablet, df_redundancy], axis = 1)
In [18]:
df_tablet['redundancy/sentence'] = (df_tablet['redundancy'] / df_tablet['sentence']).round(2)
In [ ]:
df_rank = pd.DataFrame(columns=['rank'])
for i in range(0, 1257):
rank = []
for topic_column in topics_column:
rank.append(round(df_tablet[topic_column][i] * int(df_tablet[topic_column].sum()) / int(df_tablet['total topic'].sum()), 2))
df_rank.loc[len(df_rank)] = [sum(rank)]
df_tablet = pd.concat([df_tablet, df_rank], axis=1)
In [ ]:
from nltk.tokenize import sent_tokenize, word_tokenize
In [ ]:
%run ./data/sentiment_pos_neg.py # scripts that contain how to count number of postive/negative topics in each review
In [ ]:
sentiment_pos_neg() # run pre-defined function in external module
In [ ]:
df_tablet= pd.concat([df_tablet,df_sentiment], axis =1)
In [ ]:
df_density = pd.DataFrame(columns=['density'])
for i in range(0, 1257):
if df_tablet['total topic'][i] != 0:
density = ((df_tablet['pos topic'][i] + df_tablet['neg topic'][i]) / df_tablet['total topic'][i]).round(2)
else:
density = 0
df_density.loc[len(df_density)] = [density]
df_tablet = pd.concat([df_tablet, df_density], axis=1)
In [ ]:
# assign help class(0, 1) to solve as a classification problem
df_tablet['help class'] = df_tablet['helpful score'].apply(lambda x: 1 if x >= 0.80 else 0)
In [23]:
cols = df_tablet.columns.tolist()
cols = ['star', 'review', 'sentence', 'word', 'flesch reading', 'word/sentence', 'syllable', 'display', 'sound', \
'os', 'security', 'hardware', 'battery', 'bug', 'price', 'cs', 'wifi', 'accesory', 'app', 'compatible', 'usable', \
'total topic', 'depth', 'depth/word', 'redundancy', 'redundancy/sentence', 'rank', 'pos topic', 'neg topic', 'density', \
'helpful score', 'help class']
df_tablet = df_tablet[cols] # change the order of columns
In [ ]:
df_tablet.isnull().any() # check whether thre's a null data
In [8]:
pd.set_option('display.max_columns', None)
In [24]:
df_tablet.describe()
Out[24]:
In [25]:
df_tablet.tail(3)
Out[25]:
In [ ]:
# save as pickle
import pickle
fileObject = open("./data/preprocessing",'wb')
pickle.dump(df_tablet,fileObject)
fileObject.close()