featrue/target overview


In [4]:
features = [np.array(['X_length', 'X_length', 'X_readability', 'X_readability', 'X_readability','X_topic', 'X_topic', \
                      'X_topic', 'X_topic', 'X_topic', 'X_topic', 'X_topic', 'X_topic', 'X_topic', 'X_sentiment', 'X_sentiment', \
                      'X_sentiment', 'Y']),
    np.array(['sentence', 'word', 'flesch reading', 'word/sentence', 'syllable/word', 'topic1', '...', 'topic14', \
              'n_topic', 'depth', 'depth/word', 'redundancy', 'redundancy/sentence', 'rank', 'pos topic', 'neg topic', 'density', \
              'helpful class'])]
df_ex = pd.DataFrame(columns=features)
df_ex


Out[4]:
X_length X_readability X_topic X_sentiment Y
sentence word flesch reading word/sentence syllable/word topic1 ... topic14 n_topic depth depth/word redundancy redundancy/sentence rank pos topic neg topic density helpful class


In [ ]:
df_tablet = pd.read_csv('./data/review_tablet.csv')

1. length


In [ ]:
import re
from textblob import TextBlob

df_length= pd.DataFrame(columns=['sentence', 'word'])

for i in range(0, 1257):
    df_tablet['review'][i] = re.sub('(?<=\?)(?=[a-zA-Z])', ' ', df_tablet['review'][i]) # put space after punctuation marks
    df_tablet['review'][i] = re.sub('(?<=\,)(?=[a-zA-Z])', ' ', df_tablet['review'][i])
    df_tablet['review'][i] = re.sub('(?<=\!)(?=[a-zA-Z])', ' ', df_tablet['review'][i])
    df_tablet['review'][i] = re.sub('(?<=\.)(?=[a-zA-Z])', ' ', df_tablet['review'][i])
    
    sentence = len(TextBlob(df_tablet['review'][i]).sentences) # count number of sentences
    word = len(TextBlob(df_tablet['review'][i]).words) # count the number of words
    df_length.loc[len(df_length)] = [sentence, word]

df_tablet = pd.concat([df_tablet, df_length], axis=1) # merge with the original df
df_tablet = df_tablet.drop(['author', 'n_help', 'n_total', 'helpful class'], axis=1) # drop unnecessary columns
df_tablet['helpful score'] = df_tablet['helpful score'].round(2) # round off number to three decimal places

2. readability


In [ ]:
from textstat.textstat import textstat

df_readability = pd.DataFrame(columns=['flesch reading', 'word/sentence', 'syllable'])

for i in range(0, 1257):
    n_word = df_tablet['word'][i] # define variables needed in following calculation
    n_sentence = df_tablet['sentence'][i]
    n_syllable = textstat.syllable_count(df_tablet['review'][i])
    
    flesch_reading = (206.835 - 1.015 * (n_word / n_sentence) - 84.6 * (n_syllable / n_word)).round(2) # flesch reading score
    syllable = (n_syllable / n_word).round(2) # calculate average syllable per word
    word_sentence = (n_word / n_sentence).round(2) # caldulate average word per sentence
    
    df_readability.loc[len(df_readability)] = [flesch_reading, word_sentence, syllable]

df_tablet = pd.concat([df_tablet, df_readability], axis=1) # merge with original dataframe

In [ ]:
df_tablet['flesch reading'] = df_tablet['flesch reading'].apply(lambda x: 1 if x < 0 else x)

3. topic



In [ ]:
def extract_candidate_words(text, good_tags=set(['NN','NNP','NNS','NNPS'])):
    import itertools, nltk, string

    punct = set(string.punctuation)
    stop_words = set(nltk.corpus.stopwords.words('english'))
    tagged_words = itertools.chain.from_iterable(nltk.pos_tag_sents(nltk.word_tokenize(sent) # tokenize and POS-tag words
                                                                    for sent in nltk.sent_tokenize(text)))
    candidates = [word.lower() for word, tag in tagged_words # filter on certain POS tags and lowercase all words
                  if tag in good_tags and word.lower() not in stop_words
                  and not all(char in punct for char in word)]

    return candidates

In [ ]:
topic_candidates = []
for i in range(0, 1257):
    topics_candidate = (extract_candidate_words(df_tablet['review'][i])) # extract unigram noun from reviews
    topic_candidates.append(topic_candidate)

In [ ]:
fileObject = open("./data/topic_candidates",'wb') # save topic keywords as a pickle file for future use
pickle.dump(topic_candidates,fileObject)

In [ ]:
fileObject = open("./data/topic_candidates",'r') # load topic keywords 
topic_candidates = pickle.load(fileObject)
fileObject.close()

In [ ]:
from collections import Counter
Counter(topic_candidates).most_common(50) # extract most frequently used unigram nouns

In [ ]:
from pattern.en import singularize
df_tablet['keyword'] = 0  # make new column to append text data (any value could be assigned to make column, in this case 0)
for i in range(0, 1257):
    texts = [singularize(word) for word in df_tablet['review'][i].split()]
    df_tablet['keyword'][i] = texts

  • #### topic_1, topic_2, ..., topic_14

In [ ]:
%run ./data/topics # module that contains variables including topic keywords

In [ ]:
# columns with topic words
for topic_column in topics_column:
    df_tablet[topic_column] = 0

In [ ]:
for i in range(0, 1257): # count occurence of each topic in each review
    if any(word in topic_display for word in df_tablet['keyword'][i]): df_tablet['display'][i] =1 
    if any(word in topic_sound for word in df_tablet['keyword'][i]): df_tablet['sound'][i] = 1
    if any(word in topic_os for word in df_tablet['keyword'][i]): df_tablet['os'][i] = 1
    if any(word in topic_security for word in df_tablet['keyword'][i]): df_tablet['security'][i] = 1
    if any(word in topic_hardware for word in df_tablet['keyword'][i]): df_tablet['hardware'][i] = 1
    if any(word in topic_battery for word in df_tablet['keyword'][i]): df_tablet['battery'][i] = 1
    if any(word in topic_bug for word in df_tablet['keyword'][i]): df_tablet['bug'][i] = 1
    if any(word in topic_price for word in df_tablet['keyword'][i]): df_tablet['price'][i] = 1
    if any(word in topic_cs for word in df_tablet['keyword'][i]): df_tablet['cs'][i] = 1
    if any(word in topic_wifi for word in df_tablet['keyword'][i]): df_tablet['wifi'][i] = 1
    if any(word in topic_accesory for word in df_tablet['keyword'][i]): df_tablet['accesory'][i] = 1
    if any(word in topic_app for word in df_tablet['keyword'][i]): df_tablet['app'][i] = 1
    if any(word in topic_compatible for word in df_tablet['keyword'][i]): df_tablet['compatible'][i] = 1
    if any(word in topic_usable for word in df_tablet['keyword'][i]): df_tablet['usable'][i] = 1
  • #### number of topics per review

In [ ]:
df_tablet['total topic'] = 0
for topic_column in topics_column:
    df_tablet['total topic'] += df_tablet[topic_column] # total topics per review
  • #### depth

In [ ]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

df_depth = pd.DataFrame(columns=['depth'])

for i in range(0, 1257):
    sentences = sent_tokenize(df_tablet['review'][i])
    depth_sentences = set()
    for sentence in sentences:
        for topic_list in topics_list:
            if any(word in word_tokenize(sentence) for word in topic_list):
                depth_sentences.add(sentence)
    
    depth_sentence = [len(tokenizer.tokenize(depth_sentence)) for depth_sentence in depth_sentences]
    df_depth.loc[len(df_depth)] = [sum(depth_sentence)]
    #print('review No', i, 'has', sum(depth_sentence), 'words realted to the topic',)
df_tablet = pd.concat([df_tablet, df_depth], axis = 1)

In [12]:
df_tablet['depth/word'] = (df_tablet['depth'] / df_tablet['word']).round(2) # depth per word
  • #### redundancy

In [ ]:
from nltk.tokenize import sent_tokenize, word_tokenize 

df_redundancy = pd.DataFrame(columns=['redundancy'])

for i in range(0, 1257):
    sentences = sent_tokenize(df_tablet['review'][i])
    depth_sentences = set()
    for sentence in sentences:
        for topic_list in topics_list:
            if any(word in word_tokenize(sentence) for word in topic_list): 
                depth_sentences.add(sentence)
        
    redundancy = len(sentences) - len(depth_sentences)
    df_redundancy.loc[len(df_redundancy)] = [redundancy]
    #print(i, len(sentences) - len(depth_sentences)) # redundancy
df_tablet = pd.concat([df_tablet, df_redundancy], axis = 1)

In [18]:
df_tablet['redundancy/sentence'] = (df_tablet['redundancy'] / df_tablet['sentence']).round(2)
  • #### rank

In [ ]:
df_rank = pd.DataFrame(columns=['rank'])

for i in range(0, 1257):
    rank = []
    for topic_column in topics_column:
        rank.append(round(df_tablet[topic_column][i] * int(df_tablet[topic_column].sum())  / int(df_tablet['total topic'].sum()), 2))
    df_rank.loc[len(df_rank)] = [sum(rank)]
df_tablet = pd.concat([df_tablet, df_rank], axis=1)

4. sentiment

  • #### positive/negative topics

In [ ]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [ ]:
%run ./data/sentiment_pos_neg.py # scripts that contain how to count number of postive/negative topics in each review

In [ ]:
sentiment_pos_neg() # run pre-defined function in external module

In [ ]:
df_tablet= pd.concat([df_tablet,df_sentiment], axis =1)
  • #### density

In [ ]:
df_density = pd.DataFrame(columns=['density'])
for i in range(0, 1257):
    if df_tablet['total topic'][i] != 0:
        density = ((df_tablet['pos topic'][i] + df_tablet['neg topic'][i]) / df_tablet['total topic'][i]).round(2)
    else:
        density = 0
    df_density.loc[len(df_density)] = [density]
df_tablet = pd.concat([df_tablet, df_density], axis=1)

5. Help class


In [ ]:
# assign help class(0, 1) to solve as a classification problem
df_tablet['help class'] = df_tablet['helpful score'].apply(lambda x: 1 if x >= 0.80 else 0)


In [23]:
cols = df_tablet.columns.tolist()
cols = ['star', 'review', 'sentence', 'word', 'flesch reading', 'word/sentence', 'syllable', 'display', 'sound', \
        'os', 'security', 'hardware', 'battery', 'bug', 'price', 'cs', 'wifi', 'accesory', 'app', 'compatible', 'usable', \
        'total topic', 'depth', 'depth/word', 'redundancy', 'redundancy/sentence', 'rank', 'pos topic', 'neg topic', 'density', \
        'helpful score', 'help class']
df_tablet = df_tablet[cols] # change the order of columns

In [ ]:
df_tablet.isnull().any() # check whether thre's a null data

In [8]:
pd.set_option('display.max_columns', None)

In [24]:
df_tablet.describe()


Out[24]:
star sentence word flesch reading word/sentence syllable display sound os security hardware battery bug price cs wifi accesory app compatible usable total topic depth depth/word redundancy redundancy/sentence rank pos topic neg topic density helpful score help class
count 1257.000000 1257.000000 1257.000000 1257.000000 1257.000000 1257.000000 1257.000000 1257.000000 1257.000000 1257.000000 1257.000000 1257.000000 1257.000000 1257.000000 1257.000000 1257.000000 1257.000000 1257.000000 1257.000000 1257.000000 1257.000000 1257.000000 1257.000000 1257.000000 1257.000000 1257.000000 1257.000000 1257.000000 1257.000000 1257.000000 1257.000000
mean 2.999204 13.498807 228.099443 77.891710 16.001933 1.334018 0.496420 0.169451 0.167064 0.077963 0.370724 0.256165 0.236277 0.392204 0.238663 0.203660 0.279236 0.368337 0.021480 0.456643 3.734288 148.308671 0.594105 6.466985 0.491074 0.329467 1.173429 0.218775 0.354789 0.709149 0.511535
std 1.684748 17.611027 304.980365 15.362226 8.740415 0.173927 0.500186 0.375299 0.373182 0.268221 0.483191 0.436688 0.424963 0.488436 0.426436 0.402879 0.448802 0.482546 0.145035 0.498315 2.906074 208.780723 0.275841 9.308961 0.257007 0.247442 1.333125 0.479547 0.377165 0.287818 0.500066
min 1.000000 1.000000 1.000000 1.000000 0.670000 0.680000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 1.000000 4.000000 53.000000 71.090000 11.250000 1.250000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 29.000000 0.440000 2.000000 0.330000 0.120000 0.000000 0.000000 0.000000 0.600000 0.000000
50% 3.000000 8.000000 120.000000 78.310000 15.080000 1.320000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 3.000000 76.000000 0.640000 4.000000 0.500000 0.300000 1.000000 0.000000 0.290000 0.800000 1.000000
75% 5.000000 17.000000 284.000000 85.860000 19.000000 1.390000 1.000000 0.000000 0.000000 0.000000 1.000000 1.000000 0.000000 1.000000 0.000000 0.000000 1.000000 1.000000 0.000000 1.000000 6.000000 182.000000 0.780000 8.000000 0.640000 0.510000 2.000000 0.000000 0.500000 0.930000 1.000000
max 5.000000 227.000000 3045.000000 145.670000 86.670000 2.920000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 14.000000 1913.000000 1.040000 130.000000 1.000000 0.990000 7.000000 3.000000 2.000000 1.000000 1.000000

In [25]:
df_tablet.tail(3)


Out[25]:
star review sentence word flesch reading word/sentence syllable display sound os security hardware battery bug price cs wifi accesory app compatible usable total topic depth depth/word redundancy redundancy/sentence rank pos topic neg topic density helpful score help class
1254 1 * Micro SD Card adapter doesn't read/write pro... 13 238 79.80 18.31 1.28 1 0 1 0 1 1 1 0 0 1 1 0 0 1 8 210 0.88 3 0.23 0.64 1 1 0.25 0.45 0
1255 2 (The title of this review is trying to describ... 56 960 74.04 17.14 1.36 1 0 0 1 1 1 1 1 1 0 1 1 0 1 10 726 0.76 24 0.43 0.84 1 1 0.20 0.60 0
1256 5 $50.00 tablet, can not complain at all!! It do... 6 89 89.97 14.83 1.20 1 1 0 0 0 0 0 1 0 0 0 0 0 0 3 83 0.93 1 0.17 0.29 2 0 0.67 1.00 1

In [ ]:
# save as pickle
import pickle
fileObject = open("./data/preprocessing",'wb') 
pickle.dump(df_tablet,fileObject)   
fileObject.close()