featrue/target overview



In [4]:

    
features = [np.array(['X_length', 'X_length', 'X_readability', 'X_readability', 'X_readability','X_topic', 'X_topic', \
                      'X_topic', 'X_topic', 'X_topic', 'X_topic', 'X_topic', 'X_topic', 'X_topic', 'X_sentiment', 'X_sentiment', \
                      'X_sentiment', 'Y']),
    np.array(['sentence', 'word', 'flesch reading', 'word/sentence', 'syllable/word', 'topic1', '...', 'topic14', \
              'n_topic', 'depth', 'depth/word', 'redundancy', 'redundancy/sentence', 'rank', 'pos topic', 'neg topic', 'density', \
              'helpful class'])]
df_ex = pd.DataFrame(columns=features)
df_ex









    Out[4]:






  
    
      
      X_length
      X_readability
      X_topic
      X_sentiment
      Y
    
    
      
      sentence
      word
      flesch reading
      word/sentence
      syllable/word
      topic1
      ...
      topic14
      n_topic
      depth
      depth/word
      redundancy
      redundancy/sentence
      rank
      pos topic
      neg topic
      density
      helpful class



In [ ]:

    
df_tablet = pd.read_csv('./data/review_tablet.csv')

1. length



In [ ]:

    
import re
from textblob import TextBlob

df_length= pd.DataFrame(columns=['sentence', 'word'])

for i in range(0, 1257):
    df_tablet['review'][i] = re.sub('(?<=\?)(?=[a-zA-Z])', ' ', df_tablet['review'][i]) # put space after punctuation marks
    df_tablet['review'][i] = re.sub('(?<=\,)(?=[a-zA-Z])', ' ', df_tablet['review'][i])
    df_tablet['review'][i] = re.sub('(?<=\!)(?=[a-zA-Z])', ' ', df_tablet['review'][i])
    df_tablet['review'][i] = re.sub('(?<=\.)(?=[a-zA-Z])', ' ', df_tablet['review'][i])
    
    sentence = len(TextBlob(df_tablet['review'][i]).sentences) # count number of sentences
    word = len(TextBlob(df_tablet['review'][i]).words) # count the number of words
    df_length.loc[len(df_length)] = [sentence, word]

df_tablet = pd.concat([df_tablet, df_length], axis=1) # merge with the original df
df_tablet = df_tablet.drop(['author', 'n_help', 'n_total', 'helpful class'], axis=1) # drop unnecessary columns
df_tablet['helpful score'] = df_tablet['helpful score'].round(2) # round off number to three decimal places

2. readability



In [ ]:

    
from textstat.textstat import textstat

df_readability = pd.DataFrame(columns=['flesch reading', 'word/sentence', 'syllable'])

for i in range(0, 1257):
    n_word = df_tablet['word'][i] # define variables needed in following calculation
    n_sentence = df_tablet['sentence'][i]
    n_syllable = textstat.syllable_count(df_tablet['review'][i])
    
    flesch_reading = (206.835 - 1.015 * (n_word / n_sentence) - 84.6 * (n_syllable / n_word)).round(2) # flesch reading score
    syllable = (n_syllable / n_word).round(2) # calculate average syllable per word
    word_sentence = (n_word / n_sentence).round(2) # caldulate average word per sentence
    
    df_readability.loc[len(df_readability)] = [flesch_reading, word_sentence, syllable]

df_tablet = pd.concat([df_tablet, df_readability], axis=1) # merge with original dataframe



In [ ]:

    
df_tablet['flesch reading'] = df_tablet['flesch reading'].apply(lambda x: 1 if x < 0 else x)

3. topic



In [ ]:

    
def extract_candidate_words(text, good_tags=set(['NN','NNP','NNS','NNPS'])):
    import itertools, nltk, string

    punct = set(string.punctuation)
    stop_words = set(nltk.corpus.stopwords.words('english'))
    tagged_words = itertools.chain.from_iterable(nltk.pos_tag_sents(nltk.word_tokenize(sent) # tokenize and POS-tag words
                                                                    for sent in nltk.sent_tokenize(text)))
    candidates = [word.lower() for word, tag in tagged_words # filter on certain POS tags and lowercase all words
                  if tag in good_tags and word.lower() not in stop_words
                  and not all(char in punct for char in word)]

    return candidates



In [ ]:

    
topic_candidates = []
for i in range(0, 1257):
    topics_candidate = (extract_candidate_words(df_tablet['review'][i])) # extract unigram noun from reviews
    topic_candidates.append(topic_candidate)



In [ ]:

    
fileObject = open("./data/topic_candidates",'wb') # save topic keywords as a pickle file for future use
pickle.dump(topic_candidates,fileObject)



In [ ]:

    
fileObject = open("./data/topic_candidates",'r') # load topic keywords 
topic_candidates = pickle.load(fileObject)
fileObject.close()



In [ ]:

    
from collections import Counter
Counter(topic_candidates).most_common(50) # extract most frequently used unigram nouns



In [ ]:

    
from pattern.en import singularize
df_tablet['keyword'] = 0  # make new column to append text data (any value could be assigned to make column, in this case 0)
for i in range(0, 1257):
    texts = [singularize(word) for word in df_tablet['review'][i].split()]
    df_tablet['keyword'][i] = texts

#### topic_1, topic_2, ..., topic_14



In [ ]:

    
%run ./data/topics # module that contains variables including topic keywords



In [ ]:

    
# columns with topic words
for topic_column in topics_column:
    df_tablet[topic_column] = 0



In [ ]:

    
for i in range(0, 1257): # count occurence of each topic in each review
    if any(word in topic_display for word in df_tablet['keyword'][i]): df_tablet['display'][i] =1 
    if any(word in topic_sound for word in df_tablet['keyword'][i]): df_tablet['sound'][i] = 1
    if any(word in topic_os for word in df_tablet['keyword'][i]): df_tablet['os'][i] = 1
    if any(word in topic_security for word in df_tablet['keyword'][i]): df_tablet['security'][i] = 1
    if any(word in topic_hardware for word in df_tablet['keyword'][i]): df_tablet['hardware'][i] = 1
    if any(word in topic_battery for word in df_tablet['keyword'][i]): df_tablet['battery'][i] = 1
    if any(word in topic_bug for word in df_tablet['keyword'][i]): df_tablet['bug'][i] = 1
    if any(word in topic_price for word in df_tablet['keyword'][i]): df_tablet['price'][i] = 1
    if any(word in topic_cs for word in df_tablet['keyword'][i]): df_tablet['cs'][i] = 1
    if any(word in topic_wifi for word in df_tablet['keyword'][i]): df_tablet['wifi'][i] = 1
    if any(word in topic_accesory for word in df_tablet['keyword'][i]): df_tablet['accesory'][i] = 1
    if any(word in topic_app for word in df_tablet['keyword'][i]): df_tablet['app'][i] = 1
    if any(word in topic_compatible for word in df_tablet['keyword'][i]): df_tablet['compatible'][i] = 1
    if any(word in topic_usable for word in df_tablet['keyword'][i]): df_tablet['usable'][i] = 1

#### number of topics per review



In [ ]:

    
df_tablet['total topic'] = 0
for topic_column in topics_column:
    df_tablet['total topic'] += df_tablet[topic_column] # total topics per review

#### depth



In [ ]:

    
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

df_depth = pd.DataFrame(columns=['depth'])

for i in range(0, 1257):
    sentences = sent_tokenize(df_tablet['review'][i])
    depth_sentences = set()
    for sentence in sentences:
        for topic_list in topics_list:
            if any(word in word_tokenize(sentence) for word in topic_list):
                depth_sentences.add(sentence)
    
    depth_sentence = [len(tokenizer.tokenize(depth_sentence)) for depth_sentence in depth_sentences]
    df_depth.loc[len(df_depth)] = [sum(depth_sentence)]
    #print('review No', i, 'has', sum(depth_sentence), 'words realted to the topic',)
df_tablet = pd.concat([df_tablet, df_depth], axis = 1)



In [12]:

    
df_tablet['depth/word'] = (df_tablet['depth'] / df_tablet['word']).round(2) # depth per word

#### redundancy



In [ ]:

    
from nltk.tokenize import sent_tokenize, word_tokenize 

df_redundancy = pd.DataFrame(columns=['redundancy'])

for i in range(0, 1257):
    sentences = sent_tokenize(df_tablet['review'][i])
    depth_sentences = set()
    for sentence in sentences:
        for topic_list in topics_list:
            if any(word in word_tokenize(sentence) for word in topic_list): 
                depth_sentences.add(sentence)
        
    redundancy = len(sentences) - len(depth_sentences)
    df_redundancy.loc[len(df_redundancy)] = [redundancy]
    #print(i, len(sentences) - len(depth_sentences)) # redundancy
df_tablet = pd.concat([df_tablet, df_redundancy], axis = 1)



In [18]:

    
df_tablet['redundancy/sentence'] = (df_tablet['redundancy'] / df_tablet['sentence']).round(2)

#### rank



In [ ]:

    
df_rank = pd.DataFrame(columns=['rank'])

for i in range(0, 1257):
    rank = []
    for topic_column in topics_column:
        rank.append(round(df_tablet[topic_column][i] * int(df_tablet[topic_column].sum())  / int(df_tablet['total topic'].sum()), 2))
    df_rank.loc[len(df_rank)] = [sum(rank)]
df_tablet = pd.concat([df_tablet, df_rank], axis=1)

4. sentiment

#### positive/negative topics



In [ ]:

    
from nltk.tokenize import sent_tokenize, word_tokenize



In [ ]:

    
%run ./data/sentiment_pos_neg.py # scripts that contain how to count number of postive/negative topics in each review



In [ ]:

    
sentiment_pos_neg() # run pre-defined function in external module



In [ ]:

    
df_tablet= pd.concat([df_tablet,df_sentiment], axis =1)

#### density



In [ ]:

    
df_density = pd.DataFrame(columns=['density'])
for i in range(0, 1257):
    if df_tablet['total topic'][i] != 0:
        density = ((df_tablet['pos topic'][i] + df_tablet['neg topic'][i]) / df_tablet['total topic'][i]).round(2)
    else:
        density = 0
    df_density.loc[len(df_density)] = [density]
df_tablet = pd.concat([df_tablet, df_density], axis=1)

5. Help class



In [ ]:

    
# assign help class(0, 1) to solve as a classification problem
df_tablet['help class'] = df_tablet['helpful score'].apply(lambda x: 1 if x >= 0.80 else 0)



In [23]:

    
cols = df_tablet.columns.tolist()
cols = ['star', 'review', 'sentence', 'word', 'flesch reading', 'word/sentence', 'syllable', 'display', 'sound', \
        'os', 'security', 'hardware', 'battery', 'bug', 'price', 'cs', 'wifi', 'accesory', 'app', 'compatible', 'usable', \
        'total topic', 'depth', 'depth/word', 'redundancy', 'redundancy/sentence', 'rank', 'pos topic', 'neg topic', 'density', \
        'helpful score', 'help class']
df_tablet = df_tablet[cols] # change the order of columns



In [ ]:

    
df_tablet.isnull().any() # check whether thre's a null data



In [8]:

    
pd.set_option('display.max_columns', None)



In [24]:

    
df_tablet.describe()









    Out[24]:






  
    
      
      star
      sentence
      word
      flesch reading
      word/sentence
      syllable
      display
      sound
      os
      security
      hardware
      battery
      bug
      price
      cs
      wifi
      accesory
      app
      compatible
      usable
      total topic
      depth
      depth/word
      redundancy
      redundancy/sentence
      rank
      pos topic
      neg topic
      density
      helpful score
      help class
    
  
  
    
      count
      1257.000000
      1257.000000
      1257.000000
      1257.000000
      1257.000000
      1257.000000
      1257.000000
      1257.000000
      1257.000000
      1257.000000
      1257.000000
      1257.000000
      1257.000000
      1257.000000
      1257.000000
      1257.000000
      1257.000000
      1257.000000
      1257.000000
      1257.000000
      1257.000000
      1257.000000
      1257.000000
      1257.000000
      1257.000000
      1257.000000
      1257.000000
      1257.000000
      1257.000000
      1257.000000
      1257.000000
    
    
      mean
      2.999204
      13.498807
      228.099443
      77.891710
      16.001933
      1.334018
      0.496420
      0.169451
      0.167064
      0.077963
      0.370724
      0.256165
      0.236277
      0.392204
      0.238663
      0.203660
      0.279236
      0.368337
      0.021480
      0.456643
      3.734288
      148.308671
      0.594105
      6.466985
      0.491074
      0.329467
      1.173429
      0.218775
      0.354789
      0.709149
      0.511535
    
    
      std
      1.684748
      17.611027
      304.980365
      15.362226
      8.740415
      0.173927
      0.500186
      0.375299
      0.373182
      0.268221
      0.483191
      0.436688
      0.424963
      0.488436
      0.426436
      0.402879
      0.448802
      0.482546
      0.145035
      0.498315
      2.906074
      208.780723
      0.275841
      9.308961
      0.257007
      0.247442
      1.333125
      0.479547
      0.377165
      0.287818
      0.500066
    
    
      min
      1.000000
      1.000000
      1.000000
      1.000000
      0.670000
      0.680000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
    
    
      25%
      1.000000
      4.000000
      53.000000
      71.090000
      11.250000
      1.250000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      1.000000
      29.000000
      0.440000
      2.000000
      0.330000
      0.120000
      0.000000
      0.000000
      0.000000
      0.600000
      0.000000
    
    
      50%
      3.000000
      8.000000
      120.000000
      78.310000
      15.080000
      1.320000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      3.000000
      76.000000
      0.640000
      4.000000
      0.500000
      0.300000
      1.000000
      0.000000
      0.290000
      0.800000
      1.000000
    
    
      75%
      5.000000
      17.000000
      284.000000
      85.860000
      19.000000
      1.390000
      1.000000
      0.000000
      0.000000
      0.000000
      1.000000
      1.000000
      0.000000
      1.000000
      0.000000
      0.000000
      1.000000
      1.000000
      0.000000
      1.000000
      6.000000
      182.000000
      0.780000
      8.000000
      0.640000
      0.510000
      2.000000
      0.000000
      0.500000
      0.930000
      1.000000
    
    
      max
      5.000000
      227.000000
      3045.000000
      145.670000
      86.670000
      2.920000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      14.000000
      1913.000000
      1.040000
      130.000000
      1.000000
      0.990000
      7.000000
      3.000000
      2.000000
      1.000000
      1.000000



In [25]:

    
df_tablet.tail(3)









    Out[25]:






  
    
      
      star
      review
      sentence
      word
      flesch reading
      word/sentence
      syllable
      display
      sound
      os
      security
      hardware
      battery
      bug
      price
      cs
      wifi
      accesory
      app
      compatible
      usable
      total topic
      depth
      depth/word
      redundancy
      redundancy/sentence
      rank
      pos topic
      neg topic
      density
      helpful score
      help class
    
  
  
    
      1254
      1
      * Micro SD Card adapter doesn't read/write pro...
      13
      238
      79.80
      18.31
      1.28
      1
      0
      1
      0
      1
      1
      1
      0
      0
      1
      1
      0
      0
      1
      8
      210
      0.88
      3
      0.23
      0.64
      1
      1
      0.25
      0.45
      0
    
    
      1255
      2
      (The title of this review is trying to describ...
      56
      960
      74.04
      17.14
      1.36
      1
      0
      0
      1
      1
      1
      1
      1
      1
      0
      1
      1
      0
      1
      10
      726
      0.76
      24
      0.43
      0.84
      1
      1
      0.20
      0.60
      0
    
    
      1256
      5
      $50.00 tablet, can not complain at all!! It do...
      6
      89
      89.97
      14.83
      1.20
      1
      1
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
      3
      83
      0.93
      1
      0.17
      0.29
      2
      0
      0.67
      1.00
      1



In [ ]:

    
# save as pickle
import pickle
fileObject = open("./data/preprocessing",'wb') 
pickle.dump(df_tablet,fileObject)   
fileObject.close()

	star	sentence	word	flesch reading	word/sentence	syllable	display	sound	os	security	hardware	battery	bug	price	cs	wifi	accesory	app	compatible	usable	total topic	depth	depth/word	redundancy	redundancy/sentence	rank	pos topic	neg topic	density	helpful score	help class
count	1257.000000	1257.000000	1257.000000	1257.000000	1257.000000	1257.000000	1257.000000	1257.000000	1257.000000	1257.000000	1257.000000	1257.000000	1257.000000	1257.000000	1257.000000	1257.000000	1257.000000	1257.000000	1257.000000	1257.000000	1257.000000	1257.000000	1257.000000	1257.000000	1257.000000	1257.000000	1257.000000	1257.000000	1257.000000	1257.000000	1257.000000
mean	2.999204	13.498807	228.099443	77.891710	16.001933	1.334018	0.496420	0.169451	0.167064	0.077963	0.370724	0.256165	0.236277	0.392204	0.238663	0.203660	0.279236	0.368337	0.021480	0.456643	3.734288	148.308671	0.594105	6.466985	0.491074	0.329467	1.173429	0.218775	0.354789	0.709149	0.511535
std	1.684748	17.611027	304.980365	15.362226	8.740415	0.173927	0.500186	0.375299	0.373182	0.268221	0.483191	0.436688	0.424963	0.488436	0.426436	0.402879	0.448802	0.482546	0.145035	0.498315	2.906074	208.780723	0.275841	9.308961	0.257007	0.247442	1.333125	0.479547	0.377165	0.287818	0.500066
min	1.000000	1.000000	1.000000	1.000000	0.670000	0.680000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	1.000000	4.000000	53.000000	71.090000	11.250000	1.250000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	29.000000	0.440000	2.000000	0.330000	0.120000	0.000000	0.000000	0.000000	0.600000	0.000000
50%	3.000000	8.000000	120.000000	78.310000	15.080000	1.320000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	3.000000	76.000000	0.640000	4.000000	0.500000	0.300000	1.000000	0.000000	0.290000	0.800000	1.000000
75%	5.000000	17.000000	284.000000	85.860000	19.000000	1.390000	1.000000	0.000000	0.000000	0.000000	1.000000	1.000000	0.000000	1.000000	0.000000	0.000000	1.000000	1.000000	0.000000	1.000000	6.000000	182.000000	0.780000	8.000000	0.640000	0.510000	2.000000	0.000000	0.500000	0.930000	1.000000
max	5.000000	227.000000	3045.000000	145.670000	86.670000	2.920000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	14.000000	1913.000000	1.040000	130.000000	1.000000	0.990000	7.000000	3.000000	2.000000	1.000000	1.000000

	star	review	sentence	word	flesch reading	word/sentence	syllable	display	sound	os	security	hardware	battery	bug	price	cs	wifi	accesory	app	usable	total topic	depth	depth/word	redundancy	redundancy/sentence	rank	pos topic	neg topic	density	helpful score	help class
1254	1	* Micro SD Card adapter doesn't read/write pro...	13	238	79.80	18.31	1.28	1	0	1	0	1	1	1	0	0	1	1	0	1	8	210	0.88	3	0.23	0.64	1	1	0.25	0.45	0
1255	2	(The title of this review is trying to describ...	56	960	74.04	17.14	1.36	1	0	0	1	1	1	1	1	1	0	1	1	1	10	726	0.76	24	0.43	0.84	1	1	0.20	0.60	0
1256	5	$50.00 tablet, can not complain at all!! It do...	6	89	89.97	14.83	1.20	1	1	0	0	0	0	0	1	0	0	0	0	0	3	83	0.93	1	0.17	0.29	2	0	0.67	1.00	1