In [59]:
import csv
import logging
import pickle
import numpy as np
from __future__ import division

In [60]:
# Logging definition
logger = logging.getLogger('analyzing')
logger.setLevel(logging.DEBUG)
fh = logging.FileHandler('log/process_ngrams.log')
fh.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
fh.setFormatter(formatter)
logger.addHandler(fh)

In [61]:
def normalize(value):
    """
    Normalize % value to 0.0<=value<=1.0
    """
    value = float(value)
    if value > 1.0:
        return value / 100
    else:
        return value
    
# Load Sentiwordnet
sentiwordnet = {}
with open('data/sentiwordnet/sentiwordnet.tsv', 'rb') as ifile:
    reader = csv.reader(ifile , delimiter='\t')
    headers = reader.next()
    for row in reader:
        # Upload only adjectives and with a specific objectivity threshold
        cond1 = True
        if cond1:
            sentiwordnet["%s" % (row[5])] = {"pos": normalize(row[6]), "neg": normalize(row[7]), "obj": 1.0}
logger.info(' %s sentiwords loaded' % (len(sentiwordnet)))


INFO:analyzing: 871 sentiwords loaded

In [62]:
# Process each review
total_bands = {"1.0": 0, "2.0": 0, "3.0": 0, "4.0": 0, "5.0": 0}

with open('data/output/words_labeled.p', 'rb') as ifile:
    words = pickle.loads(ifile.read())
    logger.info(' %s labeled words loaded' % (len(words)))
    with open('data/output/results.tsv', 'wb') as ofile:
        writer = csv.writer(ofile, delimiter='\t')
        writer.writerow(["word", "pos", "neg", "1.0", "2.0", "3.0", "4.0", "5.0"])
                            
        for word in words:
            for band in words[word]:
                total_bands[band] += words[word][band]
            n = sum(words[word][i] for i in words[word])

            # Random experiment
            random_experiment = False
            if not random_experiment:
                sw_pos = sentiwordnet[word]["pos"]
                sw_neg = sentiwordnet[word]["neg"]
            else:
                if np.random.uniform(0,1) > 0.5:
                    sw_pos = 0.0
                    sw_neg = np.random.uniform(0,1)
                else:
                    sw_neg = 0.0
                    sw_pos = np.random.uniform(0,1)
                    
            # Write row to file
            writer.writerow([word, sw_pos, sw_neg, 
                            words[word]["1.0"]/n, words[word]["2.0"]/n, words[word]["3.0"]/n, 
                            words[word]["4.0"]/n, words[word]["5.0"]/n])

logger.info(' Bands distribution %s' % (total_bands))


INFO:analyzing: 510 labeled words loaded
INFO:analyzing: Bands distribution {'5.0': 126337, '2.0': 32496, '1.0': 42538, '4.0': 93482, '3.0': 41045}

Bands probability Analysis


In [63]:
def acum_probability(data, bands):
    """
    Return accum probability
    """
    n = sum(data[band] for band in data)
    if n == 0:
        return 0.0
    else:
        return sum([data[band]/n for band in bands])
        
    
with open('data/output/words_labeled.p', 'rb') as ifile:
    words = pickle.load(ifile)
    for word in words:
        value = acum_probability(words[word], ["3.0", "4.0", "5.0"])
        if value >= 0.9:
            pass
            print(word," ",words[word])


('thoughtful', ' ', {'5.0': 14, '2.0': 1, '1.0': 1, '4.0': 7, '3.0': 3})
('attentive', ' ', {'5.0': 5, '2.0': 0, '1.0': 0, '4.0': 2, '3.0': 0})
('yummy', ' ', {'5.0': 1, '2.0': 0, '1.0': 0, '4.0': 0, '3.0': 0})
('overdone', ' ', {'5.0': 3, '2.0': 0, '1.0': 0, '4.0': 0, '3.0': 0})
('gloomy', ' ', {'5.0': 0, '2.0': 0, '1.0': 0, '4.0': 1, '3.0': 0})
('darling', ' ', {'5.0': 3, '2.0': 0, '1.0': 0, '4.0': 0, '3.0': 1})
('marvellous', ' ', {'5.0': 1, '2.0': 0, '1.0': 0, '4.0': 1, '3.0': 0})
('nondescript', ' ', {'5.0': 0, '2.0': 0, '1.0': 0, '4.0': 3, '3.0': 1})
('unexciting ', ' ', {'5.0': 0, '2.0': 0, '1.0': 0, '4.0': 1, '3.0': 0})
('prosaic', ' ', {'5.0': 1, '2.0': 0, '1.0': 0, '4.0': 0, '3.0': 0})
('apologetic', ' ', {'5.0': 0, '2.0': 0, '1.0': 0, '4.0': 0, '3.0': 1})
('smoky', ' ', {'5.0': 0, '2.0': 0, '1.0': 0, '4.0': 0, '3.0': 1})
('commendable', ' ', {'5.0': 1, '2.0': 0, '1.0': 0, '4.0': 1, '3.0': 0})
('whimsical', ' ', {'5.0': 1, '2.0': 0, '1.0': 0, '4.0': 0, '3.0': 1})
('faultless', ' ', {'5.0': 1, '2.0': 0, '1.0': 0, '4.0': 1, '3.0': 1})
('excellent', ' ', {'5.0': 2824, '2.0': 206, '1.0': 158, '4.0': 1416, '3.0': 340})
('outmoded', ' ', {'5.0': 1, '2.0': 0, '1.0': 0, '4.0': 0, '3.0': 1})
('awesome', ' ', {'5.0': 1106, '2.0': 80, '1.0': 73, '4.0': 382, '3.0': 111})
('contented', ' ', {'5.0': 0, '2.0': 0, '1.0': 0, '4.0': 0, '3.0': 1})
('colorful', ' ', {'5.0': 48, '2.0': 7, '1.0': 2, '4.0': 59, '3.0': 21})
('flawless', ' ', {'5.0': 387, '2.0': 32, '1.0': 28, '4.0': 172, '3.0': 43})
('amazing', ' ', {'5.0': 1033, '2.0': 59, '1.0': 90, '4.0': 393, '3.0': 119})
('undersized', ' ', {'5.0': 1, '2.0': 0, '1.0': 0, '4.0': 0, '3.0': 1})
('garish ', ' ', {'5.0': 1, '2.0': 0, '1.0': 0, '4.0': 1, '3.0': 0})
('panoramic', ' ', {'5.0': 5, '2.0': 0, '1.0': 0, '4.0': 2, '3.0': 0})
('muggy', ' ', {'5.0': 1, '2.0': 0, '1.0': 0, '4.0': 0, '3.0': 0})
('splendid', ' ', {'5.0': 4, '2.0': 0, '1.0': 0, '4.0': 3, '3.0': 1})
('rowdy', ' ', {'5.0': 1, '2.0': 0, '1.0': 0, '4.0': 0, '3.0': 0})
('classy', ' ', {'5.0': 59, '2.0': 4, '1.0': 2, '4.0': 18, '3.0': 9})
('satisfied', ' ', {'5.0': 700, '2.0': 64, '1.0': 78, '4.0': 486, '3.0': 103})
('perfect', ' ', {'5.0': 4306, '2.0': 288, '1.0': 289, '4.0': 1757, '3.0': 473})
('understated', ' ', {'5.0': 15, '2.0': 0, '1.0': 0, '4.0': 5, '3.0': 0})
('spacious', ' ', {'5.0': 7, '2.0': 0, '1.0': 0, '4.0': 2, '3.0': 2})
('savory', ' ', {'5.0': 1, '2.0': 0, '1.0': 0, '4.0': 0, '3.0': 0})
('handy', ' ', {'5.0': 506, '2.0': 52, '1.0': 30, '4.0': 361, '3.0': 112})
('dingy', ' ', {'5.0': 3, '2.0': 0, '1.0': 0, '4.0': 4, '3.0': 0})
('exquisite', ' ', {'5.0': 4, '2.0': 0, '1.0': 0, '4.0': 2, '3.0': 0})
('unoriginal', ' ', {'5.0': 1, '2.0': 0, '1.0': 0, '4.0': 1, '3.0': 0})
('unabashed', ' ', {'5.0': 2, '2.0': 0, '1.0': 0, '4.0': 0, '3.0': 1})
('pleasant', ' ', {'5.0': 207, '2.0': 18, '1.0': 12, '4.0': 96, '3.0': 17})
('bland', ' ', {'5.0': 1, '2.0': 1, '1.0': 0, '4.0': 7, '3.0': 2})
('soundproof ', ' ', {'5.0': 0, '2.0': 0, '1.0': 0, '4.0': 0, '3.0': 1})
('monotonous', ' ', {'5.0': 1, '2.0': 0, '1.0': 0, '4.0': 0, '3.0': 0})
('plentiful', ' ', {'5.0': 5, '2.0': 1, '1.0': 0, '4.0': 7, '3.0': 4})
('grimy', ' ', {'5.0': 1, '2.0': 0, '1.0': 0, '4.0': 0, '3.0': 0})
('airy', ' ', {'5.0': 0, '2.0': 0, '1.0': 0, '4.0': 0, '3.0': 2})
('offhand', ' ', {'5.0': 0, '2.0': 0, '1.0': 0, '4.0': 0, '3.0': 1})
('dreary', ' ', {'5.0': 0, '2.0': 0, '1.0': 0, '4.0': 1, '3.0': 0})
('sultry', ' ', {'5.0': 1, '2.0': 0, '1.0': 0, '4.0': 0, '3.0': 0})
('outstanding', ' ', {'5.0': 249, '2.0': 12, '1.0': 16, '4.0': 108, '3.0': 36})
('tidy', ' ', {'5.0': 10, '2.0': 0, '1.0': 1, '4.0': 2, '3.0': 0})
('juicy', ' ', {'5.0': 0, '2.0': 0, '1.0': 0, '4.0': 1, '3.0': 1})
('mawkish', ' ', {'5.0': 0, '2.0': 0, '1.0': 0, '4.0': 0, '3.0': 1})
('fashionable', ' ', {'5.0': 28, '2.0': 0, '1.0': 3, '4.0': 23, '3.0': 6})
('uncouth', ' ', {'5.0': 1, '2.0': 0, '1.0': 0, '4.0': 0, '3.0': 0})
('demolished', ' ', {'5.0': 1, '2.0': 0, '1.0': 0, '4.0': 0, '3.0': 1})
('multilingual', ' ', {'5.0': 0, '2.0': 0, '1.0': 0, '4.0': 4, '3.0': 0})
('surprising', ' ', {'5.0': 157, '2.0': 19, '1.0': 14, '4.0': 143, '3.0': 43})
('astonishing', ' ', {'5.0': 5, '2.0': 0, '1.0': 1, '4.0': 8, '3.0': 2})
('orderly', ' ', {'5.0': 1, '2.0': 0, '1.0': 0, '4.0': 0, '3.0': 1})
('discontent', ' ', {'5.0': 0, '2.0': 0, '1.0': 0, '4.0': 0, '3.0': 1})
('scorched', ' ', {'5.0': 0, '2.0': 0, '1.0': 0, '4.0': 0, '3.0': 1})
('contemporary', ' ', {'5.0': 2, '2.0': 0, '1.0': 0, '4.0': 22, '3.0': 0})
('gratified', ' ', {'5.0': 2, '2.0': 0, '1.0': 0, '4.0': 0, '3.0': 0})
('pleased', ' ', {'5.0': 1116, '2.0': 70, '1.0': 87, '4.0': 660, '3.0': 104})
('delinquent', ' ', {'5.0': 0, '2.0': 0, '1.0': 0, '4.0': 0, '3.0': 1})
('gratis', ' ', {'5.0': 0, '2.0': 0, '1.0': 0, '4.0': 1, '3.0': 0})
('exceptional', ' ', {'5.0': 170, '2.0': 11, '1.0': 14, '4.0': 94, '3.0': 30})
('unbeatable', ' ', {'5.0': 51, '2.0': 0, '1.0': 2, '4.0': 20, '3.0': 4})
('sassy', ' ', {'5.0': 0, '2.0': 0, '1.0': 0, '4.0': 0, '3.0': 1})
('unworthy', ' ', {'5.0': 0, '2.0': 0, '1.0': 0, '4.0': 1, '3.0': 0})
('fascinating', ' ', {'5.0': 3, '2.0': 0, '1.0': 0, '4.0': 1, '3.0': 1})
('fluffy', ' ', {'5.0': 0, '2.0': 0, '1.0': 0, '4.0': 1, '3.0': 2})
('satisfying', ' ', {'5.0': 31, '2.0': 2, '1.0': 4, '4.0': 19, '3.0': 5})
('superb', ' ', {'5.0': 210, '2.0': 18, '1.0': 13, '4.0': 80, '3.0': 26})
('affordable', ' ', {'5.0': 169, '2.0': 12, '1.0': 12, '4.0': 89, '3.0': 19})
('gorgeous', ' ', {'5.0': 59, '2.0': 8, '1.0': 3, '4.0': 50, '3.0': 13})
('homely', ' ', {'5.0': 0, '2.0': 0, '1.0': 0, '4.0': 0, '3.0': 1})
('welcoming', ' ', {'5.0': 1, '2.0': 0, '1.0': 0, '4.0': 0, '3.0': 1})
('pretentious', ' ', {'5.0': 2, '2.0': 0, '1.0': 0, '4.0': 3, '3.0': 1})
('impeccable', ' ', {'5.0': 5, '2.0': 0, '1.0': 0, '4.0': 1, '3.0': 0})
('stinky', ' ', {'5.0': 2, '2.0': 0, '1.0': 0, '4.0': 0, '3.0': 0})
('mislaid', ' ', {'5.0': 0, '2.0': 0, '1.0': 0, '4.0': 0, '3.0': 1})
('extortionate', ' ', {'5.0': 0, '2.0': 0, '1.0': 0, '4.0': 0, '3.0': 1})
('humdrum', ' ', {'5.0': 1, '2.0': 0, '1.0': 0, '4.0': 0, '3.0': 1})
('fantastic', ' ', {'5.0': 522, '2.0': 32, '1.0': 49, '4.0': 207, '3.0': 63})
('keen', ' ', {'5.0': 7, '2.0': 0, '1.0': 1, '4.0': 5, '3.0': 4})
('roomy', ' ', {'5.0': 10, '2.0': 0, '1.0': 0, '4.0': 3, '3.0': 1})
('repetitious', ' ', {'5.0': 1, '2.0': 0, '1.0': 0, '4.0': 0, '3.0': 0})
('easy', ' ', {'5.0': 5311, '2.0': 666, '1.0': 436, '4.0': 3603, '3.0': 1069})
('dreamy', ' ', {'5.0': 1, '2.0': 0, '1.0': 0, '4.0': 0, '3.0': 0})
('informative', ' ', {'5.0': 8, '2.0': 0, '1.0': 1, '4.0': 5, '3.0': 1})
('accessible', ' ', {'5.0': 114, '2.0': 12, '1.0': 7, '4.0': 83, '3.0': 30})
('stuffy', ' ', {'5.0': 5, '2.0': 0, '1.0': 0, '4.0': 0, '3.0': 0})
('amicable', ' ', {'5.0': 1, '2.0': 0, '1.0': 0, '4.0': 0, '3.0': 0})
('nauseating', ' ', {'5.0': 0, '2.0': 0, '1.0': 0, '4.0': 1, '3.0': 1})
('appreciative', ' ', {'5.0': 6, '2.0': 0, '1.0': 0, '4.0': 2, '3.0': 0})
('economical', ' ', {'5.0': 29, '2.0': 3, '1.0': 2, '4.0': 18, '3.0': 4})
('palatable', ' ', {'5.0': 0, '2.0': 0, '1.0': 0, '4.0': 1, '3.0': 0})
('undeserving ', ' ', {'5.0': 0, '2.0': 0, '1.0': 0, '4.0': 0, '3.0': 1})
('deranged', ' ', {'5.0': 0, '2.0': 0, '1.0': 0, '4.0': 0, '3.0': 1})
('lively', ' ', {'5.0': 1, '2.0': 0, '1.0': 0, '4.0': 1, '3.0': 0})
('refined', ' ', {'5.0': 11, '2.0': 0, '1.0': 1, '4.0': 5, '3.0': 4})
('unfashionable', ' ', {'5.0': 0, '2.0': 0, '1.0': 0, '4.0': 1, '3.0': 2})
('immaculate', ' ', {'5.0': 1, '2.0': 0, '1.0': 0, '4.0': 1, '3.0': 1})
('refreshing', ' ', {'5.0': 13, '2.0': 1, '1.0': 0, '4.0': 9, '3.0': 0})
('romantic', ' ', {'5.0': 0, '2.0': 0, '1.0': 0, '4.0': 0, '3.0': 1})
('cordial', ' ', {'5.0': 0, '2.0': 0, '1.0': 0, '4.0': 3, '3.0': 0})
('tasty', ' ', {'5.0': 2, '2.0': 0, '1.0': 0, '4.0': 1, '3.0': 0})
('stagnant', ' ', {'5.0': 1, '2.0': 0, '1.0': 0, '4.0': 0, '3.0': 0})
('energetic', ' ', {'5.0': 2, '2.0': 0, '1.0': 0, '4.0': 0, '3.0': 0})
('astounding', ' ', {'5.0': 12, '2.0': 1, '1.0': 0, '4.0': 6, '3.0': 0})
('drab', ' ', {'5.0': 2, '2.0': 0, '1.0': 0, '4.0': 1, '3.0': 1})
('deluxe', ' ', {'5.0': 26, '2.0': 4, '1.0': 0, '4.0': 15, '3.0': 5})

In [14]:
total_bands = {"1.0": 0, "2.0": 0, "3.0": 0, "4.0": 0, "5.0": 0}
with open('data/amazon/Cell_Phones_&_Accessories.txt', 'rb') as ifile:
    for line in ifile.readlines():
        if "review/score" in line:
            score = float(line[len(line)-4:len(line)])
            total_bands[str(score)] += 1
print(total_bands)


{'5.0': 30253, '2.0': 7566, '1.0': 14675, '4.0': 17717, '3.0': 8719}

In [ ]: