In [1]:
import nltk
from nltk.collocations import *
import string
import MySQLdb
from collections import Counter
from nltk.corpus import stopwords
import numpy as np
from nltk.stem.porter import *
from Queue import PriorityQueue
import datetime as dt
bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()

In [2]:
def avg_rating(rating):
    avg = [0]*len(rating)
    avg[0] = float(rating[0])
    for k in range(1, len(rating)):
        avg[k]= float(np.mean(rating[:k]))
    return avg

In [3]:
def pop_time(time): 
    unpopmin = time[0]
    unpopmax = time[0]
    popmin = time[0]
    popmax = time[len(time)-1]
    slidermin = 0
    slidersize = int(len(time)/4)
    for i in range(slidersize, len(time)): #i marks the end of the slider
        windowsize = time[i] - time[i - slidersize]
        if windowsize > unpopmax - unpopmin:
            unpopmax = time[i]
            unpopmin = time[i - slidersize]
        if windowsize < popmax - popmin:
            popmax = time[i]
            popmin = time[i - slidersize]
    #dates=[dt.datetime.fromtimestamp(ts) for ts in time]
    print dt.datetime.fromtimestamp(popmin), dt.datetime.fromtimestamp(popmax)
    return unpopmin, unpopmax, popmin, popmax

In [4]:
def first_pop_time(time): 
    firstpopmin = time[0]
    firstpopmax = time[len(time)-1]
    slidermin = 0
    slidersize = max(int(len(time)/4), 4)
    avtime = (time[len(time)-1] - time[0])/len(time)
    for i in range(slidersize, len(time)): #i marks the end of the slider
        windowsize = time[i] - time[i - slidersize]
        if windowsize < ((time[len(time)-1] - time[0])/4) and (time[i-slidersize]-time[i - slidersize - 3]) < 3*avtime:   
            firstpopmax = time[i]
            firstpopmin = time[i - slidersize]
            break;
    #dates=[dt.datetime.fromtimestamp(ts) for ts in time]
    print dt.datetime.fromtimestamp(firstpopmin), dt.datetime.fromtimestamp(firstpopmax)
    return firstpopmin, firstpopmax

In [5]:
#returns ratings and time for a given pid in tablename with cursor pointing toward the database
def get_data(PID, cursor, tablename):
    sql = "Select RTime, RScore From " +tablename + " Where PID = " + '"' + PID +'";'
    cursor.execute(sql)
    data = cursor.fetchall()
    data = sorted(data)
    rating = np.array(zip(*data)[1], dtype = int)
    time = np.array(zip(*data)[0], dtype = float)
    #dates=[dt.datetime.fromtimestamp(ts) for ts in time]
    return rating, time#, dates

In [6]:
def get_tokens(text):
  # with open('/opt/datacourse/data/parts/shakes-1.txt', 'r') as shakes:
    #text = shakes.read()
    lowers = text.lower()
    #remove the punctuation using the character deletion step of translate
    no_punctuation = lowers.translate(None, string.punctuation)
    tokens = nltk.word_tokenize(no_punctuation)
    return tokens

In [7]:
def get_all_review_text(PID, cursor, tablename):
    text = ''
    sql = "Select RSummary From " +tablename + " Where PID = " + '"' + PID +'";'
    cursor.execute(sql)
    rtext = cursor.fetchall()
    rtext = tuple(x[0] for x in rtext)
    for string in rtext:
        text = text + string
    sql = "Select RText From " +tablename + " Where PID = " + '"' + PID +'";'
    cursor.execute(sql)
    rtext = cursor.fetchall()
    rtext = tuple(x[0] for x in rtext)
    for string in rtext:
        text = text + string
    return text

In [8]:
def get_popular_review_text(PID, cursor, tablename):
    rating, time = get_data(PID, cursor, tablename)
    a, b, popmin, popmax = pop_time(time)
    text = ''
    sql = "Select RSummary From " +tablename + " Where PID = " + '"' + PID +'"' + ' and rtime > ' + str(popmin)+ " and rtime < " + str(popmax) + ";"
    cursor.execute(sql)
    rtext = cursor.fetchall()
    rtext = tuple(x[0] for x in rtext)
    for string in rtext:
        text = text + string
    sql = "Select RText From " +tablename + " Where PID = " + '"' + PID +'"' + ' and rtime > ' + str(popmin)+ " and rtime < " + str(popmax) + ";"
    cursor.execute(sql)
    rtext = cursor.fetchall()
    rtext = tuple(x[0] for x in rtext)
    for string in rtext:
        text = text + string
    return text

In [9]:
def get_firstpop_review_text(PID, cursor, tablename):
    rating, time = get_data(PID, cursor, tablename)
    popmin, popmax = first_pop_time(time)
    text = ''
    sql = "Select RSummary From " +tablename + " Where PID = " + '"' + PID +'"' + ' and rtime > ' + str(popmin)+ " and rtime < " + str(popmax) + ";"
    cursor.execute(sql)
    rtext = cursor.fetchall()
    rtext = tuple(x[0] for x in rtext)
    for string in rtext:
        text = text + string
    sql = "Select RText From " +tablename + " Where PID = " + '"' + PID +'"' + ' and rtime > ' + str(popmin)+ " and rtime < " + str(popmax) + ";"
    cursor.execute(sql)
    rtext = cursor.fetchall()
    rtext = tuple(x[0] for x in rtext)
    for string in rtext:
        text = text + string
    return text

In [10]:
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

In [84]:
def text_to_counter(text):
    stemmer = PorterStemmer()
    tokens_all = get_tokens(text)
    filtered_all = [w for w in tokens_all if not w in stopwords.words('english')]
    count = Counter(filtered_all)

    stemmed_all = stem_tokens(filtered_all, stemmer)
    count_all = Counter(stemmed_all)

    count_all = Counter({x: n-1 for x, n in count_all.items()})
    count_all += Counter()
    count_all = Counter({x: n+1 for x, n in count_all.items()})
    return count_all

In [12]:
database = "home_kitchen"
tablename = "all_hk"

In [13]:
db = MySQLdb.connect(host="localhost", user="root", db = database)
cursor = db.cursor()

In [72]:
#PID = ' B00005AQ9Q' #bad reviews so no one buys it. 99 reviews. 
PID = ' B0000E2PEI' #featured in consumer reports example
#PID = ' B0000X7CMQ'

In [73]:
stemmer = PorterStemmer()

all_text = get_all_review_text(PID, cursor, tablename)
pop_text = get_popular_review_text(PID, cursor, tablename)
first_pop_text = get_firstpop_review_text(PID, cursor, tablename)

count_all = text_to_counter(all_text)
count_pop = text_to_counter(pop_text)
count_firstpop = text_to_counter(first_pop_text)

#print count_all.most_common(100), '\n'
#print count_pop.most_common(100), '\n' 
#print  count_firstpop.most_common(100)
    
all_len = len(all_text)
pop_len = len(pop_text)
firstpop_len = len(first_pop_text)

#print all_len, pop_len, firstpop_len


2012-10-12 17:00:00 2013-02-15 16:00:00
2010-02-05 16:00:00 2010-12-31 16:00:00

In [74]:
ratio_all = {x: float(n)/float(all_len) for x, n in count_all.items()}
ratio_pop = {x: float(n)/float(pop_len) for x, n in count_pop.items()}
ratio_firstpop = {x: float(n)/float(firstpop_len) for x, n in count_firstpop.items()}
#print ratio_all, '\n\n\n', ratio_pop
popratios = Counter({x: n/ratio_all[x] for x, n in ratio_pop.items()})
firstpopratios = Counter({x: n/ratio_all[x] for x, n in ratio_firstpop.items()})
#print count_all.most_common(100), '\n'
#print count_pop.most_common(100), '\n' 
#print  count_firstpop.most_common(100)
#print {word for word, n in popratios.most_common(30)}, '\n'
#print {word for word, n in firstpopratios.most_common(30)}

In [75]:
#finder = BigramCollocationFinder.from_words(pop_text)

In [76]:
tokens = get_tokens(all_text)
filtered = [w for w in tokens if not w in stopwords.words('english')]
finder = BigramCollocationFinder.from_words(filtered)
finder.apply_freq_filter(3)
finder.nbest(bigram_measures.likelihood_ratio, 10)


Out[76]:
[('consumer', 'reports'),
 ('taylor', '7506'),
 ('easy', 'read'),
 ('easy', 'use'),
 ('stopped', 'working'),
 ('doctors', 'office'),
 ('glass', 'chrome'),
 ('looks', 'great'),
 ('customer', 'service'),
 ('would', 'recommend')]

In [77]:
tokens = get_tokens(all_text)
filtered = [w for w in tokens if not w in stopwords.words('english')]
finder = TrigramCollocationFinder.from_words(filtered)
finder.apply_freq_filter(2)
finder.nbest(trigram_measures.raw_freq, 10)


Out[77]:
[('taylor', '7506', 'scale'),
 ('7506', 'glass', 'chrome'),
 ('taylor', '7506', 'glass'),
 ('glass', 'chrome', 'digital'),
 ('chrome', 'digital', 'scale'),
 ('scale', 'looks', 'great'),
 ('scale', 'consumer', 'reports'),
 ('accurate', 'easy', 'use'),
 ('accurate', 'easy', 'read'),
 ('scale', 'great', 'scale')]

In [82]:
tokens = get_tokens(first_pop_text)
filtered = [w for w in tokens if not w in stopwords.words('english')]
finder = BigramCollocationFinder.from_words(filtered)
finder.apply_freq_filter(3)
for bigram in finder.nbest(bigram_measures.likelihood_ratio, 5):
    print bigram


('consumer', 'reports')
('taylor', '7506')
('easy', 'read')
('doctors', 'office')
('highly', 'recommend')

In [83]:
tokens = get_tokens(first_pop_text)
filtered = [w for w in tokens if not w in stopwords.words('english')]
finder = TrigramCollocationFinder.from_words(filtered)
finder.apply_freq_filter(2)
finder.nbest(trigram_measures.raw_freq, 5)


Out[83]:
[('taylor', '7506', 'scale'),
 ('7506', 'bathroom', 'scale'),
 ('taylor', '7506', 'bathroom'),
 ('scale', 'consumer', 'reports'),
 ('accurate', 'easy', 'read')]

In [85]:
print finder


<nltk.collocations.TrigramCollocationFinder object at 0x106541150>

In [ ]: