In [1]:
from flask import render_template, _app_ctx_stack, jsonify
from app import app, host, port, user, passwd, db
from app.helpers.database import con_db
import sys
import simplejson
import jinja2
import nltk
from nltk.collocations import *
import string
import MySQLdb
from collections import Counter
from nltk.corpus import stopwords
import numpy as np
from nltk.stem.porter import *
from Queue import PriorityQueue
import datetime as dt
import heapq
bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()

In [2]:
def get_db():
    db = MySQLdb.connect(host="localhost", user="root", db = "home_kitchen")
    #print "Getting DB"
    #top = _app_ctx_stack.top
    #if not hasattr(top, 'home_kitchen_db'):
        #top.home_kitchen_db = MySQLdb.connect(host="localhost", user="root", db = "home_kitchen")
    return db

In [3]:
#def query_db(query):
#    #sys.stderr.write("Querying Database with: "  + query)
#    cursor = get_db().cursor()
#    cursor.execute(query)
#    return cursor.fetchall()

def query_db(query):
#sys.stderr.write("Querying Database with: "  + query)
    cursor = get_db().cursor()
    cursor.execute(query)
    data = cursor.fetchall()
    if data:
        return data
    else:
        return "error"

In [4]:
def avg_rating(rating):
    avg = [0]*len(rating)
    avg[0] = float(rating[0])
    for k in range(1, len(rating)):
        avg[k]= float(np.mean(rating[:k]))
    return avg

In [5]:
def first_pop_time(time): 
    firstpopmin = time[0]
    firstpopmax = time[len(time)-1]
    slidermin = 0
    slidersize = max(int(len(time)/4), 4)
    avtime = (time[len(time)-1] - time[0])/len(time)
    for i in range(slidersize, len(time)): #i marks the end of the slider
        windowsize = time[i] - time[i - slidersize]
        if windowsize < ((time[len(time)-1] - time[0])/4) and (time[i-slidersize]-time[i - slidersize - 3]) < 3*avtime:   
            firstpopmax = time[i]
            firstpopmin = time[i - slidersize]
            break;
    #dates=[dt.datetime.fromtimestamp(ts) for ts in time]
    print dt.datetime.fromtimestamp(firstpopmin), dt.datetime.fromtimestamp(firstpopmax)
    return firstpopmin, firstpopmax

In [18]:
#returns ratings and time for a given pid in tablename with cursor pointing toward the database
def get_data(PID, tablename):
    sql = "Select RTime, RScore, RSummary, RText From " +tablename + " Where PID = " + '"' + PID +'";'    
    data = query_db(sql)
    data = sorted(data)
    rating = np.array(zip(*data)[1], dtype = int)
    time = np.array(zip(*data)[0], dtype = float)
    #dates=[dt.datetime.fromtimestamp(ts) for ts in time]
    return rating, time, data[2], data[3]#, dates

In [7]:
def get_tokens(text):
  # with open('/opt/datacourse/data/parts/shakes-1.txt', 'r') as shakes:
    #text = shakes.read()
    lowers = text.lower()
    #remove the punctuation using the character deletion step of translate
    no_punctuation = lowers.translate(None, string.punctuation)
    tokens = nltk.word_tokenize(no_punctuation)
    filtered = [w for w in tokens if not w in stopwords.words('english')]
    return filtered

In [9]:
def get_tot_review_text(PID, tablename):
    rating, time = get_data(PID, tablename)
    popmin, popmax = first_pop_time(time)
    text = ''
    sql = "Select RSummary From " +tablename + " Where PID = " + '"' + PID +'"' + " and rtime < " + str(popmax) + ";"
    rtext = query_db(sql)
    rtext = tuple(x[0] for x in rtext)
    for string in rtext:
        text = text + string
    sql = "Select RText From " +tablename + " Where PID = " + '"' + PID +'"' + " and rtime < " + str(popmax) + ";"
    cursor.execute(sql)
    rtext = cursor.fetchall()
    rtext = tuple(x[0] for x in rtext)
    for string in rtext:
        text = text + string
    return text

In [10]:
def get_pop_review_text(PID, tablename):
    rating, time = get_data(PID, tablename)
    popmin, popmax = first_pop_time(time)
    text = ''
    sql = "Select RSummary From " +tablename + " Where PID = " + '"' + PID +'"' + ' and rtime > ' + str(popmin)+ " and rtime < " + str(popmax) + ";"
    rtext = query_db(sql)
    rtext = tuple(x[0] for x in rtext)
    for string in rtext:
        text = text + string
    sql = "Select RText From " +tablename + " Where PID = " + '"' + PID +'"' + ' and rtime > ' + str(popmin)+ " and rtime < " + str(popmax) + ";"
    cursor.execute(sql)
    rtext = cursor.fetchall()
    rtext = tuple(x[0] for x in rtext)
    for string in rtext:
        text = text + string
    return text, rtext, popmin

In [8]:
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

In [9]:
def best_bigram_collector(finder, n, Ptitle):
    #finder.apply_freq_filter(4)
    list = finder.nbest(bigram_measures.jaccard, n*10)
    Ptitle = get_tokens(str(Ptitle))
    Ptitle.append("used")
    bests = []
    count = 0
    words = []
    for item in list:
        if count < n:
            if item[0] not in words and item[1] not in words and item[0] not in Ptitle and item[1] not in Ptitle:
                bests.append(item)
                words.append(item[0])
                words.append(item[1])
                #words.append(item[2])
                count = count + 1
            else:
                words.append(item[0])
                words.append(item[1])
                #words.append(item[2])
        else:
            break;
    return bests

In [10]:
def best_trigram_collector(finder, n, Ptitle):
    #finder.apply_freq_filter(4)
    list = finder.nbest(trigram_measures.raw_freq, n*10)
    Ptitle = get_tokens(str(Ptitle))
    #add any other words that come up a lot and want to exclude
    Ptitle.append("used")
    bests = []
    count = 0
    words = []
    for item in list:
        if count < n:
            if item[0] not in words and item[1] not in words and item[2] not in words and item[0] not in Ptitle and item[1] not in Ptitle and item[2] not in Ptitle:   
                bests.append(item)
                words.append(item[0])
                words.append(item[1])
                words.append(item[2])
                count = count + 1
            else:
                words.append(item[0])
                words.append(item[1])
                words.append(item[2])
        else:
            break;
    return bests

In [13]:
def get_better_review_text(reviews, timemin, timemax):
    text = ''
    rtext = tuple(x[0] for x in reviews)
    for string in rtext:
        text = text + string
	#sql = "Select RText From " +tablename + " Where PID = " + '"' + PID +'"' + ' and rtime > ' + str(timemin)+ " and rtime < " + str(timemax) + ";"
	#rtext = query_db(sql)
	#rtext = tuple(x[0] for x in rtext)
	#for string in rtext:
		#text = text + string
    return text, rtext

In [ ]:


In [14]:
database = "home_kitchen"
tablename = "all_hk"
db = MySQLdb.connect(host="localhost", user="root", db = database)
cursor = db.cursor()

In [15]:
#PID = ' B00005AQ9Q' #bad reviews so no one buys it. 99 reviews. 
PID = ' B0000E2PEI' #featured in consumer reports example
#PID = ' B0000X7CMQ'
#PID = ' B000S5XYI2'
#PID = ' B00019G8IS'
#PID = ' B000GTR2F6'
#PID = ' B000GXZ2GS'  #really good!
#PID = ' B0000DIU49'

In [19]:
#tot_text = get_tot_review_text(PID, tablename)
#pop_text, pop_revs, popmin = get_pop_review_text(PID, tablename)
ratings, time, RSummary, RText = get_data(PID, tablename)

In [17]:
#tot_tokens = get_tokens(tot_text)
pop_tokens = get_tokens(pop_text)

In [18]:
cursor.execute("Select distinct PTitle from all_hk where PID = " + '"' + PID + '";')
ptitle = cursor.fetchall()
ptitle = tuple(x[0] for x in ptitle)
ptitle = get_tokens(ptitle[0])
print ptitle


['zojirushi', 'ecbd15ba', 'fresh', 'brew', 'thermal', 'carafe', 'coffee', 'maker']

In [19]:
finder = BigramCollocationFinder.from_words(pop_tokens)
finder.apply_freq_filter(4)
finder.nbest(bigram_measures.jaccard, 20)


Out[19]:
[('consumer', 'reports'),
 ('black', 'decker'),
 ('stopped', 'working'),
 ('stainless', 'steel'),
 ('customer', 'service'),
 ('level', 'indicator'),
 ('opening', 'closing'),
 ('200', 'degrees'),
 ('coffee', 'maker'),
 ('30', 'minutes'),
 ('thermal', 'carafe'),
 ('ecbd15', 'fresh'),
 ('job', 'keeping'),
 ('warming', 'plate'),
 ('zojirushi', 'ecbd15'),
 ('something', 'else'),
 ('easy', 'clean'),
 ('room', 'temp'),
 ('4', 'filters'),
 ('highly', 'recommend')]

In [20]:
finder = TrigramCollocationFinder.from_words(pop_tokens)
finder.apply_freq_filter(2)
finder.nbest(trigram_measures.raw_freq, 20)


Out[20]:
[('great', 'coffee', 'maker'),
 ('keeps', 'coffee', 'hot'),
 ('thermal', 'carafe', 'coffee'),
 ('good', 'cup', 'coffee'),
 ('carafe', 'coffee', 'maker'),
 ('coffee', 'stays', 'hot'),
 ('makes', 'good', 'coffee'),
 ('keep', 'coffee', 'hot'),
 ('brew', 'thermal', 'carafe'),
 ('fresh', 'brew', 'thermal'),
 ('carafe', 'keeps', 'coffee'),
 ('coffee', 'maker', 'makes'),
 ('ecbd15', 'fresh', 'brew'),
 ('hot', 'long', 'time'),
 ('coffee', 'hot', 'hours'),
 ('keep', 'coffee', 'warm'),
 ('makes', 'great', 'coffee'),
 ('best', 'coffee', 'maker'),
 ('coffee', 'maker', 'good'),
 ('love', 'coffee', 'maker')]

In [21]:
finder = BigramCollocationFinder.from_words(pop_tokens)
finder.apply_freq_filter(4)
if finder:
    bestbigrams = best_bigram_collector(finder, 5, ptitle)
print bestbigrams

#finder.apply_freq_filter(4)
#finder.nbest(bigram_measures.jaccard, 20)


[('consumer', 'reports'), ('black', 'decker'), ('stopped', 'working'), ('stainless', 'steel'), ('customer', 'service')]

In [22]:
finder = TrigramCollocationFinder.from_words(pop_tokens)
#finder.apply_freq_filter(2)
if finder:
    besttrigrams = best_trigram_collector(finder, 5, ptitle)
print besttrigrams


[('4', 'cone', 'filters')]

In [23]:
##### now I"m going to try and snag representative reviews.

In [24]:
keywords = [item for sublist in bestbigrams for item in sublist] + [item for sublist in besttrigrams for item in sublist]
print keywords


['consumer', 'reports', 'black', 'decker', 'stopped', 'working', 'stainless', 'steel', 'customer', 'service', '4', 'cone', 'filters']

In [25]:
print len(pop_revs)
count = [0]*len(pop_revs)
i = 0
for rev in pop_revs:
    #print rev
    if len(rev)>0:
        for word in get_tokens(rev):
            #print word
            if word in keywords:
                count[i] += 1
        count[i] = float(count[i])/float(len(rev))
        i += 1


288

In [26]:
#print count

In [27]:
bestrevs = []
for x, i in enumerate(count):
    if i in heapq.nlargest(4, count):
        bestrevs.append(pop_revs[x])
        print pop_revs[x], '\n'
#print bestrevs


 A well built,good looking coffee maker,and it makes excellent coffee. The #4 filters work ok,so I wouldn't let that deter you. 

 It doesn't pour well. The lid is crazy and hard to line up. The #4 filters the mfg. recommends have to be cut down because they're too tall. The Bunn stainless steel pour-over is a much better coffee maker but it's also twice the money. 

 Quit working entirely within 2 weeks of purchase, probably made 25 pots of coffee before it stopped. 

 This brewer stopped working in early Dec. 2011, total life of just over 5 years. 


In [28]:
cursor.execute("Select distinct PTitle from all_hk where PID = " + '"' + PID + '";')
Ptitle = cursor.fetchall()
print Ptitle


((' Zojirushi EC-BD15BA Fresh Brew Thermal Carafe Coffee Maker',),)

In [29]:
sql = "Select distinct PTitle from all_hk where PID = " + '"' + PID + '";'
prodname = str(query_db(sql))
prodname = tuple(x[0] for x in prodname)

In [30]:
print dt.datetime.fromtimestamp(popmin)


2007-09-28 17:00:00

In [31]:
newrevs = []
for rev in bestrevs:
    hold = ''
    for word in rev.split():
        if word in keywords:
           hold = hold + " <b>"+word + "</b> "
        else:
            hold = hold + ' ' + word + ' '
    newrevs.append(hold)

In [32]:
print newrevs[0]


 A  well  built,good  looking  coffee  maker,and  it  makes  excellent  coffee.  The  #4  <b>filters</b>  work  ok,so  I  wouldn't  let  that  deter  you. 

In [33]:
product_id = "Wall Clock"
PID = ' ' + product_id
tablename =  'all_hk'
query = "Select RTime, RScore, RSummary, RText From " + tablename +" Where PID = "  +'"' + PID +'" ORDER BY RTime ASC;'


#did they input a pid, title, or neither?
data = query_db(query)
if data == "error":
	print "not a pid"
	query = "Select PID, PTitle From " + tablename +" Where PTitle Like "  +'"%' + product_id +'%" Limit 11'
	prodlist = query_db(query)
	PID = prodlist[0][0]


not a pid

In [ ]: