In [1]:
from flask import render_template, _app_ctx_stack, jsonify
from app import app, host, port, user, passwd, db
from app.helpers.database import con_db
import sys
import simplejson
import jinja2
import nltk
from nltk.collocations import *
import string
import MySQLdb
from collections import Counter
from nltk.corpus import stopwords
import numpy as np
from nltk.stem.porter import *
from Queue import PriorityQueue
import datetime as dt
import heapq
bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()
In [2]:
def get_db():
db = MySQLdb.connect(host="localhost", user="root", db = "home_kitchen")
#print "Getting DB"
#top = _app_ctx_stack.top
#if not hasattr(top, 'home_kitchen_db'):
#top.home_kitchen_db = MySQLdb.connect(host="localhost", user="root", db = "home_kitchen")
return db
In [3]:
#def query_db(query):
# #sys.stderr.write("Querying Database with: " + query)
# cursor = get_db().cursor()
# cursor.execute(query)
# return cursor.fetchall()
def query_db(query):
#sys.stderr.write("Querying Database with: " + query)
cursor = get_db().cursor()
cursor.execute(query)
data = cursor.fetchall()
if data:
return data
else:
return "error"
In [4]:
def avg_rating(rating):
avg = [0]*len(rating)
avg[0] = float(rating[0])
for k in range(1, len(rating)):
avg[k]= float(np.mean(rating[:k]))
return avg
In [5]:
def first_pop_time(time):
firstpopmin = time[0]
firstpopmax = time[len(time)-1]
slidermin = 0
slidersize = max(int(len(time)/4), 4)
avtime = (time[len(time)-1] - time[0])/len(time)
for i in range(slidersize, len(time)): #i marks the end of the slider
windowsize = time[i] - time[i - slidersize]
if windowsize < ((time[len(time)-1] - time[0])/4) and (time[i-slidersize]-time[i - slidersize - 3]) < 3*avtime:
firstpopmax = time[i]
firstpopmin = time[i - slidersize]
break;
#dates=[dt.datetime.fromtimestamp(ts) for ts in time]
print dt.datetime.fromtimestamp(firstpopmin), dt.datetime.fromtimestamp(firstpopmax)
return firstpopmin, firstpopmax
In [18]:
#returns ratings and time for a given pid in tablename with cursor pointing toward the database
def get_data(PID, tablename):
sql = "Select RTime, RScore, RSummary, RText From " +tablename + " Where PID = " + '"' + PID +'";'
data = query_db(sql)
data = sorted(data)
rating = np.array(zip(*data)[1], dtype = int)
time = np.array(zip(*data)[0], dtype = float)
#dates=[dt.datetime.fromtimestamp(ts) for ts in time]
return rating, time, data[2], data[3]#, dates
In [7]:
def get_tokens(text):
# with open('/opt/datacourse/data/parts/shakes-1.txt', 'r') as shakes:
#text = shakes.read()
lowers = text.lower()
#remove the punctuation using the character deletion step of translate
no_punctuation = lowers.translate(None, string.punctuation)
tokens = nltk.word_tokenize(no_punctuation)
filtered = [w for w in tokens if not w in stopwords.words('english')]
return filtered
In [9]:
def get_tot_review_text(PID, tablename):
rating, time = get_data(PID, tablename)
popmin, popmax = first_pop_time(time)
text = ''
sql = "Select RSummary From " +tablename + " Where PID = " + '"' + PID +'"' + " and rtime < " + str(popmax) + ";"
rtext = query_db(sql)
rtext = tuple(x[0] for x in rtext)
for string in rtext:
text = text + string
sql = "Select RText From " +tablename + " Where PID = " + '"' + PID +'"' + " and rtime < " + str(popmax) + ";"
cursor.execute(sql)
rtext = cursor.fetchall()
rtext = tuple(x[0] for x in rtext)
for string in rtext:
text = text + string
return text
In [10]:
def get_pop_review_text(PID, tablename):
rating, time = get_data(PID, tablename)
popmin, popmax = first_pop_time(time)
text = ''
sql = "Select RSummary From " +tablename + " Where PID = " + '"' + PID +'"' + ' and rtime > ' + str(popmin)+ " and rtime < " + str(popmax) + ";"
rtext = query_db(sql)
rtext = tuple(x[0] for x in rtext)
for string in rtext:
text = text + string
sql = "Select RText From " +tablename + " Where PID = " + '"' + PID +'"' + ' and rtime > ' + str(popmin)+ " and rtime < " + str(popmax) + ";"
cursor.execute(sql)
rtext = cursor.fetchall()
rtext = tuple(x[0] for x in rtext)
for string in rtext:
text = text + string
return text, rtext, popmin
In [8]:
def stem_tokens(tokens, stemmer):
stemmed = []
for item in tokens:
stemmed.append(stemmer.stem(item))
return stemmed
In [9]:
def best_bigram_collector(finder, n, Ptitle):
#finder.apply_freq_filter(4)
list = finder.nbest(bigram_measures.jaccard, n*10)
Ptitle = get_tokens(str(Ptitle))
Ptitle.append("used")
bests = []
count = 0
words = []
for item in list:
if count < n:
if item[0] not in words and item[1] not in words and item[0] not in Ptitle and item[1] not in Ptitle:
bests.append(item)
words.append(item[0])
words.append(item[1])
#words.append(item[2])
count = count + 1
else:
words.append(item[0])
words.append(item[1])
#words.append(item[2])
else:
break;
return bests
In [10]:
def best_trigram_collector(finder, n, Ptitle):
#finder.apply_freq_filter(4)
list = finder.nbest(trigram_measures.raw_freq, n*10)
Ptitle = get_tokens(str(Ptitle))
#add any other words that come up a lot and want to exclude
Ptitle.append("used")
bests = []
count = 0
words = []
for item in list:
if count < n:
if item[0] not in words and item[1] not in words and item[2] not in words and item[0] not in Ptitle and item[1] not in Ptitle and item[2] not in Ptitle:
bests.append(item)
words.append(item[0])
words.append(item[1])
words.append(item[2])
count = count + 1
else:
words.append(item[0])
words.append(item[1])
words.append(item[2])
else:
break;
return bests
In [13]:
def get_better_review_text(reviews, timemin, timemax):
text = ''
rtext = tuple(x[0] for x in reviews)
for string in rtext:
text = text + string
#sql = "Select RText From " +tablename + " Where PID = " + '"' + PID +'"' + ' and rtime > ' + str(timemin)+ " and rtime < " + str(timemax) + ";"
#rtext = query_db(sql)
#rtext = tuple(x[0] for x in rtext)
#for string in rtext:
#text = text + string
return text, rtext
In [ ]:
In [14]:
database = "home_kitchen"
tablename = "all_hk"
db = MySQLdb.connect(host="localhost", user="root", db = database)
cursor = db.cursor()
In [15]:
#PID = ' B00005AQ9Q' #bad reviews so no one buys it. 99 reviews.
PID = ' B0000E2PEI' #featured in consumer reports example
#PID = ' B0000X7CMQ'
#PID = ' B000S5XYI2'
#PID = ' B00019G8IS'
#PID = ' B000GTR2F6'
#PID = ' B000GXZ2GS' #really good!
#PID = ' B0000DIU49'
In [19]:
#tot_text = get_tot_review_text(PID, tablename)
#pop_text, pop_revs, popmin = get_pop_review_text(PID, tablename)
ratings, time, RSummary, RText = get_data(PID, tablename)
In [17]:
#tot_tokens = get_tokens(tot_text)
pop_tokens = get_tokens(pop_text)
In [18]:
cursor.execute("Select distinct PTitle from all_hk where PID = " + '"' + PID + '";')
ptitle = cursor.fetchall()
ptitle = tuple(x[0] for x in ptitle)
ptitle = get_tokens(ptitle[0])
print ptitle
In [19]:
finder = BigramCollocationFinder.from_words(pop_tokens)
finder.apply_freq_filter(4)
finder.nbest(bigram_measures.jaccard, 20)
Out[19]:
In [20]:
finder = TrigramCollocationFinder.from_words(pop_tokens)
finder.apply_freq_filter(2)
finder.nbest(trigram_measures.raw_freq, 20)
Out[20]:
In [21]:
finder = BigramCollocationFinder.from_words(pop_tokens)
finder.apply_freq_filter(4)
if finder:
bestbigrams = best_bigram_collector(finder, 5, ptitle)
print bestbigrams
#finder.apply_freq_filter(4)
#finder.nbest(bigram_measures.jaccard, 20)
In [22]:
finder = TrigramCollocationFinder.from_words(pop_tokens)
#finder.apply_freq_filter(2)
if finder:
besttrigrams = best_trigram_collector(finder, 5, ptitle)
print besttrigrams
In [23]:
##### now I"m going to try and snag representative reviews.
In [24]:
keywords = [item for sublist in bestbigrams for item in sublist] + [item for sublist in besttrigrams for item in sublist]
print keywords
In [25]:
print len(pop_revs)
count = [0]*len(pop_revs)
i = 0
for rev in pop_revs:
#print rev
if len(rev)>0:
for word in get_tokens(rev):
#print word
if word in keywords:
count[i] += 1
count[i] = float(count[i])/float(len(rev))
i += 1
In [26]:
#print count
In [27]:
bestrevs = []
for x, i in enumerate(count):
if i in heapq.nlargest(4, count):
bestrevs.append(pop_revs[x])
print pop_revs[x], '\n'
#print bestrevs
In [28]:
cursor.execute("Select distinct PTitle from all_hk where PID = " + '"' + PID + '";')
Ptitle = cursor.fetchall()
print Ptitle
In [29]:
sql = "Select distinct PTitle from all_hk where PID = " + '"' + PID + '";'
prodname = str(query_db(sql))
prodname = tuple(x[0] for x in prodname)
In [30]:
print dt.datetime.fromtimestamp(popmin)
In [31]:
newrevs = []
for rev in bestrevs:
hold = ''
for word in rev.split():
if word in keywords:
hold = hold + " <b>"+word + "</b> "
else:
hold = hold + ' ' + word + ' '
newrevs.append(hold)
In [32]:
print newrevs[0]
In [33]:
product_id = "Wall Clock"
PID = ' ' + product_id
tablename = 'all_hk'
query = "Select RTime, RScore, RSummary, RText From " + tablename +" Where PID = " +'"' + PID +'" ORDER BY RTime ASC;'
#did they input a pid, title, or neither?
data = query_db(query)
if data == "error":
print "not a pid"
query = "Select PID, PTitle From " + tablename +" Where PTitle Like " +'"%' + product_id +'%" Limit 11'
prodlist = query_db(query)
PID = prodlist[0][0]
In [ ]: