In [1]:
import nltk
from nltk.collocations import *
import string
import MySQLdb
from collections import Counter
from nltk.corpus import stopwords
import numpy as np
from nltk.stem.porter import *
from Queue import PriorityQueue
import datetime as dt
bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()
In [2]:
def avg_rating(rating):
avg = [0]*len(rating)
avg[0] = float(rating[0])
for k in range(1, len(rating)):
avg[k]= float(np.mean(rating[:k]))
return avg
In [3]:
def pop_time(time):
unpopmin = time[0]
unpopmax = time[0]
popmin = time[0]
popmax = time[len(time)-1]
slidermin = 0
slidersize = int(len(time)/4)
for i in range(slidersize, len(time)): #i marks the end of the slider
windowsize = time[i] - time[i - slidersize]
if windowsize > unpopmax - unpopmin:
unpopmax = time[i]
unpopmin = time[i - slidersize]
if windowsize < popmax - popmin:
popmax = time[i]
popmin = time[i - slidersize]
#dates=[dt.datetime.fromtimestamp(ts) for ts in time]
print dt.datetime.fromtimestamp(popmin), dt.datetime.fromtimestamp(popmax)
return unpopmin, unpopmax, popmin, popmax
In [4]:
def first_pop_time(time):
firstpopmin = time[0]
firstpopmax = time[len(time)-1]
slidermin = 0
slidersize = max(int(len(time)/4), 4)
avtime = (time[len(time)-1] - time[0])/len(time)
for i in range(slidersize, len(time)): #i marks the end of the slider
windowsize = time[i] - time[i - slidersize]
if windowsize < ((time[len(time)-1] - time[0])/4) and (time[i-slidersize]-time[i - slidersize - 3]) < 3*avtime:
firstpopmax = time[i]
firstpopmin = time[i - slidersize]
break;
#dates=[dt.datetime.fromtimestamp(ts) for ts in time]
print dt.datetime.fromtimestamp(firstpopmin), dt.datetime.fromtimestamp(firstpopmax)
return firstpopmin, firstpopmax
In [5]:
#returns ratings and time for a given pid in tablename with cursor pointing toward the database
def get_data(PID, cursor, tablename):
sql = "Select RTime, RScore From " +tablename + " Where PID = " + '"' + PID +'";'
cursor.execute(sql)
data = cursor.fetchall()
data = sorted(data)
rating = np.array(zip(*data)[1], dtype = int)
time = np.array(zip(*data)[0], dtype = float)
#dates=[dt.datetime.fromtimestamp(ts) for ts in time]
return rating, time#, dates
In [6]:
def get_tokens(text):
# with open('/opt/datacourse/data/parts/shakes-1.txt', 'r') as shakes:
#text = shakes.read()
lowers = text.lower()
#remove the punctuation using the character deletion step of translate
no_punctuation = lowers.translate(None, string.punctuation)
tokens = nltk.word_tokenize(no_punctuation)
return tokens
In [7]:
def get_all_review_text(PID, cursor, tablename):
text = ''
sql = "Select RSummary From " +tablename + " Where PID = " + '"' + PID +'";'
cursor.execute(sql)
rtext = cursor.fetchall()
rtext = tuple(x[0] for x in rtext)
for string in rtext:
text = text + string
sql = "Select RText From " +tablename + " Where PID = " + '"' + PID +'";'
cursor.execute(sql)
rtext = cursor.fetchall()
rtext = tuple(x[0] for x in rtext)
for string in rtext:
text = text + string
return text
In [8]:
def get_popular_review_text(PID, cursor, tablename):
rating, time = get_data(PID, cursor, tablename)
a, b, popmin, popmax = pop_time(time)
text = ''
sql = "Select RSummary From " +tablename + " Where PID = " + '"' + PID +'"' + ' and rtime > ' + str(popmin)+ " and rtime < " + str(popmax) + ";"
cursor.execute(sql)
rtext = cursor.fetchall()
rtext = tuple(x[0] for x in rtext)
for string in rtext:
text = text + string
sql = "Select RText From " +tablename + " Where PID = " + '"' + PID +'"' + ' and rtime > ' + str(popmin)+ " and rtime < " + str(popmax) + ";"
cursor.execute(sql)
rtext = cursor.fetchall()
rtext = tuple(x[0] for x in rtext)
for string in rtext:
text = text + string
return text
In [9]:
def get_firstpop_review_text(PID, cursor, tablename):
rating, time = get_data(PID, cursor, tablename)
popmin, popmax = first_pop_time(time)
text = ''
sql = "Select RSummary From " +tablename + " Where PID = " + '"' + PID +'"' + ' and rtime > ' + str(popmin)+ " and rtime < " + str(popmax) + ";"
cursor.execute(sql)
rtext = cursor.fetchall()
rtext = tuple(x[0] for x in rtext)
for string in rtext:
text = text + string
sql = "Select RText From " +tablename + " Where PID = " + '"' + PID +'"' + ' and rtime > ' + str(popmin)+ " and rtime < " + str(popmax) + ";"
cursor.execute(sql)
rtext = cursor.fetchall()
rtext = tuple(x[0] for x in rtext)
for string in rtext:
text = text + string
return text
In [10]:
def stem_tokens(tokens, stemmer):
stemmed = []
for item in tokens:
stemmed.append(stemmer.stem(item))
return stemmed
In [84]:
def text_to_counter(text):
stemmer = PorterStemmer()
tokens_all = get_tokens(text)
filtered_all = [w for w in tokens_all if not w in stopwords.words('english')]
count = Counter(filtered_all)
stemmed_all = stem_tokens(filtered_all, stemmer)
count_all = Counter(stemmed_all)
count_all = Counter({x: n-1 for x, n in count_all.items()})
count_all += Counter()
count_all = Counter({x: n+1 for x, n in count_all.items()})
return count_all
In [12]:
database = "home_kitchen"
tablename = "all_hk"
In [13]:
db = MySQLdb.connect(host="localhost", user="root", db = database)
cursor = db.cursor()
In [72]:
#PID = ' B00005AQ9Q' #bad reviews so no one buys it. 99 reviews.
PID = ' B0000E2PEI' #featured in consumer reports example
#PID = ' B0000X7CMQ'
In [73]:
stemmer = PorterStemmer()
all_text = get_all_review_text(PID, cursor, tablename)
pop_text = get_popular_review_text(PID, cursor, tablename)
first_pop_text = get_firstpop_review_text(PID, cursor, tablename)
count_all = text_to_counter(all_text)
count_pop = text_to_counter(pop_text)
count_firstpop = text_to_counter(first_pop_text)
#print count_all.most_common(100), '\n'
#print count_pop.most_common(100), '\n'
#print count_firstpop.most_common(100)
all_len = len(all_text)
pop_len = len(pop_text)
firstpop_len = len(first_pop_text)
#print all_len, pop_len, firstpop_len
In [74]:
ratio_all = {x: float(n)/float(all_len) for x, n in count_all.items()}
ratio_pop = {x: float(n)/float(pop_len) for x, n in count_pop.items()}
ratio_firstpop = {x: float(n)/float(firstpop_len) for x, n in count_firstpop.items()}
#print ratio_all, '\n\n\n', ratio_pop
popratios = Counter({x: n/ratio_all[x] for x, n in ratio_pop.items()})
firstpopratios = Counter({x: n/ratio_all[x] for x, n in ratio_firstpop.items()})
#print count_all.most_common(100), '\n'
#print count_pop.most_common(100), '\n'
#print count_firstpop.most_common(100)
#print {word for word, n in popratios.most_common(30)}, '\n'
#print {word for word, n in firstpopratios.most_common(30)}
In [75]:
#finder = BigramCollocationFinder.from_words(pop_text)
In [76]:
tokens = get_tokens(all_text)
filtered = [w for w in tokens if not w in stopwords.words('english')]
finder = BigramCollocationFinder.from_words(filtered)
finder.apply_freq_filter(3)
finder.nbest(bigram_measures.likelihood_ratio, 10)
Out[76]:
In [77]:
tokens = get_tokens(all_text)
filtered = [w for w in tokens if not w in stopwords.words('english')]
finder = TrigramCollocationFinder.from_words(filtered)
finder.apply_freq_filter(2)
finder.nbest(trigram_measures.raw_freq, 10)
Out[77]:
In [82]:
tokens = get_tokens(first_pop_text)
filtered = [w for w in tokens if not w in stopwords.words('english')]
finder = BigramCollocationFinder.from_words(filtered)
finder.apply_freq_filter(3)
for bigram in finder.nbest(bigram_measures.likelihood_ratio, 5):
print bigram
In [83]:
tokens = get_tokens(first_pop_text)
filtered = [w for w in tokens if not w in stopwords.words('english')]
finder = TrigramCollocationFinder.from_words(filtered)
finder.apply_freq_filter(2)
finder.nbest(trigram_measures.raw_freq, 5)
Out[83]:
In [85]:
print finder
In [ ]: