In [267]:
import MySQLdb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import datetime as dt
from sklearn.cluster import KMeans, DBSCAN
%matplotlib inline

In [169]:
#returns ratings and time for a given pid in tablename with cursor pointing toward the database
def get_data(PID, cursor, tablename):
    sql = "Select RTime, RScore From " +tablename + " Where PID = " + '"' + PID +'";'
    cursor.execute(sql)
    data = cursor.fetchall()
    data = sorted(data)
    rating = np.array(zip(*data)[1], dtype = int)
    time = np.array(zip(*data)[0], dtype = float)
    #dates=[dt.datetime.fromtimestamp(ts) for ts in time]
    return rating, time#, dates

In [170]:
def avg_rating(rating):
    avg = [0]*len(rating)
    avg[0] = float(rating[0])
    for k in range(1, len(rating)):
        avg[k]= float(np.mean(rating[:k]))
    return avg

In [171]:
# This returns the longest time span covering 1/4 of reviews and the shortest time span covering 1/4 of reviews
def pop_time(time): 
    unpopmin = time[0]
    unpopmax = time[0]
    popmin = time[0]
    popmax = time[len(time)-1]
    slidermin = 0
    slidersize = int(len(time)/4)
    for i in range(slidersize, len(time)): #i marks the end of the slider
        windowsize = time[i] - time[i - slidersize]
        if windowsize > unpopmax - unpopmin:
            unpopmax = time[i]
            unpopmin = time[i - slidersize]
        if windowsize < popmax - popmin:
            popmax = time[i]
            popmin = time[i - slidersize]
            
            
    return unpopmin, unpopmax, popmin, popmax

In [172]:
#this gives the average slope of the cumulative avg rating (or whatever you pass it) in the 2nd, 3rd, and 4th quarter of reviews
#on account of sparseness effects I divided by num of reviews rather than time, not clear if that's the best choice

#need a reasonable number of reviews (>40) for this to work 

#note: input must be unix timestamp (seconds)
def quarterly_slopes(ratings, timestamp):
    time = map(lambda foo: float(foo/(60*60*24*365)), timestamp)
    q = (len(ratings)/4)-1
    q1 = 0
    if len(ratings)>45:
        q1 = float((ratings[q]-ratings[10])/(time[q] - time[10]))
    return q1, float((ratings[2*q] - ratings[q])/(time[2*q]-time[q])), float((ratings[3*q]-ratings[2*q])/(time[3*q]-time[2*q])), float((ratings[4*q]-ratings[3*q])/(time[4*q]-time[3*q]))

In [173]:
#This should check if the reviews at the start are "significantly" higher than average; by more than .5 stars
def starts_high(ratings, time):
    startavg = sum(ratings[:10])/10
    avg = sum(ratings)/len(ratings)
    if startavg > avg + .5:
        return True
    else: 
        return False

In [263]:
#features vector, need at least 50? reviews for all these to be meaningful
def make_features_vec(ratings, time):
    vec = []
    n = len(ratings)
    unpopmin, unpopmax, popmin, popmax = pop_time(time)
    q1, q2, q3, q4 = quarterly_slopes(ratings, time)
    #bit 1
    if unpopmin <= time[n/10]:
        vec.append(1)
    else:
        vec.append(0)
    #bit 2
    if unpopmax >= time[9*n/10]:
        vec.append(1)
    else:
        vec.append(0)
    #bit 3
    if popmin <= time[n/10]:
        vec.append(1)
    else:
        vec.append(0)
    #bit 4
    if popmax >= time[9*n/10]:
        vec.append(1)
    else:
        vec.append(0)
    #bit 5
    vec.append(q1*10)
    #bit 6
    vec.append(q2*10)
    #bit 7
    vec.append(q3*10)
    #bit 8
    vec.append(q4*10)
#    #bit 5               #this is not pythonic. get better at python. 
#    if q1 < -1:
#        vec.append(-1)
#    elif q1 > 1:
#        vec.append(1)
#    else:
#        vec.append(0)
    
    # bit 6
#    if q2 < -.12:
#        vec.append(-1)
#    elif q2 > .12:
#        vec.append(1)
#    else:
#        vec.append(0)
#    #bit 7
#    if q3 < -.12:
#        vec.append(-1)
#    elif q3 > .12:
#        vec.append(1)
#    else:
#        vec.append(0)
#    #bit 8
#    if q4 < -.12:
#        vec.append(-1)
#    elif q4 > 1:
#        vec.append(1)
#    else:
#        vec.append(0)
    #bit 9
    if starts_high(ratings, time) == True:
        vec.append(1)
    else:
        vec.append(0)
    #bit 10
    vec.append(ratings[len(ratings)-1])
    return vec

In [ ]:


In [248]:
database = "home_kitchen"
tablename = "all_hk"
numids = 10

In [249]:
db = MySQLdb.connect(host="localhost", user="root", db = database)
cursor = db.cursor()

In [250]:
sql = "Select PID from (SELECT distinct PID, count(*) as magnitude from " + tablename + " group by pid having magnitude > 100) as x limit " +str(numids) +";"

In [251]:
cursor.execute(sql)


Out[251]:
100L

In [252]:
pids = cursor.fetchall()

In [253]:
pids = tuple(x[0] for x in pids)

In [254]:
pid_test = ' B000GXZ2GS'

In [255]:
rating_test, time_test = get_data(pid_test, cursor, tablename)

In [256]:
rating_test= avg_rating(rating_test)

In [257]:
print pop_time(time_test)


(1153267200.0, 1238803200.0, 1333497600.0, 1360886400.0)

In [258]:
print max(time_test), min(time_test)


1360886400.0 1153267200.0

In [259]:
print starts_high(rating_test, time_test)


False

In [260]:
print quarterly_slopes(rating_test, time_test)


(-0.11295193668731641, -0.03352036792272728, -0.016066588048383186, -0.01803819363162063)

In [261]:
starts_high(rating_test, time_test) == False


Out[261]:
True

In [264]:
print make_features_vec(rating_test, time_test)


[1, 0, 0, 1, -1.1295193668731642, -0.3352036792272728, -0.16066588048383185, -0.18038193631620628, 0, 4.63490099009901]

In [265]:
dates=[dt.datetime.fromtimestamp(ts) for ts in time_test]

In [266]:
plt.scatter(dates, rating_test)


Out[266]:
<matplotlib.collections.PathCollection at 0x10f689e50>

In [119]: