In [39]:
import MySQLdb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import datetime as dt
from sklearn.cluster import KMeans, DBSCAN
from matplotlib.backends.backend_pdf import PdfPages

In [40]:
def make_vec_list(pid_list):
    vec_list = []
    for pid in pid_list:
        ratings, time = get_data(pid, cursor, tablename)
        vector = make_features_vec(avg_rating(ratings), time)
        vec_list.append(vector)
    return vec_list

In [41]:
#returns ratings and time for a given pid in tablename with cursor pointing toward the database
def get_data(PID, cursor, tablename):
    sql = "Select RTime, RScore From " +tablename + " Where PID = " + '"' + PID +'";'
    cursor.execute(sql)
    data = cursor.fetchall()
    data = sorted(data)
    rating = np.array(zip(*data)[1], dtype = int)
    time = np.array(zip(*data)[0], dtype = float)
    #dates=[dt.datetime.fromtimestamp(ts) for ts in time]
    return rating, time#, dates

In [42]:
def avg_rating(rating):
    avg = [0]*len(rating)
    avg[0] = float(rating[0])
    for k in range(1, len(rating)):
        avg[k]= float(np.mean(rating[:k]))
    return avg

In [43]:
# This returns the longest time span covering 1/4 of reviews and the shortest time span covering 1/4 of reviews
def pop_time(time): 
    unpopmin = time[0]
    unpopmax = time[0]
    popmin = time[0]
    popmax = time[len(time)-1]
    slidermin = 0
    slidersize = int(len(time)/4)
    for i in range(slidersize, len(time)): #i marks the end of the slider
        windowsize = time[i] - time[i - slidersize]
        if windowsize > unpopmax - unpopmin:
            unpopmax = time[i]
            unpopmin = time[i - slidersize]
        if windowsize < popmax - popmin:
            popmax = time[i]
            popmin = time[i - slidersize]
            
            
    return unpopmin, unpopmax, popmin, popmax

In [44]:
#this gives the average slope of the cumulative avg rating (or whatever you pass it) in the 2nd, 3rd, and 4th quarter of reviews
#on account of sparseness effects I divided by num of reviews rather than time, not clear if that's the best choice

#need a reasonable number of reviews (>40) for this to work 

#note: input must be unix timestamp (seconds)
def quarterly_slopes(ratings, timestamp):
    time = map(lambda foo: float(foo/(60*60*24*365)), timestamp)
    q = (len(ratings)/4)-1
    q1 = 0
    if len(ratings)>45:
        q1 = float((ratings[q]-ratings[10])/(time[q] - time[10]))
    return q1, float((ratings[2*q] - ratings[q])/(time[2*q]-time[q])), float((ratings[3*q]-ratings[2*q])/(time[3*q]-time[2*q])), float((ratings[4*q]-ratings[3*q])/(time[4*q]-time[3*q]))

In [53]:
#This should check if the reviews at the start are "significantly" higher than average; by more than .5 stars
def starts_high(ratings, time):
    startavg = sum(ratings[:10])/10
    avg = sum(ratings)/len(ratings)
    if startavg > avg + .5:
        return True
    else: 
        return False

In [54]:
#features vector, need at least 50? reviews for all these to be meaningful
def make_features_vec(ratings, time):
    vec = []
    n = len(ratings)
    unpopmin, unpopmax, popmin, popmax = pop_time(time)
    q1, q2, q3, q4 = quarterly_slopes(ratings, time)
    #bit 1
    #unpopular at beginning
    if unpopmin <= time[n/10]:
        vec.append(1)
    else:
        vec.append(0)
    #bit 2
    #unpopular at end
    if unpopmax >= time[9*n/10]:
        vec.append(1)
    else:
        vec.append(0)
    #bit 3
    #popular at beginning
    if popmin <= time[n/10]:
        vec.append(1)
    else:
        vec.append(0)
    #bit 4
    #popular at end
    if popmax >= time[9*n/10]:
        vec.append(1)
    else:
        vec.append(0)
    #bit 5
    #qn is slope in quarter n
    #vec.append(q1*10)
    #bit 6
    #vec.append(q2*10)
    #bit 7
    #vec.append(q3*10)
    #bit 8
    #vec.append(q4*10)
#    #bit 5               #this is not pythonic. get better at python. 
    if q1 < -.12:
        vec.append(-1)
    elif q1 > .12:
        vec.append(1)
    else:
        vec.append(0)
    
    # bit 6
    if q2 < -.12:
        vec.append(-1)
    elif q2 > .12:
        vec.append(1)
    else:
        vec.append(0)
    #bit 7
    if q3 < -.12:
        vec.append(-1)
    elif q3 > .12:
        vec.append(1)
    else:
        vec.append(0)
    #bit 8
    if q4 < -.12:
        vec.append(-1)
    elif q4 > 1:
        vec.append(1)
    else:
        vec.append(0)
    #bit 9
    #initial ratings start high
    if starts_high(ratings, time) == True:
        vec.append(1)
    else:
        vec.append(0)
    #bit 10
    #final cumulative average
    vec.append(ratings[len(ratings)-1])
    return vec

In [55]:
def dist_from_centroid(vec_list, centroid):
    centroid = np.array(centroid)
    dists = []
    for vec in vec_list:
        dist = np.linalg.norm(centroid - np.array(vec))
        dists.append(dist)
    return dists

In [55]:


In [56]:
database = "home_kitchen"
tablename = "all_hk"
numids = 1000

In [57]:
db = MySQLdb.connect(host="localhost", user="root", db = database)
cursor = db.cursor()

In [58]:
sql = "Select PID from (SELECT distinct PID, count(*) as magnitude from " + tablename + " group by pid having magnitude > 100) as x limit " +str(numids) +";"

In [59]:
cursor.execute(sql)


Out[59]:
1000L

In [60]:
pids = cursor.fetchall()

In [61]:
pids = tuple(x[0] for x in pids)

In [62]:
vec_list = make_vec_list(pids)

In [90]:
#initializing kmeans
v1 = np.array([1, 0, 0, 1, -1, 0, 0, 0, 1, 3])
v2 = np.array([0, 1, 1, 0, -1, 0, -1, -1, 1, 3])
v3 = np.array([1, 0, 0, 1, -1, 1, 1, 1, 1, 4])
v4 = np.array([1, 0, 0, 1, 0, 1, 1, 0, 0, 4])
v5 = np.array([0, 1, 1, 0, -1, 0, -1, 0, 1, 2])
centroids = np.array([v1, v2, v3, v4, v5])

In [91]:
kmeans = KMeans(n_clusters = 6, init = centroids)
kmeans.fit(vec_list)


Out[91]:
KMeans(copy_x=True,
    init=array([[ 1.,  0.,  0.,  1., -1.,  0.,  0.,  0.,  1.,  3.],
       [ 0.,  1.,  1.,  0., -1.,  0., -1., -1.,  1.,  3.],
       [ 1.,  0.,  0.,  1., -1.,  1.,  1.,  1.,  1.,  4.],
       [ 1.,  0.,  0.,  1.,  0.,  1.,  1.,  0.,  0.,  4.],
       [ 0.,  1.,  1.,  0., -1.,  0., -1.,  0.,  1.,  2.]]),
    max_iter=300, n_clusters=5, n_init=10, n_jobs=1,
    precompute_distances=True, random_state=None, tol=0.0001, verbose=0)

In [92]:
centroids = kmeans.cluster_centers_

In [93]:
print centroids


[[  9.12195122e-01   0.00000000e+00   1.38777878e-16   6.24390244e-01
   -8.04878049e-01  -7.80487805e-02  -8.29268293e-02  -1.21951220e-01
    6.19512195e-01   3.92831082e+00]
 [ -1.33226763e-15   7.85714286e-01   2.57142857e-01   3.57142857e-02
   -9.92857143e-01  -2.07142857e-01  -2.85714286e-02  -2.85714286e-02
    5.00000000e-01   3.84494382e+00]
 [ -6.66133815e-16   8.86178862e-01   8.94308943e-02   5.69105691e-02
    4.87804878e-01   8.13008130e-02   3.25203252e-02  -2.43902439e-02
    4.87804878e-02   4.11942063e+00]
 [  9.52890792e-01   0.00000000e+00   6.42398287e-03   7.15203426e-01
    2.71948608e-01  -3.21199143e-02  -6.85224839e-02  -9.20770878e-02
    1.71306210e-02   4.22695820e+00]
 [  4.15384615e-01   4.76923077e-01   1.23076923e-01   1.38461538e-01
   -1.23076923e-01  -7.23076923e-01  -4.61538462e-01  -1.38461538e-01
    5.84615385e-01   2.67024778e+00]]

In [94]:
labels = kmeans.labels_

In [96]:
pp0 = PdfPages('cluster0.pdf')
#metadata = pp.infodict()
#metadata['Title'] = 'plots for cluster 0'

In [97]:
pp1 = PdfPages('cluster1.pdf')
#metadata = pp.infodict()
#metadata['Title'] = 'plots for cluster 1'

pp2 = PdfPages('cluster2.pdf')
#metadata = pp.infodict()
#metadata['Title'] = 'plots for cluster 1'

pp3 = PdfPages('cluster3.pdf')
#metadata = pp.infodict()
#metadata['Title'] = 'plots for cluster'

pp4 = PdfPages('cluster4.pdf')
#metadata = pp.infodict()
#metadata['Title'] = 'plots for cluster 0'

pp5 = PdfPages('cluster5.pdf')
#metadata = pp.infodict()
#metadata['Title'] = 'plots for cluster 0'

pp6 = PdfPages('cluster6.pdf')
#metadata = pp.infodict()
#metadata['Title'] = 'plots for cluster 0'

pp7 = PdfPages('cluster7.pdf')
#metadata = pp.infodict()
#metadata['Title'] = 'plots for cluster 0'

In [98]:
for i in range(200):
    fig = plt.figure(figsize=(10, 5), dpi=100)
    ratings, time = get_data(pids[i], cursor, tablename)
    dates = [dt.datetime.fromtimestamp(ts) for ts in time]
    ratings = avg_rating(ratings)
    filename = 'cluster' + str(labels[i]) + '.pdf'
    if labels[i] == 0:
        plt.scatter(time, ratings)
        plt.savefig(pp0, format = 'pdf')   
    if labels[i] == 1:
        plt.scatter(time, ratings)
        plt.savefig(pp1, format = 'pdf')
    if labels[i] == 2:
        plt.scatter(time, ratings)
        plt.savefig(pp2, format = 'pdf')
    if labels[i] == 3:
        plt.scatter(time, ratings)
        plt.savefig(pp3, format = 'pdf')
    if labels[i] == 4:
        plt.scatter(time, ratings)
        plt.savefig(pp4, format = 'pdf')
    if labels[i] == 5:
        plt.scatter(time, ratings)
        plt.savefig(pp5, format = 'pdf')
    if labels[i] == 6:
        plt.scatter(time, ratings)
        plt.savefig(pp6, format = 'pdf')
    if labels[i] == 7:
        plt.scatter(time, ratings)
        plt.savefig(pp7, format = 'pdf')

In [99]:
pp0.close()
pp1.close()
pp2.close()
pp3.close()
pp4.close()
pp5.close()
pp6.close()
pp7.close()

In [89]:
print centroids


[[  9.10714286e-01   0.00000000e+00   1.31838984e-16   7.20238095e-01
   -7.91666667e-01  -2.97619048e-02  -2.97619048e-02  -7.73809524e-02
    5.77380952e-01   4.06704558e+00]
 [ -8.88178420e-16   8.12500000e-01   2.89062500e-01   3.12500000e-02
   -9.21875000e-01  -4.29687500e-01  -9.37500000e-02  -2.34375000e-02
    5.00000000e-01   3.86226560e+00]
 [  1.30000000e-01   7.70000000e-01   5.00000000e-02   3.00000000e-02
   -3.00000000e-02   6.80000000e-01   1.30000000e-01  -2.00000000e-02
    1.50000000e-01   4.05583337e+00]
 [  6.88311688e-01   1.81818182e-01   3.89610390e-02   2.20779221e-01
   -8.05194805e-01  -5.58441558e-01  -4.28571429e-01  -2.07792208e-01
    8.44155844e-01   2.98536517e+00]
 [  9.61187215e-01   0.00000000e+00   2.28310502e-03   7.37442922e-01
    2.53424658e-01  -4.79452055e-02  -6.84931507e-02  -9.36073059e-02
    2.28310502e-03   4.25191456e+00]
 [  2.13483146e-01   6.17977528e-01   1.34831461e-01   1.68539326e-01
    8.98876404e-01  -4.60674157e-01  -1.34831461e-01  -1.01123596e-01
    7.86516854e-02   3.62260101e+00]]

In [78]:
v1 = np.array([1, 0, 0, 1, -1, 0, 0, 0, 1, 3])
v2 = np.array([0, 1, 1, 0, -1, 0, -1, -1, 1, 3])
v3 = np.array([1, 0, 0, 1, -1, 1, 1, 1, 1, 4])
v5 = np.array([1, 0, 0, 1, 0, 1, 1, 0, 0, 4])
v6 = np.array([0, 1, 1, 0, -1, 0, -1, 0, 1, 2])

In [ ]:


In [ ]:


In [ ]:


In [46]:
print vec_list[0]


[1, 0, 0, 0, -2.5726219308804565, 0.16669276338108985, 0.40647172993095526, -0.021361018774286387, 1, 2.4055299539170507]

In [48]:
min_dist = np.linalg.norm(np.array(centroids[0]) - np.array(vec_list[0]))

In [55]:
dists0 = dist_from_centroid(vec_list, centroids[0])

In [63]:
rep = 0
for i in range(len(dists0)):
    if dists0[i] == min(dists0):
        rep = i
        print i


52

In [64]:
ratings, time = get_data(pids[i], cursor, tablename)

In [65]:
dates = [dt.datetime.fromtimestamp(ts) for ts in time]

In [66]:
ratings = avg_rating(ratings)

In [67]:
plt.scatter(dates, ratings)


Out[67]:
<matplotlib.collections.PathCollection at 0x10aa4d8d0>

In [ ]:
################ copied code I want to refer to later ##################

In [265]:
dates=[dt.datetime.fromtimestamp(ts) for ts in time_test]

In [266]:
plt.scatter(dates, rating_test)


Out[266]:
<matplotlib.collections.PathCollection at 0x10f689e50>

In [52]:
print centroids


[[  7.59722222e-01   1.63888889e-01   2.36111111e-02   5.59722222e-01
   -2.11233153e-01  -2.94976640e-01  -3.07609070e-01   3.44095796e-02
    1.79166667e-01   4.11238338e+00]
 [  3.22580645e-02   8.06451613e-01   4.51612903e-01   3.22580645e-02
   -1.57916447e+01  -3.82032829e+00  -6.78307808e-01  -2.33581470e-01
    7.09677419e-01   3.34185417e+00]
 [  3.91891892e-01   4.52702703e-01   1.28378378e-01   2.43243243e-01
   -4.54204213e+00  -1.38983229e+00  -2.76686709e-01  -1.14800273e-01
    6.41891892e-01   3.65414430e+00]
 [  0.00000000e+00   1.00000000e+00   0.00000000e+00   0.00000000e+00
    2.02278325e+01  -2.41161053e+01   3.05793434e-01   3.35083511e-01
    0.00000000e+00   3.68568841e+00]
 [  0.00000000e+00   1.00000000e+00   3.33333333e-01   5.55111512e-17
    3.35144556e+01   2.13853318e+00   2.61725096e+00   1.65711261e-01
    0.00000000e+00   3.22297031e+00]
 [  5.52083333e-01   3.64583333e-01   7.29166667e-02   4.47916667e-01
    4.65743586e+00   1.37654770e+00  -8.69836082e-02   5.28867342e-01
    3.12500000e-02   3.91117729e+00]]

In [ ]: