In [39]:
import MySQLdb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import datetime as dt
from sklearn.cluster import KMeans, DBSCAN
from matplotlib.backends.backend_pdf import PdfPages
In [40]:
def make_vec_list(pid_list):
vec_list = []
for pid in pid_list:
ratings, time = get_data(pid, cursor, tablename)
vector = make_features_vec(avg_rating(ratings), time)
vec_list.append(vector)
return vec_list
In [41]:
#returns ratings and time for a given pid in tablename with cursor pointing toward the database
def get_data(PID, cursor, tablename):
sql = "Select RTime, RScore From " +tablename + " Where PID = " + '"' + PID +'";'
cursor.execute(sql)
data = cursor.fetchall()
data = sorted(data)
rating = np.array(zip(*data)[1], dtype = int)
time = np.array(zip(*data)[0], dtype = float)
#dates=[dt.datetime.fromtimestamp(ts) for ts in time]
return rating, time#, dates
In [42]:
def avg_rating(rating):
avg = [0]*len(rating)
avg[0] = float(rating[0])
for k in range(1, len(rating)):
avg[k]= float(np.mean(rating[:k]))
return avg
In [43]:
# This returns the longest time span covering 1/4 of reviews and the shortest time span covering 1/4 of reviews
def pop_time(time):
unpopmin = time[0]
unpopmax = time[0]
popmin = time[0]
popmax = time[len(time)-1]
slidermin = 0
slidersize = int(len(time)/4)
for i in range(slidersize, len(time)): #i marks the end of the slider
windowsize = time[i] - time[i - slidersize]
if windowsize > unpopmax - unpopmin:
unpopmax = time[i]
unpopmin = time[i - slidersize]
if windowsize < popmax - popmin:
popmax = time[i]
popmin = time[i - slidersize]
return unpopmin, unpopmax, popmin, popmax
In [44]:
#this gives the average slope of the cumulative avg rating (or whatever you pass it) in the 2nd, 3rd, and 4th quarter of reviews
#on account of sparseness effects I divided by num of reviews rather than time, not clear if that's the best choice
#need a reasonable number of reviews (>40) for this to work
#note: input must be unix timestamp (seconds)
def quarterly_slopes(ratings, timestamp):
time = map(lambda foo: float(foo/(60*60*24*365)), timestamp)
q = (len(ratings)/4)-1
q1 = 0
if len(ratings)>45:
q1 = float((ratings[q]-ratings[10])/(time[q] - time[10]))
return q1, float((ratings[2*q] - ratings[q])/(time[2*q]-time[q])), float((ratings[3*q]-ratings[2*q])/(time[3*q]-time[2*q])), float((ratings[4*q]-ratings[3*q])/(time[4*q]-time[3*q]))
In [53]:
#This should check if the reviews at the start are "significantly" higher than average; by more than .5 stars
def starts_high(ratings, time):
startavg = sum(ratings[:10])/10
avg = sum(ratings)/len(ratings)
if startavg > avg + .5:
return True
else:
return False
In [54]:
#features vector, need at least 50? reviews for all these to be meaningful
def make_features_vec(ratings, time):
vec = []
n = len(ratings)
unpopmin, unpopmax, popmin, popmax = pop_time(time)
q1, q2, q3, q4 = quarterly_slopes(ratings, time)
#bit 1
#unpopular at beginning
if unpopmin <= time[n/10]:
vec.append(1)
else:
vec.append(0)
#bit 2
#unpopular at end
if unpopmax >= time[9*n/10]:
vec.append(1)
else:
vec.append(0)
#bit 3
#popular at beginning
if popmin <= time[n/10]:
vec.append(1)
else:
vec.append(0)
#bit 4
#popular at end
if popmax >= time[9*n/10]:
vec.append(1)
else:
vec.append(0)
#bit 5
#qn is slope in quarter n
#vec.append(q1*10)
#bit 6
#vec.append(q2*10)
#bit 7
#vec.append(q3*10)
#bit 8
#vec.append(q4*10)
# #bit 5 #this is not pythonic. get better at python.
if q1 < -.12:
vec.append(-1)
elif q1 > .12:
vec.append(1)
else:
vec.append(0)
# bit 6
if q2 < -.12:
vec.append(-1)
elif q2 > .12:
vec.append(1)
else:
vec.append(0)
#bit 7
if q3 < -.12:
vec.append(-1)
elif q3 > .12:
vec.append(1)
else:
vec.append(0)
#bit 8
if q4 < -.12:
vec.append(-1)
elif q4 > 1:
vec.append(1)
else:
vec.append(0)
#bit 9
#initial ratings start high
if starts_high(ratings, time) == True:
vec.append(1)
else:
vec.append(0)
#bit 10
#final cumulative average
vec.append(ratings[len(ratings)-1])
return vec
In [55]:
def dist_from_centroid(vec_list, centroid):
centroid = np.array(centroid)
dists = []
for vec in vec_list:
dist = np.linalg.norm(centroid - np.array(vec))
dists.append(dist)
return dists
In [55]:
In [56]:
database = "home_kitchen"
tablename = "all_hk"
numids = 1000
In [57]:
db = MySQLdb.connect(host="localhost", user="root", db = database)
cursor = db.cursor()
In [58]:
sql = "Select PID from (SELECT distinct PID, count(*) as magnitude from " + tablename + " group by pid having magnitude > 100) as x limit " +str(numids) +";"
In [59]:
cursor.execute(sql)
Out[59]:
In [60]:
pids = cursor.fetchall()
In [61]:
pids = tuple(x[0] for x in pids)
In [62]:
vec_list = make_vec_list(pids)
In [90]:
#initializing kmeans
v1 = np.array([1, 0, 0, 1, -1, 0, 0, 0, 1, 3])
v2 = np.array([0, 1, 1, 0, -1, 0, -1, -1, 1, 3])
v3 = np.array([1, 0, 0, 1, -1, 1, 1, 1, 1, 4])
v4 = np.array([1, 0, 0, 1, 0, 1, 1, 0, 0, 4])
v5 = np.array([0, 1, 1, 0, -1, 0, -1, 0, 1, 2])
centroids = np.array([v1, v2, v3, v4, v5])
In [91]:
kmeans = KMeans(n_clusters = 6, init = centroids)
kmeans.fit(vec_list)
Out[91]:
In [92]:
centroids = kmeans.cluster_centers_
In [93]:
print centroids
In [94]:
labels = kmeans.labels_
In [96]:
pp0 = PdfPages('cluster0.pdf')
#metadata = pp.infodict()
#metadata['Title'] = 'plots for cluster 0'
In [97]:
pp1 = PdfPages('cluster1.pdf')
#metadata = pp.infodict()
#metadata['Title'] = 'plots for cluster 1'
pp2 = PdfPages('cluster2.pdf')
#metadata = pp.infodict()
#metadata['Title'] = 'plots for cluster 1'
pp3 = PdfPages('cluster3.pdf')
#metadata = pp.infodict()
#metadata['Title'] = 'plots for cluster'
pp4 = PdfPages('cluster4.pdf')
#metadata = pp.infodict()
#metadata['Title'] = 'plots for cluster 0'
pp5 = PdfPages('cluster5.pdf')
#metadata = pp.infodict()
#metadata['Title'] = 'plots for cluster 0'
pp6 = PdfPages('cluster6.pdf')
#metadata = pp.infodict()
#metadata['Title'] = 'plots for cluster 0'
pp7 = PdfPages('cluster7.pdf')
#metadata = pp.infodict()
#metadata['Title'] = 'plots for cluster 0'
In [98]:
for i in range(200):
fig = plt.figure(figsize=(10, 5), dpi=100)
ratings, time = get_data(pids[i], cursor, tablename)
dates = [dt.datetime.fromtimestamp(ts) for ts in time]
ratings = avg_rating(ratings)
filename = 'cluster' + str(labels[i]) + '.pdf'
if labels[i] == 0:
plt.scatter(time, ratings)
plt.savefig(pp0, format = 'pdf')
if labels[i] == 1:
plt.scatter(time, ratings)
plt.savefig(pp1, format = 'pdf')
if labels[i] == 2:
plt.scatter(time, ratings)
plt.savefig(pp2, format = 'pdf')
if labels[i] == 3:
plt.scatter(time, ratings)
plt.savefig(pp3, format = 'pdf')
if labels[i] == 4:
plt.scatter(time, ratings)
plt.savefig(pp4, format = 'pdf')
if labels[i] == 5:
plt.scatter(time, ratings)
plt.savefig(pp5, format = 'pdf')
if labels[i] == 6:
plt.scatter(time, ratings)
plt.savefig(pp6, format = 'pdf')
if labels[i] == 7:
plt.scatter(time, ratings)
plt.savefig(pp7, format = 'pdf')
In [99]:
pp0.close()
pp1.close()
pp2.close()
pp3.close()
pp4.close()
pp5.close()
pp6.close()
pp7.close()
In [89]:
print centroids
In [78]:
v1 = np.array([1, 0, 0, 1, -1, 0, 0, 0, 1, 3])
v2 = np.array([0, 1, 1, 0, -1, 0, -1, -1, 1, 3])
v3 = np.array([1, 0, 0, 1, -1, 1, 1, 1, 1, 4])
v5 = np.array([1, 0, 0, 1, 0, 1, 1, 0, 0, 4])
v6 = np.array([0, 1, 1, 0, -1, 0, -1, 0, 1, 2])
In [ ]:
In [ ]:
In [ ]:
In [46]:
print vec_list[0]
In [48]:
min_dist = np.linalg.norm(np.array(centroids[0]) - np.array(vec_list[0]))
In [55]:
dists0 = dist_from_centroid(vec_list, centroids[0])
In [63]:
rep = 0
for i in range(len(dists0)):
if dists0[i] == min(dists0):
rep = i
print i
In [64]:
ratings, time = get_data(pids[i], cursor, tablename)
In [65]:
dates = [dt.datetime.fromtimestamp(ts) for ts in time]
In [66]:
ratings = avg_rating(ratings)
In [67]:
plt.scatter(dates, ratings)
Out[67]:
In [ ]:
################ copied code I want to refer to later ##################
In [265]:
dates=[dt.datetime.fromtimestamp(ts) for ts in time_test]
In [266]:
plt.scatter(dates, rating_test)
Out[266]:
In [52]:
print centroids
In [ ]: