In [350]:
#omg I can fit more than one data set, do that!

In [3]:
import numpy as np
from sklearn.hmm import GaussianHMM
from scipy import stats

In [4]:
f = open('temp', 'w')

In [5]:
import datetime as dt

In [6]:
import matplotlib.pyplot as plt

In [7]:
import MySQLdb

In [8]:
%matplotlib inline

In [9]:
def time_data_clean(time_data):
    rating = [0.0]*len(time_data)
    time = [0]*len(time_data)
    rating = [x[0] for x in time_data]
    time = [x[1] for x in time_data]
    return rating, time

In [10]:
def time_since_last(time):
    time_since_last = np.zeros(len(time))
    for i in range(1, len(time)):
        time_since_last[i] = time[i] - time[i-1]
        if time_since_last[i] == 0: 
            time_since_last[i] = 1
    time_since_last[0] = time_since_last[1]
    return time_since_last

In [11]:
def get_data(PID, cursor, tablename):
    sql = "Select RTime, RScore From " +tablename + " Where PID = " + '"' + PID +'";'
    cursor.execute(sql)
    data = cursor.fetchall()
    data = sorted(data)
    rating = np.array(zip(*data)[1], dtype = int)
    time = np.array(zip(*data)[0], dtype = float)
    dates=[dt.datetime.fromtimestamp(ts) for ts in time]
    return rating, time, dates

In [12]:
def avg_rating(rating):
    avg = [0]*len(rating)
    avg[0] = float(rating[0])
    for k in range(1, len(rating)):
        avg[k]= float(np.mean(rating[:k]))
    return avg

In [34]:
def rolling_avg_rating(rating):
    limited_avg = [0]*len(rating)
    limited_avg[0] = float(rating[0])
    for k in range(1, len(rating)):
        if k<40:
            limited_avg[k]= float(np.mean(rating[:k]))
        else:
            limited_avg[k]=float(np.mean(rating[k-40:k]))
    return limited_avg

In [14]:
db = MySQLdb.connect(host="localhost", user="root", db = "home_kitchen")

In [15]:
cursor = db.cursor()

In [16]:
tablename = 'all_hk'

In [17]:
PIDlist = [' B0000X7CMQ', ' B00005AQ9Q', ' B000GTR2F6', ' B000AQSMPO', ' B00005MF9C', ' B0000E2PEI', ' B0006SFFAQ', ' B00005AQ9Q', ' B00005R19P', ' B000FFQ554', ' B0006ZUHR0']

In [52]:
pid1 = ' B0000E2PEI' #PIDlist[0]
pid2 = PIDlist[1]
pid3 = PIDlist[3]

In [53]:
r1, t1, d1 = get_data(pid1, cursor, tablename)
#r2, t2, d2 = get_data(pid2, cursor, tablename)
#r3, t3, d3 = get_data(pid3, cursor, tablename)

In [54]:
#ratings = [r1, r2, r3]

In [55]:
#avgs = map(avg_rating, ratings)
#limavs = map(rolling_avg_rating, ratings)
avg = avg_rating(r1)
limav = rolling_avg_rating(r1)

In [56]:
for k in range(len(t1)):
    f.write(str(t1[k])+',') 
    
f.write('\n\n\n\n')

for k in range(len(r1)):
    f.write(str(r1[k])+',')


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-56-09088a6c5ae8> in <module>()
      1 for k in range(len(t1)):
----> 2     f.write(str(t1[k])+',')
      3 
      4 f.write('\n\n\n\n')
      5 

ValueError: I/O operation on closed file

In [57]:
f.close()

In [58]:
plt.scatter(d1, r1)


Out[58]:
<matplotlib.collections.PathCollection at 0x1066fb290>

In [59]:
plt.scatter(d1, avg)


Out[59]:
<matplotlib.collections.PathCollection at 0x106750a90>

In [60]:
plt.scatter(d1, limav)


Out[60]:
<matplotlib.collections.PathCollection at 0x106e5b450>

In [61]:
#Y1 = np.array(limavs[0]).reshape(len(limavs[0]), 1)
#Y2 = np.array(limavs[1]).reshape(len(limavs[1]), 1)
#Y3 = np.array(limavs[2]).reshape(len(limavs[2]), 1)
Y1 = np.array(limav).reshape(len(limav), 1)

In [62]:
n_components = 3

In [63]:
# make an HMM instance and execute fit
model = GaussianHMM(n_components, "full")
#model.fit([X])

# predict the optimal sequence of internal hidden state
#hidden_states = model.predict(X)

In [64]:
model.fit([Y1])


Out[64]:
GaussianHMM(algorithm='viterbi', covariance_type='full', covars_prior=0.01,
      covars_weight=1,
      init_params='abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ',
      means_prior=None, means_weight=0, n_components=3, n_iter=10,
      params='abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ',
      random_state=None, startprob=None, startprob_prior=1.0, thresh=0.01,
      transmat=None, transmat_prior=1.0)

In [65]:
hidden_states = model.predict(Y1)

In [66]:
print model.score(Y1)


263.867023326

In [67]:
#colors = np.concatenate([np.array([6 for i in range(10)]), hidden_states])

In [68]:
#print colors

In [68]:


In [69]:
plt.scatter(d1, avg, c = hidden_states) #used to say avgs[0]=0?


Out[69]:
<matplotlib.collections.PathCollection at 0x106d510d0>

In [541]:
plt.show()

In [666]:
data = zip(hidden_states, avgs[0])

In [667]:
print data


[(2, 4.0), (2, 4.0), (2, 2.5), (2, 2.3333333333333335), (2, 2.25), (2, 2.8), (2, 3.1666666666666665), (2, 2.857142857142857), (2, 2.75), (2, 2.5555555555555554), (2, 2.6), (2, 2.5454545454545454), (2, 2.4166666666666665), (2, 2.3076923076923075), (2, 2.2142857142857144), (2, 2.3333333333333335), (2, 2.4375), (2, 2.3529411764705883), (2, 2.2777777777777777), (2, 2.210526315789474), (2, 2.15), (2, 2.0952380952380953), (2, 2.227272727272727), (2, 2.1739130434782608), (2, 2.125), (2, 2.12), (2, 2.230769230769231), (2, 2.185185185185185), (2, 2.2142857142857144), (2, 2.206896551724138), (2, 2.1666666666666665), (2, 2.129032258064516), (2, 2.09375), (2, 2.0606060606060606), (2, 2.0294117647058822), (2, 2.0), (2, 1.9722222222222223), (2, 1.945945945945946), (2, 2.0), (2, 2.076923076923077), (2, 2.05), (2, 2.024390243902439), (2, 2.0), (2, 1.9767441860465116), (2, 1.9545454545454546), (2, 1.9333333333333333), (2, 1.9130434782608696), (2, 1.8936170212765957), (2, 1.875), (2, 1.9387755102040816), (2, 1.92), (2, 1.9803921568627452), (2, 1.9807692307692308), (2, 1.9622641509433962), (2, 1.962962962962963), (2, 1.9454545454545455), (2, 1.9285714285714286), (2, 1.9298245614035088), (2, 1.9137931034482758), (2, 1.9661016949152543), (2, 1.9833333333333334), (2, 2.0), (2, 1.9838709677419355), (2, 1.9682539682539681), (2, 1.953125), (2, 1.9384615384615385), (2, 1.9242424242424243), (2, 1.9253731343283582), (2, 1.9411764705882353), (2, 1.9275362318840579), (2, 1.9142857142857144), (2, 1.9577464788732395), (2, 2.0), (2, 2.0), (2, 2.0), (2, 1.9866666666666666), (2, 1.9736842105263157), (2, 1.9610389610389611), (2, 1.9871794871794872), (2, 1.9746835443037976), (2, 2.0), (2, 1.9876543209876543), (2, 1.975609756097561), (2, 1.963855421686747), (2, 1.9880952380952381), (2, 1.988235294117647), (2, 1.9883720930232558), (2, 1.9770114942528736), (2, 1.9659090909090908), (2, 1.9775280898876404), (2, 1.9666666666666666), (2, 1.956043956043956), (2, 1.9456521739130435), (2, 1.935483870967742), (2, 1.925531914893617), (2, 1.9157894736842105), (2, 1.90625), (2, 1.8969072164948453), (2, 1.8877551020408163)]

In [2]:
PID = " B0000X7CMQ"
sql = "Select distinct PTitle from all_hk where PID = " + '"' + PID + '";'
prodname = str(query_db(sql))
prodname = tuple(x[0] for x in prodname)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-2-da511feee8aa> in <module>()
      1 PID = " B0000X7CMQ"
      2 sql = "Select distinct PTitle from all_hk where PID = " + '"' + PID + '";'
----> 3 prodname = str(query_db(sql))
      4 prodname = tuple(x[0] for x in prodname)

NameError: name 'query_db' is not defined

In [ ]: