notebook.community

Edit and run



In [350]:

    
#omg I can fit more than one data set, do that!



In [3]:

    
import numpy as np
from sklearn.hmm import GaussianHMM
from scipy import stats



In [4]:

    
f = open('temp', 'w')



In [5]:

    
import datetime as dt



In [6]:

    
import matplotlib.pyplot as plt



In [7]:

    
import MySQLdb



In [8]:

    
%matplotlib inline



In [9]:

    
def time_data_clean(time_data):
    rating = [0.0]*len(time_data)
    time = [0]*len(time_data)
    rating = [x[0] for x in time_data]
    time = [x[1] for x in time_data]
    return rating, time



In [10]:

    
def time_since_last(time):
    time_since_last = np.zeros(len(time))
    for i in range(1, len(time)):
        time_since_last[i] = time[i] - time[i-1]
        if time_since_last[i] == 0: 
            time_since_last[i] = 1
    time_since_last[0] = time_since_last[1]
    return time_since_last



In [11]:

    
def get_data(PID, cursor, tablename):
    sql = "Select RTime, RScore From " +tablename + " Where PID = " + '"' + PID +'";'
    cursor.execute(sql)
    data = cursor.fetchall()
    data = sorted(data)
    rating = np.array(zip(*data)[1], dtype = int)
    time = np.array(zip(*data)[0], dtype = float)
    dates=[dt.datetime.fromtimestamp(ts) for ts in time]
    return rating, time, dates



In [12]:

    
def avg_rating(rating):
    avg = [0]*len(rating)
    avg[0] = float(rating[0])
    for k in range(1, len(rating)):
        avg[k]= float(np.mean(rating[:k]))
    return avg



In [34]:

    
def rolling_avg_rating(rating):
    limited_avg = [0]*len(rating)
    limited_avg[0] = float(rating[0])
    for k in range(1, len(rating)):
        if k<40:
            limited_avg[k]= float(np.mean(rating[:k]))
        else:
            limited_avg[k]=float(np.mean(rating[k-40:k]))
    return limited_avg



In [14]:

    
db = MySQLdb.connect(host="localhost", user="root", db = "home_kitchen")



In [15]:

    
cursor = db.cursor()



In [16]:

    
tablename = 'all_hk'



In [17]:

    
PIDlist = [' B0000X7CMQ', ' B00005AQ9Q', ' B000GTR2F6', ' B000AQSMPO', ' B00005MF9C', ' B0000E2PEI', ' B0006SFFAQ', ' B00005AQ9Q', ' B00005R19P', ' B000FFQ554', ' B0006ZUHR0']



In [52]:

    
pid1 = ' B0000E2PEI' #PIDlist[0]
pid2 = PIDlist[1]
pid3 = PIDlist[3]



In [53]:

    
r1, t1, d1 = get_data(pid1, cursor, tablename)
#r2, t2, d2 = get_data(pid2, cursor, tablename)
#r3, t3, d3 = get_data(pid3, cursor, tablename)



In [54]:

    
#ratings = [r1, r2, r3]



In [55]:

    
#avgs = map(avg_rating, ratings)
#limavs = map(rolling_avg_rating, ratings)
avg = avg_rating(r1)
limav = rolling_avg_rating(r1)



In [56]:

    
for k in range(len(t1)):
    f.write(str(t1[k])+',') 
    
f.write('\n\n\n\n')

for k in range(len(r1)):
    f.write(str(r1[k])+',')









    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-56-09088a6c5ae8> in <module>()
      1 for k in range(len(t1)):
----> 2     f.write(str(t1[k])+',')
      3 
      4 f.write('\n\n\n\n')
      5 

ValueError: I/O operation on closed file



In [57]:

    
f.close()



In [58]:

    
plt.scatter(d1, r1)









    Out[58]:





<matplotlib.collections.PathCollection at 0x1066fb290>



In [59]:

    
plt.scatter(d1, avg)









    Out[59]:





<matplotlib.collections.PathCollection at 0x106750a90>



In [60]:

    
plt.scatter(d1, limav)









    Out[60]:





<matplotlib.collections.PathCollection at 0x106e5b450>



In [61]:

    
#Y1 = np.array(limavs[0]).reshape(len(limavs[0]), 1)
#Y2 = np.array(limavs[1]).reshape(len(limavs[1]), 1)
#Y3 = np.array(limavs[2]).reshape(len(limavs[2]), 1)
Y1 = np.array(limav).reshape(len(limav), 1)



In [62]:

    
n_components = 3



In [63]:

    
# make an HMM instance and execute fit
model = GaussianHMM(n_components, "full")
#model.fit([X])

# predict the optimal sequence of internal hidden state
#hidden_states = model.predict(X)



In [64]:

    
model.fit([Y1])









    Out[64]:





GaussianHMM(algorithm='viterbi', covariance_type='full', covars_prior=0.01,
      covars_weight=1,
      init_params='abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ',
      means_prior=None, means_weight=0, n_components=3, n_iter=10,
      params='abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ',
      random_state=None, startprob=None, startprob_prior=1.0, thresh=0.01,
      transmat=None, transmat_prior=1.0)



In [65]:

    
hidden_states = model.predict(Y1)



In [66]:

    
print model.score(Y1)









    



263.867023326



In [67]:

    
#colors = np.concatenate([np.array([6 for i in range(10)]), hidden_states])



In [68]:

    
#print colors



In [68]:



In [69]:

    
plt.scatter(d1, avg, c = hidden_states) #used to say avgs[0]=0?









    Out[69]:





<matplotlib.collections.PathCollection at 0x106d510d0>



In [541]:

    
plt.show()



In [666]:

    
data = zip(hidden_states, avgs[0])



In [667]:

    
print data









    



[(2, 4.0), (2, 4.0), (2, 2.5), (2, 2.3333333333333335), (2, 2.25), (2, 2.8), (2, 3.1666666666666665), (2, 2.857142857142857), (2, 2.75), (2, 2.5555555555555554), (2, 2.6), (2, 2.5454545454545454), (2, 2.4166666666666665), (2, 2.3076923076923075), (2, 2.2142857142857144), (2, 2.3333333333333335), (2, 2.4375), (2, 2.3529411764705883), (2, 2.2777777777777777), (2, 2.210526315789474), (2, 2.15), (2, 2.0952380952380953), (2, 2.227272727272727), (2, 2.1739130434782608), (2, 2.125), (2, 2.12), (2, 2.230769230769231), (2, 2.185185185185185), (2, 2.2142857142857144), (2, 2.206896551724138), (2, 2.1666666666666665), (2, 2.129032258064516), (2, 2.09375), (2, 2.0606060606060606), (2, 2.0294117647058822), (2, 2.0), (2, 1.9722222222222223), (2, 1.945945945945946), (2, 2.0), (2, 2.076923076923077), (2, 2.05), (2, 2.024390243902439), (2, 2.0), (2, 1.9767441860465116), (2, 1.9545454545454546), (2, 1.9333333333333333), (2, 1.9130434782608696), (2, 1.8936170212765957), (2, 1.875), (2, 1.9387755102040816), (2, 1.92), (2, 1.9803921568627452), (2, 1.9807692307692308), (2, 1.9622641509433962), (2, 1.962962962962963), (2, 1.9454545454545455), (2, 1.9285714285714286), (2, 1.9298245614035088), (2, 1.9137931034482758), (2, 1.9661016949152543), (2, 1.9833333333333334), (2, 2.0), (2, 1.9838709677419355), (2, 1.9682539682539681), (2, 1.953125), (2, 1.9384615384615385), (2, 1.9242424242424243), (2, 1.9253731343283582), (2, 1.9411764705882353), (2, 1.9275362318840579), (2, 1.9142857142857144), (2, 1.9577464788732395), (2, 2.0), (2, 2.0), (2, 2.0), (2, 1.9866666666666666), (2, 1.9736842105263157), (2, 1.9610389610389611), (2, 1.9871794871794872), (2, 1.9746835443037976), (2, 2.0), (2, 1.9876543209876543), (2, 1.975609756097561), (2, 1.963855421686747), (2, 1.9880952380952381), (2, 1.988235294117647), (2, 1.9883720930232558), (2, 1.9770114942528736), (2, 1.9659090909090908), (2, 1.9775280898876404), (2, 1.9666666666666666), (2, 1.956043956043956), (2, 1.9456521739130435), (2, 1.935483870967742), (2, 1.925531914893617), (2, 1.9157894736842105), (2, 1.90625), (2, 1.8969072164948453), (2, 1.8877551020408163)]



In [2]:

    
PID = " B0000X7CMQ"
sql = "Select distinct PTitle from all_hk where PID = " + '"' + PID + '";'
prodname = str(query_db(sql))
prodname = tuple(x[0] for x in prodname)









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-2-da511feee8aa> in <module>()
      1 PID = " B0000X7CMQ"
      2 sql = "Select distinct PTitle from all_hk where PID = " + '"' + PID + '";'
----> 3 prodname = str(query_db(sql))
      4 prodname = tuple(x[0] for x in prodname)

NameError: name 'query_db' is not defined



In [ ]: