In [1]:

    
import numpy as np
import matplotlib.pyplot as plt
import seaborn
from __future__ import division
from functools import partial
import matplotlib.cm as cm
import pickle

from keras.preprocessing import image

import bayesian_changepoint_detection.online_changepoint_detection as oncd
import bayesian_changepoint_detection.offline_changepoint_detection as offcd
from scipy.stats import multivariate_normal,t

import pandas as pd
import sys
sys.path.append('../')

from src import feature_extraction

%matplotlib inline
%load_ext autoreload
%autoreload 2









    



Using TensorFlow backend.






    



Use scipy logsumexp().

Bayesian Switch-Point

does online and offline detection.
the online is the adam,mckay 2007 method.

http://nbviewer.jupyter.org/github/hildensia/bayesian_changepoint_detection/blob/master/Example%20Code.ipynb

Based on:

[1] Paul Fearnhead, Exact and Efficient Bayesian Inference for Multiple Changepoint problems, Statistics and computing 16.2 (2006), pp. 203--213
[2] Xuan Xiang, Kevin Murphy, Modeling Changing Dependency Structure in Multivariate Time Series, ICML (2007), pp. 1055--1062
[3] Ryan P. Adams, David J.C. MacKay, Bayesian Online Changepoint Detection, arXiv 0710.3742 (2007)

Testing on 1-D Generated Data



In [51]:

    
def generate_normal_time_series(num, minl=50, maxl=1000):
    data = np.array([], dtype=np.float64)
    partition = np.random.randint(minl, maxl, num)
    for p in partition:
        mean = np.random.randn()*2 # new random mean 
        var = np.random.randn()*1 # new random variance 
        if var < 0:
            var = var * -1
        tdata = np.random.normal(mean, var, p)
        data = np.concatenate((data, tdata))
    return data



In [52]:

    
np.random.seed(1000)
data = generate_normal_time_series(7, 50, 200)



In [53]:

    
%time Q, P, Pcp = offcd.offline_changepoint_detection(data, partial(offcd.const_prior, l=(len(data)+1)), offcd.gaussian_obs_log_likelihood, truncate=-40)









    



CPU times: user 23.6 s, sys: 107 ms, total: 23.7 s
Wall time: 23.7 s

prior over how likely it is to have two successive change-points.
- here they have a uniform prior over length of sequences. So seq len=1,=2,=3 etc. are all equally likely.
- I might want to use a poisson.
liklihood for data in run sequences (where there is no change-point)
- here it's a guassian log lik model.



In [ ]:

    
Pcp.shape # need to sum to get the probabilities.



In [55]:

    
fig, ax = plt.subplots(figsize=[12,8])
ax = fig.add_subplot(2, 1, 1)
ax.plot(data[:])
ax.set_ylabel('data')
ax = fig.add_subplot(2, 1, 2, sharex=ax)
ax.plot(np.exp(Pcp).sum(0))
ax.set_ylabel('p(change-point)')









    Out[55]:





<matplotlib.text.Text at 0x13f7c1090>



In [56]:

    
import bayesian_changepoint_detection.online_changepoint_detection as oncd
reload(oncd)
%time R, maxes = oncd.online_changepoint_detection(data, partial(oncd.constant_hazard, 250), oncd.StudentT(0.1, .01, 1, 0))









    



CPU times: user 319 ms, sys: 6.42 ms, total: 325 ms
Wall time: 326 ms

the online c.pd. is much faster because it loops through once and updates the probability of change-points.



In [57]:

    
R.shape # are the probabilities over run-lengths









    Out[57]:





(913, 913)



In [58]:

    
import matplotlib.cm as cm
fig, ax = plt.subplots(figsize=[12,12])
ax = fig.add_subplot(3, 1, 1)
ax.plot(data)
ax = fig.add_subplot(3, 1, 2, sharex=ax)
sparsity = 5  # only plot every fifth data for faster display
ax.pcolor(np.array(range(0, len(R[:,0]), sparsity)), 
          np.array(range(0, len(R[:,0]), sparsity)), 
          -np.log(R[0:-1:sparsity, 0:-1:sparsity]), 
          cmap=cm.Greys, vmin=0, vmax=30)
ax = fig.add_subplot(3, 1, 3, sharex=ax)
Nw=10;
ax.plot(R[Nw,Nw:-1])









    



/Users/chris/anaconda/envs/virtenv/lib/python2.7/site-packages/ipykernel_launcher.py:9: RuntimeWarning: divide by zero encountered in log
  if __name__ == '__main__':






    Out[58]:





[<matplotlib.lines.Line2D at 0x1405179d0>]



In [66]:

    
plt.plot(R[4,:])









    Out[66]:





[<matplotlib.lines.Line2D at 0x13de17190>]

Testing on 2-D Generated Data

generate new dataset



In [3]:

    
def generate_2D_normal_time_series(number_partitions, minl=50, maxl=1000,dim=2):
    partition = np.random.randint(minl, maxl, number_partitions)
    data = np.zeros((1,dim))
    for p in partition:
        mean = np.random.randn(dim)*2 # new random mean 
        Sigma = np.diag(np.ones(dim)).copy()
        tdata = multivariate_normal.rvs(mean, Sigma, size=p)
        data = np.vstack((data,tdata.copy()))
    data=data[1:,:]
    return data,partition.cumsum()[0:-1]



In [21]:

    
# generate data
np.random.seed(102) # this works 
data,partition = generate_2D_normal_time_series(3,10,100)

Offline Method



In [22]:

    
Q, P, Pcp = offcd.offline_changepoint_detection(data, partial(offcd.const_prior, l=(len(data)+1)), offcd.gaussian_obs_log_likelihood, truncate=-40)



In [31]:

    
fig, ax = plt.subplots(figsize=[12,8])

ax = fig.add_subplot(2, 1, 1)
ax.plot(data[:])
ax.scatter(partition,np.zeros(len(partition)),c='k') # change points 
ax.set_ylabel('data')
ax = fig.add_subplot(2, 1, 2, sharex=ax)
ax.plot(np.exp(Pcp).sum(0))
ax.set_ylabel('p(change-point)')
ax.set_xlabel('time point')









    Out[31]:





<matplotlib.text.Text at 0x115255490>

Online Method (adapted by chris for multiple dimensions)



In [23]:

    
# fit model 
R, maxes = oncd.online_changepoint_detection(data, partial(oncd.constant_hazard, 250), oncd.MV_Norm(mu=np.zeros(2),Sigma=np.diag(np.ones(2)),n=np.array([1.0])))
# sometimes need to re-run because of underflow? there are inf's that appear



In [24]:

    
R









    Out[24]:





array([[  1.00000000e+00,   4.00000000e-03,   4.00000000e-03, ...,
          4.00000000e-03,   4.00000000e-03,   4.00000000e-03],
       [  0.00000000e+00,   9.96000000e-01,   3.98400000e-03, ...,
          1.08484064e-03,   1.23774557e-02,   1.07295785e-03],
       [  0.00000000e+00,   0.00000000e+00,   9.92016000e-01, ...,
          8.33851970e-04,   2.46604558e-04,   1.71260033e-03],
       ..., 
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          0.00000000e+00,   2.67558967e-84,   1.42027863e-86],
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          0.00000000e+00,   0.00000000e+00,   8.79588576e-85],
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00]])



In [25]:

    
# Plot 
fig, ax = plt.subplots(figsize=[12,12])
ax = fig.add_subplot(3, 1, 1)
ax.plot(data) # time series 
ax.scatter(partition,np.zeros(len(partition)),c='k') # change points 


ax = fig.add_subplot(3, 1, 2, sharex=ax)
sparsity = 5  # only plot every fifth data for faster display
ax.pcolor(np.array(range(0, len(R[:,0]), sparsity)), 
          np.array(range(0, len(R[:,0]), sparsity)), 
          -np.log(R[0:-1:sparsity, 0:-1:sparsity]), 
          cmap=cm.Greys, vmin=0, vmax=30)

ax = fig.add_subplot(3, 1, 3, sharex=ax)
Nw=10;
ax.plot(R[Nw,Nw:-1])









    



/Users/chris/anaconda/envs/virtenv/lib/python2.7/site-packages/ipykernel_launcher.py:12: RuntimeWarning: divide by zero encountered in log
  if sys.path[0] == '':






    Out[25]:





[<matplotlib.lines.Line2D at 0x113c636d0>]

Testing on black dress - white dress



In [4]:

    
# get features for 2 folders 
feature_list_black_imgs = feature_extraction.layer_feature_extraction('../data_img_classes/class_black/')
feature_list_white_imgs = feature_extraction.layer_feature_extraction('../data_img_classes/class_white/')









    



K.image_dim_ordering: tf
K.image_dim_ordering: tf



In [15]:

    
# concatenate features into a vector (black dresses first, then white dresses)
n_blk_imgs = len(feature_list_black_imgs.keys())
n_white_imgs = len(feature_list_white_imgs.keys())
feature_vec = np.empty((n_blk_imgs+n_white_imgs,2048))
img_files = []
for i,img_file in enumerate(feature_list_black_imgs.keys()+feature_list_white_imgs.keys()):
    if i<n_blk_imgs:
        feature_vec[i,:]=feature_list_black_imgs[img_file]
    else:
        feature_vec[i,:]=feature_list_white_imgs[img_file]
    img_files.append(img_file)
feature_vec.shape









    Out[15]:





(51, 2048)



In [6]:

    
n_blk_imgs









    Out[6]:





23



In [7]:

    
# reduce dimensionality 
pca_all = pickle.load(open('../data_nn_features/pca_all_items_sample1000.pkl','rb'))



In [55]:

    
projected_feature_vec = pca_all.transform(feature_vec)
print(projected_feature_vec.shape)









    



(51, 2048)
(51, 5)

Plotting the dress sequence



In [31]:

    
fig,axes =plt.subplots(1,len(img_files),figsize=(50,4))
for i,img_file in enumerate(img_files):
    img = image.load_img(img_file, target_size=(224, 224))
    axes[i].imshow(img)
    axes[i].set_title('image:'+str(i))
    axes[i].get_xaxis().set_visible(False)
    axes[i].get_yaxis().set_visible(False)
plt.savefig('../figures/test.png',dpi=300)

The dresses switch from black to white at image 22-23

Offline Method



In [78]:

    
reduced_projected_feature_vec = projected_feature_vec[:,0:5] 
print(reduced_projected_feature_vec.shape)
input_data=reduced_projected_feature_vec
Q, P, Pcp = offcd.offline_changepoint_detection(input_data, partial(offcd.const_prior, l=(len(input_data)+1)), offcd.gaussian_obs_log_likelihood, truncate=-40)



In [79]:

    
fig, ax = plt.subplots(2,1,figsize=[12,8])

ax[0].plot(input_data[:])
ax[0].set_ylabel('data')
ax[0].set_xlim(0,input_data.shape[0])

ax[1].plot(np.exp(Pcp).sum(0))
ax[1].set_ylabel('p(change-point)')
ax[1].set_xlim(0,input_data.shape[0])









    Out[79]:





(0, 51)



In [82]:

    
print('change point at image {0}').format(np.argmax(np.exp(Pcp).sum(0)))









    



change point at image 22

with 5 PC's, the algorithm picks up the switch from black dresses to white dresses.



In [73]:

    
reduced_projected_feature_vec = projected_feature_vec[:,0:50] 
print(reduced_projected_feature_vec.shape)
input_data=reduced_projected_feature_vec
Q, P, Pcp = offcd.offline_changepoint_detection(input_data, partial(offcd.const_prior, l=(len(input_data)+1)), offcd.gaussian_obs_log_likelihood, truncate=-40)



In [74]:

    
fig, ax = plt.subplots(2,1,figsize=[12,8])

ax[0].plot(input_data[:])
ax[0].set_ylabel('data')
ax[0].set_xlim(0,input_data.shape[0])

ax[1].plot(np.exp(Pcp).sum(0))
ax[1].set_ylabel('p(change-point)')
ax[1].set_xlim(0,input_data.shape[0])









    Out[74]:





(0, 51)

with too many PC's, the method fails.

Online Method (McKay 2007)



In [84]:

    
reduced_projected_feature_vec = projected_feature_vec[:,0:50] 
print(reduced_projected_feature_vec.shape)
input_data = reduced_projected_feature_vec
R, maxes = oncd.online_changepoint_detection(input_data, partial(oncd.constant_hazard, 250), oncd.MV_Norm(mu=np.zeros(input_data.shape[1]),Sigma=np.diag(np.ones(input_data.shape[1])),n=np.array([1.0])))
# sometimes have to run this twice due to underflow issues



In [85]:

    
# Plot 
fig, ax = plt.subplots(2,1,figsize=[12,8])

ax[0].plot(input_data) # time series 
ax[0].set_xlim(0,input_data.shape[0])


sparsity = 1  # only plot every fifth data for faster display
ax[1].pcolor(np.array(range(0, len(R[:,0]), sparsity)), 
          np.array(range(0, len(R[:,0]), sparsity)), 
          -np.log(R[0:-1:sparsity, 0:-1:sparsity]), 
          cmap=cm.Greys, vmin=0, vmax=30)
ax[1].set_xlim(0,input_data.shape[0])

# ax = fig.add_subplot(3, 1, 3, sharex=ax)
# Nw=5;
# ax.plot(R[Nw,Nw:-1])









    



/Users/chris/anaconda/envs/virtenv/lib/python2.7/site-packages/ipykernel_launcher.py:11: RuntimeWarning: divide by zero encountered in log
  # This is added back by InteractiveShellApp.init_path()






    Out[85]:





(0, 51)

here the algorithm estimate that at image 23, there is a 23 image run length with high probability.
at image 24 the probability of the run length drops back to 1.
this algorithm does better with the higher dimensional input



In [97]:

    
reduced_projected_feature_vec = projected_feature_vec[:,0:5] 
print(reduced_projected_feature_vec.shape)
input_data = reduced_projected_feature_vec
R, maxes = oncd.online_changepoint_detection(input_data, partial(oncd.constant_hazard, 250), oncd.MV_Norm(mu=np.zeros(input_data.shape[1]),Sigma=np.diag(np.ones(input_data.shape[1])),n=np.array([1.0])))
# sometimes have to run this twice due to underflow issues



In [98]:

    
# Plot 
fig, ax = plt.subplots(2,1,figsize=[12,8])

ax[0].plot(input_data) # time series 
ax[0].set_xlim(0,input_data.shape[0])


sparsity = 1  # only plot every fifth data for faster display
ax[1].pcolor(np.array(range(0, len(R[:,0]), sparsity)), 
          np.array(range(0, len(R[:,0]), sparsity)), 
          -np.log(R[0:-1:sparsity, 0:-1:sparsity]), 
          cmap=cm.Greys, vmin=0, vmax=30)
ax[1].set_xlim(0,input_data.shape[0])

# ax = fig.add_subplot(3, 1, 3, sharex=ax)
# Nw=5;
# ax.plot(R[Nw,Nw:-1])









    



/Users/chris/anaconda/envs/virtenv/lib/python2.7/site-packages/ipykernel_launcher.py:11: RuntimeWarning: divide by zero encountered in log
  # This is added back by InteractiveShellApp.init_path()






    Out[98]:





(0, 51)

though it appears to do better in with 5 top PC's (hard to tell).

Testing on smaller, real trajectories



In [58]:

    
# instantiate the model
base_model = ResNet50(include_top=False, weights='imagenet') #this will pull the weights from the folder 

# cut the model to lower levels only 
model = Model(input=base_model.input, output=base_model.get_layer('avg_pool').output)









    



K.image_dim_ordering: tf






    



/Users/chris/anaconda/envs/virtenv/lib/python2.7/site-packages/ipykernel_launcher.py:5: UserWarning: Update your `Model` call to the Keras 2 API: `Model(outputs=Tensor("av..., inputs=Tensor("in...)`
  """



In [141]:

    
user_id = 106144465 

#get images 
folder = '../data_img_sample_item_view_sequences/'
img_files = glob.glob(folder+'*'+str(user_id)+'*')
print(img_files)


# make features 
trajectory_features = np.empty((len(img_files),2048))
for i,img_file in enumerate(img_files):
    x,img = preprocess_img(img_file) # preprocess
    trajectory_features[i,:] = model.predict(x)[0,0,0,:]









    



['../data_img_sample_item_view_sequences/106144465_1_82548633538732181_0_view_.jpg', '../data_img_sample_item_view_sequences/106144465_2_310261911240101889_11_view_.jpg', '../data_img_sample_item_view_sequences/106144465_3_75793218577661959_2_view_.jpg', '../data_img_sample_item_view_sequences/106144465_4_76074708321099781_5_view_.jpg', '../data_img_sample_item_view_sequences/106144465_5_98029795868614676_4_view_.jpg', '../data_img_sample_item_view_sequences/106144465_6_22313013691846692_3_view_.jpg', '../data_img_sample_item_view_sequences/106144465_7_467887899726037038_4_view_.jpg', '../data_img_sample_item_view_sequences/106144465_8_216812239953170449_21_buy_.jpg']



In [142]:

    
trajectory_features.shape









    Out[142]:





(8, 2048)



In [143]:

    
# stack example trajectory_features
trajectory_features = np.vstack((trajectory_features,trajectory_features,trajectory_features))
trajectory_features.shape









    Out[143]:





(24, 2048)



In [144]:

    
projection = pca_all.transform(trajectory_features)
projection.shape









    Out[144]:





(24, 1000)



In [160]:

    
input_data = trajectory_features
input_data = trajectory_features[:,0:100]
input_data = trajectory_features[:,0:10]

#Q, P, Pcp = offcd.offline_changepoint_detection(input_data, partial(offcd.const_prior, l=(len(input_data)+1)), offcd.gaussian_obs_log_likelihood, truncate=-40)


input_data = projection[:,0:5] # 10 PC's
Q, P, Pcp = offcd.offline_changepoint_detection(input_data, partial(offcd.const_prior, l=(len(input_data)+1)), offcd.gaussian_obs_log_likelihood, truncate=-40)



In [161]:

    
plt.plot(np.exp(Pcp).sum(0))
plt.ylabel('p(change-point)')









    Out[161]:





<matplotlib.text.Text at 0x1495e0890>



In [162]:

    
plt.plot(input_data)









    Out[162]:





[<matplotlib.lines.Line2D at 0x149d33610>,
 <matplotlib.lines.Line2D at 0x149d33810>,
 <matplotlib.lines.Line2D at 0x149d33950>,
 <matplotlib.lines.Line2D at 0x149d33a90>,
 <matplotlib.lines.Line2D at 0x149d33bd0>]



In [149]:

    
#R



In [159]:

    
R, maxes = oncd.online_changepoint_detection(input_data, partial(oncd.constant_hazard,250), oncd.MV_Norm(mu=np.zeros(input_data.shape[1]),Sigma=np.diag(np.ones(input_data.shape[1])),n=np.array([1.0])))


fig, ax = plt.subplots(figsize=[12,12])
ax = fig.add_subplot(3, 1, 2, sharex=ax)
sparsity = 1  # only plot every fifth data for faster display
ax.pcolor(np.array(range(0, len(R[:,0]), sparsity)), 
          np.array(range(0, len(R[:,0]), sparsity)), 
          -np.log(R[0:-1:sparsity, 0:-1:sparsity]), 
          cmap=cm.Greys, vmin=0, vmax=30)
ax = fig.add_subplot(3, 1, 3, sharex=ax)
Nw=4;
ax.plot(R[Nw,Nw:-1])









    



/Users/chris/anaconda/envs/virtenv/lib/python2.7/site-packages/ipykernel_launcher.py:10: RuntimeWarning: divide by zero encountered in log
  # Remove the CWD from sys.path while we load stuff.






    Out[159]:





[<matplotlib.lines.Line2D at 0x1493ac190>]



In [113]:

    
print('target class')
plt.figure(figsize=(12,6))
len_seq = len(img_files)

fig,axes = plt.subplots(2,len_seq)

# make color
#color_red_black = pd.Series(red_traj>0).map({False:'k',True:'r'}).as_matrix()

for i in range(len_seq):
    img = image.load_img(img_files[i], target_size=(224, 224))

    # images
    axes[0,i].imshow(img)
    axes[0,i].set_xticklabels([])
    #axes[0,i].get_xaxis().set_visible(False)
    axes[0,i].get_xaxis().set_ticks([])
    axes[0,i].get_yaxis().set_visible(False) 
    if i<(len_seq-1):
        axes[0,i].set_xlabel('view '+str(i))
    else:
        axes[0,i].set_xlabel('buy')
         
    # bar 
#     axes[1,i].bar(0,red_traj[i],color=color_red_black[i])
#     axes[1,i].set_ylim([-10,5])
#     axes[1,i].get_xaxis().set_visible(False)
#     axes[1,i].axhline(y=0,linestyle='--',color='w')
#     if i==0:
#         print('here')
#         axes[1,i].set_ylabel('red classification')
#     else:
#         axes[1,i].get_yaxis().set_visible(False) 
#     sns.despine()
# savefile = '../figures/example_sequence_interpretable_features_ui_'+str(user_id)+'.png'
# plt.savefig(savefile,dpi=300)









    



target class






    





<matplotlib.figure.Figure at 0x1323adc50>

Save



In [ ]:

    
%%bash 
jupyter nbconvert --to html Change_Point_Detection_in_Trajectories.ipynb && mv Change_Point_Detection_in_Trajectories.html ../notebook_htmls/Change_Point_Detection_in_Trajectories_v1.html
cp Change_Point_Detection_in_Trajectories.ipynb ../notebook_versions/Change_Point_Detection_in_Trajectories_v1.ipynb