In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn
from __future__ import division
from functools import partial
import matplotlib.cm as cm
import pickle

from keras.preprocessing import image

import bayesian_changepoint_detection.online_changepoint_detection as oncd
import bayesian_changepoint_detection.offline_changepoint_detection as offcd
from scipy.stats import multivariate_normal,t

import pandas as pd
import sys
sys.path.append('../')

from src import feature_extraction

%matplotlib inline
%load_ext autoreload
%autoreload 2


Using TensorFlow backend.
Use scipy logsumexp().

Bayesian Switch-Point

  • does online and offline detection.
  • the online is the adam,mckay 2007 method.

http://nbviewer.jupyter.org/github/hildensia/bayesian_changepoint_detection/blob/master/Example%20Code.ipynb

Based on:

  • [1] Paul Fearnhead, Exact and Efficient Bayesian Inference for Multiple Changepoint problems, Statistics and computing 16.2 (2006), pp. 203--213
  • [2] Xuan Xiang, Kevin Murphy, Modeling Changing Dependency Structure in Multivariate Time Series, ICML (2007), pp. 1055--1062
  • [3] Ryan P. Adams, David J.C. MacKay, Bayesian Online Changepoint Detection, arXiv 0710.3742 (2007)

Testing on 1-D Generated Data


In [51]:
def generate_normal_time_series(num, minl=50, maxl=1000):
    data = np.array([], dtype=np.float64)
    partition = np.random.randint(minl, maxl, num)
    for p in partition:
        mean = np.random.randn()*2 # new random mean 
        var = np.random.randn()*1 # new random variance 
        if var < 0:
            var = var * -1
        tdata = np.random.normal(mean, var, p)
        data = np.concatenate((data, tdata))
    return data

In [52]:
np.random.seed(1000)
data = generate_normal_time_series(7, 50, 200)

In [53]:
%time Q, P, Pcp = offcd.offline_changepoint_detection(data, partial(offcd.const_prior, l=(len(data)+1)), offcd.gaussian_obs_log_likelihood, truncate=-40)


CPU times: user 23.6 s, sys: 107 ms, total: 23.7 s
Wall time: 23.7 s
  • prior over how likely it is to have two successive change-points.
    • here they have a uniform prior over length of sequences. So seq len=1,=2,=3 etc. are all equally likely.
    • I might want to use a poisson.
  • liklihood for data in run sequences (where there is no change-point)
    • here it's a guassian log lik model.

In [ ]:
Pcp.shape # need to sum to get the probabilities.

In [55]:
fig, ax = plt.subplots(figsize=[12,8])
ax = fig.add_subplot(2, 1, 1)
ax.plot(data[:])
ax.set_ylabel('data')
ax = fig.add_subplot(2, 1, 2, sharex=ax)
ax.plot(np.exp(Pcp).sum(0))
ax.set_ylabel('p(change-point)')


Out[55]:
<matplotlib.text.Text at 0x13f7c1090>

In [56]:
import bayesian_changepoint_detection.online_changepoint_detection as oncd
reload(oncd)
%time R, maxes = oncd.online_changepoint_detection(data, partial(oncd.constant_hazard, 250), oncd.StudentT(0.1, .01, 1, 0))


CPU times: user 319 ms, sys: 6.42 ms, total: 325 ms
Wall time: 326 ms
  • the online c.pd. is much faster because it loops through once and updates the probability of change-points.

In [57]:
R.shape # are the probabilities over run-lengths


Out[57]:
(913, 913)

In [58]:
import matplotlib.cm as cm
fig, ax = plt.subplots(figsize=[12,12])
ax = fig.add_subplot(3, 1, 1)
ax.plot(data)
ax = fig.add_subplot(3, 1, 2, sharex=ax)
sparsity = 5  # only plot every fifth data for faster display
ax.pcolor(np.array(range(0, len(R[:,0]), sparsity)), 
          np.array(range(0, len(R[:,0]), sparsity)), 
          -np.log(R[0:-1:sparsity, 0:-1:sparsity]), 
          cmap=cm.Greys, vmin=0, vmax=30)
ax = fig.add_subplot(3, 1, 3, sharex=ax)
Nw=10;
ax.plot(R[Nw,Nw:-1])


/Users/chris/anaconda/envs/virtenv/lib/python2.7/site-packages/ipykernel_launcher.py:9: RuntimeWarning: divide by zero encountered in log
  if __name__ == '__main__':
Out[58]:
[<matplotlib.lines.Line2D at 0x1405179d0>]

In [66]:
plt.plot(R[4,:])


Out[66]:
[<matplotlib.lines.Line2D at 0x13de17190>]

Testing on 2-D Generated Data

  • generate new dataset

In [3]:
def generate_2D_normal_time_series(number_partitions, minl=50, maxl=1000,dim=2):
    partition = np.random.randint(minl, maxl, number_partitions)
    data = np.zeros((1,dim))
    for p in partition:
        mean = np.random.randn(dim)*2 # new random mean 
        Sigma = np.diag(np.ones(dim)).copy()
        tdata = multivariate_normal.rvs(mean, Sigma, size=p)
        data = np.vstack((data,tdata.copy()))
    data=data[1:,:]
    return data,partition.cumsum()[0:-1]

In [21]:
# generate data
np.random.seed(102) # this works 
data,partition = generate_2D_normal_time_series(3,10,100)

Offline Method


In [22]:
Q, P, Pcp = offcd.offline_changepoint_detection(data, partial(offcd.const_prior, l=(len(data)+1)), offcd.gaussian_obs_log_likelihood, truncate=-40)

In [31]:
fig, ax = plt.subplots(figsize=[12,8])

ax = fig.add_subplot(2, 1, 1)
ax.plot(data[:])
ax.scatter(partition,np.zeros(len(partition)),c='k') # change points 
ax.set_ylabel('data')
ax = fig.add_subplot(2, 1, 2, sharex=ax)
ax.plot(np.exp(Pcp).sum(0))
ax.set_ylabel('p(change-point)')
ax.set_xlabel('time point')


Out[31]:
<matplotlib.text.Text at 0x115255490>

Online Method (adapted by chris for multiple dimensions)


In [23]:
# fit model 
R, maxes = oncd.online_changepoint_detection(data, partial(oncd.constant_hazard, 250), oncd.MV_Norm(mu=np.zeros(2),Sigma=np.diag(np.ones(2)),n=np.array([1.0])))
# sometimes need to re-run because of underflow? there are inf's that appear

In [24]:
R


Out[24]:
array([[  1.00000000e+00,   4.00000000e-03,   4.00000000e-03, ...,
          4.00000000e-03,   4.00000000e-03,   4.00000000e-03],
       [  0.00000000e+00,   9.96000000e-01,   3.98400000e-03, ...,
          1.08484064e-03,   1.23774557e-02,   1.07295785e-03],
       [  0.00000000e+00,   0.00000000e+00,   9.92016000e-01, ...,
          8.33851970e-04,   2.46604558e-04,   1.71260033e-03],
       ..., 
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          0.00000000e+00,   2.67558967e-84,   1.42027863e-86],
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          0.00000000e+00,   0.00000000e+00,   8.79588576e-85],
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00]])

In [25]:
# Plot 
fig, ax = plt.subplots(figsize=[12,12])
ax = fig.add_subplot(3, 1, 1)
ax.plot(data) # time series 
ax.scatter(partition,np.zeros(len(partition)),c='k') # change points 


ax = fig.add_subplot(3, 1, 2, sharex=ax)
sparsity = 5  # only plot every fifth data for faster display
ax.pcolor(np.array(range(0, len(R[:,0]), sparsity)), 
          np.array(range(0, len(R[:,0]), sparsity)), 
          -np.log(R[0:-1:sparsity, 0:-1:sparsity]), 
          cmap=cm.Greys, vmin=0, vmax=30)

ax = fig.add_subplot(3, 1, 3, sharex=ax)
Nw=10;
ax.plot(R[Nw,Nw:-1])


/Users/chris/anaconda/envs/virtenv/lib/python2.7/site-packages/ipykernel_launcher.py:12: RuntimeWarning: divide by zero encountered in log
  if sys.path[0] == '':
Out[25]:
[<matplotlib.lines.Line2D at 0x113c636d0>]

Testing on black dress - white dress


In [4]:
# get features for 2 folders 
feature_list_black_imgs = feature_extraction.layer_feature_extraction('../data_img_classes/class_black/')
feature_list_white_imgs = feature_extraction.layer_feature_extraction('../data_img_classes/class_white/')


K.image_dim_ordering: tf
K.image_dim_ordering: tf

In [15]:
# concatenate features into a vector (black dresses first, then white dresses)
n_blk_imgs = len(feature_list_black_imgs.keys())
n_white_imgs = len(feature_list_white_imgs.keys())
feature_vec = np.empty((n_blk_imgs+n_white_imgs,2048))
img_files = []
for i,img_file in enumerate(feature_list_black_imgs.keys()+feature_list_white_imgs.keys()):
    if i<n_blk_imgs:
        feature_vec[i,:]=feature_list_black_imgs[img_file]
    else:
        feature_vec[i,:]=feature_list_white_imgs[img_file]
    img_files.append(img_file)
feature_vec.shape


Out[15]:
(51, 2048)

In [6]:
n_blk_imgs


Out[6]:
23

In [7]:
# reduce dimensionality 
pca_all = pickle.load(open('../data_nn_features/pca_all_items_sample1000.pkl','rb'))

In [55]:
projected_feature_vec = pca_all.transform(feature_vec)
print(projected_feature_vec.shape)


(51, 2048)
(51, 5)

Plotting the dress sequence


In [31]:
fig,axes =plt.subplots(1,len(img_files),figsize=(50,4))
for i,img_file in enumerate(img_files):
    img = image.load_img(img_file, target_size=(224, 224))
    axes[i].imshow(img)
    axes[i].set_title('image:'+str(i))
    axes[i].get_xaxis().set_visible(False)
    axes[i].get_yaxis().set_visible(False)
plt.savefig('../figures/test.png',dpi=300)


  • The dresses switch from black to white at image 22-23

Offline Method


In [78]:
reduced_projected_feature_vec = projected_feature_vec[:,0:5] 
print(reduced_projected_feature_vec.shape)
input_data=reduced_projected_feature_vec
Q, P, Pcp = offcd.offline_changepoint_detection(input_data, partial(offcd.const_prior, l=(len(input_data)+1)), offcd.gaussian_obs_log_likelihood, truncate=-40)


(51, 5)

In [79]:
fig, ax = plt.subplots(2,1,figsize=[12,8])

ax[0].plot(input_data[:])
ax[0].set_ylabel('data')
ax[0].set_xlim(0,input_data.shape[0])

ax[1].plot(np.exp(Pcp).sum(0))
ax[1].set_ylabel('p(change-point)')
ax[1].set_xlim(0,input_data.shape[0])


Out[79]:
(0, 51)

In [82]:
print('change point at image {0}').format(np.argmax(np.exp(Pcp).sum(0)))


change point at image 22
  • with 5 PC's, the algorithm picks up the switch from black dresses to white dresses.

In [73]:
reduced_projected_feature_vec = projected_feature_vec[:,0:50] 
print(reduced_projected_feature_vec.shape)
input_data=reduced_projected_feature_vec
Q, P, Pcp = offcd.offline_changepoint_detection(input_data, partial(offcd.const_prior, l=(len(input_data)+1)), offcd.gaussian_obs_log_likelihood, truncate=-40)


(51, 50)

In [74]:
fig, ax = plt.subplots(2,1,figsize=[12,8])

ax[0].plot(input_data[:])
ax[0].set_ylabel('data')
ax[0].set_xlim(0,input_data.shape[0])

ax[1].plot(np.exp(Pcp).sum(0))
ax[1].set_ylabel('p(change-point)')
ax[1].set_xlim(0,input_data.shape[0])


Out[74]:
(0, 51)
  • with too many PC's, the method fails.

Online Method (McKay 2007)


In [84]:
reduced_projected_feature_vec = projected_feature_vec[:,0:50] 
print(reduced_projected_feature_vec.shape)
input_data = reduced_projected_feature_vec
R, maxes = oncd.online_changepoint_detection(input_data, partial(oncd.constant_hazard, 250), oncd.MV_Norm(mu=np.zeros(input_data.shape[1]),Sigma=np.diag(np.ones(input_data.shape[1])),n=np.array([1.0])))
# sometimes have to run this twice due to underflow issues


(51, 50)

In [85]:
# Plot 
fig, ax = plt.subplots(2,1,figsize=[12,8])

ax[0].plot(input_data) # time series 
ax[0].set_xlim(0,input_data.shape[0])


sparsity = 1  # only plot every fifth data for faster display
ax[1].pcolor(np.array(range(0, len(R[:,0]), sparsity)), 
          np.array(range(0, len(R[:,0]), sparsity)), 
          -np.log(R[0:-1:sparsity, 0:-1:sparsity]), 
          cmap=cm.Greys, vmin=0, vmax=30)
ax[1].set_xlim(0,input_data.shape[0])

# ax = fig.add_subplot(3, 1, 3, sharex=ax)
# Nw=5;
# ax.plot(R[Nw,Nw:-1])


/Users/chris/anaconda/envs/virtenv/lib/python2.7/site-packages/ipykernel_launcher.py:11: RuntimeWarning: divide by zero encountered in log
  # This is added back by InteractiveShellApp.init_path()
Out[85]:
(0, 51)
  • here the algorithm estimate that at image 23, there is a 23 image run length with high probability.
  • at image 24 the probability of the run length drops back to 1.
  • this algorithm does better with the higher dimensional input

In [97]:
reduced_projected_feature_vec = projected_feature_vec[:,0:5] 
print(reduced_projected_feature_vec.shape)
input_data = reduced_projected_feature_vec
R, maxes = oncd.online_changepoint_detection(input_data, partial(oncd.constant_hazard, 250), oncd.MV_Norm(mu=np.zeros(input_data.shape[1]),Sigma=np.diag(np.ones(input_data.shape[1])),n=np.array([1.0])))
# sometimes have to run this twice due to underflow issues


(51, 5)

In [98]:
# Plot 
fig, ax = plt.subplots(2,1,figsize=[12,8])

ax[0].plot(input_data) # time series 
ax[0].set_xlim(0,input_data.shape[0])


sparsity = 1  # only plot every fifth data for faster display
ax[1].pcolor(np.array(range(0, len(R[:,0]), sparsity)), 
          np.array(range(0, len(R[:,0]), sparsity)), 
          -np.log(R[0:-1:sparsity, 0:-1:sparsity]), 
          cmap=cm.Greys, vmin=0, vmax=30)
ax[1].set_xlim(0,input_data.shape[0])

# ax = fig.add_subplot(3, 1, 3, sharex=ax)
# Nw=5;
# ax.plot(R[Nw,Nw:-1])


/Users/chris/anaconda/envs/virtenv/lib/python2.7/site-packages/ipykernel_launcher.py:11: RuntimeWarning: divide by zero encountered in log
  # This is added back by InteractiveShellApp.init_path()
Out[98]:
(0, 51)
  • though it appears to do better in with 5 top PC's (hard to tell).

Testing on smaller, real trajectories


In [58]:
# instantiate the model
base_model = ResNet50(include_top=False, weights='imagenet') #this will pull the weights from the folder 

# cut the model to lower levels only 
model = Model(input=base_model.input, output=base_model.get_layer('avg_pool').output)


K.image_dim_ordering: tf
/Users/chris/anaconda/envs/virtenv/lib/python2.7/site-packages/ipykernel_launcher.py:5: UserWarning: Update your `Model` call to the Keras 2 API: `Model(outputs=Tensor("av..., inputs=Tensor("in...)`
  """

In [141]:
user_id = 106144465 

#get images 
folder = '../data_img_sample_item_view_sequences/'
img_files = glob.glob(folder+'*'+str(user_id)+'*')
print(img_files)


# make features 
trajectory_features = np.empty((len(img_files),2048))
for i,img_file in enumerate(img_files):
    x,img = preprocess_img(img_file) # preprocess
    trajectory_features[i,:] = model.predict(x)[0,0,0,:]


['../data_img_sample_item_view_sequences/106144465_1_82548633538732181_0_view_.jpg', '../data_img_sample_item_view_sequences/106144465_2_310261911240101889_11_view_.jpg', '../data_img_sample_item_view_sequences/106144465_3_75793218577661959_2_view_.jpg', '../data_img_sample_item_view_sequences/106144465_4_76074708321099781_5_view_.jpg', '../data_img_sample_item_view_sequences/106144465_5_98029795868614676_4_view_.jpg', '../data_img_sample_item_view_sequences/106144465_6_22313013691846692_3_view_.jpg', '../data_img_sample_item_view_sequences/106144465_7_467887899726037038_4_view_.jpg', '../data_img_sample_item_view_sequences/106144465_8_216812239953170449_21_buy_.jpg']

In [142]:
trajectory_features.shape


Out[142]:
(8, 2048)

In [143]:
# stack example trajectory_features
trajectory_features = np.vstack((trajectory_features,trajectory_features,trajectory_features))
trajectory_features.shape


Out[143]:
(24, 2048)

In [144]:
projection = pca_all.transform(trajectory_features)
projection.shape


Out[144]:
(24, 1000)

In [160]:
input_data = trajectory_features
input_data = trajectory_features[:,0:100]
input_data = trajectory_features[:,0:10]

#Q, P, Pcp = offcd.offline_changepoint_detection(input_data, partial(offcd.const_prior, l=(len(input_data)+1)), offcd.gaussian_obs_log_likelihood, truncate=-40)


input_data = projection[:,0:5] # 10 PC's
Q, P, Pcp = offcd.offline_changepoint_detection(input_data, partial(offcd.const_prior, l=(len(input_data)+1)), offcd.gaussian_obs_log_likelihood, truncate=-40)

In [161]:
plt.plot(np.exp(Pcp).sum(0))
plt.ylabel('p(change-point)')


Out[161]:
<matplotlib.text.Text at 0x1495e0890>

In [162]:
plt.plot(input_data)


Out[162]:
[<matplotlib.lines.Line2D at 0x149d33610>,
 <matplotlib.lines.Line2D at 0x149d33810>,
 <matplotlib.lines.Line2D at 0x149d33950>,
 <matplotlib.lines.Line2D at 0x149d33a90>,
 <matplotlib.lines.Line2D at 0x149d33bd0>]

In [149]:
#R

In [159]:
R, maxes = oncd.online_changepoint_detection(input_data, partial(oncd.constant_hazard,250), oncd.MV_Norm(mu=np.zeros(input_data.shape[1]),Sigma=np.diag(np.ones(input_data.shape[1])),n=np.array([1.0])))


fig, ax = plt.subplots(figsize=[12,12])
ax = fig.add_subplot(3, 1, 2, sharex=ax)
sparsity = 1  # only plot every fifth data for faster display
ax.pcolor(np.array(range(0, len(R[:,0]), sparsity)), 
          np.array(range(0, len(R[:,0]), sparsity)), 
          -np.log(R[0:-1:sparsity, 0:-1:sparsity]), 
          cmap=cm.Greys, vmin=0, vmax=30)
ax = fig.add_subplot(3, 1, 3, sharex=ax)
Nw=4;
ax.plot(R[Nw,Nw:-1])


/Users/chris/anaconda/envs/virtenv/lib/python2.7/site-packages/ipykernel_launcher.py:10: RuntimeWarning: divide by zero encountered in log
  # Remove the CWD from sys.path while we load stuff.
Out[159]:
[<matplotlib.lines.Line2D at 0x1493ac190>]

In [113]:
print('target class')
plt.figure(figsize=(12,6))
len_seq = len(img_files)

fig,axes = plt.subplots(2,len_seq)

# make color
#color_red_black = pd.Series(red_traj>0).map({False:'k',True:'r'}).as_matrix()

for i in range(len_seq):
    img = image.load_img(img_files[i], target_size=(224, 224))

    # images
    axes[0,i].imshow(img)
    axes[0,i].set_xticklabels([])
    #axes[0,i].get_xaxis().set_visible(False)
    axes[0,i].get_xaxis().set_ticks([])
    axes[0,i].get_yaxis().set_visible(False) 
    if i<(len_seq-1):
        axes[0,i].set_xlabel('view '+str(i))
    else:
        axes[0,i].set_xlabel('buy')
         
    # bar 
#     axes[1,i].bar(0,red_traj[i],color=color_red_black[i])
#     axes[1,i].set_ylim([-10,5])
#     axes[1,i].get_xaxis().set_visible(False)
#     axes[1,i].axhline(y=0,linestyle='--',color='w')
#     if i==0:
#         print('here')
#         axes[1,i].set_ylabel('red classification')
#     else:
#         axes[1,i].get_yaxis().set_visible(False) 
#     sns.despine()
# savefile = '../figures/example_sequence_interpretable_features_ui_'+str(user_id)+'.png'
# plt.savefig(savefile,dpi=300)


target class
<matplotlib.figure.Figure at 0x1323adc50>

Save


In [ ]:
%%bash 
jupyter nbconvert --to html Change_Point_Detection_in_Trajectories.ipynb && mv Change_Point_Detection_in_Trajectories.html ../notebook_htmls/Change_Point_Detection_in_Trajectories_v1.html
cp Change_Point_Detection_in_Trajectories.ipynb ../notebook_versions/Change_Point_Detection_in_Trajectories_v1.ipynb