In [92]:
import numpy
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
import pandas as pd
from sklearn import cross_validation

# load data
top_words = 4214 # insert max index of items + 1
max_length = 10 # length of sequential data
df = pd.read_csv("c:/dev/dl/purchacedata_base.csv", header=0)

df.head()


Out[92]:
Y CustomerID P1 P2 P3 P4 P5 P6 P7 P8 ... P91 P92 P93 P94 P95 P96 P97 P98 P99 P100
0 0 12346 604 604.0 NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 0 12348 120 238.0 60.0 60.0 109.0 109.0 181.0 553.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 0 12349 603 2259.0 1333.0 649.0 1072.0 488.0 344.0 140.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 0 12350 217 1117.0 1073.0 1847.0 384.0 2139.0 102.0 148.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 0 12352 1155 630.0 779.0 1325.0 402.0 348.0 524.0 1839.0 ... 749.0 537.0 1066.0 1196.0 11.0 NaN NaN NaN NaN NaN

5 rows × 102 columns


In [93]:
# extract columns and drop rows having NaN
df = df[df.columns[0:(max_length + 2)]].dropna().astype(int)

train_X, test_X, train_Y, test_Y = cross_validation.train_test_split(df[df.columns[2:(max_length + 2)]], df["Y"])

train_X = train_X.values
train_Y = train_Y.values
test_X = test_X.values
test_Y = test_Y.values

print(train_X.shape)
print(test_X.shape)
print(test_Y)


(2835, 10)
(946, 10)
[0 0 0 0 0 1 1 1 1 0 0 0 0 1 0 0 1 1 0 1 0 0 0 0 1 0 0 0 1 0 0 1 0 0 1 0 0
 0 0 0 0 0 1 1 1 0 0 1 0 1 1 1 1 1 1 1 0 1 1 0 1 0 0 0 1 1 1 0 1 0 0 1 0 0
 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0
 1 1 1 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 1 1 0 0
 1 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0
 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 0 1 0 0 0 1 0 1 0 0 0 0
 1 0 1 0 1 0 0 0 0 1 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 1
 1 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 1 1 0 1 1 0 1 1 0 0
 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 1 0 0 0 1 1 1 1 0 0 1 1 1
 0 0 0 0 0 0 1 0 0 0 0 0 1 0 1 0 1 0 1 0 0 0 0 1 0 0 0 1 0 0 1 0 1 0 1 0 1
 1 0 1 0 0 0 1 0 0 0 1 0 0 0 1 1 0 1 0 1 1 1 1 0 0 0 0 1 0 0 0 0 1 0 0 0 1
 0 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0 1 0 1 0 1 0
 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 1 0 0 1 0 0
 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 1 0 1 0 0 1 0 0 1 0 1 0 0 0 0 1
 0 1 0 0 0 1 1 1 0 0 0 0 0 0 1 1 1 0 1 1 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 1
 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 1 0 1 0 0 0
 0 1 1 0 0 0 0 0 0 1 1 0 1 1 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0
 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 1 1 1 0 1 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0 0
 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 1 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0
 0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 1 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 0 0 0 1 1 0 0
 1 1 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 1 0 0 0 0 1 0 0 1 1
 0 1 0 1 1 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0
 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 0 0 0 1 1 0 0 0 0 1
 1 1 0 1 0 1 1 0 0 0 0 1 0 0 0 0 1 0 1 0 1]

In [107]:
# create the model
embedding_vector_length = 5
model = Sequential()
model.add(Embedding(top_words, embedding_vector_length, input_length=max_length))
model.add(LSTM(3))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(train_X, train_Y, validation_data=(test_X, test_Y), epochs=3, batch_size=64)


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_29 (Embedding)     (None, 10, 5)             21070     
_________________________________________________________________
lstm_29 (LSTM)               (None, 3)                 108       
_________________________________________________________________
dense_29 (Dense)             (None, 1)                 4         
=================================================================
Total params: 21,182
Trainable params: 21,182
Non-trainable params: 0
_________________________________________________________________
None
Train on 2835 samples, validate on 946 samples
Epoch 1/3
2835/2835 [==============================] - 1s - loss: 0.6792 - acc: 0.6942 - val_loss: 0.6594 - val_acc: 0.7241
Epoch 2/3
2835/2835 [==============================] - 0s - loss: 0.6411 - acc: 0.7012 - val_loss: 0.6106 - val_acc: 0.7241
Epoch 3/3
2835/2835 [==============================] - 0s - loss: 0.6002 - acc: 0.7012 - val_loss: 0.5859 - val_acc: 0.7241
Out[107]:
<keras.callbacks.History at 0x5e178668>

In [108]:
# Final evaluation of the model
scores = model.evaluate(test_X, test_Y, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))


Accuracy: 72.41%

In [ ]: