In [56]:
import pandas as pd
import numpy as np
from keras.datasets import imdb
from matplotlib import pyplot as plt
from __future__ import print_function
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
np.random.seed(1)
%matplotlib nbagg

In [17]:
# load the dataset
(X_train, y_train), (X_test, y_test) = imdb.load_data()
X = numpy.concatenate((X_train, X_test), axis=0)
y = numpy.concatenate((y_train, y_test), axis=0)

In [18]:
# Brief look into the data
print(pd.DataFrame(X).head())
print(pd.DataFrame(y).head())


                                                   0
0  [1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, ...
1  [1, 194, 1153, 194, 8255, 78, 228, 5, 6, 1463,...
2  [1, 14, 47, 8, 30, 31, 7, 4, 249, 108, 7, 4, 5...
3  [1, 4, 18609, 16085, 33, 2804, 4, 2040, 432, 1...
4  [1, 249, 1323, 7, 61, 113, 10, 10, 13, 1637, 1...
   0
0  1
1  0
2  0
3  1
4  0

In [19]:
# Summarize the data
print("Training data: ")
print(X.shape)
print(y.shape)


Training data: 
(50000,)
(50000,)

In [20]:
# Summary of the classes:
print("Classes:{}".format(numpy.unique(y)))


Classes:[0 1]

In [21]:
# Summarize number of words
print("Number of words: {}".format(len(numpy.unique(numpy.hstack(X)))))


Number of words: 88585

In [33]:
# Summarize review length
print("Basic Statistics on the length of the reviews: ")
result = map(len, X)
print("Mean Review Length {:.2f} words, with standard deviation of {:.2f}".format(np.mean(result), np.std(result)))


Basic Statistics on the length of the reviews: 
Mean Review Length 234.76 words, with standard deviation of 172.91

In [55]:
# plot review length
f = plt.figure(1)
plt.plot(range(0, len(result)), result, 'bo')
f.show()
g = plt.figure(2)
plt.boxplot(result)
g.show()