In [56]:
import pandas as pd
import numpy as np
from keras.datasets import imdb
from matplotlib import pyplot as plt
from __future__ import print_function
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
np.random.seed(1)
%matplotlib nbagg
In [17]:
# load the dataset
(X_train, y_train), (X_test, y_test) = imdb.load_data()
X = numpy.concatenate((X_train, X_test), axis=0)
y = numpy.concatenate((y_train, y_test), axis=0)
In [18]:
# Brief look into the data
print(pd.DataFrame(X).head())
print(pd.DataFrame(y).head())
In [19]:
# Summarize the data
print("Training data: ")
print(X.shape)
print(y.shape)
In [20]:
# Summary of the classes:
print("Classes:{}".format(numpy.unique(y)))
In [21]:
# Summarize number of words
print("Number of words: {}".format(len(numpy.unique(numpy.hstack(X)))))
In [33]:
# Summarize review length
print("Basic Statistics on the length of the reviews: ")
result = map(len, X)
print("Mean Review Length {:.2f} words, with standard deviation of {:.2f}".format(np.mean(result), np.std(result)))
In [55]:
# plot review length
f = plt.figure(1)
plt.plot(range(0, len(result)), result, 'bo')
f.show()
g = plt.figure(2)
plt.boxplot(result)
g.show()