In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')
In [2]:
data = pd.read_csv('imdb_labelled.txt', sep='\|', engine='python', header=None)
data.columns = ['text', 'label']
In [3]:
data.head()
Out[3]:
In [4]:
ax = data.label.value_counts().plot(kind='bar', rot=0, title='Counts per Label')
In [5]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stopwords = set(stopwords.words('english'))
from nltk.tokenize import RegexpTokenizer
def remove_stopwords(sent):
words = word_tokenize(sent)
return ' '.join([w for w in words if w not in stopwords])
def remove_punctuations(sent):
regex_tok = RegexpTokenizer(r'\w+')
words = regex_tok.tokenize(sent)
return ' '.join(words)
def preprocess_sent(sent):
sent = sent.lower()
return remove_punctuations(remove_stopwords(sent))
preprocessed_text = data.text.apply(preprocess_sent)
data.loc[:, 'text'] = preprocessed_text
In [6]:
data.head()
Out[6]:
In [7]:
test_df = data.groupby('label').apply(lambda g: g.sample(frac=0.2, random_state=1000))
test_df.index = test_df.index.droplevel(0)
train_df = data.drop(test_df.index, axis=0)
print("Train and test lengths: ", len(train_df), len(test_df))
In [8]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
vocab_size = 40000
tokenizer = Tokenizer(lower=True, num_words=vocab_size)
tokenizer.fit_on_texts(train_df.text.values)
pad_len = 15
train_sequences = pad_sequences(tokenizer.texts_to_sequences(train_df.text.values), pad_len)
test_sequences = pad_sequences(tokenizer.texts_to_sequences(test_df.text.values), pad_len)
In [9]:
print("Shape of train sequences: ", train_sequences.shape)
print("Shape of test sequences : ", test_sequences.shape)
In [10]:
train_sequences[:5]
Out[10]:
In [66]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import SimpleRNN, LSTM
from keras.layers.embeddings import Embedding
embedding_size = 60
model = Sequential()
model.add(Embedding(input_dim=vocab_size+1, output_dim=embedding_size, mask_zero=True))
model.add(SimpleRNN(units=10, activation='tanh'))
model.add(Dense(1, activation='sigmoid'))
In [67]:
model.summary()
In [68]:
from keras import optimizers
sgd = optimizers.SGD(lr=0.008)
model.compile(optimizer=sgd, loss='binary_crossentropy', metrics=['accuracy'])
history = model.fit(train_sequences, train_df.label.values,
batch_size=32,
epochs=10,
verbose=1,
validation_split=0.1,
shuffle=True)
In [61]:
loss_per_epoch = history.history['loss']
ax = pd.Series(loss_per_epoch).plot(kind='bar', title='Loss per epoch', rot=0)
In [73]:
predictions = model.predict(test_sequences).squeeze()
predictions = np.where(predictions > 0.5, 1, 0)
In [75]:
from sklearn.metrics import classification_report
report = classification_report(predictions, test_df.label.values)
print(report)