The dataset is comprised of tab-separated files with phrases from the Rotten Tomatoes dataset. The train/test split has been preserved for the purposes of benchmarking, but the sentences have been shuffled from their original order. Each Sentence has been parsed into many phrases by the Stanford parser. Each phrase has a PhraseId. Each sentence has a SentenceId. Phrases that are repeated (such as short/common words) are only included once in the data.
Before downloading the data, ensure that the terms of the competition is accepted.
In [0]:
import os
In [0]:
colab_mode = True
download_rawData = True
setup = True
ROOT_DIR = '/content/'
WEIGHTS_FILENAME = 'RT_LSTM.h5'
WEIGHTS_FILE = os.path.join(ROOT_DIR, WEIGHTS_FILENAME)
In [0]:
from google.colab import files
In [0]:
if colab_mode and download_rawData:
files.upload()
In [0]:
if colab_mode and download_rawData:
! mkdir /root/.kaggle/
! mv /content/kaggle.json /root/.kaggle/
In [0]:
if setup:
! pip install kaggle
In [0]:
! kaggle competitions download -c movie-review-sentiment-analysis-kernels-only
In [0]:
! kaggle datasets download -d terenceliu4444/glove6b100dtxt
In [0]:
! rm /root/.kaggle/kaggle.json
In [0]:
if setup:
! unzip -q /content/train.tsv.zip
! unzip -q /content/test.tsv.zip
In [0]:
if setup:
! unzip -q /content/glove6b100dtxt.zip
In [0]:
import nltk
import os
import gc
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from keras.preprocessing import sequence,text
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense,Dropout,Embedding,LSTM,Conv1D,GlobalMaxPooling1D,Flatten,MaxPooling1D,GRU,SpatialDropout1D,Bidirectional
from keras.callbacks import EarlyStopping
from keras.utils import to_categorical
from keras.losses import categorical_crossentropy
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,f1_score
warnings.filterwarnings("ignore")
#pd.set_option('display.max_colwidth',100)
pd.set_option('display.max_colwidth', -1)
In [0]:
train=pd.read_csv('/content/train.tsv',sep='\t')
print(train.shape)
train.head()
Out[0]:
The sentiment labels are:
In [0]:
train['Sentiment'].unique()
Out[0]:
In [0]:
Sent_dic={0:'negative', 1:'somewhat negative', 2:'neutral', 3:'somewhat positive', 4:'positive'}
In [0]:
len(train['Sentiment'])
Out[0]:
In [0]:
train.groupby('Sentiment')['PhraseId'].nunique()
Out[0]:
In [0]:
import seaborn as sns
In [0]:
sns.countplot(data=train,x='Sentiment',)
Out[0]:
In [0]:
test=pd.read_csv('/content/test.tsv',sep='\t')
print(test.shape)
test.head()
Out[0]:
In [0]:
sub=pd.read_csv('/content/sampleSubmission.csv')
sub.head()
Out[0]:
In [0]:
test['Sentiment']=-999
test.head()
Out[0]:
In [0]:
df=pd.concat([train,
test], ignore_index=True)
print(df.shape)
df.tail()
Out[0]:
In [0]:
del train,test
gc.collect()
Out[0]:
In [0]:
In [0]:
from nltk.tokenize import word_tokenize
from nltk import FreqDist
from nltk.stem import SnowballStemmer,WordNetLemmatizer
stemmer=SnowballStemmer('english')
lemma=WordNetLemmatizer()
from string import punctuation
import re
In [0]:
if setup:
nltk.download()
In [0]:
def clean_review(review_col):
review_corpus=[]
for i in range(0,len(review_col)):
review=str(review_col[i])
review=re.sub('[^a-zA-Z]',' ',review)
#review=[stemmer.stem(w) for w in word_tokenize(str(review).lower())]
review=[lemma.lemmatize(w) for w in word_tokenize(str(review).lower())]
review=' '.join(review)
review_corpus.append(review)
return review_corpus
In [0]:
df['clean_review']=clean_review(df.Phrase.values)
df.head()
Out[0]:
In [0]:
df_train=df[df.Sentiment!=-999]
print (df_train.shape)
df_train.head()
Out[0]:
In [0]:
df_test=df[df.Sentiment==-999]
df_test.drop('Sentiment',axis=1,inplace=True)
print(df_test.shape)
df_test.head()
Out[0]:
In [0]:
del df
gc.collect()
Out[0]:
In [0]:
train_text=df_train.clean_review.values
test_text=df_test.clean_review.values
target=df_train.Sentiment.values
In [0]:
In [0]:
y=to_categorical(target)
print(train_text.shape,target.shape,y.shape)
In [0]:
X_train_text,X_val_text,y_train,y_val=train_test_split(train_text,y,test_size=0.2,stratify=y,random_state=123)
print(X_train_text.shape,y_train.shape)
print(X_val_text.shape,y_val.shape)
In [0]:
all_words=' '.join(X_train_text)
all_words=word_tokenize(all_words)
dist=FreqDist(all_words)
num_unique_word=len(dist)
num_unique_word
Out[0]:
In [0]:
r_len=[]
for text in X_train_text:
word=word_tokenize(text)
l=len(word)
r_len.append(l)
MAX_REVIEW_LEN=np.max(r_len)
MAX_REVIEW_LEN
Out[0]:
In [0]:
max_features = num_unique_word
max_words = MAX_REVIEW_LEN
batch_size = 128
epochs = 3
num_classes=y.shape[1]
print ('Total number of sentiment classes: {} ...'.format(num_classes))
Tokenizing using Keras text pre-processor. This class allows to vectorize a text corpus, by turning each text into either a sequence of integers (each integer being the index of a token in a dictionary) or into a vector where the coefficient for each token could be binary, based on word count, based on tf-idf...
In [0]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train_text))
X_train = tokenizer.texts_to_sequences(X_train_text)
X_val = tokenizer.texts_to_sequences(X_val_text)
X_test = tokenizer.texts_to_sequences(test_text)
In [0]:
X_train = sequence.pad_sequences(X_train, maxlen=max_words)
X_val = sequence.pad_sequences(X_val, maxlen=max_words)
X_test = sequence.pad_sequences(X_test, maxlen=max_words)
print(X_train.shape,X_val.shape,X_test.shape)
One-hot encoded vectors are high-dimensional and sparse. Let’s assume that we are doing Natural Language Processing (NLP) and have a dictionary of 2000 words. This means that, when using one-hot encoding, each word will be represented by a vector containing 2000 integers. And 1999 of these integers are zeros. In a big dataset this approach is not computationally efficient.
The vectors of each embedding get updated while training the neural network. If you have seen the image at the top of this post you can see how similarities between words can be found in a multi-dimensional space. This allows us to visualize relationships between words, but also between everything that can be turned into a vector through an embedding layer.
In [0]:
def model_LSTM():
model=Sequential()
model.add(Embedding(max_features,100,mask_zero=True))
model.add(LSTM(64,dropout=0.4, recurrent_dropout=0.4,return_sequences=True))
model.add(LSTM(32,dropout=0.5, recurrent_dropout=0.5,return_sequences=False))
model.add(Dense(4096, activation='tanh'))
model.add(Dense(num_classes,activation='softmax'))
return model
In [0]:
model1 = model_LSTM()
In [0]:
model1.compile(loss='categorical_crossentropy',
optimizer=Adam(lr=0.001),
metrics=['accuracy'])
model1.summary()
In [0]:
! wget https://github.com/rahulremanan/python_tutorial/raw/master/NLP/10-Sentiment_analysis/weights/RT_LSTM.h5
try:
model1.load_weights(WEIGHTS_FILE)
print ('Loaded model weights from: {} ...'.format(WEIGHTS_FILE))
except:
print ('No model weights file: {} found ...'.format(WEIGHTS_FILE))
In [0]:
%%time
history1=model1.fit(X_train,
y_train,
validation_data=(X_val, y_val),
epochs=epochs,
batch_size=batch_size,
verbose=1)
In [0]:
model1.save_weights(WEIGHTS_FILE)
In [0]:
files.download(WEIGHTS_FILE)
In [0]:
try:
model1.load_weights(WEIGHTS_FILE)
print ('Loaded model weights from: {} ...'.format(WEIGHTS_FILE))
except:
print ('No model weights file: {} found ...'.format(WEIGHTS_FILE))
In [0]:
test_element=5
input_sequence = np.asarray([list(X_test[test_element])])
y_pred_LSTM=model1.predict(input_sequence,verbose=1)
print ('Input string: {} ...'.format(test_text[test_element]))
print ('Sentiment for the input string: {} ...'.format(Sent_dic[np.argmax(y_pred_LSTM)]))
In [0]:
print(y_pred_LSTM)
Sent_dic[np.argmax(y_pred_LSTM)]
Out[0]:
In [0]:
input_string = ['This movie was horrible']
input_text = tokenizer.texts_to_sequences(input_string)
input_sequence = sequence.pad_sequences(input_text, maxlen=max_words)
y_pred_LSTM=model1.predict(input_sequence,verbose=1)
print ('Input string: {} ...'.format(input_string))
print ('Sentiment for the input string: {} ...'.format(Sent_dic[np.argmax(y_pred_LSTM)]))
In [0]:
input_string = ['this movie was great']
input_text = tokenizer.texts_to_sequences(input_string)
input_sequence = sequence.pad_sequences(input_text, maxlen=max_words)
y_pred_LSTM=model1.predict(input_sequence,verbose=1)
print ('Input string: {} ...'.format(input_string))
print ('Sentiment for the input string: {} ...'.format(Sent_dic[np.argmax(y_pred_LSTM)]))
In [0]:
In [0]:
def model_CNN():
model= Sequential()
model.add(Embedding(max_features,100,input_length=max_words))
model.add(Dropout(0.2))
model.add(Conv1D(64,kernel_size=3,padding='same',activation='relu',strides=1))
model.add(GlobalMaxPooling1D())
model.add(Dense(128,activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(num_classes,activation='softmax'))
return model
In [0]:
model2 = model_CNN()
In [0]:
model2.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
model2.summary()
In [0]:
#! wget https://github.com/rahulremanan/python_tutorial/raw/master/NLP/10-Sentiment_analysis/weights/RT_LSTM.h5
try:
model2.load_weights(WEIGHTS_FILE)
print ('Loaded model weights from: {} ...'.format(WEIGHTS_FILE))
except:
print ('No model weights file: {} found ...'.format(WEIGHTS_FILE))
In [0]:
%%time
history2=model2.fit(X_train,
y_train,
validation_data=(X_val, y_val),
epochs=epochs,
batch_size=batch_size,
verbose=1)
In [0]:
model2.save_weights(WEIGHTS_FILE)
In [0]:
files.download(WEIGHTS_FILE)
In [0]:
try:
model2.load_weights(WEIGHTS_FILE)
print ('Loaded model weights from: {} ...'.format(WEIGHTS_FILE))
except:
print ('No model weights file: {} found ...'.format(WEIGHTS_FILE))
In [0]:
test_element=5
input_sequence = np.asarray([list(X_test[test_element])])
y_pred=model2.predict(input_sequence,verbose=1)
print ('Input string: {} ...'.format(test_text[test_element]))
print ('Sentiment for the input string: {} ...'.format(Sent_dic[np.argmax(y_pred)]))
In [0]:
In [0]:
def model_CNN_GRU():
model= Sequential()
model.add(Embedding(max_features,100,input_length=max_words))
model.add(Conv1D(64,kernel_size=3,padding='same',activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.25))
model.add(GRU(128,return_sequences=True))
model.add(Dropout(0.3))
model.add(Flatten())
model.add(Dense(128,activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(5,activation='softmax'))
return model
In [0]:
model3 = model_CNN_GRU()
In [0]:
model3.compile(loss='categorical_crossentropy',optimizer=Adam(lr=0.001),metrics=['accuracy'])
model3.summary()
In [0]:
#! wget https://github.com/rahulremanan/python_tutorial/raw/master/NLP/10-Sentiment_analysis/weights/RT_LSTM.h5
try:
model3.load_weights(WEIGHTS_FILE)
print ('Loaded model weights from: {} ...'.format(WEIGHTS_FILE))
except:
print ('No model weights file: {} found ...'.format(WEIGHTS_FILE))
In [0]:
%%time
history3=model3.fit(X_train,
y_train,
validation_data=(X_val, y_val),
epochs=epochs,
batch_size=batch_size,
verbose=1)
In [0]:
model3.save_weights(WEIGHTS_FILE)
In [0]:
files.download(WEIGHTS_FILE)
In [0]:
try:
model3.load_weights(WEIGHTS_FILE)
print ('Loaded model weights from: {} ...'.format(WEIGHTS_FILE))
except:
print ('No model weights file: {} found ...'.format(WEIGHTS_FILE))
In [0]:
test_element=5
input_sequence = np.asarray([list(X_test[test_element])])
y_pred=model3.predict(input_sequence,verbose=1)
print ('Input string: {} ...'.format(test_text[test_element]))
print ('Sentiment for the input string: {} ...'.format(Sent_dic[np.argmax(y_pred)]))
In [0]:
In [0]:
def model_BiDir_GRU():
model = Sequential()
model.add(Embedding(max_features, 100, input_length=max_words))
model.add(SpatialDropout1D(0.25))
model.add(Bidirectional(GRU(128)))
model.add(Dropout(0.5))
model.add(Dense(5, activation='softmax'))
return model
In [0]:
model4 = model_BiDir_GRU()
In [0]:
model4.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model4.summary()
In [0]:
#! wget https://github.com/rahulremanan/python_tutorial/raw/master/NLP/10-Sentiment_analysis/weights/RT_LSTM.h5
try:
model4.load_weights(WEIGHTS_FILE)
print ('Loaded model weights from: {} ...'.format(WEIGHTS_FILE))
except:
print ('No model weights file: {} found ...'.format(WEIGHTS_FILE))
In [0]:
%%time
history4=model4.fit(X_train,
y_train,
validation_data=(X_val, y_val),
epochs=epochs,
batch_size=batch_size,
verbose=1)
In [0]:
model4.save_weights(WEIGHTS_FILE)
In [0]:
files.download(WEIGHTS_FILE)
In [0]:
try:
model4.load_weights(WEIGHTS_FILE)
print ('Loaded model weights from: {} ...'.format(WEIGHTS_FILE))
except:
print ('No model weights file: {} found ...'.format(WEIGHTS_FILE))
In [0]:
test_element=5
input_sequence = np.asarray([list(X_test[test_element])])
y_pred=model4.predict(input_sequence,verbose=1)
print ('Input string: {} ...'.format(test_text[test_element]))
print ('Sentiment for the input string: {} ...'.format(Sent_dic[np.argmax(y_pred)]))
In [0]:
In [0]:
def get_coefs(word, *arr):
return word, np.asarray(arr, dtype='float32')
def get_embed_mat(EMBEDDING_FILE, max_features,embed_dim):
# word vectors
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE, encoding='utf8'))
print('Found %s word vectors.' % len(embeddings_index))
# embedding matrix
word_index = tokenizer.word_index
num_words = min(max_features, len(word_index) + 1)
all_embs = np.stack(embeddings_index.values()) #for random init
embedding_matrix = np.random.normal(all_embs.mean(), all_embs.std(),
(num_words, embed_dim))
for word, i in word_index.items():
if i >= max_features:
continue
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
max_features = embedding_matrix.shape[0]
return embedding_matrix
In [0]:
# embedding matrix
EMBEDDING_FILE = '/content/glove.6B.100d.txt'
embed_dim = 100 #word vector dim
embedding_matrix = get_embed_mat(EMBEDDING_FILE,max_features,embed_dim)
print(embedding_matrix.shape)
In [0]:
In [0]:
def model_Glove():
model = Sequential()
model.add(Embedding(max_features, embed_dim, input_length=X_train.shape[1],weights=[embedding_matrix],trainable=True))
model.add(SpatialDropout1D(0.25))
model.add(Bidirectional(GRU(128,return_sequences=True)))
model.add(Bidirectional(GRU(64,return_sequences=False)))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))
return model
In [0]:
model5 = model_Glove()
In [0]:
model5.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model5.summary()
In [0]:
#! wget https://github.com/rahulremanan/python_tutorial/raw/master/NLP/10-Sentiment_analysis/weights/RT_LSTM.h5
try:
model5.load_weights(WEIGHTS_FILE)
print ('Loaded model weights from: {} ...'.format(WEIGHTS_FILE))
except:
print ('No model weights file: {} found ...'.format(WEIGHTS_FILE))
In [0]:
%%time
history5=model5.fit(X_train,
y_train,
validation_data=(X_val, y_val),
epochs=4,
batch_size=batch_size,
verbose=1)
In [0]:
model5.save_weights(WEIGHTS_FILE)
In [0]:
files.download(WEIGHTS_FILE)
In [0]:
try:
model5.load_weights(WEIGHTS_FILE)
print ('Loaded model weights from: {} ...'.format(WEIGHTS_FILE))
except:
print ('No model weights file: {} found ...'.format(WEIGHTS_FILE))
In [0]:
test_element=5
input_sequence = np.asarray([list(X_test[test_element])])
y_pred=model5.predict(input_sequence,verbose=1)
print ('Input string: {} ...'.format(test_text[test_element]))
print ('Sentiment for the input string: {} ...'.format(Sent_dic[np.argmax(y_pred)]))
In [0]:
In [0]:
test_element=5
input_sequence = np.asarray([list(X_test[test_element])])
y_pred1=model1.predict(input_sequence,verbose=1)
y_pred2=model2.predict(input_sequence,verbose=1)
y_pred3=model3.predict(input_sequence,verbose=1)
y_pred4=model4.predict(input_sequence,verbose=1)
y_pred5=model5.predict(input_sequence,verbose=1)
pred1=np.argmax(y_pred1)
pred2=np.argmax(y_pred2)
pred3=np.argmax(y_pred3)
pred4=np.argmax(y_pred4)
pred5=np.argmax(y_pred5)
Sent_all=stats.mode([pred1,pred2,pred3,pred4,pred5],axis=0)[0][0]
print ('Input string: {} ...'.format(test_text[test_element]))
print ('Sentiment for the input string: {} ...'.format(Sent_dic[Sent_all]))
In [0]:
y_pred
Out[0]:
In [0]:
np.mod()