In [1]:
# movie review dataset homepage: http://www.cs.cornell.edu/people/pabo/movie-review-data/
# Download the dataset used here: http://www.cs.cornell.edu/people/pabo/movie-review-data/review_polarity.tar.gz
# In this code
## Step1 - Generate vocabulary from the raw text data
## Step2 - basic data preprocessing (added stemming here) + neural network
In [ ]:
import nltk
from nltk.stem.porter import *
nltk.download('stopwords')
stemmer = PorterStemmer()
stopwords = nltk.corpus.stopwords.words('english')
In [4]:
# STEP 1 - Generate Vocabulary (word, count)
import string
from collections import Counter
from os import listdir
neg_reviews_folder = "review_polarity/txt_sentoken/neg/"
pos_reviews_folder = "review_polarity/txt_sentoken/pos/"
# convert each raw text to cleaned token list
def clean_text(raw_text_path):
with open(raw_text_path) as text_in:
raw_text = text_in.read()
tokens = raw_text.split()
# remove punctuation
tokens = [w.strip(string.punctuation) for w in tokens]
# only keep alphabetic non-stop words
tokens = [w for w in tokens if w.isalpha()==True and w not in stopwords]
# remove short words
tokens = [w for w in tokens if len(w) >= 3]
# stemming
tokens = [stemmer.stem(w) for w in tokens]
return tokens
vocabCounter = Counter() # vocabulary set will use training data tokens
training_neg_docs, testing_neg_docs = [], []
training_pos_docs, testing_pos_docs = [], []
for f_name in listdir(neg_reviews_folder):
if f_name.startswith('cv7'): # files start with 'cv7' will be the testing data
testing_neg_docs.append(clean_text(neg_reviews_folder + f_name))
else:
tks = clean_text(neg_reviews_folder + f_name)
training_neg_docs.append(' '.join(tks)) # This format helps later texts_to_sequences
vocabCounter.update(tks)
for f_name in listdir(pos_reviews_folder):
if f_name.startswith('cv9'): # files start with 'cv9' will be the testing data
testing_pos_docs.append(clean_text(pos_reviews_folder + f_name))
else:
tks = clean_text(pos_reviews_folder + f_name)
training_pos_docs.append(' '.join(tks)) # This format helps later texts_to_sequences
vocabCounter.update(tks)
In [6]:
print len(vocabCounter)
print vocabCounter.most_common(79)
In [7]:
print vocabCounter.most_common()[:-79:-1]
In [9]:
# remove those tokens with low occurance
vocab_tokens = [w for w,c in vocabCounter.items() if c >= 2]
print len(vocab_tokens)
vocabset = set(vocab_tokens)
In [10]:
# For testting tokens, only keep those in vocabulary
cleaned_testing_neg_docs = []
cleaned_testing_pos_docs = []
for d in testing_neg_docs:
cleaned_testing_neg_docs.append(' '.join([w for w in d if w in vocabset]))
for d in testing_pos_docs:
cleaned_testing_pos_docs.append(' '.join([w for w in d if w in vocabset]))
In [12]:
# STEP 2 - Neural Network with word embedding
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
np.random.seed(410)
In [14]:
tokenizer = Tokenizer()
In [15]:
# This is how texts_to_sequences() in NN tokenizer work
# Same words in different documents get same word index
# But here, has to be exactly the same words
tokenizer.fit_on_texts(['I love Emmanuel', 'I love ice-cream', 'love I'])
encoded_docs = tokenizer.texts_to_sequences(['I love Emmanuel', 'I love ice-cream', 'love I'])
encoded_docs
Out[15]:
In [16]:
training_docs = training_neg_docs + training_pos_docs
testing_docs = cleaned_testing_neg_docs + cleaned_testing_pos_docs
tokenizer.fit_on_texts(training_docs) # tokenizer need to fit on training data
In [17]:
training_docs[7]
Out[17]:
In [18]:
## TRAINING DATA
# encode as sequence
training_encoded_seq = tokenizer.texts_to_sequences(training_docs)
print 'Encoded Sequence: ', training_encoded_seq[7]
# pad sequences
max_length = max([len(s.split()) for s in training_docs]) # MAX total number of tokens in training documents
Xtrain = pad_sequences(training_encoded_seq, maxlen=max_length, padding='post')
print Xtrain[7]
print 'Xtrain Shape: ', Xtrain.shape
ytrain = np.array([0 for _ in range(900)] + [1 for _ in range(900)]) # first 900 are negative, last 900 are positive
print ytrain[4:10]
print 'ytrain Shape: ', ytrain.shape
In [19]:
## TESTING DATA
# sequence encode
testing_encoded_seq = tokenizer.texts_to_sequences(testing_docs)
print 'Encoded Sequence: ', testing_encoded_seq[9]
# pad sequences
Xtest = pad_sequences(testing_encoded_seq, maxlen=max_length, padding='post') # still use training max_length here
print Xtest[9]
print 'Xtest Shape: ', Xtest.shape
ytest = np.array([0 for _ in range(100)] + [1 for _ in range(100)]) # first 100 are negative, last 100 are positive
print ytest[4:10]
print 'ytest Shape: ', ytest.shape
In [20]:
vocab_size = len(tokenizer.word_index) + 1 # 1 is for unknown words here
print vocab_size
print max_length
In [21]:
# build the NN model
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=max_length)) # 100 dimensional vector space
model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
print(model.summary())
In [22]:
# compile network
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model.fit(Xtrain, ytrain, epochs=10, verbose=2)
# evaluate model
loss, acc = model.evaluate(Xtest, ytest, verbose=0)
print('Test Accuracy: %f' % (acc*100))