In [1]:
# movie review dataset homepage: http://www.cs.cornell.edu/people/pabo/movie-review-data/
# Download the dataset used here: http://www.cs.cornell.edu/people/pabo/movie-review-data/review_polarity.tar.gz

# In this code
## Step1 - Generate vocabulary from the raw text data
## Step2 - basic data preprocessing (added stemming here) + neural network

In [ ]:
import nltk
from nltk.stem.porter import *

nltk.download('stopwords')
stemmer = PorterStemmer()
stopwords = nltk.corpus.stopwords.words('english')

In [4]:
# STEP 1 - Generate Vocabulary (word, count)

import string
from collections import Counter
from os import listdir

neg_reviews_folder = "review_polarity/txt_sentoken/neg/"
pos_reviews_folder = "review_polarity/txt_sentoken/pos/"

# convert each raw text to cleaned token list
def clean_text(raw_text_path):
    with open(raw_text_path) as text_in:
        raw_text = text_in.read()
        tokens = raw_text.split()
        # remove punctuation
        tokens = [w.strip(string.punctuation) for w in tokens]
        # only keep alphabetic non-stop words
        tokens = [w for w in tokens if w.isalpha()==True and w not in stopwords]
        # remove short words
        tokens = [w for w in tokens if len(w) >= 3]
        # stemming
        tokens = [stemmer.stem(w) for w in tokens]
    return tokens

vocabCounter = Counter()  # vocabulary set will use training data tokens
training_neg_docs, testing_neg_docs = [], []
training_pos_docs, testing_pos_docs = [], []

for f_name in listdir(neg_reviews_folder):
    if f_name.startswith('cv7'):  # files start with 'cv7' will be the testing data
        testing_neg_docs.append(clean_text(neg_reviews_folder + f_name))
    else: 
        tks = clean_text(neg_reviews_folder + f_name)  
        training_neg_docs.append(' '.join(tks))  # This format helps later texts_to_sequences
        vocabCounter.update(tks)
           
for f_name in listdir(pos_reviews_folder):
    if f_name.startswith('cv9'):  # files start with 'cv9' will be the testing data
        testing_pos_docs.append(clean_text(pos_reviews_folder + f_name))
    else:
        tks = clean_text(pos_reviews_folder + f_name)  
        training_pos_docs.append(' '.join(tks))  # This format helps later texts_to_sequences
        vocabCounter.update(tks)

In [6]:
print len(vocabCounter)
print vocabCounter.most_common(79)


23451
[('film', 9387), (u'movi', 5830), ('one', 5137), ('like', 3609), (u'charact', 3377), ('get', 2870), ('make', 2812), ('time', 2612), ('scene', 2365), ('even', 2329), (u'play', 2148), ('good', 2134), (u'stori', 2060), ('see', 1958), ('would', 1833), ('much', 1821), ('also', 1751), (u'seem', 1669), ('way', 1667), ('two', 1637), (u'take', 1629), (u'end', 1627), ('look', 1625), ('first', 1603), ('come', 1576), ('work', 1541), ('well', 1530), ('thing', 1468), (u'realli', 1397), ('know', 1397), (u'perform', 1379), ('plot', 1372), (u'littl', 1340), ('life', 1331), ('year', 1294), ('love', 1284), (u'peopl', 1270), ('never', 1239), ('bad', 1234), ('could', 1226), (u'tri', 1201), ('show', 1183), ('best', 1176), ('man', 1168), ('new', 1145), (u'give', 1143), (u'mani', 1131), (u'star', 1109), ('say', 1092), (u'actor', 1083), ('want', 1081), ('find', 1061), (u'watch', 1057), (u'becom', 1056), ('great', 1055), ('role', 1048), (u'action', 1034), (u'think', 1034), ('director', 1032), ('use', 1022), (u'anoth', 978), (u'act', 966), (u'effect', 957), ('back', 948), ('still', 943), (u'someth', 934), (u'audienc', 929), ('turn', 927), ('made', 918), (u'interest', 893), (u'actual', 888), ('feel', 886), ('big', 879), ('world', 878), (u'howev', 878), (u'live', 864), (u'everi', 854), ('though', 849), ('set', 847)]

In [7]:
print vocabCounter.most_common()[:-79:-1]


[(u'emeri', 1), (u'snif', 1), ('contenda', 1), ('counteract', 1), ('dearth', 1), ('tilvern', 1), ('jaffa', 1), ('cairo', 1), ('fudd', 1), ('mith', 1), ('lear', 1), ('pheneomena', 1), ('holley', 1), ('nigga', 1), (u'unplay', 1), (u'faldwel', 1), (u'diminuit', 1), ('knob', 1), ('wren', 1), ('breaker', 1), (u'outgrow', 1), (u'unconvention', 1), ('comaprison', 1), (u'actualis', 1), (u'rectitud', 1), (u'ashle', 1), ('christmastown', 1), ('decter', 1), (u'coteri', 1), (u'creedenc', 1), ('wile', 1), ('vanilla', 1), ('wilt', 1), (u'butcheri', 1), (u'everglad', 1), ('takehiro', 1), ('partment', 1), ('salesgirl', 1), (u'closes', 1), (u'hudgeon', 1), (u'quaternari', 1), (u'afeminit', 1), (u'sappili', 1), (u'watercolor', 1), (u'dopi', 1), (u'oned', 1), ('escobar', 1), (u'syring', 1), (u'telli', 1), (u'hornbi', 1), (u'siphon', 1), ('frontman', 1), ('adamson', 1), (u'unexperienc', 1), ('deadlier', 1), (u'outstrip', 1), ('mongolian', 1), ('huevelman', 1), (u'beatif', 1), ('frugal', 1), (u'droog', 1), (u'motorcad', 1), ('hitherto', 1), ('renna', 1), ('anyhoo', 1), (u'archimed', 1), ('elizando', 1), ('kritic', 1), (u'arkansa', 1), ('greedier', 1), ('dariush', 1), ('tram', 1), (u'pallbear', 1), ('flour', 1), ('specliast', 1), ('mentorship', 1), (u'trelkin', 1), (u'fratern', 1)]

In [9]:
# remove those tokens with low occurance

vocab_tokens = [w for w,c in vocabCounter.items() if c >= 2]
print len(vocab_tokens)

vocabset = set(vocab_tokens)


15906

In [10]:
# For testting tokens, only keep those in vocabulary
cleaned_testing_neg_docs = []
cleaned_testing_pos_docs = []

for d in testing_neg_docs:
    cleaned_testing_neg_docs.append(' '.join([w for w in d if w in vocabset]))

for d in testing_pos_docs:
    cleaned_testing_pos_docs.append(' '.join([w for w in d if w in vocabset]))

In [12]:
# STEP 2 - Neural Network with word embedding

import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D

np.random.seed(410)

In [14]:
tokenizer = Tokenizer()

In [15]:
# This is how texts_to_sequences() in NN tokenizer work
# Same words in different documents get same word index
# But here, has to be exactly the same words

tokenizer.fit_on_texts(['I love Emmanuel', 'I love ice-cream', 'love I'])
encoded_docs = tokenizer.texts_to_sequences(['I love Emmanuel', 'I love ice-cream', 'love I'])
encoded_docs


Out[15]:
[[1, 2, 3], [1, 2, 4, 5], [2, 1]]

In [16]:
training_docs = training_neg_docs + training_pos_docs
testing_docs = cleaned_testing_neg_docs + cleaned_testing_pos_docs

tokenizer.fit_on_texts(training_docs)  # tokenizer need to fit on training data

In [17]:
training_docs[7]


Out[17]:
u'main problem martin pet project thin line love hate like fatal attract variat protagonist man charact irrespons jerk case seem anyth except justifi action especi case darnel wright one macho guy women line mile long think condon male philosophi one heterosexu male lucki enough get hand beauti kind girl treat like princess respect darnel think like sleep girl dump period film discov newest target beauti wealthi brandi web play nice whitfield run success real estat busi kill dump childdhood friend mia king found think suppos feel sorri guy larg fault line women think enough common sens think would backfir one day mother say get sens irrespons also pretti dumb especi get sens dumb brandi tell kill husband allegedli abus put pant two second brandi psycho bitch hard agre say darnel final guy like treat like garbag one final flaw though let brandi fall darnel begin brandi classi intellig woman mba harvard init resist immatur play call woman like would real life sudden say ye listen four letter word vocabulari watch action wonder intellig woman would fall guy like darnel period much less unhealthi obsess lawrenc good want order prove need let peopl write direct movi look movi four writer result long two hour easili work nineti minut lot subplot charact appear disappear quickli came make think writer get along necessari bright spot though whitfield regina king della rees bobbi brown lazili written part help lot funni part like exampl scene rees attempt fight whitfeld harrass son damag properti enough sustain film'

In [18]:
## TRAINING DATA

# encode as sequence
training_encoded_seq = tokenizer.texts_to_sequences(training_docs)
print 'Encoded Sequence: ', training_encoded_seq[7]

# pad sequences
max_length = max([len(s.split()) for s in training_docs])  # MAX total number of tokens in training documents
Xtrain = pad_sequences(training_encoded_seq, maxlen=max_length, padding='post')
print Xtrain[7]
print 'Xtrain Shape: ', Xtrain.shape

ytrain = np.array([0 for _ in range(900)] + [1 for _ in range(900)])  # first 900 are negative, last 900 are positive
print ytrain[4:10]
print 'ytrain Shape: ', ytrain.shape


Encoded Sequence:  [288, 131, 782, 2477, 495, 1070, 123, 36, 576, 4, 1918, 655, 3048, 1071, 44, 5, 7982, 2703, 228, 18, 150, 287, 2425, 58, 247, 228, 7983, 3613, 3, 3049, 94, 470, 123, 1354, 108, 57, 7984, 1085, 2426, 3, 5196, 1085, 1691, 87, 6, 182, 233, 166, 153, 736, 4, 1323, 515, 7983, 57, 4, 1291, 153, 2064, 903, 1, 392, 3746, 1177, 233, 1919, 3207, 3747, 11, 261, 12784, 98, 232, 93, 4420, 561, 116, 2064, 15929, 110, 4275, 462, 294, 57, 277, 72, 1737, 94, 543, 1273, 123, 470, 57, 87, 962, 156, 57, 15, 6844, 3, 80, 310, 49, 6, 156, 7982, 17, 192, 780, 247, 6, 156, 780, 3207, 124, 116, 538, 5727, 1814, 164, 3403, 20, 258, 3207, 1953, 3516, 204, 904, 49, 7983, 102, 94, 4, 736, 4, 2375, 3, 102, 584, 78, 282, 3207, 185, 7983, 96, 3207, 6845, 393, 183, 15930, 6045, 10967, 2704, 6432, 11, 127, 183, 4, 15, 93, 34, 2007, 49, 484, 1304, 475, 1888, 259, 7442, 53, 58, 129, 393, 183, 15, 185, 94, 4, 7983, 903, 16, 242, 7985, 900, 1920, 12, 51, 339, 439, 140, 282, 37, 336, 89, 2, 23, 2, 475, 417, 328, 108, 20, 189, 617, 26, 3748, 114, 106, 693, 5, 139, 1367, 493, 668, 7, 57, 417, 6, 190, 1368, 1292, 987, 78, 12784, 9665, 462, 10968, 2427, 1815, 978, 15931, 322, 84, 119, 106, 99, 84, 4, 375, 9, 2427, 174, 225, 15932, 12785, 358, 2428, 3305, 87, 2836, 1]
[288 131 782 ...,   0   0   0]
Xtrain Shape:  (1800, 1277)
[0 0 0 0 0 0]
ytrain Shape:  (1800,)

In [19]:
## TESTING DATA

# sequence encode
testing_encoded_seq = tokenizer.texts_to_sequences(testing_docs)
print 'Encoded Sequence: ', testing_encoded_seq[9]

# pad sequences
Xtest = pad_sequences(testing_encoded_seq, maxlen=max_length, padding='post')  # still use training max_length here
print Xtest[9]
print 'Xtest Shape: ', Xtest.shape

ytest = np.array([0 for _ in range(100)] + [1 for _ in range(100)])  # first 100 are negative, last 100 are positive
print ytest[4:10]
print 'ytest Shape: ', ytest.shape


Encoded Sequence:  [4327, 808, 43, 1, 1116, 162, 252, 17, 24, 1, 81, 61, 3, 1593, 2, 142, 584, 61, 3, 1593, 2, 142, 584, 1071, 3226, 2112, 73, 301, 22, 6887, 1992, 55, 928, 2344, 239, 298, 68, 2112, 349, 552, 2488, 301, 1856, 4476, 5889, 363, 218, 117, 265, 519, 5262, 5793, 3457, 911, 11770, 196, 307, 11, 4327, 1883, 349, 29, 213, 6, 6199, 978, 944, 6, 298, 132, 226, 101, 200, 111, 911, 11770, 2016, 446, 1, 2058, 1220, 223, 298, 736, 4704, 1883, 301, 258, 25, 2733, 3, 9, 925, 2327, 3424, 2672, 911, 11770, 73, 552, 197, 48, 5540, 236, 5021, 1462, 1873, 117, 52, 21, 947, 114, 2816, 88, 108, 96, 131, 610, 1507, 748, 392, 945, 10, 61, 153, 2736, 5019, 469, 197, 1404, 1119, 1793, 5635, 606, 1, 446, 1718, 2022, 349, 542, 301, 164, 117, 10484, 2112, 1748, 3093, 4, 1522, 3292, 14623, 1505, 5631, 56, 39, 94, 518, 165, 2112, 1171, 1351, 6199, 497, 552, 102, 301, 32, 4327, 808, 115, 926, 620, 77, 421, 281, 3102, 38, 406, 274, 53, 422, 645, 118, 163, 390, 40, 29, 57, 908, 1742, 13, 1, 55, 327, 235, 1728, 101, 191, 3, 854, 131, 1, 33, 70, 5, 71, 132, 325, 117, 11, 519, 5262, 5793, 4136, 192, 16, 274, 126, 772, 521, 195, 1223, 519, 5262, 5793, 111, 11898, 408, 482, 119, 65, 325, 1816, 500, 1464, 101, 191, 13, 1676, 85, 117, 1002, 1070, 390, 485, 98, 8, 1609, 2493, 248, 1408, 715, 14, 237, 2645, 1113, 26, 4646, 867, 2659, 660, 117, 610, 593, 2144, 693, 17, 299, 33, 252, 111, 1507, 748, 57, 81, 348, 1, 195, 5, 322, 368, 393, 748, 449, 5, 54, 5966, 610, 176, 25, 120, 183, 200, 1103, 39, 334, 3082, 108, 87, 40, 603, 235, 652, 5, 195, 1241, 518, 111, 158, 11, 1791, 298, 206, 109, 1883, 243, 509, 17, 5598, 162, 1390, 256, 79, 54, 1883, 764, 944, 117, 51, 84, 310, 383, 1571, 73, 466, 538, 5966, 855, 102, 1748, 3093, 11, 1638, 2722, 2112, 5, 109, 7, 5511, 56, 219, 4327, 808, 528, 744, 141, 155, 3, 79, 237, 2645, 803, 6, 147, 9, 11, 1883, 517, 14623, 1505, 47, 1593, 1, 185, 713, 175, 369, 218, 231, 1, 140, 69, 18, 276, 218, 128, 106, 499, 111, 115, 13, 18, 7086, 201, 29, 140, 61, 1593, 1, 325, 3, 727, 4, 169, 2496, 84, 16, 83, 578, 1, 4327, 808, 595, 24, 3, 75, 155, 4222, 9, 3, 5, 791, 4040, 3849, 804, 84]
[4327  808   43 ...,    0    0    0]
Xtest Shape:  (200, 1277)
[0 0 0 0 0 0]
ytest Shape:  (200,)

In [20]:
vocab_size = len(tokenizer.word_index) + 1  # 1 is for unknown words here
print vocab_size
print max_length


23453
1277

In [21]:
# build the NN model
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=max_length))  # 100 dimensional vector space
model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
print(model.summary())


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_1 (Embedding)      (None, 1277, 100)         2345300   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 1270, 32)          25632     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 635, 32)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 20320)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 10)                203210    
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 11        
=================================================================
Total params: 2,574,153
Trainable params: 2,574,153
Non-trainable params: 0
_________________________________________________________________
None

In [22]:
# compile network
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model.fit(Xtrain, ytrain, epochs=10, verbose=2)
# evaluate model
loss, acc = model.evaluate(Xtest, ytest, verbose=0)
print('Test Accuracy: %f' % (acc*100))


Epoch 1/10
19s - loss: 0.6883 - acc: 0.5350
Epoch 2/10
18s - loss: 0.5176 - acc: 0.7794
Epoch 3/10
18s - loss: 0.1135 - acc: 0.9606
Epoch 4/10
17s - loss: 0.0115 - acc: 1.0000
Epoch 5/10
17s - loss: 0.0031 - acc: 0.9994
Epoch 6/10
16s - loss: 0.0016 - acc: 1.0000
Epoch 7/10
17s - loss: 0.0011 - acc: 1.0000
Epoch 8/10
17s - loss: 8.0485e-04 - acc: 1.0000
Epoch 9/10
17s - loss: 6.1696e-04 - acc: 1.0000
Epoch 10/10
18s - loss: 4.9296e-04 - acc: 1.0000
Test Accuracy: 84.500000