In [1]:
# movie review dataset homepage: http://www.cs.cornell.edu/people/pabo/movie-review-data/
# Download the dataset used here: http://www.cs.cornell.edu/people/pabo/movie-review-data/review_polarity.tar.gz

# In this code
## Step1 - Generate vocabulary from the raw text data
## Step2 - basic data preprocessing + neural network

In [ ]:
import nltk

nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')

In [76]:
# STEP 1 - Generate Vocabulary (word, count)

import string
from collections import Counter
from os import listdir

neg_reviews_folder = "review_polarity/txt_sentoken/neg/"
pos_reviews_folder = "review_polarity/txt_sentoken/pos/"

# convert each raw text to cleaned token list
def clean_text(raw_text_path):
    with open(raw_text_path) as text_in:
        raw_text = text_in.read()
        tokens = raw_text.split()
        # remove punctuation
        tokens = [w.strip(string.punctuation) for w in tokens]
        # only keep alphabetic non-stop words
        tokens = [w for w in tokens if w.isalpha()==True and w not in stopwords]
        # remove short words
        tokens = [w for w in tokens if len(w) >= 3]
    return tokens

vocabCounter = Counter()  # vocabulary set will use training data tokens
training_neg_docs, testing_neg_docs = [], []
training_pos_docs, testing_pos_docs = [], []

for f_name in listdir(neg_reviews_folder):
    if f_name.startswith('cv7'):  # files start with 'cv7' will be the testing data
        testing_neg_docs.append(clean_text(neg_reviews_folder + f_name))
    else: 
        tks = clean_text(neg_reviews_folder + f_name)  
        training_neg_docs.append(' '.join(tks))  # This format helps later texts_to_sequences
        vocabCounter.update(tks)
           
for f_name in listdir(pos_reviews_folder):
    if f_name.startswith('cv9'):  # files start with 'cv9' will be the testing data
        testing_pos_docs.append(clean_text(pos_reviews_folder + f_name))
    else:
        tks = clean_text(pos_reviews_folder + f_name)  
        training_pos_docs.append(' '.join(tks))  # This format helps later texts_to_sequences
        vocabCounter.update(tks)

In [78]:
print len(vocabCounter)
print vocabCounter.most_common(79)


35969
[('film', 7916), ('one', 4980), ('movie', 4794), ('like', 3216), ('even', 2286), ('good', 2078), ('time', 2048), ('story', 1903), ('would', 1833), ('much', 1820), ('also', 1751), ('get', 1719), ('character', 1715), ('characters', 1661), ('two', 1637), ('first', 1603), ('see', 1542), ('way', 1495), ('well', 1474), ('make', 1424), ('really', 1397), ('films', 1338), ('little', 1337), ('life', 1331), ('plot', 1291), ('people', 1258), ('never', 1239), ('scene', 1236), ('could', 1226), ('bad', 1226), ('best', 1173), ('man', 1155), ('new', 1145), ('many', 1131), ('scenes', 1129), ('know', 1084), ('movies', 1036), ('great', 1024), ('love', 991), ('another', 978), ('action', 946), ('still', 940), ('seems', 939), ('end', 937), ('director', 936), ('something', 934), ('back', 925), ('made', 918), ('work', 915), ('makes', 882), ('big', 879), ('however', 878), ('world', 859), ('every', 854), ('though', 849), ('seen', 819), ('around', 811), ('better', 805), ('take', 803), ('performance', 800), ('enough', 797), ('role', 793), ('audience', 783), ('gets', 782), ('may', 776), ('real', 770), ('going', 768), ('years', 765), ('think', 754), ('last', 753), ('things', 750), ('funny', 749), ('look', 743), ('actually', 741), ('although', 729), ('comedy', 725), ('almost', 724), ('played', 721), ('thing', 718)]

In [79]:
print vocabCounter.most_common()[:-79:-1]


[('emery', 1), ('upholds', 1), ('contenda', 1), ('moguls', 1), ('figueras', 1), ('boos', 1), ('counteract', 1), ('bombarding', 1), ('heralding', 1), ('tilvern', 1), ('tab', 1), ('jaffa', 1), ('volatility', 1), ('cairo', 1), ('brushstrokes', 1), ('hypnotism', 1), ('grapevine', 1), ('hjejle', 1), ('registration', 1), ('argumentative', 1), ('fudd', 1), ('stereotypically', 1), ('mith', 1), ('uneffective', 1), ('lear', 1), ('pheneomena', 1), ('obscurities', 1), ('spoofy', 1), ('holley', 1), ('holler', 1), ('nigga', 1), ('hitchhike', 1), ('bedfellows', 1), ('baffels', 1), ('thereus', 1), ('unsigned', 1), ('heisting', 1), ('disturbance', 1), ('temerity', 1), ('danced', 1), ('wren', 1), ('breaker', 1), ('equidistant', 1), ('theses', 1), ('ironsides', 1), ('tristine', 1), ('comaprison', 1), ('tactful', 1), ('mambos', 1), ('elbows', 1), ('christmastown', 1), ('sympathizers', 1), ('shapeshifting', 1), ('merciful', 1), ('actionless', 1), ('decter', 1), ('glistens', 1), ('wile', 1), ('pornogrpahy', 1), ('divinely', 1), ('undertsanding', 1), ('vanilla', 1), ('wilt', 1), ('readied', 1), ('dearth', 1), ('bandies', 1), ('thrusting', 1), ('butchery', 1), ('takehiro', 1), ('playstation', 1), ('butchers', 1), ('ashmore', 1), ('validating', 1), ('torments', 1), ('centrestage', 1), ('broadside', 1), ('direcing', 1), ('cellulars', 1)]

In [80]:
# remove those tokens with low occurance

vocab_tokens = [w for w,c in vocabCounter.items() if c >= 2]
print len(vocab_tokens)

vocabset = set(vocab_tokens)


23201

In [81]:
# For testting tokens, only keep those in vocabulary
cleaned_testing_neg_docs = []
cleaned_testing_pos_docs = []

for d in testing_neg_docs:
    cleaned_testing_neg_docs.append(' '.join([w for w in d if w in vocabset]))

for d in testing_pos_docs:
    cleaned_testing_pos_docs.append(' '.join([w for w in d if w in vocabset]))

In [82]:
# STEP 2 - Neural Network with word embedding

import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D

np.random.seed(410)

In [ ]:
tokenizer = Tokenizer()

In [84]:
# This is how texts_to_sequences() in NN tokenizer work
# Same words in different documents get same word index
# But here, has to be exactly the same words

tokenizer.fit_on_texts(['I love Emmanuel', 'I love ice-cream', 'love I'])
encoded_docs = tokenizer.texts_to_sequences(['I love Emmanuel', 'I love ice-cream', 'love I'])
encoded_docs


Out[84]:
[[1, 2, 6], [1, 2, 7, 8], [2, 1]]

In [85]:
training_docs = training_neg_docs + training_pos_docs
testing_docs = cleaned_testing_neg_docs + cleaned_testing_pos_docs

tokenizer.fit_on_texts(training_docs)  # tokenizer need to fit on training data

In [86]:
training_docs[7]


Out[86]:
'main problem martin pet project thin line love hate like fatal attraction variation protagonist man character irresponsible jerk case seem anything except justify actions especially case darnell wright one macho guys women lined mile long think condone male philosophy one heterosexual males lucky enough get hands beautiful kind girl treat like princess respect darnell think like sleeps girls dumps period film discovered newest target beautiful wealthy brandi web played nicely whitfield runs successful real estate business kill dumping childdhood friend mia king found thinking supposed feel sorry guy largely fault lining women think enough common sense think would backfire one day mother says get sense irresponsible also pretty dumb especially get sense dumb brandi tells killed husband allegedly abusing put pants two seconds brandi psycho bitch hard agree says darnell finale guys like treating like garbage one final flaw though letting brandi fall darnell begin brandi classy intelligent woman mba harvard initally resists immature play calls woman like would real life sudden says yes listen four letter word vocabulary watch actions wonder intelligent woman would fall guy like darnell period much less unhealthy obsession lawrence good wants order prove needs let people write direct movies look movie four writers result long two hours easily worked ninety minutes lot subplots characters appear disappear quickly came makes think writers get along necessary bright spots though whitfield regina king della reese bobby brown lazily written parts help lot funny parts like example scene reese attempts fight whitfeld harrassing son damaging property enough sustain film'

In [89]:
## TRAINING DATA

# encode as sequence
training_encoded_seq = tokenizer.texts_to_sequences(training_docs)
print 'Encoded Sequence: ', training_encoded_seq[7]

# pad sequences
max_length = max([len(s.split()) for s in training_docs])  # MAX total number of tokens in training documents
Xtrain = pad_sequences(training_encoded_seq, maxlen=max_length, padding='post')
print Xtrain[7]
print 'Xtrain Shape: ', Xtrain.shape

ytrain = np.array([0 for _ in range(900)] + [1 for _ in range(900)])  # first 900 are negative, last 900 are positive
print ytrain[4:10]
print 'ytrain Shape: ', ytrain.shape


Encoded Sequence:  [223, 221, 718, 3118, 596, 965, 216, 39, 872, 4, 2953, 2877, 4793, 1568, 32, 13, 11563, 4253, 193, 127, 115, 500, 4254, 1252, 181, 193, 10426, 4255, 2, 3380, 376, 381, 15110, 2487, 88, 69, 11564, 1160, 3033, 2, 7132, 5462, 1700, 61, 12, 774, 325, 144, 218, 1623, 4, 1254, 1412, 10426, 69, 4, 6362, 468, 11565, 975, 1, 2112, 4256, 1434, 325, 1969, 10427, 4414, 78, 1178, 18078, 552, 623, 66, 5463, 600, 343, 6030, 23242, 242, 4995, 424, 224, 676, 288, 174, 1748, 123, 1894, 1622, 13049, 381, 69, 61, 880, 134, 69, 9, 13050, 2, 124, 241, 389, 12, 134, 11563, 11, 152, 741, 181, 12, 134, 741, 10427, 398, 465, 472, 7133, 18079, 207, 3955, 15, 1647, 10427, 2113, 4609, 165, 1935, 389, 10426, 1379, 376, 4, 6363, 4, 2554, 2, 234, 2006, 55, 2488, 10427, 444, 10426, 584, 10427, 8721, 636, 147, 23243, 7574, 23244, 13051, 8722, 178, 1144, 147, 4, 9, 66, 24, 2076, 389, 392, 2805, 384, 2878, 495, 9505, 128, 1252, 432, 636, 147, 9, 444, 123, 4, 10426, 975, 10, 177, 10428, 2609, 1970, 6, 171, 346, 1145, 473, 328, 26, 941, 1723, 37, 73, 3, 384, 1015, 423, 88, 15, 437, 533, 989, 6031, 111, 102, 1864, 14, 748, 4415, 403, 581, 50, 69, 1015, 12, 150, 1295, 1255, 2382, 55, 18078, 13052, 424, 15111, 2618, 1816, 963, 23245, 250, 550, 140, 102, 72, 550, 4, 345, 28, 2618, 476, 295, 23246, 18080, 298, 15112, 4110, 61, 3956, 1]
[223 221 718 ...,   0   0   0]
Xtrain Shape:  (1800, 1277)
[0 0 0 0 0 0]
ytrain Shape:  (1800,)

In [91]:
## TESTING DATA

# sequence encode
testing_encoded_seq = tokenizer.texts_to_sequences(testing_docs)
print 'Encoded Sequence: ', testing_encoded_seq[9]

# pad sequences
Xtest = pad_sequences(testing_encoded_seq, maxlen=max_length, padding='post')  # still use training max_length here
print Xtest[9]
print 'Xtest Shape: ', Xtest.shape

ytest = np.array([0 for _ in range(100)] + [1 for _ in range(100)])  # first 100 are negative, last 100 are positive
print ytest[4:10]
print 'ytest Shape: ', ytest.shape


Encoded Sequence:  [5080, 2207, 31, 1, 1014, 122, 231, 11, 16, 1, 56, 40, 2, 2311, 37, 1149, 2723, 40, 2, 2311, 37, 1149, 2723, 1568, 3851, 2579, 51, 302, 44, 10483, 2323, 38, 3594, 3549, 175, 232, 268, 2579, 337, 1183, 2749, 302, 7157, 7376, 448, 355, 92, 196, 426, 6461, 7234, 4867, 810, 16422, 159, 247, 90, 5080, 1921, 337, 21, 1539, 12, 9774, 963, 3890, 12, 232, 101, 170, 83, 244, 86, 810, 16422, 2090, 1453, 1, 2157, 1267, 168, 232, 6363, 5616, 1921, 1678, 253, 402, 2977, 2, 28, 909, 4126, 4279, 9086, 810, 16422, 51, 1183, 215, 110, 6832, 6984, 6463, 10001, 4914, 92, 275, 413, 986, 111, 3090, 70, 88, 226, 344, 564, 1602, 700, 1068, 839, 2755, 40, 218, 2981, 6127, 3015, 215, 2686, 2392, 2775, 7932, 1441, 1, 1453, 3491, 17096, 337, 1427, 302, 2081, 92, 2579, 1771, 3436, 4, 1487, 3678, 1574, 62, 29, 123, 420, 126, 2579, 2361, 3332, 9774, 4225, 1183, 234, 302, 3350, 5080, 2207, 93, 3075, 1044, 54, 1931, 281, 3444, 27, 330, 1456, 167, 662, 1154, 91, 3552, 309, 30, 21, 69, 1266, 1903, 8, 1142, 38, 497, 21898, 5711, 83, 850, 2, 769, 344, 1, 23, 484, 14, 74, 101, 254, 92, 78, 426, 6461, 7234, 4804, 152, 10, 2331, 96, 1640, 2346, 156, 1301, 426, 6461, 7234, 86, 21024, 353, 397, 140, 42, 254, 1974, 409, 1501, 83, 850, 8, 12598, 57, 92, 895, 965, 309, 393, 307, 7, 12624, 2965, 219, 1331, 1795, 17, 180, 5747, 1091, 517, 5800, 903, 3137, 630, 92, 564, 654, 3050, 967, 11, 1943, 23, 1996, 86, 1602, 700, 69, 56, 276, 22, 156, 13, 250, 364, 1025, 700, 1458, 13, 151, 7483, 564, 133, 402, 94, 147, 802, 6594, 29, 289, 8611, 88, 61, 30, 881, 2042, 1074, 14, 156, 1508, 420, 86, 121, 178, 3898, 232, 1527, 82, 1921, 248, 521, 11, 10860, 122, 3665, 190, 141, 974, 1921, 1611, 3890, 92, 142, 98, 241, 1013, 6077, 51, 374, 472, 7483, 813, 261, 1771, 3436, 90, 1774, 4130, 2579, 13, 82, 20, 6783, 62, 1070, 5080, 2207, 1414, 911, 347, 284, 2, 141, 180, 5747, 847, 12, 454, 35, 263, 1921, 1757, 1574, 34, 2311, 22, 444, 1146, 626, 455, 355, 214, 1, 306, 48, 127, 1880, 355, 106, 102, 5024, 86, 93, 645, 127, 20948, 158, 21, 306, 40, 2311, 1, 254, 2, 659, 4, 146, 3721, 550, 10, 58, 593, 22, 5080, 2207, 629, 16, 2, 52, 1128, 5681, 28, 2, 14, 2341, 11747, 4798, 747, 550]
[5080 2207   31 ...,    0    0    0]
Xtest Shape:  (200, 1277)
[0 0 0 0 0 0]
ytest Shape:  (200,)

In [92]:
vocab_size = len(tokenizer.word_index) + 1  # 1 is for unknown words here
print vocab_size
print max_length


35970
1277

In [93]:
# build the NN model
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=max_length))  # 100 dimensional vector space
model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
print(model.summary())


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_9 (Embedding)      (None, 1277, 100)         3597000   
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 1270, 32)          25632     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 635, 32)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 20320)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 10)                203210    
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 11        
=================================================================
Total params: 3,825,853
Trainable params: 3,825,853
Non-trainable params: 0
_________________________________________________________________
None

In [94]:
# compile network
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model.fit(Xtrain, ytrain, epochs=10, verbose=2)
# evaluate model
loss, acc = model.evaluate(Xtest, ytest, verbose=0)
print('Test Accuracy: %f' % (acc*100))


Epoch 1/10
23s - loss: 0.6902 - acc: 0.5183
Epoch 2/10
23s - loss: 0.4866 - acc: 0.7989
Epoch 3/10
19s - loss: 0.0629 - acc: 0.9828
Epoch 4/10
19s - loss: 0.0054 - acc: 1.0000
Epoch 5/10
19s - loss: 0.0020 - acc: 1.0000
Epoch 6/10
19s - loss: 0.0013 - acc: 1.0000
Epoch 7/10
20s - loss: 9.2170e-04 - acc: 1.0000
Epoch 8/10
19s - loss: 7.0686e-04 - acc: 1.0000
Epoch 9/10
19s - loss: 5.5977e-04 - acc: 1.0000
Epoch 10/10
19s - loss: 4.1090e-04 - acc: 1.0000
Test Accuracy: 84.000000