In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np
import pandas as pd 

from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re


Using TensorFlow backend.

In [2]:
data = pd.read_csv('Sentiment.csv')
data = data[['text', 'sentiment']]
data.columns


Out[2]:
Index(['text', 'sentiment'], dtype='object')

In [3]:
data['text'] = data['text'].apply(lambda x: str(x))
data['text'] = data['text'].apply(lambda x: x.lower())
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))

print(data[ data['sentiment'] == 'pos'].size)
print(data[ data['sentiment'] == 'cons'].size)

max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)
X = pad_sequences(X)


42004
39944

In [4]:
print(data)
data.iloc[[1]]['text']


                                                    text sentiment
0                                     stylish work great       pos
1      style speaker phone voice recorder twoway dire...       pos
2                                      fast high quality       pos
3                            high quality easy transfers       pos
4                fun completely customizable works great       pos
5                                     to good to be true       pos
6                            easy to use and lightweight       pos
7        little lag time between shots good battery life       pos
8                          fairly easy set up and to use       pos
9                          price dual cartridges quality       pos
10                        great pictures lots of storage       pos
11                  screen looks reception voice quality       pos
12     print results are very good for a 100 model  w...       pos
13                       efficient and quiet inexpensive       pos
14     works great as a standalone copier  but the fu...       pos
15             size sound internet ready signal strength       pos
16                   easy to use images are nice quality       pos
17      its unbelievably fast quiet and simple to set up       pos
18     speed speed speed and not at the cost of quali...       pos
19     excellent price solid quality output flexible ...       pos
20                10x zoom a lot of pic effects and more       pos
21     versatile with good features for a realworld o...       pos
22     long battery life and excellent call reliabili...       pos
23      instant pictures to share with family on the www       pos
24                       usb connectivity easy operation       pos
25                     good camera lots of features cute       pos
26                    compact highresolution easy to use       pos
27     sleek thin design  big bright display  perhaps...       pos
28     small lightweight great reception clear speake...       pos
29     fast reliable easeofuse  ink cartridges last a...       pos
...                                                  ...       ...
40944                      weird flash bad with distance      cons
40945    awful battery life clumsy software full of bugs      cons
40946  noisy uses ink fast unable to find ink replace...      cons
40947  digital zoom 8mb memory stick enter button usb...      cons
40948             middle of the road reception bad shape      cons
40949   watch that battery strength if you use bluetooth      cons
40950                             inferior sound quality      cons
40951             noise feeding problems rough operation      cons
40952       accessories not included occasional lag time      cons
40953                   ink cartridge dont hold much ink      cons
40954                                 no flash on camera      cons
40955                  takes a little time to turn onoff      cons
40956                      no internet capabilities size      cons
40957                                 price optional bag      cons
40958  poor battery life prone to redeye when using f...      cons
40959  loves those batteries buy the nickelmetal hydr...      cons
40960  lcd screen auto shutoff 16mb card no discount ...      cons
40961       design isnt on par with other epson printers      cons
40962                             no bluetooth expensive      cons
40963  zoom delay before the picture is taken and bat...      cons
40964  battery life cost 49999 web not in color wont ...      cons
40965              only gsmfor c331g and can lose signal      cons
40966  sometimes hard to hear if you are in a noisy area      cons
40967                       slow low resolution printing      cons
40968                   battery cover seems rather loose      cons
40969                 minor annoyances for phone junkies      cons
40970                    no eyepiece only an lcd display      cons
40971                        inefficient and unreliable       cons
40972  flaky reception frequent phone and sim card fr...      cons
40973      too many menus button to view last shot taken      cons

[40974 rows x 2 columns]
Out[4]:
1    style speaker phone voice recorder twoway dire...
Name: text, dtype: object

In [5]:
embed_dim = 128
lstm_out = 196

max_features=2000
model = Sequential()
model.add(Embedding(max_features, embed_dim, input_length=X.shape[1], dropout=0.2))
model.add(LSTM(lstm_out, dropout_U=0.2, dropout_W=0.2))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())


/home/zitao/py3/lib/python3.5/site-packages/ipykernel_launcher.py:6: UserWarning: The `dropout` argument is no longer support in `Embedding`. You can apply a `keras.layers.SpatialDropout1D` layer right after the `Embedding` layer to get the same behavior.
  
/home/zitao/py3/lib/python3.5/site-packages/ipykernel_launcher.py:7: UserWarning: Update your `LSTM` call to the Keras 2 API: `LSTM(196, recurrent_dropout=0.2, dropout=0.2)`
  import sys
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_1 (Embedding)      (None, 53, 128)           256000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 196)               254800    
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 394       
=================================================================
Total params: 511,194
Trainable params: 511,194
Non-trainable params: 0
_________________________________________________________________
None

In [6]:
Y = pd.get_dummies(data['sentiment']).values
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.33, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)


(27452, 53) (27452, 2)
(13522, 53) (13522, 2)

In [7]:
batch_size = 128
model.fit(X_train, Y_train, epochs = 7, batch_size=batch_size, verbose = 2)


Epoch 1/7
 - 9s - loss: 0.2641 - acc: 0.8880
Epoch 2/7
 - 9s - loss: 0.1588 - acc: 0.9391
Epoch 3/7
 - 9s - loss: 0.1431 - acc: 0.9449
Epoch 4/7
 - 9s - loss: 0.1303 - acc: 0.9490
Epoch 5/7
 - 9s - loss: 0.1201 - acc: 0.9531
Epoch 6/7
 - 9s - loss: 0.1095 - acc: 0.9575
Epoch 7/7
 - 9s - loss: 0.1030 - acc: 0.9583
Out[7]:
<keras.callbacks.History at 0x7f370524bf28>

In [8]:
validation_size = 10000

X_validate = X_test[-validation_size:]
Y_validate = Y_test[-validation_size:]
X_test = X_test[:-validation_size]
Y_test = Y_test[:-validation_size]
score,acc = model.evaluate(X_test, Y_test, verbose = 1, batch_size = batch_size)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))


3522/3522 [==============================] - 0s 106us/step
score: 0.18
acc: 0.93

In [9]:
pos_cnt, neg_cnt, pos_correct, neg_correct = 0, 0, 0, 0
for x in range(len(X_validate)):
    
    result = model.predict(X_validate[x].reshape(1,X_test.shape[1]),batch_size=1,verbose = 2)[0]
   
    if np.argmax(result) == np.argmax(Y_validate[x]):
        if np.argmax(Y_validate[x]) == 0:
            neg_correct += 1
        else:
            pos_correct += 1
       
    if np.argmax(Y_validate[x]) == 0:
        neg_cnt += 1
    else:
        pos_cnt += 1



print("pos_acc", pos_correct/pos_cnt*100, "%")
print("neg_acc", neg_correct/neg_cnt*100, "%")


pos_acc 92.918961447679 %
neg_acc 93.28722538649309 %

In [11]:
model.save('lstm.h5')

In [ ]: