In [1]:
%pylab inline
import pandas as pd
from keras import layers, models, metrics, optimizers, preprocessing, losses, regularizers
from keras.preprocessing import sequence
import re
from gensim.models import Word2Vec
from collections import Counter
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.model_selection import train_test_split


Populating the interactive namespace from numpy and matplotlib
Using TensorFlow backend.

In [3]:
df_all = pd.read_csv('training.1600000.processed.noemoticon.csv', header=None, 
                       names=['pol', 'id', 'date', 'lyx', 'user', 'txt']).append(
           pd.read_csv('testdata.manual.2009.06.14.csv', header=None, 
                       names=['pol', 'id', 'date', 'lyx', 'user', 'txt']))
n_train = 1600000
n_test = df_all.shape[0] - n_train

In [4]:
df_all = df_all.loc[df_all.pol != 2]
df_all.pol = (df_all.pol == 4)

In [7]:
n_train = 1600000
n_test = df_all.shape[0] - n_train

In [5]:
date_re = re.compile(r'(?P<week_day>\w{3}) (?P<month>\w{3}) (?P<day>\d\d) (?P<hour>\d\d):(?P<minute>\d\d):(?P<second>\d\d) (?P<tz>\w{3}) (?P<year>\d{4})')

In [6]:
df_all[['week_day', 'day', 'hour']] = \
    df_all.date.str.extract(date_re, expand=True).loc[:,['week_day', 'day', 'hour']]

In [8]:
df_all = pd.get_dummies(df_all, columns=['week_day', 'day', 'hour'])

In [9]:
df_all.txt = (df_all.txt
              .str.replace(r'https?:\/\/\S+\b|www\.(\w+\.)+\S*', r' <URL> ')
              .str.replace(r'@\w*', r' <USER> ')
              .str.replace(r'#([A-Z0-9\-]+)', r' <HASHTAG> \1 <ALLCAPS> ')
              .str.replace(r'#(\S+)', lambda x: ' <HASHTAG> ' + ' '.join(re.split(r'(?=[A-Z])', x.group(1))))
              .str.replace(r'<3', r' <HEART> ')
              .str.replace(r'([!?.]){2,}', r' \1 <REPEAT> ')
              .str.replace(r'\b(\S*?)(.)\2{2,}\b', r' \1\2 <ELONG> ')
              .str.replace(r'\s([^a-z0-9()<>\'`\-]){2,}\s',
                           lambda x: x.group(0).lower() + '<ALLCAPS> ')).str.lower()

In [10]:
df_test = df_all[n_train:].copy()
df_train = df_all[:n_train].copy()
del df_all

In [11]:
df_train = df_train.loc[np.random.permutation(df_train.shape[0])]

In [12]:
split_re = re.compile(r'[^\w<>]')

In [13]:
words = Counter(re.split(split_re, df_train.txt.str.cat(sep=' ')))

In [14]:
words_dict = {}
with open('glove.twitter.27B.200d.txt') as f:
    for line_num, line in enumerate(f):
        text = line.decode('utf-8').split()
        if len(text) != 201:
            continue
        words_dict[text[0]] = np.array([float(x) for x in text[1:]], dtype='float32')

In [15]:
word_num = {word: i + 1 for i, word in enumerate(words_dict) if words[word] >= 5}

In [16]:
word_matrix = np.zeros((len(words_dict) + 2, 200))
for word, i in word_num.items():
    word_matrix[i] = words_dict[word]

In [17]:
def convert_txt(df):
    tweets = []
    for tweet in df.txt:
        current_tweet_list = []
        for word in re.split(split_re, tweet):
            if word not in word_num or words[word] < 5:
                continue
            else:
                current_tweet_list.append(word_num[word])
        tweets.append(np.array(current_tweet_list))
    return tweets

In [18]:
maxlen = 60
X_train = sequence.pad_sequences(convert_txt(df_train), maxlen=maxlen, padding='post')

In [19]:
X_test = sequence.pad_sequences(convert_txt(df_test), maxlen=maxlen, padding='post')

In [20]:
y_test = df_test.pol

In [21]:
y_train = df_train.pol

In [22]:
X_train, X_cv, y_train, y_cv = train_test_split(
    X_train, y_train, test_size=.1)

In [23]:
np.mean(np.count_nonzero(X_train, axis=1))


Out[23]:
14.106870138888889

In [24]:
tweet = layers.Input((maxlen,), dtype='int32')

In [25]:
embedded = layers.Embedding(word_matrix.shape[0], 200, input_length=maxlen,
                            weights=[word_matrix], trainable=False)(tweet)

In [26]:
embedded_normalized = layers.BatchNormalization()(embedded)

In [27]:
lstm = layers.Bidirectional(layers.LSTM(150, dropout=.2, recurrent_dropout=.2))(embedded_normalized)

In [28]:
lstm_dropout = layers.Dropout(.5)(layers.BatchNormalization()(lstm))

In [29]:
result = layers.Dense(1, activation='sigmoid', kernel_regularizer=regularizers.l2(1e-3))(lstm_dropout)

In [30]:
model = models.Model(tweet, result)

In [31]:
model.compile(optimizer=optimizers.Adam(lr=1e-3), loss=losses.binary_crossentropy, metrics=['accuracy'])

In [45]:
model.fit(X_train, y_train, epochs=20, batch_size=256, 
          validation_data=(X_cv, y_cv))


Train on 1440000 samples, validate on 160000 samples
Epoch 1/20
1440000/1440000 [==============================] - 1215s - loss: 0.3590 - acc: 0.8415 - val_loss: 0.3339 - val_acc: 0.8546
Epoch 2/20
1440000/1440000 [==============================] - 1213s - loss: 0.3586 - acc: 0.8417 - val_loss: 0.3361 - val_acc: 0.8533
Epoch 3/20
1440000/1440000 [==============================] - 1200s - loss: 0.3578 - acc: 0.8421 - val_loss: 0.3389 - val_acc: 0.8522
Epoch 4/20
1440000/1440000 [==============================] - 1196s - loss: 0.3573 - acc: 0.8424 - val_loss: 0.3401 - val_acc: 0.8518
Epoch 5/20
   6144/1440000 [..............................] - ETA: 1165s - loss: 0.3608 - acc: 0.8381

KeyboardInterruptTraceback (most recent call last)
<ipython-input-45-b50908686898> in <module>()
      1 model.fit(X_train, y_train, epochs=20, batch_size=256, 
----> 2           validation_data=(X_cv, y_cv))

/usr/local/lib/python2.7/dist-packages/keras/engine/training.pyc in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, **kwargs)
   1484                               val_f=val_f, val_ins=val_ins, shuffle=shuffle,
   1485                               callback_metrics=callback_metrics,
-> 1486                               initial_epoch=initial_epoch)
   1487 
   1488     def evaluate(self, x, y, batch_size=32, verbose=1, sample_weight=None):

/usr/local/lib/python2.7/dist-packages/keras/engine/training.pyc in _fit_loop(self, f, ins, out_labels, batch_size, epochs, verbose, callbacks, val_f, val_ins, shuffle, callback_metrics, initial_epoch)
   1139                 batch_logs['size'] = len(batch_ids)
   1140                 callbacks.on_batch_begin(batch_index, batch_logs)
-> 1141                 outs = f(ins_batch)
   1142                 if not isinstance(outs, list):
   1143                     outs = [outs]

/usr/local/lib/python2.7/dist-packages/keras/backend/tensorflow_backend.pyc in __call__(self, inputs)
   2101         session = get_session()
   2102         updated = session.run(self.outputs + [self.updates_op],
-> 2103                               feed_dict=feed_dict)
   2104         return updated[:len(self.outputs)]
   2105 

/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.pyc in run(self, fetches, feed_dict, options, run_metadata)
    765     try:
    766       result = self._run(None, fetches, feed_dict, options_ptr,
--> 767                          run_metadata_ptr)
    768       if run_metadata:
    769         proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)

/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.pyc in _run(self, handle, fetches, feed_dict, options, run_metadata)
    963     if final_fetches or final_targets:
    964       results = self._do_run(handle, final_targets, final_fetches,
--> 965                              feed_dict_string, options, run_metadata)
    966     else:
    967       results = []

/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.pyc in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)
   1013     if handle is None:
   1014       return self._do_call(_run_fn, self._session, feed_dict, fetch_list,
-> 1015                            target_list, options, run_metadata)
   1016     else:
   1017       return self._do_call(_prun_fn, self._session, handle, feed_dict,

/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.pyc in _do_call(self, fn, *args)
   1020   def _do_call(self, fn, *args):
   1021     try:
-> 1022       return fn(*args)
   1023     except errors.OpError as e:
   1024       message = compat.as_text(e.message)

/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.pyc in _run_fn(session, feed_dict, fetch_list, target_list, options, run_metadata)
   1002         return tf_session.TF_Run(session, options,
   1003                                  feed_dict, fetch_list, target_list,
-> 1004                                  status, run_metadata)
   1005 
   1006     def _prun_fn(session, handle, feed_dict, fetch_list):

KeyboardInterrupt: 

In [33]:
model.load_weights('lstm-glove.model')

In [ ]:
import keras
sum([np.prod(keras.backend.get_value(w).shape) for w in model.trainable_weights]) / float(X_train.shape[0])

In [34]:
model.history.history


Out[34]:
{}

In [46]:
y_pred = model.predict(X_test, batch_size=256, verbose=1)


359/359 [==============================] - 0s     

In [47]:
X_test[0]


Out[47]:
array([  14165,  847053, 1050169,  464094,  330482,  677694,  733354,
        889739,  992092, 1050169,  626053, 1095865,   32850,  889739,
        626053,  828915,  626066,  729819,  484095,  545272,       0,
             0,       0,       0,       0,       0,       0,       0,
             0,       0,       0,       0,       0,       0,       0,
             0,       0,       0,       0,       0,       0,       0,
             0,       0,       0,       0,       0,       0,       0,
             0,       0,       0,       0,       0,       0,       0,
             0,       0,       0,       0], dtype=int32)

In [48]:
from sklearn import metrics

In [49]:
metrics.accuracy_score(y_test, y_pred > .5)


Out[49]:
0.84679665738161558

In [50]:
metrics.roc_auc_score(y_test, y_pred)


Out[50]:
0.92621220587322273

In [44]:
y_pred = model.predict(X_test, batch_size=256)
print metrics.roc_auc_score(y_test, y_pred)
print metrics.accuracy_score(y_test, y_pred > .5)


0.930371887999
0.844011142061

In [ ]:
print "OK"

In [51]:
model.save_weights('lstm-glove.model.new')

In [64]:
df_train.shape


Out[64]:
(1600000, 67)

In [65]:
X_train.shape


Out[65]:
(1440000, 60)

In [66]:
X_cv.shape


Out[66]:
(160000, 60)

In [ ]: