In [1]:
%pylab inline
import pandas as pd
from keras import layers, models, metrics, optimizers, preprocessing, losses, regularizers
from keras.preprocessing import sequence
import re
from gensim.models import Word2Vec
from collections import Counter
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.model_selection import train_test_split
In [3]:
df_all = pd.read_csv('training.1600000.processed.noemoticon.csv', header=None,
names=['pol', 'id', 'date', 'lyx', 'user', 'txt']).append(
pd.read_csv('testdata.manual.2009.06.14.csv', header=None,
names=['pol', 'id', 'date', 'lyx', 'user', 'txt']))
n_train = 1600000
n_test = df_all.shape[0] - n_train
In [4]:
df_all = df_all.loc[df_all.pol != 2]
df_all.pol = (df_all.pol == 4)
In [7]:
n_train = 1600000
n_test = df_all.shape[0] - n_train
In [5]:
date_re = re.compile(r'(?P<week_day>\w{3}) (?P<month>\w{3}) (?P<day>\d\d) (?P<hour>\d\d):(?P<minute>\d\d):(?P<second>\d\d) (?P<tz>\w{3}) (?P<year>\d{4})')
In [6]:
df_all[['week_day', 'day', 'hour']] = \
df_all.date.str.extract(date_re, expand=True).loc[:,['week_day', 'day', 'hour']]
In [8]:
df_all = pd.get_dummies(df_all, columns=['week_day', 'day', 'hour'])
In [9]:
df_all.txt = (df_all.txt
.str.replace(r'https?:\/\/\S+\b|www\.(\w+\.)+\S*', r' <URL> ')
.str.replace(r'@\w*', r' <USER> ')
.str.replace(r'#([A-Z0-9\-]+)', r' <HASHTAG> \1 <ALLCAPS> ')
.str.replace(r'#(\S+)', lambda x: ' <HASHTAG> ' + ' '.join(re.split(r'(?=[A-Z])', x.group(1))))
.str.replace(r'<3', r' <HEART> ')
.str.replace(r'([!?.]){2,}', r' \1 <REPEAT> ')
.str.replace(r'\b(\S*?)(.)\2{2,}\b', r' \1\2 <ELONG> ')
.str.replace(r'\s([^a-z0-9()<>\'`\-]){2,}\s',
lambda x: x.group(0).lower() + '<ALLCAPS> ')).str.lower()
In [10]:
df_test = df_all[n_train:].copy()
df_train = df_all[:n_train].copy()
del df_all
In [11]:
df_train = df_train.loc[np.random.permutation(df_train.shape[0])]
In [12]:
split_re = re.compile(r'[^\w<>]')
In [13]:
words = Counter(re.split(split_re, df_train.txt.str.cat(sep=' ')))
In [14]:
words_dict = {}
with open('glove.twitter.27B.200d.txt') as f:
for line_num, line in enumerate(f):
text = line.decode('utf-8').split()
if len(text) != 201:
continue
words_dict[text[0]] = np.array([float(x) for x in text[1:]], dtype='float32')
In [15]:
word_num = {word: i + 1 for i, word in enumerate(words_dict) if words[word] >= 5}
In [16]:
word_matrix = np.zeros((len(words_dict) + 2, 200))
for word, i in word_num.items():
word_matrix[i] = words_dict[word]
In [17]:
def convert_txt(df):
tweets = []
for tweet in df.txt:
current_tweet_list = []
for word in re.split(split_re, tweet):
if word not in word_num or words[word] < 5:
continue
else:
current_tweet_list.append(word_num[word])
tweets.append(np.array(current_tweet_list))
return tweets
In [18]:
maxlen = 60
X_train = sequence.pad_sequences(convert_txt(df_train), maxlen=maxlen, padding='post')
In [19]:
X_test = sequence.pad_sequences(convert_txt(df_test), maxlen=maxlen, padding='post')
In [20]:
y_test = df_test.pol
In [21]:
y_train = df_train.pol
In [22]:
X_train, X_cv, y_train, y_cv = train_test_split(
X_train, y_train, test_size=.1)
In [23]:
np.mean(np.count_nonzero(X_train, axis=1))
Out[23]:
In [24]:
tweet = layers.Input((maxlen,), dtype='int32')
In [25]:
embedded = layers.Embedding(word_matrix.shape[0], 200, input_length=maxlen,
weights=[word_matrix], trainable=False)(tweet)
In [26]:
embedded_normalized = layers.BatchNormalization()(embedded)
In [27]:
lstm = layers.Bidirectional(layers.LSTM(150, dropout=.2, recurrent_dropout=.2))(embedded_normalized)
In [28]:
lstm_dropout = layers.Dropout(.5)(layers.BatchNormalization()(lstm))
In [29]:
result = layers.Dense(1, activation='sigmoid', kernel_regularizer=regularizers.l2(1e-3))(lstm_dropout)
In [30]:
model = models.Model(tweet, result)
In [31]:
model.compile(optimizer=optimizers.Adam(lr=1e-3), loss=losses.binary_crossentropy, metrics=['accuracy'])
In [45]:
model.fit(X_train, y_train, epochs=20, batch_size=256,
validation_data=(X_cv, y_cv))
In [33]:
model.load_weights('lstm-glove.model')
In [ ]:
import keras
sum([np.prod(keras.backend.get_value(w).shape) for w in model.trainable_weights]) / float(X_train.shape[0])
In [34]:
model.history.history
Out[34]:
In [46]:
y_pred = model.predict(X_test, batch_size=256, verbose=1)
In [47]:
X_test[0]
Out[47]:
In [48]:
from sklearn import metrics
In [49]:
metrics.accuracy_score(y_test, y_pred > .5)
Out[49]:
In [50]:
metrics.roc_auc_score(y_test, y_pred)
Out[50]:
In [44]:
y_pred = model.predict(X_test, batch_size=256)
print metrics.roc_auc_score(y_test, y_pred)
print metrics.accuracy_score(y_test, y_pred > .5)
In [ ]:
print "OK"
In [51]:
model.save_weights('lstm-glove.model.new')
In [64]:
df_train.shape
Out[64]:
In [65]:
X_train.shape
Out[65]:
In [66]:
X_cv.shape
Out[66]:
In [ ]: