In [1]:
%matplotlib inline
from __future__ import print_function
import time, datetime
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Input, Embedding, Dense, GRU, Dropout, Reshape, Merge, Bidirectional
from keras.callbacks import Callback, ModelCheckpoint
from sklearn.manifold import TSNE
In [2]:
! wget -q -O emoji_joined.txt https://raw.githubusercontent.com/uclmr/emoji2vec/master/data/raw_training_data/emoji_joined.txt
! wget -q http://nlp.stanford.edu/data/glove.6B.zip
! unzip -q -o glove.6B.zip
! rm -f glove.6B.zip glove.6B.50d.txt glove.6B.100d.txt glove.6B.200d.txt
In [3]:
GLOVE_FILE = 'glove.6B.300d.txt'
EMOJI_DESCRIPTIONS_FILE = 'emoji_joined.txt'
EMOJI_EMB_VIZ_FILE = 'emoji_emb_viz.csv'
MODEL_WEIGHTS_FILE = 'weights.h5'
EMOJI_EMBEDDINGS_FILE = 'emoji_embeddings.txt'
MAX_SEQUENCE_LENGTH = 15
MAX_NB_WORDS = 5000
MAX_NB_EMOJIS = 2000
EMBEDDING_DIM = 300
RNG_SEED_1 = 1446557
RNG_SEED_2 = 1337603
VALIDATION_SPLIT = 0.1
In [4]:
emoji_descriptions = pd.read_csv(EMOJI_DESCRIPTIONS_FILE,
sep='\t',
engine='python',
encoding='utf_8',
names=['description', 'emoji'])
print('Emoji descriptions: %d' % len(emoji_descriptions))
In [5]:
emoji_descriptions.head(5)
Out[5]:
In [6]:
neg_emoji_descriptions = pd.DataFrame({'emoji': emoji_descriptions['emoji'].values,
'description': emoji_descriptions.sample(frac=1,
random_state=RNG_SEED_1)['description'].values})
In [7]:
neg_emoji_descriptions.head(5)
Out[7]:
In [8]:
emoji_descriptions['label'] = 1
neg_emoji_descriptions['label'] = 0
emoji_data = pd.concat([emoji_descriptions, neg_emoji_descriptions]).sample(frac=1, random_state=RNG_SEED_2)
In [9]:
emoji_data.head(10)
Out[9]:
In [10]:
emoji_series = emoji_descriptions.groupby('emoji')['description'].apply(lambda x: ', '.join(x))
emojis_combined_desc = pd.DataFrame({'emoji': emoji_series.index, 'description': emoji_series.values})
print('Emojis: %d' % len(emojis_combined_desc))
In [11]:
emojis_combined_desc[emojis_combined_desc['description'].str.contains('new york')]
Out[11]:
In [12]:
emojis = emojis_combined_desc['emoji'].values
emoji_index = {}
emoji_reverse_index = {}
i = 0
for e in emojis:
i += 1
emoji_index[e] = i
emoji_reverse_index[i] = e
print("Emojis in index: %d" % len(emoji_index))
In [13]:
descriptions = emoji_data['description'].values
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(descriptions.tolist())
desc_word_sequences = tokenizer.texts_to_sequences(descriptions.tolist())
word_index = tokenizer.word_index
print("Words in index: %d" % len(word_index))
In [14]:
embeddings_index = {}
with open(GLOVE_FILE) as f:
for line in f:
values = line.split()
word = values[0]
embedding = np.asarray(values[1:], dtype='float32')
embeddings_index[word] = embedding
print('Word embeddings: %d' % len(embeddings_index))
In [15]:
nb_words = min(MAX_NB_WORDS, len(word_index))
word_embedding_matrix = np.zeros((nb_words + 1, EMBEDDING_DIM))
for word, i in word_index.items():
if i > MAX_NB_WORDS:
continue
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
# words not found in embedding index will be all-zeros.
word_embedding_matrix[i] = embedding_vector
print('Null word embeddings: %d' % np.sum(np.sum(word_embedding_matrix, axis=1) == 0))
In [16]:
e_data = np.array([ emoji_index[e] for e in emoji_data['emoji'].values ])
d_data = pad_sequences(desc_word_sequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = np.array([ [0, 1] if l == 0 else [1, 0] for l in emoji_data['label'].values ])
nb_emojis = min(MAX_NB_EMOJIS, len(emoji_index))
print('Shape of emoji data tensor:', e_data.shape)
print('Shape of description data tensor:', d_data.shape)
print('Shape of label tensor:', labels.shape)
print('Number of emojis:', nb_emojis)
In [17]:
P = Sequential()
P.add(Embedding(nb_emojis + 1, EMBEDDING_DIM, input_length=1))
P.add(Reshape((EMBEDDING_DIM,)))
Q = Sequential()
Q.add(Embedding(nb_words + 1,
EMBEDDING_DIM,
weights=[word_embedding_matrix],
input_length=MAX_SEQUENCE_LENGTH,
trainable=False))
Q.add(Bidirectional(GRU(EMBEDDING_DIM, dropout_W=0.5, dropout_U=0.5), merge_mode='sum'))
model = Sequential()
model.add(Merge([P, Q], mode='concat'))
model.add(Dropout(0.5))
model.add(Dense(EMBEDDING_DIM*2, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy'])
In [18]:
callbacks = [ModelCheckpoint(MODEL_WEIGHTS_FILE, monitor='val_categorical_accuracy', save_best_only=True)]
print("Starting training at", datetime.datetime.now())
t0 = time.time()
history = model.fit([e_data, d_data],
labels,
nb_epoch=80,
validation_split=VALIDATION_SPLIT,
verbose=0,
callbacks=callbacks)
t1 = time.time()
print("Training ended at", datetime.datetime.now())
print("Minutes elapsed: %f" % ((t1 - t0) / 60.))
In [19]:
acc = pd.DataFrame({'epoch': [ i + 1 for i in history.epoch ],
'training': history.history['categorical_accuracy'],
'validation': history.history['val_categorical_accuracy']})
ax = acc.ix[:,:].plot(x='epoch', figsize={7,10}, grid=True)
ax.set_ylabel("categorical accuracy")
ax.set_ylim([0.0,1.0]);
In [20]:
loss = pd.DataFrame({'epoch': [ i + 1 for i in history.epoch ],
'training': history.history['loss'],
'validation': history.history['val_loss']})
ax = loss.ix[:,:].plot(x='epoch', figsize={7,10}, grid=True)
ax.set_ylabel("loss")
ax.set_ylim([0.0,2.0]);
In [21]:
model.load_weights(MODEL_WEIGHTS_FILE)
weights = P.layers[0].get_weights()[0]
embeddings = pd.DataFrame(weights[1:])
embeddings = pd.concat([emojis_combined_desc['emoji'], embeddings], axis=1)
embeddings.to_csv(EMOJI_EMBEDDINGS_FILE, sep=' ', header=False, index=False)
In [22]:
tsne2 = TSNE(n_components=2, perplexity=30, init='pca', n_iter=5000)
fit = tsne2.fit_transform(weights)
visualization = pd.DataFrame(fit[1:], columns=['x', 'y'])
visualization['emoji'] = emojis_combined_desc['emoji'].values
visualization.plot('x', 'y', kind='scatter', figsize={7,10}, grid=True);
visualization.to_csv(EMOJI_EMB_VIZ_FILE)
In [ ]: