In [1]:
from __future__ import print_function

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix

NGRAMS = 2
EPOCHS = 15

# Wikilabels
df = pd.read_csv('../data/wiki/wiki_name_race.csv')
df.dropna(subset=['name_first', 'name_last'], inplace=True)
sdf = df

# Additional features
sdf['name_first'] = sdf.name_first.str.title()

sdf.groupby('race').agg({'name_first': 'count'})


Out[1]:
name_first
race
Asian,GreaterEastAsian,EastAsian 5497
Asian,GreaterEastAsian,Japanese 7334
Asian,IndianSubContinent 7861
GreaterAfrican,Africans 3672
GreaterAfrican,Muslim 6242
GreaterEuropean,British 41445
GreaterEuropean,EastEuropean 8329
GreaterEuropean,Jewish 10239
GreaterEuropean,WestEuropean,French 12293
GreaterEuropean,WestEuropean,Germanic 3869
GreaterEuropean,WestEuropean,Hispanic 10412
GreaterEuropean,WestEuropean,Italian 11867
GreaterEuropean,WestEuropean,Nordic 4813

Preprocessing the input data


In [2]:
# only last name will be use to train the model
sdf['name_last_name_first'] = sdf['name_last'] 

# build n-gram list
vect = CountVectorizer(analyzer='char', max_df=0.3, min_df=3, ngram_range=(NGRAMS, NGRAMS), lowercase=False) 
a = vect.fit_transform(sdf.name_last_name_first)
vocab = vect.vocabulary_

# sort n-gram by freq (highest -> lowest)
words = []
for b in vocab:
    c = vocab[b]
    #print(b, c, a[:, c].sum())
    words.append((a[:, c].sum(), b))
    #break
words = sorted(words, reverse=True)
words_list = [w[1] for w in words]
num_words = len(words_list)
print("num_words = %d" % num_words)


def find_ngrams(text, n):
    a = zip(*[text[i:] for i in range(n)])
    wi = []
    for i in a:
        w = ''.join(i)
        try:
            idx = words_list.index(w)
        except:
            idx = 0
        wi.append(idx)
    return wi

# build X from index of n-gram sequence
X = np.array(sdf.name_last_name_first.apply(lambda c: find_ngrams(c, NGRAMS)))

# check max/avg feature
X_len = []
for x in X:
    X_len.append(len(x))

max_feature_len = max(X_len)
avg_feature_len = int(np.mean(X_len))

print("Max feature len = %d, Avg. feature len = %d" % (max_feature_len, avg_feature_len))
y = np.array(sdf.race.astype('category').cat.codes)

# Split train and test dataset
X_train,  X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)


num_words = 1946
/opt/venv/lib/python2.7/site-packages/ipykernel/__main__.py:28: UnicodeWarning: Unicode equal comparison failed to convert both arguments to Unicode - interpreting them as being unequal
Max feature len = 71, Avg. feature len = 5

In [3]:
'''The dataset is actually too small for LSTM to be of any advantage
compared to simpler, much faster methods such as TF-IDF + LogReg.
Notes:

- RNNs are tricky. Choice of batch size is important,
choice of loss and optimizer is critical, etc.
Some configurations won't converge.

- LSTM loss decrease patterns during training can be quite different
from what you see with CNNs/MLPs/etc.
'''
import keras
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding, Dropout, Activation
from keras.layers import LSTM
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.models import load_model

max_features = num_words # 20000
feature_len = 20 # avg_feature_len # cut texts after this number of words (among top max_features most common words)
batch_size = 32

print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')

print('Pad sequences (samples x time)')
X_train = sequence.pad_sequences(X_train, maxlen=feature_len)
X_test = sequence.pad_sequences(X_test, maxlen=feature_len)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

num_classes = np.max(y_train) + 1
print(num_classes, 'classes')

print('Convert class vector to binary class matrix '
      '(for use with categorical_crossentropy)')
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)


Using TensorFlow backend.
107098 train sequences
26775 test sequences
Pad sequences (samples x time)
X_train shape: (107098, 20)
X_test shape: (26775, 20)
13 classes
Convert class vector to binary class matrix (for use with categorical_crossentropy)
y_train shape: (107098, 13)
y_test shape: (26775, 13)

In [4]:
print('Build model...')

model = Sequential()
model.add(Embedding(num_words, 32, input_length=feature_len))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(num_classes, activation='sigmoid'))

# try using different optimizers and different optimizer configs
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print(model.summary())


Build model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_1 (Embedding)      (None, 20, 32)            62272     
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               82432     
_________________________________________________________________
dense_1 (Dense)              (None, 13)                1677      
=================================================================
Total params: 146,381.0
Trainable params: 146,381
Non-trainable params: 0.0
_________________________________________________________________
None

In [5]:
print('Train...')
model.fit(X_train, y_train, batch_size=batch_size, epochs=EPOCHS,
          validation_split=0.1, verbose=2)
score, acc = model.evaluate(X_test, y_test,
                            batch_size=batch_size, verbose=2)
print('Test score:', score)
print('Test accuracy:', acc)


Train...
Train on 96388 samples, validate on 10710 samples
Epoch 1/15
141s - loss: 1.6834 - acc: 0.4915 - val_loss: 1.4706 - val_acc: 0.5553
Epoch 2/15
139s - loss: 1.4253 - acc: 0.5746 - val_loss: 1.3817 - val_acc: 0.5919
Epoch 3/15
133s - loss: 1.3492 - acc: 0.6009 - val_loss: 1.3427 - val_acc: 0.6020
Epoch 4/15
149s - loss: 1.2990 - acc: 0.6174 - val_loss: 1.3053 - val_acc: 0.6162
Epoch 5/15
146s - loss: 1.2673 - acc: 0.6282 - val_loss: 1.2870 - val_acc: 0.6257
Epoch 6/15
149s - loss: 1.2412 - acc: 0.6346 - val_loss: 1.2748 - val_acc: 0.6282
Epoch 7/15
150s - loss: 1.2197 - acc: 0.6415 - val_loss: 1.2579 - val_acc: 0.6370
Epoch 8/15
149s - loss: 1.2004 - acc: 0.6455 - val_loss: 1.2527 - val_acc: 0.6402
Epoch 9/15
142s - loss: 1.1855 - acc: 0.6522 - val_loss: 1.2406 - val_acc: 0.6440
Epoch 10/15
158s - loss: 1.1721 - acc: 0.6549 - val_loss: 1.2411 - val_acc: 0.6414
Epoch 11/15
139s - loss: 1.1591 - acc: 0.6582 - val_loss: 1.2351 - val_acc: 0.6458
Epoch 12/15
140s - loss: 1.1484 - acc: 0.6623 - val_loss: 1.2288 - val_acc: 0.6511
Epoch 13/15
157s - loss: 1.1366 - acc: 0.6656 - val_loss: 1.2249 - val_acc: 0.6516
Epoch 14/15
151s - loss: 1.1262 - acc: 0.6676 - val_loss: 1.2216 - val_acc: 0.6539
Epoch 15/15
158s - loss: 1.1164 - acc: 0.6707 - val_loss: 1.2243 - val_acc: 0.6524
Test score: 1.20667794086
Test accuracy: 0.654939309077

Confusion Matrix


In [6]:
y_pred = model.predict_classes(X_test, verbose=2)
p = model.predict_proba(X_test, verbose=2) # to predict probability
target_names = list(sdf.race.astype('category').cat.categories)
print(classification_report(np.argmax(y_test, axis=1), y_pred, target_names=target_names))
print(confusion_matrix(np.argmax(y_test, axis=1), y_pred))


                                       precision    recall  f1-score   support

     Asian,GreaterEastAsian,EastAsian       0.76      0.74      0.75      1099
      Asian,GreaterEastAsian,Japanese       0.82      0.84      0.83      1467
             Asian,IndianSubContinent       0.65      0.67      0.66      1572
              GreaterAfrican,Africans       0.50      0.36      0.42       734
                GreaterAfrican,Muslim       0.56      0.52      0.54      1248
              GreaterEuropean,British       0.72      0.85      0.78      8289
         GreaterEuropean,EastEuropean       0.75      0.62      0.68      1666
               GreaterEuropean,Jewish       0.43      0.37      0.40      2048
  GreaterEuropean,WestEuropean,French       0.56      0.47      0.51      2459
GreaterEuropean,WestEuropean,Germanic       0.38      0.29      0.33       774
GreaterEuropean,WestEuropean,Hispanic       0.63      0.49      0.55      2082
 GreaterEuropean,WestEuropean,Italian       0.61      0.74      0.67      2374
  GreaterEuropean,WestEuropean,Nordic       0.65      0.53      0.58       963

                          avg / total       0.65      0.65      0.65     26775

[[ 814   49   15    6   15  136   10   11   13    4   11   11    4]
 [  30 1233    9   15   14   50    6   10    9    2   46   36    7]
 [  16   15 1055   29  134  153   11   33   24    8   28   55   11]
 [  15   54   57  267   69  110   10   28   29    6   34   48    7]
 [  20   22  132   43  645  115   45   77   44    8   35   46   16]
 [  57   27   85   46   62 7076   51  276  294   60   62  130   63]
 [  20   17   38   13   40  136 1029  156   53   43   28   62   31]
 [  13   15   56   17   81  634   95  754  103   87   71   93   29]
 [  24   18   48   41   29  576   32  117 1160   53  135  201   25]
 [  11    1   16    5    8  206   14  142   54  223   13   31   50]
 [  12   36   56   22   25  244   22   65  133   39 1018  391   19]
 [  29   11   32   25   12  178   34   37  120   21  112 1751   12]
 [  16    2   13    4   16  205   19   62   35   40   20   20  511]]

Save model


In [7]:
model.save('./wiki/lstm/wiki_ln_lstm.h5')


<keras.models.Sequential object at 0x7f4fb10d0ed0>

In [10]:
words_df = pd.DataFrame(words_list, columns=['vocab'])
words_df.to_csv('./wiki/lstm/wiki_ln_vocab.csv', index=False, encoding='utf-8')

In [ ]: