In [1]:
from __future__ import print_function
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
NGRAMS = 2
EPOCHS = 5
YEAR = '2000'
#YEAR = '2010'
df = pd.read_csv('./data/census/census_%s.csv' % YEAR)
df.dropna(subset=['name'], inplace=True)
df.replace('(S)', 0, inplace=True)
df
Out[1]:
In [2]:
sdf = df.sample(1000000, weights=df['count'], replace=True)
In [3]:
from numpy.random import choice
races = ['white', 'black', 'api', 'hispanic']
def to_race(c):
w = np.array(c).astype(float)
probs = w/w.sum()
return choice(races, p=probs)
sdf['race'] = sdf[['pctwhite', 'pctblack', 'pctapi', 'pcthispanic']].apply(lambda c: to_race(c), axis=1)
sdf
Out[3]:
In [4]:
df[df.name == 'SMITH']
Out[4]:
In [5]:
xdf = sdf[sdf.name=='SMITH'].groupby(['race']).agg({'name': 'count'})
xdf * 100 / xdf.sum()
Out[5]:
In [6]:
# Additional features
sdf['name_last'] = sdf.name.str.title()
sdf.groupby('race').agg({'name_last': 'count'})
Out[6]:
In [7]:
len(sdf)
Out[7]:
In [8]:
# only last name in Census data
sdf['name_last_name_first'] = sdf['name_last']
# build n-gram list
vect = CountVectorizer(analyzer='char', max_df=0.3, min_df=3, ngram_range=(NGRAMS, NGRAMS), lowercase=False)
#vect = CountVectorizer(analyzer='char', ngram_range=(2, 2), lowercase=False)
a = vect.fit_transform(sdf.name_last_name_first)
vocab = vect.vocabulary_
len(vocab)
Out[8]:
In [9]:
import operator
sorted_vocab = sorted(vocab.items(), key=operator.itemgetter(1))
cols = list(map(operator.itemgetter(0), sorted_vocab))
In [10]:
count_df = pd.DataFrame(a.todense(), columns=cols)
count_df
Out[10]:
In [11]:
count_df.sum().sort_values(ascending=False).describe()
Out[11]:
In [12]:
pd.set_option('display.max_rows', 20)
count_df.sum().sort_values(ascending=False)
Out[12]:
In [13]:
# sort n-gram by freq (highest -> lowest)
words = []
for b in vocab:
c = vocab[b]
#print(b, c, a[:, c].sum())
words.append((a[:, c].sum(), b))
#break
words = sorted(words, reverse=True)
words_list = [w[1] for w in words]
num_words = len(words_list)
print("num_words = %d" % num_words)
def find_ngrams(text, n):
a = zip(*[text[i:] for i in range(n)])
wi = []
for i in a:
w = ''.join(i)
try:
idx = words_list.index(w)
except:
idx = 0
wi.append(idx)
return wi
# build X from index of n-gram sequence
X = np.array(sdf.name_last_name_first.apply(lambda c: find_ngrams(c, NGRAMS)))
# check max/avg feature
X_len = []
for x in X:
X_len.append(len(x))
max_feature_len = max(X_len)
avg_feature_len = int(np.mean(X_len))
print("Max feature len = %d, Avg. feature len = %d" % (max_feature_len, avg_feature_len))
y = np.array(sdf.race.astype('category').cat.codes)
# Split train and test dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)
In [14]:
len(vocab)
Out[14]:
In [15]:
'''The dataset is actually too small for LSTM to be of any advantage
compared to simpler, much faster methods such as TF-IDF + LogReg.
Notes:
- RNNs are tricky. Choice of batch size is important,
choice of loss and optimizer is critical, etc.
Some configurations won't converge.
- LSTM loss decrease patterns during training can be quite different
from what you see with CNNs/MLPs/etc.
'''
import keras
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding, Dropout, Activation
from keras.layers import LSTM
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.models import load_model
max_features = num_words # 20000
feature_len = 20 # avg_feature_len # cut texts after this number of words (among top max_features most common words)
batch_size = 32
print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')
print('Pad sequences (samples x time)')
X_train = sequence.pad_sequences(X_train, maxlen=feature_len)
X_test = sequence.pad_sequences(X_test, maxlen=feature_len)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)
num_classes = np.max(y_train) + 1
print(num_classes, 'classes')
print('Convert class vector to binary class matrix '
'(for use with categorical_crossentropy)')
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)
In [16]:
print('Build model...')
model = Sequential()
model.add(Embedding(num_words, 32, input_length=feature_len))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(num_classes, activation='sigmoid'))
# try using different optimizers and different optimizer configs
model.compile(loss='categorical_crossentropy',
optimizer='adam',
metrics=['accuracy'])
print(model.summary())
In [17]:
print('Train...')
model.fit(X_train, y_train, batch_size=batch_size, epochs=EPOCHS,
validation_split=0.1, verbose=2)
score, acc = model.evaluate(X_test, y_test,
batch_size=batch_size, verbose=2)
print('Test score:', score)
print('Test accuracy:', acc)
Train...
Train on 72000 samples, validate on 8000 samples
Epoch 1/5
124s - loss: 0.6793 - acc: 0.7678 - val_loss: 0.6147 - val_acc: 0.7917
Epoch 2/5
156s - loss: 0.5908 - acc: 0.7979 - val_loss: 0.5909 - val_acc: 0.8001
Epoch 3/5
157s - loss: 0.5725 - acc: 0.8043 - val_loss: 0.5792 - val_acc: 0.8031
Epoch 4/5
206s - loss: 0.5621 - acc: 0.8071 - val_loss: 0.5726 - val_acc: 0.8039
Epoch 5/5
156s - loss: 0.5556 - acc: 0.8087 - val_loss: 0.5741 - val_acc: 0.8044
Test score: 0.56178010149
Test accuracy: 0.80655
In [18]:
y_pred = model.predict_classes(X_test, verbose=2)
p = model.predict_proba(X_test, verbose=2) # to predict probability
target_names = list(sdf.race.astype('category').cat.categories)
print(classification_report(np.argmax(y_test, axis=1), y_pred, target_names=target_names))
print(confusion_matrix(np.argmax(y_test, axis=1), y_pred))
In [19]:
model.save('./models/census/lstm/census%s_ln_lstm.h5' % YEAR)
In [20]:
words_df = pd.DataFrame(words_list, columns=['vocab'])
words_df.to_csv('./models/census/lstm/census%s_ln_vocab.csv' % YEAR, index=False, encoding='utf-8')
In [ ]: