In [1]:
from __future__ import print_function

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix

NGRAMS = 2
EPOCHS = 5
YEAR = '2000'
#YEAR = '2010'

df = pd.read_csv('./data/census/census_%s.csv' % YEAR)
df.dropna(subset=['name'], inplace=True)
df.replace('(S)', 0, inplace=True)
df


Out[1]:
name rank count prop100k cum_prop100k pctwhite pctblack pctapi pctaian pct2prace pcthispanic
0 SMITH 1 2376206 880.85 880.85 73.35 22.22 0.40 0.85 1.63 1.56
1 JOHNSON 2 1857160 688.44 1569.30 61.55 33.80 0.42 0.91 1.82 1.50
2 WILLIAMS 3 1534042 568.66 2137.96 48.52 46.72 0.37 0.78 2.01 1.60
3 BROWN 4 1380145 511.62 2649.58 60.71 34.54 0.41 0.83 1.86 1.64
4 JONES 5 1362755 505.17 3154.75 57.69 37.73 0.35 0.94 1.85 1.44
5 MILLER 6 1127803 418.07 3572.82 85.81 10.41 0.42 0.63 1.31 1.43
6 DAVIS 7 1072335 397.51 3970.33 64.73 30.77 0.40 0.79 1.73 1.58
7 GARCIA 8 858289 318.17 4288.50 6.17 0.49 1.43 0.58 0.51 90.81
8 RODRIGUEZ 9 804240 298.13 4586.62 5.52 0.54 0.58 0.24 0.41 92.70
9 WILSON 10 783051 290.27 4876.90 69.72 25.32 0.46 1.03 1.74 1.73
10 MARTINEZ 11 775072 287.32 5164.22 6.04 0.52 0.60 0.64 0.46 91.72
11 ANDERSON 12 762394 282.62 5446.83 77.60 18.06 0.48 0.70 1.59 1.58
12 TAYLOR 13 720370 267.04 5713.87 67.80 27.67 0.39 0.75 1.78 1.61
13 THOMAS 14 710696 263.45 5977.33 55.53 38.17 1.63 1.01 2.00 1.66
14 HERNANDEZ 15 706372 261.85 6239.18 4.55 0.38 0.65 0.27 0.35 93.81
15 MOORE 16 698671 259.00 6498.17 68.85 26.92 0.37 0.65 1.70 1.50
16 MARTIN 17 672711 249.37 6747.54 77.47 15.30 0.71 0.94 1.59 3.99
17 JACKSON 18 666125 246.93 6994.47 41.93 53.02 0.31 1.04 2.18 1.53
18 THOMPSON 19 644368 238.87 7233.34 72.48 22.53 0.44 1.15 1.78 1.62
19 WHITE 20 639515 237.07 7470.40 67.91 27.38 0.39 1.01 1.76 1.55
20 LOPEZ 21 621536 230.40 7700.81 5.85 0.61 1.04 0.47 0.52 91.51
21 LEE 22 605860 224.59 7925.40 40.09 17.41 37.83 1.03 2.30 1.34
22 GONZALEZ 23 597718 221.57 8146.97 4.76 0.37 0.38 0.18 0.33 93.99
23 HARRIS 24 593542 220.02 8366.99 53.88 41.63 0.36 0.65 2.02 1.45
24 CLARK 25 548369 203.28 8570.27 76.84 18.53 0.41 0.94 1.60 1.68
25 LEWIS 26 509930 189.03 8759.30 60.97 33.83 0.45 1.14 1.97 1.64
26 ROBINSON 27 503028 186.47 8945.77 51.34 44.10 0.37 0.51 1.99 1.68
27 WALKER 28 501307 185.83 9131.61 61.25 34.17 0.35 0.83 1.80 1.60
28 PEREZ 29 488521 181.09 9312.70 5.95 0.48 1.18 0.26 0.48 91.65
29 HALL 30 473568 175.55 9488.25 75.11 20.75 0.48 0.63 1.63 1.40
... ... ... ... ... ... ... ... ... ... ... ...
151641 WRATH 150436 100 0.04 89752.48 85.00 0 0 0 7.00 0
151642 WRIXON 150436 100 0.04 89752.52 98.00 0.00 0.00 0 0.00 0
151643 WRZESIEN 150436 100 0.04 89752.56 90.00 0.00 0 0 6.00 0.00
151644 WULFECK 150436 100 0.04 89752.59 99.00 0.00 0.00 0.00 0.00 0.00
151645 WYMS 150436 100 0.04 89752.63 0 95.00 0.00 0 0 0
151646 YACCINO 150436 100 0.04 89752.67 99.00 0 0.00 0.00 0.00 0
151647 YAGGY 150436 100 0.04 89752.71 93.00 0.00 0.00 0.00 0.00 7.00
151648 YAKIM 150436 100 0.04 89752.74 97.00 0 0 0.00 0.00 0.00
151649 YAPO 150436 100 0.04 89752.78 21.00 18.00 48.00 0 9.00 0
151650 YARDY 150436 100 0.04 89752.82 81.00 0 0.00 13.00 0 0
151651 YAUKEY 150436 100 0.04 89752.85 99.00 0.00 0.00 0.00 0.00 0.00
151652 YERKA 150436 100 0.04 89752.89 97.00 0.00 0.00 0.00 0 0
151653 YOUSKO 150436 100 0.04 89752.93 99.00 0 0.00 0.00 0.00 0
151654 ZADRIMA 150436 100 0.04 89752.97 87.00 0.00 0.00 0.00 13.00 0.00
151655 ZAFT 150436 100 0.04 89753.00 94.00 0.00 0 0.00 0 0
151656 ZAITSEV 150436 100 0.04 89753.04 92.00 0 0.00 0.00 7.00 0
151657 ZALABAK 150436 100 0.04 89753.08 98.00 0.00 0.00 0.00 0 0
151658 ZALLA 150436 100 0.04 89753.11 99.00 0 0.00 0.00 0.00 0
151659 ZANJANI 150436 100 0.04 89753.15 76.00 0.00 10.00 0.00 14.00 0.00
151660 ZANOL 150436 100 0.04 89753.19 99.00 0.00 0.00 0.00 0 0
151661 ZAPKA 150436 100 0.04 89753.23 99.00 0.00 0.00 0.00 0.00 0.00
151662 ZELDES 150436 100 0.04 89753.26 99.00 0.00 0.00 0.00 0.00 0.00
151663 ZERBEY 150436 100 0.04 89753.30 99.00 0 0.00 0.00 0.00 0
151664 ZIEGELHOFER 150436 100 0.04 89753.34 99.00 0.00 0.00 0.00 0.00 0.00
151665 ZIELESCH 150436 100 0.04 89753.37 99.00 0.00 0.00 0.00 0.00 0.00
151666 ZILK 150436 100 0.04 89753.41 90.00 9.00 0.00 0.00 0 0
151667 ZINNANTI 150436 100 0.04 89753.45 98.00 0.00 0.00 0.00 0 0
151668 ZITTERICH 150436 100 0.04 89753.48 98.00 0 0.00 0.00 0.00 0
151669 ZULU 150436 100 0.04 89753.52 6.00 90.00 0.00 0.00 0 0
151670 ZUSI 150436 100 0.04 89753.56 99.00 0.00 0.00 0.00 0.00 0.00

151670 rows × 11 columns

Resampling with weight


In [2]:
sdf = df.sample(1000000, weights=df['count'], replace=True)

Assign race by pertcentage


In [3]:
from numpy.random import choice

races = ['white', 'black', 'api', 'hispanic']

def to_race(c):
    w = np.array(c).astype(float)
    probs = w/w.sum()
    return choice(races, p=probs)

sdf['race'] = sdf[['pctwhite', 'pctblack', 'pctapi', 'pcthispanic']].apply(lambda c: to_race(c), axis=1)
sdf


/opt/venv/lib/python2.7/site-packages/ipykernel/__main__.py:7: RuntimeWarning: invalid value encountered in divide
/opt/venv/lib/python2.7/site-packages/ipykernel/__main__.py:8: RuntimeWarning: invalid value encountered in less
Out[3]:
name rank count prop100k cum_prop100k pctwhite pctblack pctapi pctaian pct2prace pcthispanic race
1817 MEDEIROS 1818 18121 6.72 47759.96 85.14 0.33 3.06 0.14 7.27 4.07 white
26179 NOWLAN 26163 879 0.33 77509.45 95.22 0 0.91 0 1.82 1.02 white
121 STEVENS 122 181417 67.25 18684.77 83.40 11.40 0.53 1.23 1.64 1.79 white
1193 WOMACK 1194 26890 9.97 42674.71 68.30 27.53 0.33 0.71 1.72 1.40 white
49 TORRES 50 325169 120.54 12433.80 6.05 0.58 1.42 0.26 0.53 91.16 hispanic
31161 SKAFF 31126 705 0.26 78960.22 95.32 0.71 0 0 1.84 1.84 white
698 MOSLEY 699 44698 16.57 36382.71 42.69 52.83 0.26 0.53 2.18 1.51 white
21144 BRODE 21142 1157 0.43 75630.80 96.02 1.12 0 0 0.61 1.38 white
44726 TROTTI 44654 453 0.17 81777.92 96.69 0 0.00 0 1.32 0 white
12377 FUJII 12377 2302 0.85 70365.91 5.82 0 84.40 0 7.04 2.13 api
147823 OPLIGER 147095 103 0.04 89609.27 95.15 0.00 0 0.00 0.00 0 white
31316 KOLBECK 31301 700 0.26 79000.58 97.57 0 0 0 0 1.14 white
290 GUERRERO 291 94152 34.90 26824.07 5.16 0.32 2.36 0.25 0.60 91.32 hispanic
86 FOSTER 87 221040 81.94 16079.19 72.00 23.19 0.46 0.86 1.78 1.70 white
934 BAIRD 935 34233 12.69 39775.20 93.10 2.99 0.56 0.61 1.23 1.50 white
79199 FURTNEY 79115 224 0.08 85797.04 96.43 0.00 0 0.00 0 0 white
3468 MAAS 3469 9410 3.49 55687.78 95.61 0.45 0.55 0.36 1.04 1.99 white
932 PETTY 933 34278 12.71 39749.82 75.78 20.32 0.33 0.56 1.56 1.46 white
3959 KETCHUM 3960 8229 3.05 57288.24 89.29 3.58 0.56 2.60 1.98 1.98 white
39634 HETZ 39617 522 0.19 80860.45 95.79 0 0 0 1.34 1.15 white
75564 KUHEL 75466 238 0.09 85485.81 97.90 0.00 0 0.00 0 0.00 white
1738 CODY 1739 18853 6.99 47218.22 79.40 14.11 0.34 2.80 1.72 1.63 white
26219 CONSTANTE 26215 877 0.33 77522.47 13.23 0.80 0.91 0.00 0.68 84.38 hispanic
21 LEE 22 605860 224.59 7925.40 40.09 17.41 37.83 1.03 2.30 1.34 black
585 BRADFORD 586 51726 19.17 34361.20 67.27 28.47 0.35 0.60 1.72 1.60 white
42166 IDRIS 42134 485 0.18 81333.05 18.14 47.63 18.76 0 13.81 0 black
104 JORDAN 105 197212 73.11 17490.73 64.30 30.06 0.44 0.68 1.67 2.86 white
33542 KORINEK 33528 642 0.24 79553.42 96.73 0 0 0.00 0.78 1.71 white
228 BURKE 229 119175 44.18 24395.04 87.74 8.40 0.48 0.49 1.31 1.57 white
43 MITCHELL 44 367433 136.21 11661.38 63.55 31.52 0.39 0.98 1.93 1.63 black
... ... ... ... ... ... ... ... ... ... ... ... ...
568 ROBERSON 569 53198 19.72 34031.17 53.85 42.11 0.28 0.59 1.88 1.29 black
9423 SPITLER 9422 3165 1.17 67409.31 97.31 0 0.51 0 0.63 1.14 white
17831 SHEATS 17823 1448 0.54 74046.01 68.09 29.07 0 0 1.17 1.04 white
3982 CREAMER 3983 8185 3.03 57358.26 90.85 5.24 0.38 0.42 1.26 1.86 white
7163 GARFIELD 7163 4296 1.59 64316.68 77.56 12.85 0.93 4.84 1.56 2.26 white
37770 SEVERTSON 37728 554 0.21 80489.05 96.03 0.00 0 0.00 0 3.25 white
88133 CAPIN 88083 196 0.07 86491.41 80.10 0.00 0 0.00 0 16.33 white
80 REYES 81 232511 86.19 15575.57 5.26 0.75 6.02 0.30 0.76 86.90 hispanic
2588 HILLS 2589 12867 4.77 52108.02 66.82 29.18 0.54 0.61 1.47 1.38 white
2107 FAUST 2108 15799 5.86 49576.20 86.80 9.91 0.41 0.36 0.94 1.59 white
3214 CENTENO 3215 10188 3.78 54764.08 5.12 0.74 4.57 0.24 1.00 88.33 hispanic
1980 LUNSFORD 1981 16736 6.20 48810.54 88.16 8.08 0.31 0.75 1.45 1.25 white
18560 BADEAUX 18560 1371 0.51 74426.93 97.45 0.44 0 0 0.36 1.31 white
4257 CHA 4258 7698 2.85 58168.75 2.46 0.36 93.19 0.06 2.51 1.42 api
499 PRATT 500 59801 22.17 32592.72 80.27 14.77 0.53 1.03 1.63 1.77 white
5 MILLER 6 1127803 418.07 3572.82 85.81 10.41 0.42 0.63 1.31 1.43 white
12547 SENSENIG 12542 2266 0.84 70509.81 98.63 0 0.26 0 0.35 0.62 white
3 BROWN 4 1380145 511.62 2649.58 60.71 34.54 0.41 0.83 1.86 1.64 white
5120 ALBANESE 5121 6286 2.33 60390.34 96.50 0 0.59 0 0.78 1.83 white
36 GREEN 37 413477 153.27 10631.29 59.33 36.23 0.34 0.61 1.78 1.71 white
121 STEVENS 122 181417 67.25 18684.77 83.40 11.40 0.53 1.23 1.64 1.79 white
24 CLARK 25 548369 203.28 8570.27 76.84 18.53 0.41 0.94 1.60 1.68 white
1486 GROVE 1487 21969 8.14 45310.90 92.55 4.03 0.43 0.40 1.14 1.45 white
953 ANDERSEN 954 33508 12.42 40014.03 95.51 0.57 0.63 0.41 1.16 1.72 white
126220 SANDRIK 125639 126 0.05 88699.48 95.24 0.00 0.00 0 0 0 white
49625 MCCLENATHAN 49563 398 0.15 82548.45 95.98 0.00 0 0 1.76 1.26 white
113 GRIFFIN 114 190636 70.67 18135.72 66.46 29.77 0.34 0.50 1.56 1.36 white
47 EVANS 48 342237 126.87 12188.83 70.65 25.05 0.40 0.68 1.67 1.55 black
1209 LOCKE 1210 26516 9.83 42832.88 83.26 11.77 1.06 0.81 1.59 1.52 white
2385 YAZZIE 2386 13915 5.16 51104.25 1.44 0.10 0.05 96.10 1.37 0.93 white

1000000 rows × 12 columns

Check the correctness of race assignment


In [4]:
df[df.name == 'SMITH']


Out[4]:
name rank count prop100k cum_prop100k pctwhite pctblack pctapi pctaian pct2prace pcthispanic
0 SMITH 1 2376206 880.85 880.85 73.35 22.22 0.40 0.85 1.63 1.56

In [5]:
xdf = sdf[sdf.name=='SMITH'].groupby(['race']).agg({'name': 'count'})
xdf * 100 / xdf.sum()


Out[5]:
name
race
api 0.525465
black 22.564673
hispanic 1.546079
white 75.363783

In [6]:
# Additional features
sdf['name_last'] = sdf.name.str.title()
sdf.groupby('race').agg({'name_last': 'count'})


Out[6]:
name_last
race
api 34970
black 125879
hispanic 128146
white 711005

In [7]:
len(sdf)


Out[7]:
1000000

Preprocessing the input data


In [8]:
# only last name in Census data
sdf['name_last_name_first'] = sdf['name_last']

# build n-gram list
vect = CountVectorizer(analyzer='char', max_df=0.3, min_df=3, ngram_range=(NGRAMS, NGRAMS), lowercase=False) 
#vect = CountVectorizer(analyzer='char', ngram_range=(2, 2), lowercase=False) 
a = vect.fit_transform(sdf.name_last_name_first)
vocab = vect.vocabulary_
len(vocab)


Out[8]:
963

In [9]:
import operator
sorted_vocab = sorted(vocab.items(), key=operator.itemgetter(1))
cols = list(map(operator.itemgetter(0), sorted_vocab))

In [10]:
count_df = pd.DataFrame(a.todense(), columns=cols)
count_df


Out[10]:
Aa Ab Ac Ad Ae Af Ag Ah Ai Aj ... zp zq zr zs zt zu zv zw zy zz
0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
5 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
6 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
7 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
8 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
9 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
10 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
11 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
12 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
13 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
14 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
15 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
16 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
17 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
18 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
19 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
20 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
21 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
22 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
23 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
24 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
25 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
26 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
27 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
28 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
29 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
999970 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
999971 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
999972 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
999973 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
999974 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
999975 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
999976 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
999977 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
999978 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
999979 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
999980 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
999981 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
999982 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
999983 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
999984 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
999985 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
999986 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
999987 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
999988 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
999989 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
999990 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
999991 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
999992 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
999993 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
999994 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
999995 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
999996 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
999997 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
999998 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
999999 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1

1000000 rows × 963 columns


In [11]:
count_df.sum().sort_values(ascending=False).describe()


Out[11]:
count       963.000000
mean       5508.892004
std       12662.615933
min           3.000000
25%          79.500000
50%         911.000000
75%        5417.500000
max      175908.000000
dtype: float64

In [12]:
pd.set_option('display.max_rows', 20)
count_df.sum().sort_values(ascending=False)


Out[12]:
er    175908
on    124264
an    106667
ar    100173
ll     86314
in     82185
le     73190
en     67003
el     61376
so     57480
       ...  
Gf         3
kv         3
fv         3
Vn         3
Tc         3
Gd         3
xv         3
gj         3
Gm         3
Iu         3
dtype: int64

In [13]:
# sort n-gram by freq (highest -> lowest)
words = []
for b in vocab:
    c = vocab[b]
    #print(b, c, a[:, c].sum())
    words.append((a[:, c].sum(), b))
    #break
words = sorted(words, reverse=True)
words_list = [w[1] for w in words]
num_words = len(words_list)
print("num_words = %d" % num_words)


def find_ngrams(text, n):
    a = zip(*[text[i:] for i in range(n)])
    wi = []
    for i in a:
        w = ''.join(i)
        try:
            idx = words_list.index(w)
        except:
            idx = 0
        wi.append(idx)
    return wi

# build X from index of n-gram sequence
X = np.array(sdf.name_last_name_first.apply(lambda c: find_ngrams(c, NGRAMS)))

# check max/avg feature
X_len = []
for x in X:
    X_len.append(len(x))

max_feature_len = max(X_len)
avg_feature_len = int(np.mean(X_len))

print("Max feature len = %d, Avg. feature len = %d" % (max_feature_len, avg_feature_len))
y = np.array(sdf.race.astype('category').cat.codes)

# Split train and test dataset
X_train,  X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)


num_words = 963
Max feature len = 14, Avg. feature len = 5

In [14]:
len(vocab)


Out[14]:
963

In [15]:
'''The dataset is actually too small for LSTM to be of any advantage
compared to simpler, much faster methods such as TF-IDF + LogReg.
Notes:

- RNNs are tricky. Choice of batch size is important,
choice of loss and optimizer is critical, etc.
Some configurations won't converge.

- LSTM loss decrease patterns during training can be quite different
from what you see with CNNs/MLPs/etc.
'''
import keras
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding, Dropout, Activation
from keras.layers import LSTM
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.models import load_model

max_features = num_words # 20000
feature_len = 20 # avg_feature_len # cut texts after this number of words (among top max_features most common words)
batch_size = 32

print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')

print('Pad sequences (samples x time)')
X_train = sequence.pad_sequences(X_train, maxlen=feature_len)
X_test = sequence.pad_sequences(X_test, maxlen=feature_len)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

num_classes = np.max(y_train) + 1
print(num_classes, 'classes')

print('Convert class vector to binary class matrix '
      '(for use with categorical_crossentropy)')
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)


Using TensorFlow backend.
800000 train sequences
200000 test sequences
Pad sequences (samples x time)
X_train shape: (800000, 20)
X_test shape: (200000, 20)
4 classes
Convert class vector to binary class matrix (for use with categorical_crossentropy)
y_train shape: (800000, 4)
y_test shape: (200000, 4)

In [16]:
print('Build model...')

model = Sequential()
model.add(Embedding(num_words, 32, input_length=feature_len))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(num_classes, activation='sigmoid'))

# try using different optimizers and different optimizer configs
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print(model.summary())


Build model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_1 (Embedding)      (None, 20, 32)            30816     
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               82432     
_________________________________________________________________
dense_1 (Dense)              (None, 4)                 516       
=================================================================
Total params: 113,764.0
Trainable params: 113,764
Non-trainable params: 0.0
_________________________________________________________________
None

In [17]:
print('Train...')
model.fit(X_train, y_train, batch_size=batch_size, epochs=EPOCHS,
          validation_split=0.1, verbose=2)
score, acc = model.evaluate(X_test, y_test,
                            batch_size=batch_size, verbose=2)
print('Test score:', score)
print('Test accuracy:', acc)


Train...
Train on 720000 samples, validate on 80000 samples
Epoch 1/5
1214s - loss: 0.5751 - acc: 0.8031 - val_loss: 0.5465 - val_acc: 0.8114
Epoch 2/5
1277s - loss: 0.5359 - acc: 0.8152 - val_loss: 0.5269 - val_acc: 0.8173
Epoch 3/5
1168s - loss: 0.5233 - acc: 0.8185 - val_loss: 0.5173 - val_acc: 0.8192
Epoch 4/5
1236s - loss: 0.5159 - acc: 0.8205 - val_loss: 0.5127 - val_acc: 0.8216
Epoch 5/5
1181s - loss: 0.5110 - acc: 0.8222 - val_loss: 0.5077 - val_acc: 0.8234
Test score: 0.50193548161
Test accuracy: 0.82522
Train...
Train on 72000 samples, validate on 8000 samples
Epoch 1/5
124s - loss: 0.6793 - acc: 0.7678 - val_loss: 0.6147 - val_acc: 0.7917
Epoch 2/5
156s - loss: 0.5908 - acc: 0.7979 - val_loss: 0.5909 - val_acc: 0.8001
Epoch 3/5
157s - loss: 0.5725 - acc: 0.8043 - val_loss: 0.5792 - val_acc: 0.8031
Epoch 4/5
206s - loss: 0.5621 - acc: 0.8071 - val_loss: 0.5726 - val_acc: 0.8039
Epoch 5/5
156s - loss: 0.5556 - acc: 0.8087 - val_loss: 0.5741 - val_acc: 0.8044
Test score: 0.56178010149
Test accuracy: 0.80655

Confusion Matrix


In [18]:
y_pred = model.predict_classes(X_test, verbose=2)
p = model.predict_proba(X_test, verbose=2) # to predict probability
target_names = list(sdf.race.astype('category').cat.categories)
print(classification_report(np.argmax(y_test, axis=1), y_pred, target_names=target_names))
print(confusion_matrix(np.argmax(y_test, axis=1), y_pred))


             precision    recall  f1-score   support

        api       0.84      0.64      0.73      6994
      black       0.63      0.02      0.04     25176
   hispanic       0.86      0.83      0.85     25629
      white       0.82      0.98      0.89    142201

avg / total       0.80      0.83      0.77    200000

[[  4507      5    685   1797]
 [   160    472    179  24365]
 [    94     13  21387   4135]
 [   581    254   2688 138678]]

Save model


In [19]:
model.save('./models/census/lstm/census%s_ln_lstm.h5' % YEAR)

In [20]:
words_df = pd.DataFrame(words_list, columns=['vocab'])
words_df.to_csv('./models/census/lstm/census%s_ln_vocab.csv' % YEAR, index=False, encoding='utf-8')

In [ ]: