In [1]:

    
from __future__ import print_function

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix

NGRAMS = 2
EPOCHS = 5
YEAR = '2000'
#YEAR = '2010'

df = pd.read_csv('./data/census/census_%s.csv' % YEAR)
df.dropna(subset=['name'], inplace=True)
df.replace('(S)', 0, inplace=True)
df









    Out[1]:






  
    
      
      name
      rank
      count
      prop100k
      cum_prop100k
      pctwhite
      pctblack
      pctapi
      pctaian
      pct2prace
      pcthispanic
    
  
  
    
      0
      SMITH
      1
      2376206
      880.85
      880.85
      73.35
      22.22
      0.40
      0.85
      1.63
      1.56
    
    
      1
      JOHNSON
      2
      1857160
      688.44
      1569.30
      61.55
      33.80
      0.42
      0.91
      1.82
      1.50
    
    
      2
      WILLIAMS
      3
      1534042
      568.66
      2137.96
      48.52
      46.72
      0.37
      0.78
      2.01
      1.60
    
    
      3
      BROWN
      4
      1380145
      511.62
      2649.58
      60.71
      34.54
      0.41
      0.83
      1.86
      1.64
    
    
      4
      JONES
      5
      1362755
      505.17
      3154.75
      57.69
      37.73
      0.35
      0.94
      1.85
      1.44
    
    
      5
      MILLER
      6
      1127803
      418.07
      3572.82
      85.81
      10.41
      0.42
      0.63
      1.31
      1.43
    
    
      6
      DAVIS
      7
      1072335
      397.51
      3970.33
      64.73
      30.77
      0.40
      0.79
      1.73
      1.58
    
    
      7
      GARCIA
      8
      858289
      318.17
      4288.50
      6.17
      0.49
      1.43
      0.58
      0.51
      90.81
    
    
      8
      RODRIGUEZ
      9
      804240
      298.13
      4586.62
      5.52
      0.54
      0.58
      0.24
      0.41
      92.70
    
    
      9
      WILSON
      10
      783051
      290.27
      4876.90
      69.72
      25.32
      0.46
      1.03
      1.74
      1.73
    
    
      10
      MARTINEZ
      11
      775072
      287.32
      5164.22
      6.04
      0.52
      0.60
      0.64
      0.46
      91.72
    
    
      11
      ANDERSON
      12
      762394
      282.62
      5446.83
      77.60
      18.06
      0.48
      0.70
      1.59
      1.58
    
    
      12
      TAYLOR
      13
      720370
      267.04
      5713.87
      67.80
      27.67
      0.39
      0.75
      1.78
      1.61
    
    
      13
      THOMAS
      14
      710696
      263.45
      5977.33
      55.53
      38.17
      1.63
      1.01
      2.00
      1.66
    
    
      14
      HERNANDEZ
      15
      706372
      261.85
      6239.18
      4.55
      0.38
      0.65
      0.27
      0.35
      93.81
    
    
      15
      MOORE
      16
      698671
      259.00
      6498.17
      68.85
      26.92
      0.37
      0.65
      1.70
      1.50
    
    
      16
      MARTIN
      17
      672711
      249.37
      6747.54
      77.47
      15.30
      0.71
      0.94
      1.59
      3.99
    
    
      17
      JACKSON
      18
      666125
      246.93
      6994.47
      41.93
      53.02
      0.31
      1.04
      2.18
      1.53
    
    
      18
      THOMPSON
      19
      644368
      238.87
      7233.34
      72.48
      22.53
      0.44
      1.15
      1.78
      1.62
    
    
      19
      WHITE
      20
      639515
      237.07
      7470.40
      67.91
      27.38
      0.39
      1.01
      1.76
      1.55
    
    
      20
      LOPEZ
      21
      621536
      230.40
      7700.81
      5.85
      0.61
      1.04
      0.47
      0.52
      91.51
    
    
      21
      LEE
      22
      605860
      224.59
      7925.40
      40.09
      17.41
      37.83
      1.03
      2.30
      1.34
    
    
      22
      GONZALEZ
      23
      597718
      221.57
      8146.97
      4.76
      0.37
      0.38
      0.18
      0.33
      93.99
    
    
      23
      HARRIS
      24
      593542
      220.02
      8366.99
      53.88
      41.63
      0.36
      0.65
      2.02
      1.45
    
    
      24
      CLARK
      25
      548369
      203.28
      8570.27
      76.84
      18.53
      0.41
      0.94
      1.60
      1.68
    
    
      25
      LEWIS
      26
      509930
      189.03
      8759.30
      60.97
      33.83
      0.45
      1.14
      1.97
      1.64
    
    
      26
      ROBINSON
      27
      503028
      186.47
      8945.77
      51.34
      44.10
      0.37
      0.51
      1.99
      1.68
    
    
      27
      WALKER
      28
      501307
      185.83
      9131.61
      61.25
      34.17
      0.35
      0.83
      1.80
      1.60
    
    
      28
      PEREZ
      29
      488521
      181.09
      9312.70
      5.95
      0.48
      1.18
      0.26
      0.48
      91.65
    
    
      29
      HALL
      30
      473568
      175.55
      9488.25
      75.11
      20.75
      0.48
      0.63
      1.63
      1.40
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      151641
      WRATH
      150436
      100
      0.04
      89752.48
      85.00
      0
      0
      0
      7.00
      0
    
    
      151642
      WRIXON
      150436
      100
      0.04
      89752.52
      98.00
      0.00
      0.00
      0
      0.00
      0
    
    
      151643
      WRZESIEN
      150436
      100
      0.04
      89752.56
      90.00
      0.00
      0
      0
      6.00
      0.00
    
    
      151644
      WULFECK
      150436
      100
      0.04
      89752.59
      99.00
      0.00
      0.00
      0.00
      0.00
      0.00
    
    
      151645
      WYMS
      150436
      100
      0.04
      89752.63
      0
      95.00
      0.00
      0
      0
      0
    
    
      151646
      YACCINO
      150436
      100
      0.04
      89752.67
      99.00
      0
      0.00
      0.00
      0.00
      0
    
    
      151647
      YAGGY
      150436
      100
      0.04
      89752.71
      93.00
      0.00
      0.00
      0.00
      0.00
      7.00
    
    
      151648
      YAKIM
      150436
      100
      0.04
      89752.74
      97.00
      0
      0
      0.00
      0.00
      0.00
    
    
      151649
      YAPO
      150436
      100
      0.04
      89752.78
      21.00
      18.00
      48.00
      0
      9.00
      0
    
    
      151650
      YARDY
      150436
      100
      0.04
      89752.82
      81.00
      0
      0.00
      13.00
      0
      0
    
    
      151651
      YAUKEY
      150436
      100
      0.04
      89752.85
      99.00
      0.00
      0.00
      0.00
      0.00
      0.00
    
    
      151652
      YERKA
      150436
      100
      0.04
      89752.89
      97.00
      0.00
      0.00
      0.00
      0
      0
    
    
      151653
      YOUSKO
      150436
      100
      0.04
      89752.93
      99.00
      0
      0.00
      0.00
      0.00
      0
    
    
      151654
      ZADRIMA
      150436
      100
      0.04
      89752.97
      87.00
      0.00
      0.00
      0.00
      13.00
      0.00
    
    
      151655
      ZAFT
      150436
      100
      0.04
      89753.00
      94.00
      0.00
      0
      0.00
      0
      0
    
    
      151656
      ZAITSEV
      150436
      100
      0.04
      89753.04
      92.00
      0
      0.00
      0.00
      7.00
      0
    
    
      151657
      ZALABAK
      150436
      100
      0.04
      89753.08
      98.00
      0.00
      0.00
      0.00
      0
      0
    
    
      151658
      ZALLA
      150436
      100
      0.04
      89753.11
      99.00
      0
      0.00
      0.00
      0.00
      0
    
    
      151659
      ZANJANI
      150436
      100
      0.04
      89753.15
      76.00
      0.00
      10.00
      0.00
      14.00
      0.00
    
    
      151660
      ZANOL
      150436
      100
      0.04
      89753.19
      99.00
      0.00
      0.00
      0.00
      0
      0
    
    
      151661
      ZAPKA
      150436
      100
      0.04
      89753.23
      99.00
      0.00
      0.00
      0.00
      0.00
      0.00
    
    
      151662
      ZELDES
      150436
      100
      0.04
      89753.26
      99.00
      0.00
      0.00
      0.00
      0.00
      0.00
    
    
      151663
      ZERBEY
      150436
      100
      0.04
      89753.30
      99.00
      0
      0.00
      0.00
      0.00
      0
    
    
      151664
      ZIEGELHOFER
      150436
      100
      0.04
      89753.34
      99.00
      0.00
      0.00
      0.00
      0.00
      0.00
    
    
      151665
      ZIELESCH
      150436
      100
      0.04
      89753.37
      99.00
      0.00
      0.00
      0.00
      0.00
      0.00
    
    
      151666
      ZILK
      150436
      100
      0.04
      89753.41
      90.00
      9.00
      0.00
      0.00
      0
      0
    
    
      151667
      ZINNANTI
      150436
      100
      0.04
      89753.45
      98.00
      0.00
      0.00
      0.00
      0
      0
    
    
      151668
      ZITTERICH
      150436
      100
      0.04
      89753.48
      98.00
      0
      0.00
      0.00
      0.00
      0
    
    
      151669
      ZULU
      150436
      100
      0.04
      89753.52
      6.00
      90.00
      0.00
      0.00
      0
      0
    
    
      151670
      ZUSI
      150436
      100
      0.04
      89753.56
      99.00
      0.00
      0.00
      0.00
      0.00
      0.00
    
  

151670 rows × 11 columns

Resampling with weight



In [2]:

    
sdf = df.sample(1000000, weights=df['count'], replace=True)

Assign race by pertcentage



In [3]:

    
from numpy.random import choice

races = ['white', 'black', 'api', 'hispanic']

def to_race(c):
    w = np.array(c).astype(float)
    probs = w/w.sum()
    return choice(races, p=probs)

sdf['race'] = sdf[['pctwhite', 'pctblack', 'pctapi', 'pcthispanic']].apply(lambda c: to_race(c), axis=1)
sdf









    



/opt/venv/lib/python2.7/site-packages/ipykernel/__main__.py:7: RuntimeWarning: invalid value encountered in divide
/opt/venv/lib/python2.7/site-packages/ipykernel/__main__.py:8: RuntimeWarning: invalid value encountered in less






    Out[3]:






  
    
      
      name
      rank
      count
      prop100k
      cum_prop100k
      pctwhite
      pctblack
      pctapi
      pctaian
      pct2prace
      pcthispanic
      race
    
  
  
    
      1817
      MEDEIROS
      1818
      18121
      6.72
      47759.96
      85.14
      0.33
      3.06
      0.14
      7.27
      4.07
      white
    
    
      26179
      NOWLAN
      26163
      879
      0.33
      77509.45
      95.22
      0
      0.91
      0
      1.82
      1.02
      white
    
    
      121
      STEVENS
      122
      181417
      67.25
      18684.77
      83.40
      11.40
      0.53
      1.23
      1.64
      1.79
      white
    
    
      1193
      WOMACK
      1194
      26890
      9.97
      42674.71
      68.30
      27.53
      0.33
      0.71
      1.72
      1.40
      white
    
    
      49
      TORRES
      50
      325169
      120.54
      12433.80
      6.05
      0.58
      1.42
      0.26
      0.53
      91.16
      hispanic
    
    
      31161
      SKAFF
      31126
      705
      0.26
      78960.22
      95.32
      0.71
      0
      0
      1.84
      1.84
      white
    
    
      698
      MOSLEY
      699
      44698
      16.57
      36382.71
      42.69
      52.83
      0.26
      0.53
      2.18
      1.51
      white
    
    
      21144
      BRODE
      21142
      1157
      0.43
      75630.80
      96.02
      1.12
      0
      0
      0.61
      1.38
      white
    
    
      44726
      TROTTI
      44654
      453
      0.17
      81777.92
      96.69
      0
      0.00
      0
      1.32
      0
      white
    
    
      12377
      FUJII
      12377
      2302
      0.85
      70365.91
      5.82
      0
      84.40
      0
      7.04
      2.13
      api
    
    
      147823
      OPLIGER
      147095
      103
      0.04
      89609.27
      95.15
      0.00
      0
      0.00
      0.00
      0
      white
    
    
      31316
      KOLBECK
      31301
      700
      0.26
      79000.58
      97.57
      0
      0
      0
      0
      1.14
      white
    
    
      290
      GUERRERO
      291
      94152
      34.90
      26824.07
      5.16
      0.32
      2.36
      0.25
      0.60
      91.32
      hispanic
    
    
      86
      FOSTER
      87
      221040
      81.94
      16079.19
      72.00
      23.19
      0.46
      0.86
      1.78
      1.70
      white
    
    
      934
      BAIRD
      935
      34233
      12.69
      39775.20
      93.10
      2.99
      0.56
      0.61
      1.23
      1.50
      white
    
    
      79199
      FURTNEY
      79115
      224
      0.08
      85797.04
      96.43
      0.00
      0
      0.00
      0
      0
      white
    
    
      3468
      MAAS
      3469
      9410
      3.49
      55687.78
      95.61
      0.45
      0.55
      0.36
      1.04
      1.99
      white
    
    
      932
      PETTY
      933
      34278
      12.71
      39749.82
      75.78
      20.32
      0.33
      0.56
      1.56
      1.46
      white
    
    
      3959
      KETCHUM
      3960
      8229
      3.05
      57288.24
      89.29
      3.58
      0.56
      2.60
      1.98
      1.98
      white
    
    
      39634
      HETZ
      39617
      522
      0.19
      80860.45
      95.79
      0
      0
      0
      1.34
      1.15
      white
    
    
      75564
      KUHEL
      75466
      238
      0.09
      85485.81
      97.90
      0.00
      0
      0.00
      0
      0.00
      white
    
    
      1738
      CODY
      1739
      18853
      6.99
      47218.22
      79.40
      14.11
      0.34
      2.80
      1.72
      1.63
      white
    
    
      26219
      CONSTANTE
      26215
      877
      0.33
      77522.47
      13.23
      0.80
      0.91
      0.00
      0.68
      84.38
      hispanic
    
    
      21
      LEE
      22
      605860
      224.59
      7925.40
      40.09
      17.41
      37.83
      1.03
      2.30
      1.34
      black
    
    
      585
      BRADFORD
      586
      51726
      19.17
      34361.20
      67.27
      28.47
      0.35
      0.60
      1.72
      1.60
      white
    
    
      42166
      IDRIS
      42134
      485
      0.18
      81333.05
      18.14
      47.63
      18.76
      0
      13.81
      0
      black
    
    
      104
      JORDAN
      105
      197212
      73.11
      17490.73
      64.30
      30.06
      0.44
      0.68
      1.67
      2.86
      white
    
    
      33542
      KORINEK
      33528
      642
      0.24
      79553.42
      96.73
      0
      0
      0.00
      0.78
      1.71
      white
    
    
      228
      BURKE
      229
      119175
      44.18
      24395.04
      87.74
      8.40
      0.48
      0.49
      1.31
      1.57
      white
    
    
      43
      MITCHELL
      44
      367433
      136.21
      11661.38
      63.55
      31.52
      0.39
      0.98
      1.93
      1.63
      black
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      568
      ROBERSON
      569
      53198
      19.72
      34031.17
      53.85
      42.11
      0.28
      0.59
      1.88
      1.29
      black
    
    
      9423
      SPITLER
      9422
      3165
      1.17
      67409.31
      97.31
      0
      0.51
      0
      0.63
      1.14
      white
    
    
      17831
      SHEATS
      17823
      1448
      0.54
      74046.01
      68.09
      29.07
      0
      0
      1.17
      1.04
      white
    
    
      3982
      CREAMER
      3983
      8185
      3.03
      57358.26
      90.85
      5.24
      0.38
      0.42
      1.26
      1.86
      white
    
    
      7163
      GARFIELD
      7163
      4296
      1.59
      64316.68
      77.56
      12.85
      0.93
      4.84
      1.56
      2.26
      white
    
    
      37770
      SEVERTSON
      37728
      554
      0.21
      80489.05
      96.03
      0.00
      0
      0.00
      0
      3.25
      white
    
    
      88133
      CAPIN
      88083
      196
      0.07
      86491.41
      80.10
      0.00
      0
      0.00
      0
      16.33
      white
    
    
      80
      REYES
      81
      232511
      86.19
      15575.57
      5.26
      0.75
      6.02
      0.30
      0.76
      86.90
      hispanic
    
    
      2588
      HILLS
      2589
      12867
      4.77
      52108.02
      66.82
      29.18
      0.54
      0.61
      1.47
      1.38
      white
    
    
      2107
      FAUST
      2108
      15799
      5.86
      49576.20
      86.80
      9.91
      0.41
      0.36
      0.94
      1.59
      white
    
    
      3214
      CENTENO
      3215
      10188
      3.78
      54764.08
      5.12
      0.74
      4.57
      0.24
      1.00
      88.33
      hispanic
    
    
      1980
      LUNSFORD
      1981
      16736
      6.20
      48810.54
      88.16
      8.08
      0.31
      0.75
      1.45
      1.25
      white
    
    
      18560
      BADEAUX
      18560
      1371
      0.51
      74426.93
      97.45
      0.44
      0
      0
      0.36
      1.31
      white
    
    
      4257
      CHA
      4258
      7698
      2.85
      58168.75
      2.46
      0.36
      93.19
      0.06
      2.51
      1.42
      api
    
    
      499
      PRATT
      500
      59801
      22.17
      32592.72
      80.27
      14.77
      0.53
      1.03
      1.63
      1.77
      white
    
    
      5
      MILLER
      6
      1127803
      418.07
      3572.82
      85.81
      10.41
      0.42
      0.63
      1.31
      1.43
      white
    
    
      12547
      SENSENIG
      12542
      2266
      0.84
      70509.81
      98.63
      0
      0.26
      0
      0.35
      0.62
      white
    
    
      3
      BROWN
      4
      1380145
      511.62
      2649.58
      60.71
      34.54
      0.41
      0.83
      1.86
      1.64
      white
    
    
      5120
      ALBANESE
      5121
      6286
      2.33
      60390.34
      96.50
      0
      0.59
      0
      0.78
      1.83
      white
    
    
      36
      GREEN
      37
      413477
      153.27
      10631.29
      59.33
      36.23
      0.34
      0.61
      1.78
      1.71
      white
    
    
      121
      STEVENS
      122
      181417
      67.25
      18684.77
      83.40
      11.40
      0.53
      1.23
      1.64
      1.79
      white
    
    
      24
      CLARK
      25
      548369
      203.28
      8570.27
      76.84
      18.53
      0.41
      0.94
      1.60
      1.68
      white
    
    
      1486
      GROVE
      1487
      21969
      8.14
      45310.90
      92.55
      4.03
      0.43
      0.40
      1.14
      1.45
      white
    
    
      953
      ANDERSEN
      954
      33508
      12.42
      40014.03
      95.51
      0.57
      0.63
      0.41
      1.16
      1.72
      white
    
    
      126220
      SANDRIK
      125639
      126
      0.05
      88699.48
      95.24
      0.00
      0.00
      0
      0
      0
      white
    
    
      49625
      MCCLENATHAN
      49563
      398
      0.15
      82548.45
      95.98
      0.00
      0
      0
      1.76
      1.26
      white
    
    
      113
      GRIFFIN
      114
      190636
      70.67
      18135.72
      66.46
      29.77
      0.34
      0.50
      1.56
      1.36
      white
    
    
      47
      EVANS
      48
      342237
      126.87
      12188.83
      70.65
      25.05
      0.40
      0.68
      1.67
      1.55
      black
    
    
      1209
      LOCKE
      1210
      26516
      9.83
      42832.88
      83.26
      11.77
      1.06
      0.81
      1.59
      1.52
      white
    
    
      2385
      YAZZIE
      2386
      13915
      5.16
      51104.25
      1.44
      0.10
      0.05
      96.10
      1.37
      0.93
      white
    
  

1000000 rows × 12 columns

Check the correctness of race assignment



In [4]:

    
df[df.name == 'SMITH']









    Out[4]:






  
    
      
      name
      rank
      count
      prop100k
      cum_prop100k
      pctwhite
      pctblack
      pctapi
      pctaian
      pct2prace
      pcthispanic
    
  
  
    
      0
      SMITH
      1
      2376206
      880.85
      880.85
      73.35
      22.22
      0.40
      0.85
      1.63
      1.56



In [5]:

    
xdf = sdf[sdf.name=='SMITH'].groupby(['race']).agg({'name': 'count'})
xdf * 100 / xdf.sum()



In [6]:

    
# Additional features
sdf['name_last'] = sdf.name.str.title()
sdf.groupby('race').agg({'name_last': 'count'})



In [7]:

    
len(sdf)









    Out[7]:





1000000

Preprocessing the input data



In [8]:

    
# only last name in Census data
sdf['name_last_name_first'] = sdf['name_last']

# build n-gram list
vect = CountVectorizer(analyzer='char', max_df=0.3, min_df=3, ngram_range=(NGRAMS, NGRAMS), lowercase=False) 
#vect = CountVectorizer(analyzer='char', ngram_range=(2, 2), lowercase=False) 
a = vect.fit_transform(sdf.name_last_name_first)
vocab = vect.vocabulary_
len(vocab)









    Out[8]:





963



In [9]:

    
import operator
sorted_vocab = sorted(vocab.items(), key=operator.itemgetter(1))
cols = list(map(operator.itemgetter(0), sorted_vocab))



In [10]:

    
count_df = pd.DataFrame(a.todense(), columns=cols)
count_df









    Out[10]:






  
    
      
      Aa
      Ab
      Ac
      Ad
      Ae
      Af
      Ag
      Ah
      Ai
      Aj
      ...
      zp
      zq
      zr
      zs
      zt
      zu
      zv
      zw
      zy
      zz
    
  
  
    
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      2
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      3
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      4
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      5
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      6
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      7
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      8
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      9
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      10
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      11
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      12
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      13
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      14
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      15
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      16
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      17
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      18
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      19
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      20
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      21
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      22
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      23
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      24
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      25
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      26
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      27
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      28
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      29
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      999970
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      999971
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      999972
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      999973
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      999974
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      999975
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      999976
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      999977
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      999978
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      999979
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      999980
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      999981
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      999982
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      999983
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      999984
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      999985
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      999986
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      999987
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      999988
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      999989
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      999990
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      999991
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      999992
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      999993
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      999994
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      999995
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      999996
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      999997
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      999998
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      999999
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
    
  

1000000 rows × 963 columns



In [11]:

    
count_df.sum().sort_values(ascending=False).describe()









    Out[11]:





count       963.000000
mean       5508.892004
std       12662.615933
min           3.000000
25%          79.500000
50%         911.000000
75%        5417.500000
max      175908.000000
dtype: float64



In [12]:

    
pd.set_option('display.max_rows', 20)
count_df.sum().sort_values(ascending=False)









    Out[12]:





er    175908
on    124264
an    106667
ar    100173
ll     86314
in     82185
le     73190
en     67003
el     61376
so     57480
       ...  
Gf         3
kv         3
fv         3
Vn         3
Tc         3
Gd         3
xv         3
gj         3
Gm         3
Iu         3
dtype: int64



In [13]:

    
# sort n-gram by freq (highest -> lowest)
words = []
for b in vocab:
    c = vocab[b]
    #print(b, c, a[:, c].sum())
    words.append((a[:, c].sum(), b))
    #break
words = sorted(words, reverse=True)
words_list = [w[1] for w in words]
num_words = len(words_list)
print("num_words = %d" % num_words)


def find_ngrams(text, n):
    a = zip(*[text[i:] for i in range(n)])
    wi = []
    for i in a:
        w = ''.join(i)
        try:
            idx = words_list.index(w)
        except:
            idx = 0
        wi.append(idx)
    return wi

# build X from index of n-gram sequence
X = np.array(sdf.name_last_name_first.apply(lambda c: find_ngrams(c, NGRAMS)))

# check max/avg feature
X_len = []
for x in X:
    X_len.append(len(x))

max_feature_len = max(X_len)
avg_feature_len = int(np.mean(X_len))

print("Max feature len = %d, Avg. feature len = %d" % (max_feature_len, avg_feature_len))
y = np.array(sdf.race.astype('category').cat.codes)

# Split train and test dataset
X_train,  X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)









    



num_words = 963
Max feature len = 14, Avg. feature len = 5



In [14]:

    
len(vocab)









    Out[14]:





963

Train a LSTM model

ref: http://machinelearningmastery.com/sequence-classification-lstm-recurrent-neural-networks-python-keras/



In [15]:

    
'''The dataset is actually too small for LSTM to be of any advantage
compared to simpler, much faster methods such as TF-IDF + LogReg.
Notes:

- RNNs are tricky. Choice of batch size is important,
choice of loss and optimizer is critical, etc.
Some configurations won't converge.

- LSTM loss decrease patterns during training can be quite different
from what you see with CNNs/MLPs/etc.
'''
import keras
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding, Dropout, Activation
from keras.layers import LSTM
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.models import load_model

max_features = num_words # 20000
feature_len = 20 # avg_feature_len # cut texts after this number of words (among top max_features most common words)
batch_size = 32

print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')

print('Pad sequences (samples x time)')
X_train = sequence.pad_sequences(X_train, maxlen=feature_len)
X_test = sequence.pad_sequences(X_test, maxlen=feature_len)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

num_classes = np.max(y_train) + 1
print(num_classes, 'classes')

print('Convert class vector to binary class matrix '
      '(for use with categorical_crossentropy)')
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)









    



Using TensorFlow backend.






    



800000 train sequences
200000 test sequences
Pad sequences (samples x time)
X_train shape: (800000, 20)
X_test shape: (200000, 20)
4 classes
Convert class vector to binary class matrix (for use with categorical_crossentropy)
y_train shape: (800000, 4)
y_test shape: (200000, 4)



In [16]:

    
print('Build model...')

model = Sequential()
model.add(Embedding(num_words, 32, input_length=feature_len))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(num_classes, activation='sigmoid'))

# try using different optimizers and different optimizer configs
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print(model.summary())









    



Build model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_1 (Embedding)      (None, 20, 32)            30816     
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               82432     
_________________________________________________________________
dense_1 (Dense)              (None, 4)                 516       
=================================================================
Total params: 113,764.0
Trainable params: 113,764
Non-trainable params: 0.0
_________________________________________________________________
None



In [17]:

    
print('Train...')
model.fit(X_train, y_train, batch_size=batch_size, epochs=EPOCHS,
          validation_split=0.1, verbose=2)
score, acc = model.evaluate(X_test, y_test,
                            batch_size=batch_size, verbose=2)
print('Test score:', score)
print('Test accuracy:', acc)









    



Train...
Train on 720000 samples, validate on 80000 samples
Epoch 1/5
1214s - loss: 0.5751 - acc: 0.8031 - val_loss: 0.5465 - val_acc: 0.8114
Epoch 2/5
1277s - loss: 0.5359 - acc: 0.8152 - val_loss: 0.5269 - val_acc: 0.8173
Epoch 3/5
1168s - loss: 0.5233 - acc: 0.8185 - val_loss: 0.5173 - val_acc: 0.8192
Epoch 4/5
1236s - loss: 0.5159 - acc: 0.8205 - val_loss: 0.5127 - val_acc: 0.8216
Epoch 5/5
1181s - loss: 0.5110 - acc: 0.8222 - val_loss: 0.5077 - val_acc: 0.8234
Test score: 0.50193548161
Test accuracy: 0.82522

Train...
Train on 72000 samples, validate on 8000 samples
Epoch 1/5
124s - loss: 0.6793 - acc: 0.7678 - val_loss: 0.6147 - val_acc: 0.7917
Epoch 2/5
156s - loss: 0.5908 - acc: 0.7979 - val_loss: 0.5909 - val_acc: 0.8001
Epoch 3/5
157s - loss: 0.5725 - acc: 0.8043 - val_loss: 0.5792 - val_acc: 0.8031
Epoch 4/5
206s - loss: 0.5621 - acc: 0.8071 - val_loss: 0.5726 - val_acc: 0.8039
Epoch 5/5
156s - loss: 0.5556 - acc: 0.8087 - val_loss: 0.5741 - val_acc: 0.8044
Test score: 0.56178010149
Test accuracy: 0.80655

Confusion Matrix



In [18]:

    
y_pred = model.predict_classes(X_test, verbose=2)
p = model.predict_proba(X_test, verbose=2) # to predict probability
target_names = list(sdf.race.astype('category').cat.categories)
print(classification_report(np.argmax(y_test, axis=1), y_pred, target_names=target_names))
print(confusion_matrix(np.argmax(y_test, axis=1), y_pred))









    



             precision    recall  f1-score   support

        api       0.84      0.64      0.73      6994
      black       0.63      0.02      0.04     25176
   hispanic       0.86      0.83      0.85     25629
      white       0.82      0.98      0.89    142201

avg / total       0.80      0.83      0.77    200000

[[  4507      5    685   1797]
 [   160    472    179  24365]
 [    94     13  21387   4135]
 [   581    254   2688 138678]]

Save model



In [19]:

    
model.save('./models/census/lstm/census%s_ln_lstm.h5' % YEAR)



In [20]:

    
words_df = pd.DataFrame(words_list, columns=['vocab'])
words_df.to_csv('./models/census/lstm/census%s_ln_vocab.csv' % YEAR, index=False, encoding='utf-8')



In [ ]:

	name	rank	count	prop100k	cum_prop100k	pctwhite	pctblack	pctapi	pctaian	pct2prace	pcthispanic
0	SMITH	1	2376206	880.85	880.85	73.35	22.22	0.40	0.85	1.63	1.56
1	JOHNSON	2	1857160	688.44	1569.30	61.55	33.80	0.42	0.91	1.82	1.50
2	WILLIAMS	3	1534042	568.66	2137.96	48.52	46.72	0.37	0.78	2.01	1.60
3	BROWN	4	1380145	511.62	2649.58	60.71	34.54	0.41	0.83	1.86	1.64
4	JONES	5	1362755	505.17	3154.75	57.69	37.73	0.35	0.94	1.85	1.44
5	MILLER	6	1127803	418.07	3572.82	85.81	10.41	0.42	0.63	1.31	1.43
6	DAVIS	7	1072335	397.51	3970.33	64.73	30.77	0.40	0.79	1.73	1.58
7	GARCIA	8	858289	318.17	4288.50	6.17	0.49	1.43	0.58	0.51	90.81
8	RODRIGUEZ	9	804240	298.13	4586.62	5.52	0.54	0.58	0.24	0.41	92.70
9	WILSON	10	783051	290.27	4876.90	69.72	25.32	0.46	1.03	1.74	1.73
10	MARTINEZ	11	775072	287.32	5164.22	6.04	0.52	0.60	0.64	0.46	91.72
11	ANDERSON	12	762394	282.62	5446.83	77.60	18.06	0.48	0.70	1.59	1.58
12	TAYLOR	13	720370	267.04	5713.87	67.80	27.67	0.39	0.75	1.78	1.61
13	THOMAS	14	710696	263.45	5977.33	55.53	38.17	1.63	1.01	2.00	1.66
14	HERNANDEZ	15	706372	261.85	6239.18	4.55	0.38	0.65	0.27	0.35	93.81
15	MOORE	16	698671	259.00	6498.17	68.85	26.92	0.37	0.65	1.70	1.50
16	MARTIN	17	672711	249.37	6747.54	77.47	15.30	0.71	0.94	1.59	3.99
17	JACKSON	18	666125	246.93	6994.47	41.93	53.02	0.31	1.04	2.18	1.53
18	THOMPSON	19	644368	238.87	7233.34	72.48	22.53	0.44	1.15	1.78	1.62
19	WHITE	20	639515	237.07	7470.40	67.91	27.38	0.39	1.01	1.76	1.55
20	LOPEZ	21	621536	230.40	7700.81	5.85	0.61	1.04	0.47	0.52	91.51
21	LEE	22	605860	224.59	7925.40	40.09	17.41	37.83	1.03	2.30	1.34
22	GONZALEZ	23	597718	221.57	8146.97	4.76	0.37	0.38	0.18	0.33	93.99
23	HARRIS	24	593542	220.02	8366.99	53.88	41.63	0.36	0.65	2.02	1.45
24	CLARK	25	548369	203.28	8570.27	76.84	18.53	0.41	0.94	1.60	1.68
25	LEWIS	26	509930	189.03	8759.30	60.97	33.83	0.45	1.14	1.97	1.64
26	ROBINSON	27	503028	186.47	8945.77	51.34	44.10	0.37	0.51	1.99	1.68
27	WALKER	28	501307	185.83	9131.61	61.25	34.17	0.35	0.83	1.80	1.60
28	PEREZ	29	488521	181.09	9312.70	5.95	0.48	1.18	0.26	0.48	91.65
29	HALL	30	473568	175.55	9488.25	75.11	20.75	0.48	0.63	1.63	1.40
...	...	...	...	...	...	...	...	...	...	...	...
151641	WRATH	150436	100	0.04	89752.48	85.00	0	0	0	7.00	0
151642	WRIXON	150436	100	0.04	89752.52	98.00	0.00	0.00	0	0.00	0
151643	WRZESIEN	150436	100	0.04	89752.56	90.00	0.00	0	0	6.00	0.00
151644	WULFECK	150436	100	0.04	89752.59	99.00	0.00	0.00	0.00	0.00	0.00
151645	WYMS	150436	100	0.04	89752.63	0	95.00	0.00	0	0	0
151646	YACCINO	150436	100	0.04	89752.67	99.00	0	0.00	0.00	0.00	0
151647	YAGGY	150436	100	0.04	89752.71	93.00	0.00	0.00	0.00	0.00	7.00
151648	YAKIM	150436	100	0.04	89752.74	97.00	0	0	0.00	0.00	0.00
151649	YAPO	150436	100	0.04	89752.78	21.00	18.00	48.00	0	9.00	0
151650	YARDY	150436	100	0.04	89752.82	81.00	0	0.00	13.00	0	0
151651	YAUKEY	150436	100	0.04	89752.85	99.00	0.00	0.00	0.00	0.00	0.00
151652	YERKA	150436	100	0.04	89752.89	97.00	0.00	0.00	0.00	0	0
151653	YOUSKO	150436	100	0.04	89752.93	99.00	0	0.00	0.00	0.00	0
151654	ZADRIMA	150436	100	0.04	89752.97	87.00	0.00	0.00	0.00	13.00	0.00
151655	ZAFT	150436	100	0.04	89753.00	94.00	0.00	0	0.00	0	0
151656	ZAITSEV	150436	100	0.04	89753.04	92.00	0	0.00	0.00	7.00	0
151657	ZALABAK	150436	100	0.04	89753.08	98.00	0.00	0.00	0.00	0	0
151658	ZALLA	150436	100	0.04	89753.11	99.00	0	0.00	0.00	0.00	0
151659	ZANJANI	150436	100	0.04	89753.15	76.00	0.00	10.00	0.00	14.00	0.00
151660	ZANOL	150436	100	0.04	89753.19	99.00	0.00	0.00	0.00	0	0
151661	ZAPKA	150436	100	0.04	89753.23	99.00	0.00	0.00	0.00	0.00	0.00
151662	ZELDES	150436	100	0.04	89753.26	99.00	0.00	0.00	0.00	0.00	0.00
151663	ZERBEY	150436	100	0.04	89753.30	99.00	0	0.00	0.00	0.00	0
151664	ZIEGELHOFER	150436	100	0.04	89753.34	99.00	0.00	0.00	0.00	0.00	0.00
151665	ZIELESCH	150436	100	0.04	89753.37	99.00	0.00	0.00	0.00	0.00	0.00
151666	ZILK	150436	100	0.04	89753.41	90.00	9.00	0.00	0.00	0	0
151667	ZINNANTI	150436	100	0.04	89753.45	98.00	0.00	0.00	0.00	0	0
151668	ZITTERICH	150436	100	0.04	89753.48	98.00	0	0.00	0.00	0.00	0
151669	ZULU	150436	100	0.04	89753.52	6.00	90.00	0.00	0.00	0	0
151670	ZUSI	150436	100	0.04	89753.56	99.00	0.00	0.00	0.00	0.00	0.00

	name	rank	count	prop100k	cum_prop100k	pctwhite	pctblack	pctapi	pctaian	pct2prace	pcthispanic	race
1817	MEDEIROS	1818	18121	6.72	47759.96	85.14	0.33	3.06	0.14	7.27	4.07	white
26179	NOWLAN	26163	879	0.33	77509.45	95.22	0	0.91	0	1.82	1.02	white
121	STEVENS	122	181417	67.25	18684.77	83.40	11.40	0.53	1.23	1.64	1.79	white
1193	WOMACK	1194	26890	9.97	42674.71	68.30	27.53	0.33	0.71	1.72	1.40	white
49	TORRES	50	325169	120.54	12433.80	6.05	0.58	1.42	0.26	0.53	91.16	hispanic
31161	SKAFF	31126	705	0.26	78960.22	95.32	0.71	0	0	1.84	1.84	white
698	MOSLEY	699	44698	16.57	36382.71	42.69	52.83	0.26	0.53	2.18	1.51	white
21144	BRODE	21142	1157	0.43	75630.80	96.02	1.12	0	0	0.61	1.38	white
44726	TROTTI	44654	453	0.17	81777.92	96.69	0	0.00	0	1.32	0	white
12377	FUJII	12377	2302	0.85	70365.91	5.82	0	84.40	0	7.04	2.13	api
147823	OPLIGER	147095	103	0.04	89609.27	95.15	0.00	0	0.00	0.00	0	white
31316	KOLBECK	31301	700	0.26	79000.58	97.57	0	0	0	0	1.14	white
290	GUERRERO	291	94152	34.90	26824.07	5.16	0.32	2.36	0.25	0.60	91.32	hispanic
86	FOSTER	87	221040	81.94	16079.19	72.00	23.19	0.46	0.86	1.78	1.70	white
934	BAIRD	935	34233	12.69	39775.20	93.10	2.99	0.56	0.61	1.23	1.50	white
79199	FURTNEY	79115	224	0.08	85797.04	96.43	0.00	0	0.00	0	0	white
3468	MAAS	3469	9410	3.49	55687.78	95.61	0.45	0.55	0.36	1.04	1.99	white
932	PETTY	933	34278	12.71	39749.82	75.78	20.32	0.33	0.56	1.56	1.46	white
3959	KETCHUM	3960	8229	3.05	57288.24	89.29	3.58	0.56	2.60	1.98	1.98	white
39634	HETZ	39617	522	0.19	80860.45	95.79	0	0	0	1.34	1.15	white
75564	KUHEL	75466	238	0.09	85485.81	97.90	0.00	0	0.00	0	0.00	white
1738	CODY	1739	18853	6.99	47218.22	79.40	14.11	0.34	2.80	1.72	1.63	white
26219	CONSTANTE	26215	877	0.33	77522.47	13.23	0.80	0.91	0.00	0.68	84.38	hispanic
21	LEE	22	605860	224.59	7925.40	40.09	17.41	37.83	1.03	2.30	1.34	black
585	BRADFORD	586	51726	19.17	34361.20	67.27	28.47	0.35	0.60	1.72	1.60	white
42166	IDRIS	42134	485	0.18	81333.05	18.14	47.63	18.76	0	13.81	0	black
104	JORDAN	105	197212	73.11	17490.73	64.30	30.06	0.44	0.68	1.67	2.86	white
33542	KORINEK	33528	642	0.24	79553.42	96.73	0	0	0.00	0.78	1.71	white
228	BURKE	229	119175	44.18	24395.04	87.74	8.40	0.48	0.49	1.31	1.57	white
43	MITCHELL	44	367433	136.21	11661.38	63.55	31.52	0.39	0.98	1.93	1.63	black
...	...	...	...	...	...	...	...	...	...	...	...	...
568	ROBERSON	569	53198	19.72	34031.17	53.85	42.11	0.28	0.59	1.88	1.29	black
9423	SPITLER	9422	3165	1.17	67409.31	97.31	0	0.51	0	0.63	1.14	white
17831	SHEATS	17823	1448	0.54	74046.01	68.09	29.07	0	0	1.17	1.04	white
3982	CREAMER	3983	8185	3.03	57358.26	90.85	5.24	0.38	0.42	1.26	1.86	white
7163	GARFIELD	7163	4296	1.59	64316.68	77.56	12.85	0.93	4.84	1.56	2.26	white
37770	SEVERTSON	37728	554	0.21	80489.05	96.03	0.00	0	0.00	0	3.25	white
88133	CAPIN	88083	196	0.07	86491.41	80.10	0.00	0	0.00	0	16.33	white
80	REYES	81	232511	86.19	15575.57	5.26	0.75	6.02	0.30	0.76	86.90	hispanic
2588	HILLS	2589	12867	4.77	52108.02	66.82	29.18	0.54	0.61	1.47	1.38	white
2107	FAUST	2108	15799	5.86	49576.20	86.80	9.91	0.41	0.36	0.94	1.59	white
3214	CENTENO	3215	10188	3.78	54764.08	5.12	0.74	4.57	0.24	1.00	88.33	hispanic
1980	LUNSFORD	1981	16736	6.20	48810.54	88.16	8.08	0.31	0.75	1.45	1.25	white
18560	BADEAUX	18560	1371	0.51	74426.93	97.45	0.44	0	0	0.36	1.31	white
4257	CHA	4258	7698	2.85	58168.75	2.46	0.36	93.19	0.06	2.51	1.42	api
499	PRATT	500	59801	22.17	32592.72	80.27	14.77	0.53	1.03	1.63	1.77	white
5	MILLER	6	1127803	418.07	3572.82	85.81	10.41	0.42	0.63	1.31	1.43	white
12547	SENSENIG	12542	2266	0.84	70509.81	98.63	0	0.26	0	0.35	0.62	white
3	BROWN	4	1380145	511.62	2649.58	60.71	34.54	0.41	0.83	1.86	1.64	white
5120	ALBANESE	5121	6286	2.33	60390.34	96.50	0	0.59	0	0.78	1.83	white
36	GREEN	37	413477	153.27	10631.29	59.33	36.23	0.34	0.61	1.78	1.71	white
121	STEVENS	122	181417	67.25	18684.77	83.40	11.40	0.53	1.23	1.64	1.79	white
24	CLARK	25	548369	203.28	8570.27	76.84	18.53	0.41	0.94	1.60	1.68	white
1486	GROVE	1487	21969	8.14	45310.90	92.55	4.03	0.43	0.40	1.14	1.45	white
953	ANDERSEN	954	33508	12.42	40014.03	95.51	0.57	0.63	0.41	1.16	1.72	white
126220	SANDRIK	125639	126	0.05	88699.48	95.24	0.00	0.00	0	0	0	white
49625	MCCLENATHAN	49563	398	0.15	82548.45	95.98	0.00	0	0	1.76	1.26	white
113	GRIFFIN	114	190636	70.67	18135.72	66.46	29.77	0.34	0.50	1.56	1.36	white
47	EVANS	48	342237	126.87	12188.83	70.65	25.05	0.40	0.68	1.67	1.55	black
1209	LOCKE	1210	26516	9.83	42832.88	83.26	11.77	1.06	0.81	1.59	1.52	white
2385	YAZZIE	2386	13915	5.16	51104.25	1.44	0.10	0.05	96.10	1.37	0.93	white

	Aa	Ab	Ac	Ad	Ae	Af	Ag	Ah	Ai	Aj	...	zp	zq	zr	zs	zt	zu	zv	zw	zy	zz
0	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
1	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
2	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
3	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
4	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
5	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
6	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
7	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
8	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
9	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
10	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
11	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
12	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
13	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
14	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
15	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
16	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
17	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
18	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
19	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
20	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
21	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
22	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
23	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
24	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
25	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
26	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
27	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
28	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
29	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
999970	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
999971	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
999972	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
999973	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
999974	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
999975	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
999976	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
999977	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
999978	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
999979	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
999980	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
999981	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
999982	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
999983	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
999984	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
999985	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
999986	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
999987	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
999988	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
999989	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
999990	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
999991	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
999992	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
999993	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
999994	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
999995	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
999996	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
999997	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
999998	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
999999	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	1