Let's work with the collection of state CSVs


In [24]:
import csv

import numpy  as np
import pandas as pd
from pandas import (DataFrame, Series)

In [7]:
import glob

glob.glob("census_2010_sf1/state*")


Out[7]:
['census_2010_sf1/state_diversity_measures.csv',
 'census_2010_sf1/state_five_categories.csv',
 'census_2010_sf1/state_P005.csv',
 'census_2010_sf1/state_population.csv']

In [14]:
# use http://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html instead
# of DataFrame.from_csv to use dtype

df = pd.read_csv("census_2010_sf1/state_population.csv", dtype={'FIPS': str})
df.head()


Out[14]:
NAME P0050001 FIPS
0 Alabama 4779736 01
1 Alaska 710231 02
2 Arizona 6392017 04
3 Arkansas 2915918 05
4 California 37253956 06

In [33]:
# use some of the pre-written code
from census_api_utils import entropy

In [19]:
df = pd.read_csv("census_2010_sf1/state_diversity_measures.csv", dtype={'FIPS': str})
df.head()


Out[19]:
NAME Total White Black Asian Hispanic Other p_White p_Black p_Asian p_Hispanic p_Other entropy5 entropy4 entropy_rice gini_simpson FIPS
0 Alabama 4779736 3204402 1244437 52937 185602 92358 0.670414 0.260357 0.011075 0.038831 0.019323 0.541001 0.570292 0.573075 0.480755 01
1 Alaska 710231 455320 21949 37459 39249 156254 0.641087 0.030904 0.052742 0.055262 0.220004 0.646677 0.475235 0.510480 0.533815 02
2 Arizona 6392017 3695647 239101 170509 1895149 391611 0.578166 0.037406 0.026675 0.296487 0.061266 0.663524 0.643529 0.646914 0.571955 04
3 Arkansas 2915918 2173469 447102 35647 186050 73650 0.745381 0.153331 0.012225 0.063805 0.025258 0.515025 0.526205 0.530902 0.416039 05
4 California 37253956 14956253 2163804 4775070 14013719 1345110 0.401468 0.058083 0.128176 0.376167 0.036107 0.796994 0.843670 0.838778 0.676216 06

In [26]:
# https://docs.scipy.org/doc/numpy/reference/generated/numpy.testing.assert_array_almost_equal.html
np.testing.assert_array_almost_equal(
df[["White", "Black", "Asian", "Hispanic", "Other"]].apply(entropy, axis=1),
df.entropy5)

In [34]:
df.sort('entropy5', ascending=False).head()


Out[34]:
NAME Total White Black Asian Hispanic Other p_White p_Black p_Asian p_Hispanic p_Other entropy5 entropy4 entropy_rice gini_simpson FIPS
11 Hawaii 1360301 309343 19904 513294 120842 396918 0.227408 0.014632 0.377339 0.088835 0.291787 0.833108 0.750762 0.707954 0.712656 15
4 California 37253956 14956253 2163804 4775070 14013719 1345110 0.401468 0.058083 0.128176 0.376167 0.036107 0.796994 0.843670 0.838778 0.676216 06
28 Nevada 2700551 1462081 208058 191047 716501 122864 0.541401 0.077043 0.070744 0.265317 0.045496 0.751622 0.774363 0.771193 0.623482 32
32 New York 19378102 11304247 2783857 1406194 3416922 466882 0.583352 0.143660 0.072566 0.176329 0.024093 0.732727 0.787727 0.785917 0.602124 36
43 Texas 25145561 11397345 2886825 948426 9460921 452044 0.453255 0.114805 0.037717 0.376246 0.017977 0.727466 0.793870 0.792449 0.638073 48

In [ ]: