notebook.community

Edit and run



In [1]:

    
import pandas as pd
from pandas import DataFrame
import numpy as np

import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

%matplotlib inline



In [3]:

    
import sys

# Point this to folder that has information.py inside.
sys.path.append('/Users/joe/Sites/SFI/stats_nhood/statistics_neighborhoods')
from information import CensusFrame

# If you are making changes to information.py module
# %load_ext autoreload
# %autoreload 2



In [4]:

    
DATA_DIR = '/Users/joe/Dropbox/SFI_CensusData/UnitedStates/'
CENSUS_FILE = '2010acs_edu.csv' 

df = pd.read_csv(DATA_DIR + CENSUS_FILE, dtype={'ID':'str'})
df = df.set_index('ID', drop=False)



In [5]:

    
# Get columns you want to use as bins, use a regex of some sort
edu_regex = 'Edu_Level*'
edu_cols = df.filter(regex=edu_regex).columns

# Its helpful to add _bin to the column names to ensure you include only those
df = df.rename(columns={col:col + '_bin' for col in edu_cols})



In [6]:

    
# Set up the CensusFrame
#   data = the dataframe of the data
#   bin_regex = the regex that will match the bin columns
#   group_col = the column to group by (e.g. city id).
#   tot_col = the column to use as the total for each row. create this from bins, if necessary

bin_regex = 'Edu.*_bin$'

cf = CensusFrame(
        data=df,
        bin_regex=bin_regex,
        group_col='CITY_NAME',
        tot_col='TOTPOP',
    )

# Make sure you only have the bin columns you want
cf.filter(regex=bin_regex).columns









    Out[6]:





Index(['Edu_Level_0_bin', 'Edu_Level_1_bin', 'Edu_Level_2_bin',
       'Edu_Level_3_bin', 'Edu_Level_4_bin', 'Edu_Level_5_bin',
       'Edu_Level_6_bin', 'Edu_Level_7_bin', 'Edu_Level_8_bin',
       'Edu_Level_9_bin', 'Edu_Level_10_bin', 'Edu_Level_11_bin',
       'Edu_Level_12_bin', 'Edu_Level_13_bin', 'Edu_Level_14_bin',
       'Edu_Level_15_bin'],
      dtype='object')



In [ ]:

    
CensusFrame()



In [7]:

    
# These functions run most of the calculations you will need

def run_information_calculations(censusframe, **kwargs):
    _ = censusframe.calculate_group_sums(**kwargs)
    _ = censusframe.nhood_weights()
    _ = censusframe.dkl_y()
    _ = censusframe.entropy_y()
    _ = censusframe.entropy_y(conditional=False)
    _ = censusframe.entropy_n()
    _ = censusframe.mutual_info()
    return

run_information_calculations(cf,
        var_regex = bin_regex, 
        var_list=['TOTPOP'])



In [8]:

    
# Now you can access either neighborhood or city level data with information theory values in the columns:

# cf.nhood_df (neighborhood level data)
# cf.city_df (city level data - the index is whatever you passed to group_col)
# e.g.:
cf.city_df.head()









    Out[8]:






  
    
      
      DKL(n|y)_Edu_Level_0_bin
      DKL(n|y)_Edu_Level_10_bin
      DKL(n|y)_Edu_Level_11_bin
      DKL(n|y)_Edu_Level_12_bin
      DKL(n|y)_Edu_Level_13_bin
      DKL(n|y)_Edu_Level_14_bin
      DKL(n|y)_Edu_Level_15_bin
      DKL(n|y)_Edu_Level_1_bin
      DKL(n|y)_Edu_Level_2_bin
      DKL(n|y)_Edu_Level_3_bin
      ...
      H(n|y)_Edu_Level_3_bin
      H(n|y)_Edu_Level_4_bin
      H(n|y)_Edu_Level_5_bin
      H(n|y)_Edu_Level_6_bin
      H(n|y)_Edu_Level_7_bin
      H(n|y)_Edu_Level_8_bin
      H(n|y)_Edu_Level_9_bin
      H(y)
      TOTPOP
      MI
    
    
      CITY_NAME
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      Abbeville, LA Micro Area
      1.037812
      0.217105
      0.564744
      0.301556
      0.973252
      1.525835
      2.771401
      0.138149
      1.213060
      0.494106
      ...
      4.927879
      4.952261
      4.745133
      4.993620
      4.562248
      5.266463
      4.926075
      2.970333
      57280
      0.207153
    
    
      Aberdeen, SD Micro Area
      2.606836
      0.075037
      0.095657
      0.140041
      0.464672
      1.015468
      2.154919
      0.030405
      2.802169
      0.696954
      ...
      4.429377
      4.292800
      3.751787
      3.367319
      3.538777
      5.030396
      4.923653
      2.802520
      40058
      0.170816
    
    
      Aberdeen, WA Micro Area
      1.695948
      0.133783
      0.179883
      0.292184
      0.677531
      1.553511
      2.329494
      0.074598
      1.970248
      0.629900
      ...
      5.162775
      4.749903
      5.254503
      5.356209
      5.191609
      5.706772
      5.650933
      3.021013
      72092
      0.211551
    
    
      Abilene, TX Metro Area
      1.995904
      0.173822
      0.433036
      0.463082
      0.824350
      2.083184
      2.348232
      0.157157
      1.754856
      1.252149
      ...
      5.794527
      5.606002
      5.708467
      5.719960
      6.007474
      6.865055
      6.779138
      3.051275
      163092
      0.350779
    
    
      Ada, OK Micro Area
      1.894614
      0.073756
      0.355089
      0.181776
      0.285433
      1.089228
      1.464415
      0.056870
      2.121542
      0.766861
      ...
      4.127014
      3.932562
      4.453291
      4.298811
      4.139683
      4.774106
      4.574255
      3.002102
      36644
      0.212329
    
  

5 rows × 52 columns



In [9]:

    
# Plot things like they are normal pandas dataframes (because they are)
cf.city_df.plot('MI', 'TOTPOP', kind='scatter', logy=True)









    Out[9]:





<matplotlib.axes._subplots.AxesSubplot at 0x1040f89e8>



In [33]:

    
# Export Neighborhood dataframes to a csv
# cf.nhood_df.to_csv(DATA_DIR + 'USA_dkl_edu.csv')



In [ ]:

	DKL(n\|y)_Edu_Level_0_bin	DKL(n\|y)_Edu_Level_10_bin	DKL(n\|y)_Edu_Level_11_bin	DKL(n\|y)_Edu_Level_12_bin	DKL(n\|y)_Edu_Level_13_bin	DKL(n\|y)_Edu_Level_14_bin	DKL(n\|y)_Edu_Level_15_bin	DKL(n\|y)_Edu_Level_1_bin	DKL(n\|y)_Edu_Level_2_bin	DKL(n\|y)_Edu_Level_3_bin	...	H(n\|y)_Edu_Level_3_bin	H(n\|y)_Edu_Level_4_bin	H(n\|y)_Edu_Level_5_bin	H(n\|y)_Edu_Level_6_bin	H(n\|y)_Edu_Level_7_bin	H(n\|y)_Edu_Level_8_bin	H(n\|y)_Edu_Level_9_bin	H(y)	TOTPOP	MI
CITY_NAME
Abbeville, LA Micro Area	1.037812	0.217105	0.564744	0.301556	0.973252	1.525835	2.771401	0.138149	1.213060	0.494106	...	4.927879	4.952261	4.745133	4.993620	4.562248	5.266463	4.926075	2.970333	57280	0.207153
Aberdeen, SD Micro Area	2.606836	0.075037	0.095657	0.140041	0.464672	1.015468	2.154919	0.030405	2.802169	0.696954	...	4.429377	4.292800	3.751787	3.367319	3.538777	5.030396	4.923653	2.802520	40058	0.170816
Aberdeen, WA Micro Area	1.695948	0.133783	0.179883	0.292184	0.677531	1.553511	2.329494	0.074598	1.970248	0.629900	...	5.162775	4.749903	5.254503	5.356209	5.191609	5.706772	5.650933	3.021013	72092	0.211551
Abilene, TX Metro Area	1.995904	0.173822	0.433036	0.463082	0.824350	2.083184	2.348232	0.157157	1.754856	1.252149	...	5.794527	5.606002	5.708467	5.719960	6.007474	6.865055	6.779138	3.051275	163092	0.350779
Ada, OK Micro Area	1.894614	0.073756	0.355089	0.181776	0.285433	1.089228	1.464415	0.056870	2.121542	0.766861	...	4.127014	3.932562	4.453291	4.298811	4.139683	4.774106	4.574255	3.002102	36644	0.212329