In [1]:
import pandas as pd
from pandas import DataFrame
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
In [3]:
import sys
# Point this to folder that has information.py inside.
sys.path.append('/Users/joe/Sites/SFI/stats_nhood/statistics_neighborhoods')
from information import CensusFrame
# If you are making changes to information.py module
# %load_ext autoreload
# %autoreload 2
In [4]:
DATA_DIR = '/Users/joe/Dropbox/SFI_CensusData/UnitedStates/'
CENSUS_FILE = '2010acs_edu.csv'
df = pd.read_csv(DATA_DIR + CENSUS_FILE, dtype={'ID':'str'})
df = df.set_index('ID', drop=False)
In [5]:
# Get columns you want to use as bins, use a regex of some sort
edu_regex = 'Edu_Level*'
edu_cols = df.filter(regex=edu_regex).columns
# Its helpful to add _bin to the column names to ensure you include only those
df = df.rename(columns={col:col + '_bin' for col in edu_cols})
In [6]:
# Set up the CensusFrame
# data = the dataframe of the data
# bin_regex = the regex that will match the bin columns
# group_col = the column to group by (e.g. city id).
# tot_col = the column to use as the total for each row. create this from bins, if necessary
bin_regex = 'Edu.*_bin$'
cf = CensusFrame(
data=df,
bin_regex=bin_regex,
group_col='CITY_NAME',
tot_col='TOTPOP',
)
# Make sure you only have the bin columns you want
cf.filter(regex=bin_regex).columns
Out[6]:
In [ ]:
CensusFrame()
In [7]:
# These functions run most of the calculations you will need
def run_information_calculations(censusframe, **kwargs):
_ = censusframe.calculate_group_sums(**kwargs)
_ = censusframe.nhood_weights()
_ = censusframe.dkl_y()
_ = censusframe.entropy_y()
_ = censusframe.entropy_y(conditional=False)
_ = censusframe.entropy_n()
_ = censusframe.mutual_info()
return
run_information_calculations(cf,
var_regex = bin_regex,
var_list=['TOTPOP'])
In [8]:
# Now you can access either neighborhood or city level data with information theory values in the columns:
# cf.nhood_df (neighborhood level data)
# cf.city_df (city level data - the index is whatever you passed to group_col)
# e.g.:
cf.city_df.head()
Out[8]:
In [9]:
# Plot things like they are normal pandas dataframes (because they are)
cf.city_df.plot('MI', 'TOTPOP', kind='scatter', logy=True)
Out[9]:
In [33]:
# Export Neighborhood dataframes to a csv
# cf.nhood_df.to_csv(DATA_DIR + 'USA_dkl_edu.csv')
In [ ]: