In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import os
from os.path import join
cwd = os.getcwd()
data_directory = join(cwd, '..', 'Data storage')
The sex, origin, race, and age columns are ALL when they have values of 0
Not clear if these are beginning or end of year values.
https://www2.census.gov/programs-surveys/popest/datasets/2000-2010/intercensal/state/
In [3]:
path = os.path.join(data_directory, 'Population data',
'st-est00int-alldata.csv')
pop1 = pd.read_csv(path)
In [4]:
pop1.head()
Out[4]:
In [5]:
pop1 = pop1.loc[(pop1['SEX'] == 0) &
(pop1['ORIGIN'] == 0) &
(pop1['RACE'] == 0) &
(pop1['AGEGRP'] == 0), :]
In [6]:
# Column names for population estimate
est_cols = ['POPESTIMATE{}'.format(x) for x in range(2000, 2011)]
In [7]:
# Melt the wide-form data into a tidy dataframe
pop1_tidy = pd.melt(pop1, id_vars='NAME',
value_vars=est_cols, var_name='Year',
value_name='Population')
In [8]:
pop1_tidy.head()
Out[8]:
In [9]:
def map_year(x):
'Return last 4 characters (the year)'
year = x[-4:]
return int(year)
pop1_tidy['Year'] = pop1_tidy['Year'].map(map_year)
The values shown below are ever slightly different than those listed in the later dataset.
In [10]:
pop1_tidy.loc[pop1_tidy['Year'] == 2010].head()
Out[10]:
In [11]:
pop1_tidy.head()
Out[11]:
In [12]:
pop1_tidy.tail()
Out[12]:
In [13]:
pop1_tidy.columns = ['State', 'Year', 'Population']
https://www.census.gov/data/tables/2016/demo/popest/state-total.html
In [14]:
path = os.path.join(data_directory, 'Population data', 'nst-est2016-01.xlsx')
pop2 = pd.read_excel(path, header=3, parse_cols='A, D:J', skip_footer=7)
In [15]:
pop2.head()
Out[15]:
In [16]:
pop2.tail()
Out[16]:
In [17]:
drop_rows = ['Northeast', 'Midwest', 'South', 'West']
pop2.drop(drop_rows, inplace=True)
In [19]:
pop2.index = pop2.index.str.strip('.')
In [20]:
pop2.head()
Out[20]:
In [21]:
pop2.columns
Out[21]:
In [22]:
pop2_tidy = pd.melt(pop2.reset_index(), id_vars='index',
value_vars=range(2010, 2017), value_name='Population',
var_name='Year')
pop2_tidy.columns = ['State', 'Year', 'Population']
In [23]:
pop_total = pd.concat([pop1_tidy, pop2_tidy])
The overlapping 2010 values are different, but just barely. I'm going to re-combine the datasets and keep values from the second dataset.
In [24]:
pop_total.loc[pop_total['Year']==2010].sort_values('State')
Out[24]:
In [25]:
pop_total = pd.concat([pop1_tidy.loc[pop1_tidy['Year'] < 2010], pop2_tidy])
In [26]:
pop_total.head()
Out[26]:
In [27]:
pop_total.tail()
Out[27]:
In [81]:
path = os.path.join('Data storage', 'Derived data', 'State population.csv')
pop_total.to_csv(path, index=False)