Goal:
References:
Note the split-apply-combine framework of thinking
In [1]:
# note version of pandas
import pandas
pandas.__version__
Out[1]:
In [2]:
import us
import census
import settings
import pandas as pd
import numpy as np
from pandas import DataFrame, Series
from itertools import islice
c = census.Census(settings.CENSUS_KEY)
def states(variables='NAME'):
geo={'for':'state:*'}
states_fips = set([state.fips for state in us.states.STATES])
# need to filter out non-states
for r in c.sf1.get(variables, geo=geo, year=2010):
if r['state'] in states_fips:
yield r
In [3]:
# make a dataframe from the total populations of states in the 2010 Census
df = DataFrame(states('NAME,P0010001'))
df.P0010001 = df.P0010001.astype('int')
df.head()
Out[3]:
In [4]:
# check that that we have the right total population
df.P0010001.sum() == 308745538
Out[4]:
In [5]:
# add a column with the first letter
# we'll be grouping states based on the first letter of the state NAME
df['first_letter'] = df.NAME.apply(lambda s:s[0])
df.head()
Out[5]:
Possible to do grouping in many ways. "The mapping can be specified many different ways":
We can also group by columns, axis=1, but such functionality is not demonstrated in this notebook.
In [6]:
# we can explicitly name df.first_letter
grouped = df.groupby(df.first_letter)
grouped
Out[6]:
In [7]:
# shorthand for df.first_letter
# same thing as df.groupby(df.first_letter)
grouped = df.groupby('first_letter')
grouped
Out[7]:
In [8]:
# count the number of states with each first letter -- look at top of the resulting DataFrame
grouped.count().head()
Out[8]:
In [9]:
# count the number of states with each first letter -- look at bottom of the resulting DataFrame
grouped.count().tail()
Out[9]:
In [10]:
# we didn't have to explicitly create a new column -- we could groupby on a dynamically generated Series
# note the use of str operations on df.NAME:
# http://pandas.pydata.org/pandas-docs/stable/basics.html#vectorized-string-methods
df.groupby(df.NAME.str.slice(0,1)).sum()
Out[10]:
In [11]:
# we can get groups of indexes
df.groupby('first_letter').groups
Out[11]:
In [12]:
# loop through all groups --> here just the first one
for name, group in islice(df.groupby('first_letter'),1):
print(name)
print type(group) # yes -- a DataFrame
print group.index
print(group),
In [13]:
# how about accessing group 'C'?
# http://stackoverflow.com/a/14734627/7782
# http://stackoverflow.com/questions/19804282/in-pandas-is-there-something-like-a-groupby-get-group-but-with-an-optional-defa
# http://pandas.pydata.org/pandas-docs/dev/generated/pandas.core.groupby.GroupBy.get_group.html
grouped = df.groupby('first_letter')
grouped.get_group('C')
Out[13]:
In [14]:
# total population of states starting with 'C'
grouped.get_group('C').P0010001.sum()
Out[14]:
In [15]:
# generate a Series of total populations by first letter
grouped = df.groupby('first_letter')
s = grouped['P0010001'].sum()
s
Out[15]:
In [16]:
# sort the list to get the most populous groups
# http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.sort.html
# note sorting in place
s.sort(ascending=False)
s
Out[16]:
In [17]:
s.index
Out[17]:
In [18]:
# first pass at pulling together the letter, the total population, and the list of corresponding states
for k in s.index:
print k, s[k], list(grouped.get_group(k).NAME)
In [18]:
In [19]:
# let's try this again and try to do this in a more idiomatic Pandas fashion
# ideally, generate a DataFrame with a NAME column that is a list of all states with the same first letter
# It turns out that apply can come to the rescue
# here' calculate the list of NAMEs
# http://pandas.pydata.org/pandas-docs/dev/groupby.html#flexible-apply
# http://stackoverflow.com/questions/19530568/can-pandas-groupby-aggregate-into-a-list-rather-than-sum-mean-etc
df.groupby("first_letter").apply(lambda x: list(x['NAME']))
Out[19]:
In [20]:
# apply can be used to add up the populations by group
df.groupby("first_letter").apply(lambda x: np.sum(x['P0010001']))
Out[20]:
In [21]:
# make a tuple out of the list of names and the population
df.groupby("first_letter").apply(lambda x:( list(x['NAME']), np.sum(x['P0010001'])))
Out[21]:
In [22]:
# remind ourselves on how to turn a tuple into a Series with a small example
Series(([1,2],2), index=['one','two'])
Out[22]:
In [23]:
# we're ready to make a new DataFrame
df.groupby("first_letter").apply(lambda x:Series((list(x['NAME']), np.sum(x['P0010001'])),
index=['states','total_pop']))
Out[23]:
In [24]:
df2 = df.groupby("first_letter").apply(lambda x:Series((list(x['NAME']), np.sum(x['P0010001'])),
index=['states','total_pop'])).sort_index(by='total_pop',ascending=False)
In [25]:
# make sure you understand the syntax here:
# .ix: http://pandas.pydata.org/pandas-docs/dev/indexing.html#advanced-indexing-with-ix
# ability to grab columns by name to return a new DataFrame
df2.ix['C'][['states','total_pop']]
Out[25]:
In [26]:
print "states that start with 'C'", df2.ix['C']['states']
In [27]:
print "total population of states that start with 'C'", df2.ix['C']['total_pop']
In [27]: