In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
Pandas has lots of great documentation, tutorials and walkthroughs.
This tutorial was based largely off of a SWC inspired lesson by Nancy Soontiens found at: https://nsoontie.github.io/2015-03-05-ubc/novice/python/Pandas-Lesson.html
I adapted other parts from a great tutorial by Greg Reda: http://www.gregreda.com/2013/10/26/intro-to-pandas-data-structures/
More can be found in the pandas documentation: http://pandas.pydata.org/pandas-docs/stable/
A great youtube walkthrough from PyCon 2015: https://www.youtube.com/watch?v=5JnMutdy6Fw
Lastly, a set of recent helpful blogposts for intermediate and advanced users can be found at: https://tomaugspurger.github.io/modern-1.html
pandas introduces two new data structures to Python - Series and DataFrame, both of which are built on top of NumPy.
We can load in a tabular data set as a dataframe in a number of different ways.
In [2]:
df = pd.read_table('./gapminderDataFiveYear.txt')
In [3]:
df
Out[3]:
In [4]:
type(df)
Out[4]:
In [5]:
df.shape
Out[5]:
In [6]:
df.columns
Out[6]:
In [7]:
df.head()
Out[7]:
In [8]:
df.head(6)
Out[8]:
In [9]:
df.tail()
Out[9]:
In [10]:
df.info()
In [11]:
df.dtypes
Out[11]:
In [12]:
df.describe()
Out[12]:
In [13]:
#select multiple columns with a list of column names
df[['year','lifeExp']]
Out[13]:
In [14]:
#alternative selection with dot notation won't work if column names have spaces, uncommon characters or leading numbers
df.lifeExp
Out[14]:
In [15]:
#by index location
df.iloc[[0]]
Out[15]:
In [16]:
#you can provide a list of index values to select
df.iloc[[0,5,10]]
Out[16]:
In [17]:
#or select with the slice notation
df[0:5]
Out[17]:
In [18]:
#select by index label
#would require named index
country_index = df.set_index('country')
In [19]:
country_index.loc['Canada']
Out[19]:
In [20]:
#boolean indexing
large_pop = df[df['pop'] > 300000000]
In [21]:
large_pop
Out[21]:
In [22]:
large_pop['country'].unique()
Out[22]:
You can also chain together multiple criteria for boolean indexing:
In [23]:
multi_criteria = df[(df['country']=='Canada') & (df['year'] > 1990)]
In [24]:
multi_criteria
Out[24]:
In [ ]:
In [25]:
def print_stats(df,country,year):
""" Prints the life expectancy, gdp per capita and population
of country in year. """
In [26]:
print_stats(df, 'Canada', 2007)
In [27]:
continents = df.groupby('continent')
continents
Out[27]:
In [28]:
len(continents)
Out[28]:
In [29]:
#helpful way to visualize the groupby object: gives first row of each group
continents.first()
Out[29]:
In [ ]:
In [ ]:
You can use an aggregate function to get the mean life expectancy in the different continents
In [30]:
continents.lifeExp.mean()
Out[30]:
The previous cell showed mean life expectancy values aggregated over all the years.
Alternatively, we can groupby multiple columns and use an aggregate function to get the mean life expectancy/population/gdpPercap in a specific continent in a specific year of interest:
In [31]:
df.groupby(['continent', 'year']).agg(np.mean)
Out[31]:
You can also retrieve a particular group with the get_group() command.
In [32]:
continents.get_group('Africa').describe()
Out[32]:
In [ ]:
What country is this? When was the measurement taken? We can figure this out in a few different ways:
In [33]:
continents.get_group('Asia').lifeExp.idxmax()
Out[33]:
In [34]:
#idxmax convenience function will return the index with max value
df[df['continent']=='Asia']['lifeExp'].idxmax()
Out[34]:
In [35]:
df.loc[803]
Out[35]:
How can we rank each country based on their lifeExp?
Let's create a new column 'lifeExp_rank' that creates an ordered ranking based on the longest life expectancy.
In [36]:
sorted_by_lifeExp = df.sort_values('lifeExp', ascending=False)
In [37]:
sorted_by_lifeExp['lifeExp_rank'] = np.arange(len(sorted_by_lifeExp)) + 1
In [38]:
#lists all rows in order of lifeExp
sorted_by_lifeExp.head()
Out[38]:
In [39]:
def ranker(df):
"""Assigns a rank to each country based on lifeExp, with 1 having the highest lifeExp.
Assumes the data is DESC sorted by lifeExp."""
df['lifeExp_rank'] = np.arange(len(df)) + 1
return df
In [40]:
#apply the ranking function on a per year basis:
sorted_by_lifeExp = sorted_by_lifeExp.groupby('year').apply(ranker)
We can now subset my new dataframe by year to view the lifeExp ranks for each year
In [41]:
sorted_by_lifeExp[sorted_by_lifeExp.year == 2002].head()
Out[41]:
We can also subset by country=='Canada' to see how Canada's ranking has changed over the years:
In [42]:
sorted_by_lifeExp[(sorted_by_lifeExp['country']=='Canada')]
Out[42]:
Make sure you use the following %magic command to allow for inline plotting
In [43]:
%matplotlib inline
We can specify the type of plot with the kind argument. Also, choose the independent and dependent variables with x and y arguments.
In [44]:
df.plot(x='year',y='lifeExp',kind='scatter')
Out[44]:
In [45]:
df.plot(x='gdpPercap',y='lifeExp',kind='scatter', alpha = 0.2, s=50, marker='o')
Out[45]:
What's going on with those points on the right?
High gdp per capita, yet not particularly high lifeExp. We can use boolean selection to rapidly subset and check them out.
In [46]:
df[df['gdpPercap'] > 55000]
Out[46]:
In [47]:
df.hist(column='lifeExp')
Out[47]:
In [48]:
df.lifeExp.plot.hist(bins=200)
Out[48]:
In [49]:
df['lifeExp'].plot(kind='kde')
Out[49]:
In [50]:
def compare_lifeExp(country1, country2):
"""Plot life expectancy vs year for country1 and country2"""
In [51]:
compare_lifeExp('Canada', 'Mexico')
In [52]:
spec=['country','lifeExp']
df[df['year']==1982][spec].min()
Out[52]:
We can do a quick check to look up Afghanistan's life expectancy in 1982.
In [53]:
df[(df['year']==1982) & (df['country']=='Afghanistan')]
Out[53]:
This doesnt match with the answer above because the min() function was applied to each column (country and lifeExp).
She should have done this:
In [ ]:
In [54]:
continents = df.groupby(['continent'])
for continent in continents.groups:
group = continents.get_group(continent)
group[group['year']==2007].plot(kind='scatter', x='gdpPercap', y='lifeExp', title=continent)
plt.axis([-10000,60000,30,90])
In [55]:
#Example
fig,ax = plt.subplots(1,1)
colours = ['m','b','r','g','y']
for continent, colour in zip(continents.groups, colours):
group = continents.get_group(continent)
group[group['year']==2007].plot(kind='scatter',x='gdpPercap',y='lifeExp',label=continent,ax=ax,color=colour,alpha=0.5)
ax.set_title(2007)
plt.legend(loc='lower right')
Out[55]:
In [56]:
def compare_gdp_lifeExp(df,country):
""" plot GDP per capita against life expectancy for a given country.
print year of min/max gdp per capita and life expectancy
"""
In [57]:
compare_gdp_lifeExp(df,'Afghanistan')
In [58]:
compare_gdp_lifeExp(df,'Canada')
In [59]:
import seaborn as sns
In [60]:
df.head()
Out[60]:
In [61]:
sns.set_context("talk")
sns.factorplot(data=df, x='year', y='lifeExp', hue='continent', size=8)
Out[61]:
In [62]:
sns.regplot(data=df, x='year', y='gdpPercap', fit_reg=True)
Out[62]:
In [63]:
sns.lmplot(data=df, x='year', y='gdpPercap', row='continent')
Out[63]:
In [ ]:
sns.factorplot(data=df, x='continent', y='gdpPercap', kind='bar')
Out[ ]:
In [ ]:
g = sns.FacetGrid(df, col='continent', row='year')
g.map(plt.hist, 'lifeExp')
Out[ ]: