In [ ]:
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
import wget
import os
# Dependencies: wget, xlrd
In this post I will be downloading data from the HMRC on income in the UK. The data provides the mean and median income before and after tax split by age and gender. You can see the original data sources:
and
The data is several xls files. These have no standard naming convention, so we will need to download them on a somewhat adhoc bases. Let's do this first, saving them to a local dir with names year-range.xls.
In [1]:
data_dir = "./DataIncomeInvestigation"
if not os.path.isdir(data_dir):
os.makedirs(data_dir)
In [2]:
file_names_1999_2009 = [
"table3_2_september04.xls",
"table3-2-2000-01.xls",
"table-32-2001-02.xls",
"table3_2.xls",
"3_2_apr06.xls",
"table3-2-2004-05.xls",
"table3-2-jan08.xls",
"3-2tabledec08.xls",
"3-2table-jan2010.xls",
"",
"3-2table-feb2012.xls"]
years_1999_2009 = ["{}-{}".format(i, i+1) for i in range(1999, 2010)]
url_base = "http://webarchive.nationalarchives.gov.uk/20120405152450/http://hmrc.gov.uk/stats/income_distribution/{}"
for file, year in zip(file_names_1999_2009, years_1999_2009):
fname = wget.download(url_base.format(file),
out="{}/{}.xls".format(data_dir, year))
print "Downloaded {} for year {} to {}".format(file, year, fname)
In [ ]:
url_list_2010_2013 = ['https://www.gov.uk/government/uploads/system/uploads/attachment_data/file/267112/table3-2-1.xls',
'https://www.gov.uk/government/uploads/system/uploads/attachment_data/file/276222/table3-2-12.xls',
'https://www.gov.uk/government/uploads/system/uploads/attachment_data/file/399053/Table_3_2_13.xls']
years_2010_2013 = ['2010-2011', '2011-2012', '2012-2013']
for url, year in zip(url_list_2010_2013, years_2010_2013):
fname = wget.download(url, out="{}/{}.xls".format(data_dir, year))
print "Downloaded {} \nfor year {} to {}".format(url, year, fname)
Okay, now that we have downloaded the data we will import it. Unfortunately although all
the spreadsheets are in a similar format, they are not exactly the same. To simplify the
process then we will first construct some helper functions. The general principle will be
to read each spreadsheet as a dataframe, sanitise it, then add it to a total dataframe df.
Each spreadsheet will have the data in a slightly different place, we we will, by hand,
set the row and columns of interest and check that the correct data is imported:
Note: For now we restrict our focus to the total data and ignore the gender split.
In [2]:
years = ["{}-{}".format(i, i+1) for i in range(1999, 2013)]
def ReadData(file_name, **kwargs):
""" Read in the data and print it for checking """
print("Reading in data from {}".format(file_name))
df = pd.read_excel(file_name,
index_col=None,
header=None,
names=['age',
'Number',
'MedianIncomeBeforeTax',
'MeanIncomeBeforetax'],
**kwargs)
return df
def RecordData(year, df=None, **kwargs):
""" Helper function to add the data for year to a data frame df """
if type(df) != pd.core.frame.DataFrame:
df = pd.DataFrame()
file_name = "{}/{}.xls".format(data_dir, year)
dfNew = ReadData(file_name, **kwargs)
dfNew.dropna(inplace=True)
dfNew['year'] = year
print "Adding the following data to the frame:"
print(dfNew)
return df.append(dfNew, ignore_index=True)
In [3]:
i = 0
df = RecordData(years[i], skiprows=68, skip_footer=12, parse_cols=[0, 2, 3, 5])
In [4]:
i = 1
df = RecordData(years[i], df=df, skiprows=68, skip_footer=10, parse_cols=[0, 2, 4, 9])
In [5]:
df = RecordData(years[2], df=df, skiprows=68, skip_footer=11, parse_cols=[0, 2, 4, 9])
In [6]:
df = RecordData(years[3], df=df, skiprows=68, skip_footer=12, parse_cols=[0, 1, 2, 4])
In [7]:
df = RecordData(years[4], df=df, skiprows=14, skip_footer=65, parse_cols=[0, 2, 3, 5])
In [8]:
df = RecordData(years[5], df=df, skiprows=14, skip_footer=65, parse_cols=[0, 2, 3, 5])
In [9]:
df = RecordData(years[6], df=df, skiprows=14, skip_footer=65, parse_cols=[0, 2, 3, 5])
In [10]:
df = RecordData(years[7], df=df, skiprows=14, skip_footer=65, parse_cols=[0, 2, 3, 5])
In [11]:
df = RecordData(years[8], df=df, skiprows=14, skip_footer=66, parse_cols=[0, 2, 3, 5])
In [12]:
# THE DATA FOR 2008-2009 DOES NOT EXIST: So we create Nans
df_2008_2009 = df[df.year=='2007-2008'].copy()
df_2008_2009.set_value(df_2008_2009.index,
['Number', 'MedianIncomeBeforeTax', 'MeanIncomeBeforetax'],
np.nan)
df_2008_2009['year'] = years[9]
df = df.append(df_2008_2009, ignore_index=True)
In [13]:
df = RecordData(years[10], df=df, skiprows=14, skip_footer=109, parse_cols=[0, 2, 3, 5],
sheetname="3.2")
In [14]:
df = RecordData(years[11], df=df, skiprows=14, skip_footer=106, parse_cols=[0, 2, 3, 5])
In [15]:
df = RecordData(years[12], df=df, skiprows=14, skip_footer=101, parse_cols=[0, 2, 3, 5])
In [16]:
df = RecordData(years[13], df=df, skiprows=14, skip_footer=101, parse_cols=[0, 2, 3, 5])
In [17]:
df = df.drop_duplicates()
df
Out[17]:
In [22]:
years_total_val = [int(s.split("-")[0]) for s in years]
age_ranges = df[df.year == '1999-2000'].age.values
fig, ax = plt.subplots(figsize=(10, 5))
NUM_COLORS = len(age_ranges)
cm = plt.get_cmap('gist_rainbow')
ax.set_color_cycle([cm(1.*i/NUM_COLORS) for i in range(NUM_COLORS)])
for age in age_ranges:
ax.plot(years_total_val, df[df.age == age]['MeanIncomeBeforetax'], "-o",
lw=2, label=age)
ax.set_xticks(years_total_val)
ax.set_xticklabels(years, rotation=45)
plt.legend(bbox_to_anchor=(1.3, 1.0))
plt.show()
In [27]:
age_ranges_values = range(len(age_ranges))
fig, ax = plt.subplots(figsize=(10, 5))
NUM_COLORS = len(years)
cm = plt.get_cmap('gist_rainbow')
ax.set_color_cycle([cm(1.*i/NUM_COLORS) for i in range(NUM_COLORS)])
for yr in years:
ax.plot(age_ranges_values, df[df.year == yr]['MeanIncomeBeforetax'], "-o",
label=yr)
ax.set_xticks(age_ranges_values)
ax.set_xticklabels(age_ranges, rotation=45)
plt.legend(bbox_to_anchor=(1.3, 1.0))
plt.show()
In [ ]: