You are going to recreate the first plot in the following article about tall Dutch people:
http://www.randalolson.com/2014/06/23/why-the-dutch-are-so-tall/
To help you on your way, we download the dataset first and clean it by interpolating missing data, see also http://pandas.pydata.org/pandas-docs/stable/missing_data.html.
The color series used in the plot is taken from the default Tableau color scheme. In the following blogpost you can read how you can use it in your plots: http://www.randalolson.com/2014/06/28/how-to-make-beautiful-data-visualizations-in-python-with-matplotlib/. It also helps you with adding the Country annotations.
Try to reproduce the plot as closely as possible. Good luck!!
Do you agree with the conclusions from the article?
In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
heights = pd.read_csv('http://files.figshare.com/1545826/world_heights.csv')
heights_cleaned = heights.interpolate()
In [135]:
heights_cleaned
# You typically want your plot to be ~1.33x wider than tall. This plot is a rare
# exception because of the number of lines being plotted on it.
# Common sizes: (10, 7.5) and (12, 9)
plt.figure(figsize=(12, 9))
# Remove the plot frame lines. They are unnecessary chartjunk.
ax = plt.subplot(111)
ax.spines["top"].set_visible(False)
ax.spines["bottom"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.spines["left"].set_visible(False)
plt.xticks(range(1800, 2025, 25), [str(x) for x in range(1800, 2025, 25)], fontsize=14)
plt.yticks(fontsize=14)
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left()
# These are the "Tableau 20" colors as RGB.
tableau20 = [(31, 119, 180), (174, 199, 232), (255, 127, 14), (255, 187, 120),
(44, 160, 44), (152, 223, 138), (214, 39, 40), (255, 152, 150),
(148, 103, 189), (197, 176, 213), (140, 86, 75), (196, 156, 148),
(227, 119, 194), (247, 182, 210), (127, 127, 127), (199, 199, 199),
(188, 189, 34), (219, 219, 141), (23, 190, 207), (158, 218, 229)]
# Scale the RGB values to the [0, 1] range, which is the format matplotlib accepts.
for i in range(len(tableau20)):
r, g, b = tableau20[i]
tableau20[i] = (r / 255., g / 255., b / 255.)
countries = ["The Netherlands", "Italy", "France", "Sweden","U.S.A.","Germany","Denmark"]
countries.sort()
for y in range(155, 186, 5):
plt.plot(range(1800, 2013), [y] * len(range(1800, 2013)), "--", lw=0.5, color="black", alpha=0.3)
plt.tick_params(axis="both", which="both", bottom="off", top="off",
labelbottom="on", left="off", right="off", labelleft="on")
for rank, column in enumerate(countries):
# Plot each line separately with its own color, using the Tableau 20
# color set in order.
plt.plot(heights_cleaned.Year.values,
heights_cleaned[column.replace("", "")].values,
lw=2.5, color=tableau20[rank])
y_pos = heights_cleaned[column.replace("", "")].values[-1] - 0.1
if column == "U.S.A.":
y_pos += 0.3
elif column == "Italy":
y_pos -= 0.2
elif column == "France":
y_pos -= 0.5
plt.text(2014.5, y_pos, column, fontsize=12, color=tableau20[rank])
plt.ylim(160, 185)
plt.xlim(1818, 2013)
plt.text(1925, 186, "Median male height (cm) in various countries, 1820-2013", fontsize=20, ha="center")
# Always include your data source(s) and copyright notice! And for your
# data sources, tell your viewers exactly where the data came from,
# preferably with a direct link to the data. Just telling your viewers
# that you used data from the "U.S. Census Bureau" is completely useless:
# the U.S. Census Bureau provides all kinds of data, so how are your
# viewers supposed to know which data set you used?
plt.text(1810, 158, "Sources: dx.doi.org/10.6084/m9.figshare.1066523 | Author: Randy Olson (randalolson.com / @randal_olson)", fontsize=10)
plt.savefig("generatedCharts/median_male_height.png", bbox_inches="tight")
In [ ]: