In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import pylab as pyl
# This is an example of an iPython magic command.
# If we don't use this, then we can't see our matplotlib plots in our notebook
%matplotlib inline
Data looks better naked
What in the world does that mean?
Slide and data presentation often refers back to Edward Tufte and his book "The Visual Display of Quantitative Information."
Define naked data this way:
Data-ink is the non-erasable core of the graphic, the non-redundant ink arranged in response to variation in the numbers represented
“Perfection is achieved not when there is nothing more to add, but when there is nothing left to take away”
– Antoine de Saint-Exupery
In [2]:
dfLetterFrequency = pd.read_csv('../data/letter_frequency.csv', header=None, index_col=0, names=['Frequency'])
In [29]:
dfLetterFrequency.plot(kind='bar', figsize=(10,6))
Out[29]:
In [ ]:
tableau20 = [(31, 119, 180), (174, 199, 232), (255, 127, 14), (255, 187, 120),
(44, 160, 44), (152, 223, 138), (214, 39, 40), (255, 152, 150),
(148, 103, 189), (197, 176, 213), (140, 86, 75), (196, 156, 148),
(227, 119, 194), (247, 182, 210), (127, 127, 127), (199, 199, 199),
(188, 189, 34), (219, 219, 141), (23, 190, 207), (158, 218, 229)]
In [ ]:
for i in range(len(tableau20)):
r, g, b = tableau20[i]
tableau20[i] = (r / 255., g / 255., b / 255.)
In [ ]:
N = 26
ind = np.arange(N) # the x locations for the groups
width = 0.8 # the width of the bars
In [ ]:
pyl.figure(figsize=(12, 9))
In [ ]:
ax = pyl.subplot(111)
ax.spines["top"].set_visible(False)
ax.spines["bottom"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.spines["left"].set_visible(False)
In [ ]:
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left()
In [ ]:
ax.set_xticks(ind + 0.5 * width)
ax.set_xticklabels(dfLetterFrequency.index.values)
In [ ]:
pyl.ylim(0, 14)
pyl.xlim(0,26)
In [ ]:
pyl.yticks(range(0, 14, 2), [str(x) + "%" for x in range(0, 14, 2)], fontsize=14)
pyl.xticks(fontsize=14)
In [ ]:
for y in range(0, 14, 2):
plt.plot(range(0, 26), [y] * len(range(0, 26)), ":", lw=0.5, color="black", alpha=0.3)
In [ ]:
plt.tick_params(axis="both", which="both", bottom="off", top="off",
labelbottom="on", left="off", right="off", labelleft="on")
In [ ]:
plt.bar(ind, dfLetterFrequency.Frequency.values, width, color=tableau20[0], alpha=0.5)
Make the title big enough so it spans the entire plot, but don't make it so big that it requires two lines to show.
Note that if the title is descriptive enough, it is unnecessary to include axis labels.
In [ ]:
pyl.text(6, 13.5, "Letter Frequency in English Writing", fontsize=17, ha="center")
In [ ]:
pyl.text(0, -1, "Data source: Cryptological Mathematics, Robert Lewand.", fontsize=10)
In [28]:
savefig("../outputs/letter_frequency.png", bbox_inches="tight");
dataviz = plt.gcf()