In [1]:
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
import matplotlib 
import matplotlib.pyplot as plt

In [5]:
data = pd.read_csv("http://roybatty.org/butterfat.csv")

In [6]:
data.shape


Out[6]:
(120, 1)

In [7]:
data.head(3)


Out[7]:
butterfat
0 4.32
1 4.00
2 3.89

In [8]:
data.tail(3)


Out[8]:
butterfat
117 4.28
118 3.92
119 4.09

In [9]:
matplotlib.style.use("ggplot")
data.butterfat.hist(bins=13) 
plt.xlabel("Percentage butter fat")
plt.ylabel("Frequency")
## plt.savefig("butterfat-hist.png")  # uncomment to save to a file


Out[9]:
<matplotlib.text.Text at 0x10a73f898>

In [10]:
matplotlib.style.use("ggplot")
data.butterfat.hist(bins=13) 
plt.xlabel("Percentage butter fat")
plt.ylabel("Frequency")

# add vertical line at mean
plt.vlines(data.butterfat.mean(), 0, 20, linestyle="dashed")


Out[10]:
<matplotlib.collections.LineCollection at 0x108da9668>

In [11]:
matplotlib.style.use("ggplot")
data.butterfat.hist(bins=13) 
plt.xlabel("Percentage butter fat")
plt.ylabel("Frequency")

# add vertical lines at both mean and media
plt.vlines(data.butterfat.mean(), 0, 20, linestyle="dashed")
plt.vlines(data.butterfat.median(), 0, 20, linestyle="dotted")


Out[11]:
<matplotlib.collections.LineCollection at 0x10aace5c0>

In [12]:
# add some artificial outliers to data set and redraw
newbutterfat = list(data.butterfat)
newbutterfat += [10,15,8]
data.butterfat.hist(bins=13) 
plt.xlabel("Percentage butter fat")
plt.ylabel("Frequency")
plt.vlines(data.butterfat.mean(), 0, 20, linestyle="dashed", linewidth=2)
plt.vlines(data.butterfat.median(), 0, 20, linestyle="dotted", linewidth=2)
plt.vlines(np.mean(newbutterfat), 0, 20, linestyle="dashed", color="blue", linewidth=2)
plt.vlines(np.median(newbutterfat), 0, 20, linestyle="dotted", color="blue", linewidth=2)
plt.savefig('butterfat-hist-outliers.png')



In [13]:
# distn plus standard deviations
m = data.butterfat.mean()
s = data.butterfat.std()
data.butterfat.hist(bins=13) 
plt.xlabel("Percentage butter fat")
plt.ylabel("Frequency")

plt.vlines(m + s, 0, 20, linestyle="dashed", linewidth=2)
plt.vlines(m - s, 0, 20, linestyle="dashed", linewidth=2)

# draw arrow to mean 
plt.annotate("Mean", xy=(m,0),
             xytext=(0,20),
             textcoords='offset points', # offset text in pixels
             horizontalalignment="center",
             verticalalignment="bottom",
             arrowprops=dict(arrowstyle="->",color='black'))

# draw arrow to +1 std dev
plt.annotate("+1 std dev", xy=(m+s,15),
             xytext=(50,20),
             textcoords='offset points',
             horizontalalignment="center",
             verticalalignment="bottom",
             )

# draw arrow to -1 std dev
plt.annotate("-1 std dev", xy=(m-s,15),
             xytext=(-50,20),
             textcoords='offset points',
             horizontalalignment="center",
             verticalalignment="bottom",
             arrowprops=dict(arrowstyle="->",color='black',
                             connectionstyle='arc3,rad=0.5',
                             linewidth=1))


# plt.savefig('butterfat-hist-stddev.pdf')


Out[13]:
<matplotlib.text.Annotation at 0x10ae2ec88>

In [15]:
# distn plus standard deviations
m = data.butterfat.mean()
s = data.butterfat.std()

fig, ax = plt.subplots(1,1)
ax.hist(data.butterfat, bins=13) 
plt.xlabel("Percentage butter fat")
plt.ylabel("Frequency")

# draw arrows to mean 
ax.annotate("Mean", xy=(m,0),
             xytext=(0,20),
             textcoords='offset points',
             horizontalalignment="center",
             verticalalignment="bottom",
             arrowprops=dict(arrowstyle="->",color='black'))
ax.vlines(m + s, 0, 25, linestyle="dashed", linewidth=2)
ax.vlines(m - s, 0, 25, linestyle="dashed", linewidth=2)

ax.boxplot(data.butterfat, vert=False,
           positions=[22],widths=[3],
          )
plt.ylim(0,25)
#plt.savefig('butterfat-hist-boxplot.png')
pass



In [11]:
print("Petal Length, lowest decile: ", iris.Petal_Length.quantile(0.1))
print("Petal length, lower quartile:", iris.Petal_Length.quantile(0.25))
print("Petal Length, median: ", iris.Petal_Length.quantile(0.5))
print("Petal length, upper quartile:", iris.Petal_Length.quantile(0.75))
print("Petal Length, upper decile: ", iris.Petal_Length.quantile(0.9))


Petal Length, lowest decile:  1.4
Petal length, lower quartile: 1.6
Petal Length, median:  4.35
Petal length, upper quartile: 5.1
Petal Length, upper decile:  5.8

In [168]:
iris = pd.read_csv("http://roybatty.org/iris.csv")

In [196]:
sw = iris["Sepal.Width"]
fig, (ax1, ax2, ax3, ax4) = plt.subplots(1,4, sharex=True, sharey=True)
fig.set_size_inches(10,4)
ax1.hist(sw, bins=5)
ax1.set_title("5 bins")
ax1.set_ylabel("Frequency")
ax2.hist(sw, bins=10)
ax2.set_title("10 bins")
ax3.hist(sw, bins=15)
ax3.set_title("15 bins")
ax4.hist(sw, bins=20)
ax4.set_title("20 bins")
fig.text(0.5, 0.02, 'Sepal Width (mm)', ha='center')
fig.savefig("iris-hist-binsize.pdf")