In [1]:
%matplotlib inline
In [2]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
In [5]:
data = pd.read_csv("http://roybatty.org/butterfat.csv")
In [6]:
data.shape
Out[6]:
In [7]:
data.head(3)
Out[7]:
In [8]:
data.tail(3)
Out[8]:
In [9]:
matplotlib.style.use("ggplot")
data.butterfat.hist(bins=13)
plt.xlabel("Percentage butter fat")
plt.ylabel("Frequency")
## plt.savefig("butterfat-hist.png") # uncomment to save to a file
Out[9]:
In [10]:
matplotlib.style.use("ggplot")
data.butterfat.hist(bins=13)
plt.xlabel("Percentage butter fat")
plt.ylabel("Frequency")
# add vertical line at mean
plt.vlines(data.butterfat.mean(), 0, 20, linestyle="dashed")
Out[10]:
In [11]:
matplotlib.style.use("ggplot")
data.butterfat.hist(bins=13)
plt.xlabel("Percentage butter fat")
plt.ylabel("Frequency")
# add vertical lines at both mean and media
plt.vlines(data.butterfat.mean(), 0, 20, linestyle="dashed")
plt.vlines(data.butterfat.median(), 0, 20, linestyle="dotted")
Out[11]:
In [12]:
# add some artificial outliers to data set and redraw
newbutterfat = list(data.butterfat)
newbutterfat += [10,15,8]
data.butterfat.hist(bins=13)
plt.xlabel("Percentage butter fat")
plt.ylabel("Frequency")
plt.vlines(data.butterfat.mean(), 0, 20, linestyle="dashed", linewidth=2)
plt.vlines(data.butterfat.median(), 0, 20, linestyle="dotted", linewidth=2)
plt.vlines(np.mean(newbutterfat), 0, 20, linestyle="dashed", color="blue", linewidth=2)
plt.vlines(np.median(newbutterfat), 0, 20, linestyle="dotted", color="blue", linewidth=2)
plt.savefig('butterfat-hist-outliers.png')
In [13]:
# distn plus standard deviations
m = data.butterfat.mean()
s = data.butterfat.std()
data.butterfat.hist(bins=13)
plt.xlabel("Percentage butter fat")
plt.ylabel("Frequency")
plt.vlines(m + s, 0, 20, linestyle="dashed", linewidth=2)
plt.vlines(m - s, 0, 20, linestyle="dashed", linewidth=2)
# draw arrow to mean
plt.annotate("Mean", xy=(m,0),
xytext=(0,20),
textcoords='offset points', # offset text in pixels
horizontalalignment="center",
verticalalignment="bottom",
arrowprops=dict(arrowstyle="->",color='black'))
# draw arrow to +1 std dev
plt.annotate("+1 std dev", xy=(m+s,15),
xytext=(50,20),
textcoords='offset points',
horizontalalignment="center",
verticalalignment="bottom",
)
# draw arrow to -1 std dev
plt.annotate("-1 std dev", xy=(m-s,15),
xytext=(-50,20),
textcoords='offset points',
horizontalalignment="center",
verticalalignment="bottom",
arrowprops=dict(arrowstyle="->",color='black',
connectionstyle='arc3,rad=0.5',
linewidth=1))
# plt.savefig('butterfat-hist-stddev.pdf')
Out[13]:
In [15]:
# distn plus standard deviations
m = data.butterfat.mean()
s = data.butterfat.std()
fig, ax = plt.subplots(1,1)
ax.hist(data.butterfat, bins=13)
plt.xlabel("Percentage butter fat")
plt.ylabel("Frequency")
# draw arrows to mean
ax.annotate("Mean", xy=(m,0),
xytext=(0,20),
textcoords='offset points',
horizontalalignment="center",
verticalalignment="bottom",
arrowprops=dict(arrowstyle="->",color='black'))
ax.vlines(m + s, 0, 25, linestyle="dashed", linewidth=2)
ax.vlines(m - s, 0, 25, linestyle="dashed", linewidth=2)
ax.boxplot(data.butterfat, vert=False,
positions=[22],widths=[3],
)
plt.ylim(0,25)
#plt.savefig('butterfat-hist-boxplot.png')
pass
In [11]:
print("Petal Length, lowest decile: ", iris.Petal_Length.quantile(0.1))
print("Petal length, lower quartile:", iris.Petal_Length.quantile(0.25))
print("Petal Length, median: ", iris.Petal_Length.quantile(0.5))
print("Petal length, upper quartile:", iris.Petal_Length.quantile(0.75))
print("Petal Length, upper decile: ", iris.Petal_Length.quantile(0.9))
In [168]:
iris = pd.read_csv("http://roybatty.org/iris.csv")
In [196]:
sw = iris["Sepal.Width"]
fig, (ax1, ax2, ax3, ax4) = plt.subplots(1,4, sharex=True, sharey=True)
fig.set_size_inches(10,4)
ax1.hist(sw, bins=5)
ax1.set_title("5 bins")
ax1.set_ylabel("Frequency")
ax2.hist(sw, bins=10)
ax2.set_title("10 bins")
ax3.hist(sw, bins=15)
ax3.set_title("15 bins")
ax4.hist(sw, bins=20)
ax4.set_title("20 bins")
fig.text(0.5, 0.02, 'Sepal Width (mm)', ha='center')
fig.savefig("iris-hist-binsize.pdf")