back to the matplotlib-gallery
at https://github.com/rasbt/matplotlib-gallery
In [1]:
%load_ext watermark
In [2]:
%watermark -u -v -d -p matplotlib,numpy
[More info](http://nbviewer.ipython.org/github/rasbt/python_reference/blob/master/ipython_magic/watermark.ipynb) about the `%watermark` extension
In [3]:
%matplotlib inline
In [26]:
import numpy as np
import random
from matplotlib import pyplot as plt
data = np.random.normal(0, 20, 1000)
# fixed bin size
bins = np.arange(-100, 100, 5) # fixed bin size
plt.xlim([min(data)-5, max(data)+5])
plt.hist(data, bins=bins, alpha=0.5)
plt.title('Random Gaussian data (fixed bin size)')
plt.xlabel('variable X (bin size = 5)')
plt.ylabel('count')
plt.show()
In [30]:
import numpy as np
import random
import math
from matplotlib import pyplot as plt
data = np.random.normal(0, 20, 1000)
bins = np.linspace(math.ceil(min(data)),
math.floor(max(data)),
20) # fixed number of bins
plt.xlim([min(data)-5, max(data)+5])
plt.hist(data, bins=bins, alpha=0.5)
plt.title('Random Gaussian data (fixed number of bins)')
plt.xlabel('variable X (20 evenly spaced bins)')
plt.ylabel('count')
plt.show()
In [4]:
import numpy as np
import random
from matplotlib import pyplot as plt
data1 = [random.gauss(15,10) for i in range(500)]
data2 = [random.gauss(5,5) for i in range(500)]
bins = np.arange(-60, 60, 2.5)
plt.xlim([min(data1+data2)-5, max(data1+data2)+5])
plt.hist(data1, bins=bins, alpha=0.3, label='class 1')
plt.hist(data2, bins=bins, alpha=0.3, label='class 2')
plt.title('Random Gaussian data')
plt.xlabel('variable X')
plt.ylabel('count')
plt.legend(loc='upper right')
plt.show()
The line plot below is using bins of a histogram and is particularly useful if you are working with many different overlapping data sets.
In [33]:
# Generate a random Gaussian dataset with different means
# 5 rows with 30 columns, where every row represents 1 sample.
import numpy as np
data = np.ones((5,30))
for i in range(5):
data[i,:] = np.random.normal(loc=i/2, scale=1.0, size=30)
Via the numpy.histogram
function, we can categorize our data into distinct bins.
In [34]:
from math import floor, ceil # for rounding up and down
data_min = floor(data.min()) # minimum val. of the dataset rounded down
data_max = floor(data.max()) # maximum val. of the dataset rounded up
bins_size = 0.5
bins = np.arange(floor(data_min), ceil(data_max), bin_size)
np.histogram(data[0,:], bins=bins)
Out[34]:
The numpy.histogram
function returns a tuple, where the first value is an array of how many samples fall into the first bin, the second bin, and so forth.
The second value is another NumPy array; it contains the specified bins. Note that all bins but the last one are half open intervals, e.g., the first bin would be [-2, -1.5)
(including -2, but not including -1.5), and the second bin would be [-1.5, -1.)
(including -1.5, but not including 1.0). But the last bin is defined as [2., 2.5]
(including 2 and including 2.5).
In [57]:
from matplotlib import pyplot as plt
markers = ['^', 'v', 'o', 'p', 'x', 's', 'p', ',']
plt.figure(figsize=(13,8))
for row in range(data.shape[0]):
hist = np.histogram(data[row,:], bins=bins)
plt.errorbar(hist[1][:-1] + bin_size/2,
hist[0],
alpha=0.3,
xerr=bin_size/2,
capsize=0,
fmt=None,
linewidth=8,
)
plt.legend(['sample %s'%i for i in range(1, 6)])
plt.grid()
plt.title('Histogram showing bar heights but without area under the bars', fontsize=18)
plt.ylabel('count', fontsize=14)
plt.xlabel('X value (bin size = %s)'%bin_size, fontsize=14)
plt.xticks(bins + bin_size)
plt.show()
In [ ]: