In [1]:

    
%load_ext watermark



In [2]:

    
%watermark -a 'Sebastian Raschka' -v -d -p numpy,scipy,matplotlib,scikit-learn









    



Sebastian Raschka 24/08/2014 

CPython 3.4.1
IPython 2.1.0

numpy 1.8.1
scipy 0.14.0
matplotlib 1.3.1
scikit-learn 0.15.0b1

Matplolib examples - visualization techniques for exploratory data analysis

There are enumerous useful visualization techniques that are useful for exploratory data analysis: In practice, the choice highly depends on the kind of data and the question at hand.

This IPython notebook is a small gallery for visualizing the Iris flower dataset. This gallery is more meant to be a code matplotlib reference so that certain plots may be more or less useful in the context of this dataset.

Reading in the dataset

[back to top]



In [4]:

    
import pandas as pd

# dictionary of the feature names
feature_dict = {i:label for i,label in zip(
            range(4),
              ('sepal length in cm', 
              'sepal width in cm', 
              'petal length in cm', 
              'petal width in cm', ))}

# reading the CSV file directly from the UCI machine learning repository
df = pd.io.parsers.read_csv(
    filepath_or_buffer='https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', 
    header=None, 
    sep=',', 
    )

df.columns = [l for i,l in sorted(feature_dict.items())] + ['class label']
df.dropna(how="all", inplace=True) # to drop the empty line at file-end

df.tail()









    Out[4]:






  
    
      
      sepal length in cm
      sepal width in cm
      petal length in cm
      petal width in cm
      class label
    
  
  
    
      145
       6.7
       3.0
       5.2
       2.3
       Iris-virginica
    
    
      146
       6.3
       2.5
       5.0
       1.9
       Iris-virginica
    
    
      147
       6.5
       3.0
       5.2
       2.0
       Iris-virginica
    
    
      148
       6.2
       3.4
       5.4
       2.3
       Iris-virginica
    
    
      149
       5.9
       3.0
       5.1
       1.8
       Iris-virginica



In [5]:

    
from sklearn.preprocessing import LabelEncoder

# convert pandas DataFrame to simple numpy arrays
X = df[[0,1,2,3]].values 
y = df['class label'].values

# convert class labels from strings to integers
enc = LabelEncoder()
label_encoder = enc.fit(y)
y = label_encoder.transform(y)

label_dict = {0: 'Setosa', 1: 'Versicolor', 2:'Virginica'}

attr_dict = {0: 'sepal length', 
            1:  'sepal width', 
            2:  'petal length', 
            3:  'petal width'}

Pie chart

[back to top]



In [6]:

    
%matplotlib inline



In [7]:

    
from matplotlib import pyplot as plt
import numpy as np

plt.pie(
    [X[y==i].shape[0] for i in range(3)],
    labels=[label_dict[i] for i in range(3)],
    shadow=True,
    colors=('yellowgreen', 'lightskyblue', 'gold'),
    startangle=90,      # rotate conter-clockwise by 90 degrees
    autopct='%1.1f%%',  # display fraction as percentage
    )

plt.legend(fancybox=True, loc='lower right')

plt.title('Class distribution of the 3 different flower species')
plt.axis('equal')     # plot pyplot as circle

plt.tight_layout()

plt.show()

Bar plot

[back to top]



In [8]:

    
import matplotlib.pyplot as plt

mean_vals = [X[y==i,:].mean(axis=0) for i in range(3)]

labels = [attr_dict[i] for i in range(4)]

# Setting the positions and width for the bars
pos = np.arange(4)
width = 0.2 
    
# Plotting the bars
fig, ax = plt.subplots(figsize=(8,6))

plt.bar(pos, mean_vals[0], width,
                 alpha=0.5,
                 color='g',
                 label=labels[0])

plt.bar([p + width for p in pos], mean_vals[1], width,
                 alpha=0.5,
                 color='b',
                 label=labels[1])
    
plt.bar([p + width*2 for p in pos], mean_vals[2], width,
                 alpha=0.5,
                 color='r',
                 label=labels[2])

# Setting axis labels and ticks
ax.set_ylabel('cm')
ax.set_title('Average values for the flower dimensions')
ax.set_xticks([p + 1.5 * width for p in pos])
ax.set_xticklabels(labels)

# Setting the x-axis and y-axis limits
plt.xlim(min(pos)-width, max(pos)+width*4)


# Adding the legend and showing the plot
plt.legend([label_dict[i] for i in range(3)], loc='upper right')

# adding horizontal grid lines 
ax.yaxis.grid(True) 

plt.show()

Box plot

[back to top]



In [10]:

    
fig = plt.figure(figsize=(8,6))
ax = plt.subplot(111)

bplot = plt.boxplot([X[y==i,3] for i in range(3)],
        notch=True,          # notch shape 
        vert=True,           # vertical box aligmnent
        sym='ko',            # black circle for outliers
        patch_artist=True)   # fill with color

# choosing custom colors to fill the boxes
colors = ['pink', 'lightblue', 'lightgreen']
for patch, color in zip(bplot['boxes'], colors):
    patch.set_facecolor(color)

# modifying the whiskers: straight lines, black, wider
for whisker in bplot['whiskers']:
    whisker.set(color='black', linewidth=1.2, linestyle='-')    
    
# making the caps a little bit wider 
for cap in bplot['caps']:
    cap.set(linewidth=1.2)
    
# hiding axis ticks
plt.tick_params(axis="both", which="both", bottom="off", top="off",  
        labelbottom="on", left="off", right="off", labelleft="on")

# adding horizontal grid lines 
ax.yaxis.grid(True)

plt.xticks([y+1 for y in range(4)],
           [label_dict[i] for i in range(3)])
plt.ylim([0,3])

plt.title('Petal widths of the three different flower species')
plt.ylabel('cm')
plt.show()

1D Histogram

[back to top]



In [11]:

    
import math

fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12,6))

for ax,cnt in zip(axes.ravel(), range(4)):

    # set bin sizes
    min_b = math.floor(np.min(X[:,cnt]))
    max_b = math.ceil(np.max(X[:,cnt]))
    bins = np.linspace(min_b, max_b, 25)

    # plottling the histograms
    for lab,col in zip(range(3), ('blue', 'red', 'green')):
        ax.hist(X[y==lab, cnt],
                   color=col, 
                   label='%s' %label_dict[lab], 
                   bins=bins,
                   alpha=0.5,)
    ylims = ax.get_ylim()

    # plot annotation
    leg = ax.legend(loc='upper right', fancybox=True, fontsize=8)
    ax.set_ylim([0, max(ylims)+2])
    ax.set_xlabel(feature_dict[cnt])
    ax.set_title('Iris histogram #%s' %str(cnt+1))
    
    # adding horizontal grid lines 
    ax.yaxis.grid(True)

    # hide axis ticks
    ax.tick_params(axis="both", which="both", bottom="off", top="off",  
            labelbottom="on", left="off", right="off", labelleft="on")

    # remove axis spines
    ax.spines["top"].set_visible(False)  
    ax.spines["right"].set_visible(False) 
    ax.spines["bottom"].set_visible(False) 
    ax.spines["left"].set_visible(False)

axes[0][0].set_ylabel('count')
axes[1][0].set_ylabel('count')

fig.tight_layout()

plt.show()

2D Histogram

[back to top]



In [12]:

    
fig = plt.figure(figsize=(6,6))

hist, xedges, yedges = np.histogram2d(X[y==0,2], X[y==0,3])

plt.xlabel('petal length (cm)')
plt.ylabel('petal width (cm)')
plt.title('Petal dimensions of Setosa')
plt.imshow(hist)
plt.grid(True)
plt.colorbar()

plt.show()

3D Histogram

[back to top]



In [13]:

    
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=(8,6))
ax = fig.add_subplot(111, projection='3d')
x1, y1 = np.copy(X[:,2]), np.copy(X[:,3])
hist, xedges, yedges = np.histogram2d(x1, y1, bins=4)

elements = (len(xedges) - 1) * (len(yedges) - 1)
xpos, ypos = np.meshgrid(xedges[:-1]+0.25, yedges[:-1]+0.25)

xpos = xpos.flatten()
ypos = ypos.flatten()
zpos = np.zeros(elements)
dx = 0.5 * np.ones_like(zpos)
dy = dx.copy()
dz = hist.flatten()

ax.set_xlabel('petal length (cm)')
ax.set_ylabel('petal width (cm)')
ax.set_zlabel('count')
plt.title('Petal length and width distributions for all samples')

ax.bar3d(xpos, ypos, zpos, dx, dy, dz, color='b', zsort='average')

plt.show()

Scatter plot

[back to top]



In [14]:

    
fig = plt.figure(figsize=(6,6))
ax = fig.add_subplot(111)

colors = ['blue', 'red', 'green']
markers = ['s', 'o', '^']

for lab, c, m in zip(range(3), colors, markers):
    ax.scatter(X[y==lab, 2], X[y==lab, 3], 
               c=c,      # color
               marker=m, # marker symbol
               s=40,     # markersize
               alpha=0.4, # transparency
               label=label_dict[lab]
               )

ax.set_xlabel('petal height (cm)')
ax.set_ylabel('petal width (cm)')
plt.legend(loc='upper left')
plt.grid()

plt.show()

3D Scatter plot

[back to top]



In [15]:

    
from mpl_toolkits.mplot3d import Axes3D

### plot scatterplot

fig = plt.figure(figsize=(8,8))
ax = fig.add_subplot(111, projection='3d')

colors = ['blue', 'red', 'green']
markers = ['s', 'o', '^']

for lab, c, m in zip(range(3), colors, markers):
    ax.scatter(X[y==lab, 1], X[y==lab, 2], X[y==lab, 3], 
               c=c,      # color
               marker=m, # marker symbol
               s=40,     # markersize
               alpha=0.7 # transparency
               )

ax.set_xlabel('sepal width (cm)')
ax.set_ylabel('petal height (cm)')
ax.set_zlabel('petal width (cm)')
plt.show()


### plot legend
fig,ax = plt.subplots(1,1)
ax.hold(True)

# dummy plot
[ax.plot(None, None, 
         marker=m, ls='', 
         c=c, label=l) 
     for c,l,m in zip(colors, labels, markers)]

ax.legend(labels, 
          loc='upper left',
          numpoints=1) # show marker symbol in legend only once

# hide axis ticks
ax.tick_params(axis="both", which="both", bottom="off", top="off",  
       labelbottom="off", left="off", right="off", labelleft="off")

# remove axis spines
ax.spines["top"].set_visible(False)  
ax.spines["right"].set_visible(False) 
ax.spines["bottom"].set_visible(False) 
ax.spines["left"].set_visible(False)

plt.show()

	sepal length in cm	sepal width in cm	petal length in cm	petal width in cm	class label
145	6.7	3.0	5.2	2.3	Iris-virginica
146	6.3	2.5	5.0	1.9	Iris-virginica
147	6.5	3.0	5.2	2.0	Iris-virginica
148	6.2	3.4	5.4	2.3	Iris-virginica
149	5.9	3.0	5.1	1.8	Iris-virginica