Visualizing and Representing Bivariate Relationships



In [112]:

    
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sbn
import pandas as pd
import scipy.stats as stats



In [17]:

    
sbn.set_style("white") # get rid of seaborn grid lines



In [18]:

    
irisurl ="https://raw.githubusercontent.com/Bio204-class/bio204-datasets/master/iris.csv"
iris = pd.read_csv(irisurl)



In [19]:

    
iris.head(3)









    Out[19]:






  
    
      
      Sepal.Length
      Sepal.Width
      Petal.Length
      Petal.Width
      Species
    
  
  
    
      0
      5.1
      3.5
      1.4
      0.2
      setosa
    
    
      1
      4.9
      3.0
      1.4
      0.2
      setosa
    
    
      2
      4.7
      3.2
      1.3
      0.2
      setosa



In [20]:

    
# rename columns for convenience
iris.columns = iris.columns.str.replace('.', '')
iris.head(3)









    Out[20]:






  
    
      
      SepalLength
      SepalWidth
      PetalLength
      PetalWidth
      Species
    
  
  
    
      0
      5.1
      3.5
      1.4
      0.2
      setosa
    
    
      1
      4.9
      3.0
      1.4
      0.2
      setosa
    
    
      2
      4.7
      3.2
      1.3
      0.2
      setosa



In [21]:

    
setosa = iris[iris.Species == 'setosa']
versicolor = iris[iris.Species == 'versicolor']
virginica = iris[iris.Species == 'virginica']

Scatter plot



In [38]:

    
# using pyplot.scatter
fig, ax = plt.subplots()
ax.scatter(setosa.SepalLength, setosa.SepalWidth, marker='o', s=30, color='steelblue')
ax.set_xlabel('Sepal Length (mm)')
ax.set_ylabel('Sepal Width (mm)')
sbn.despine()
pass



In [72]:

    
# as above but with equal aspect ratio
fig, ax = plt.subplots()
ax.scatter(setosa.SepalLength, setosa.SepalWidth, marker='o', s=30, color='steelblue')
ax.set_xlabel('Sepal Length (mm)')
ax.set_ylabel('Sepal Width (mm)')
ax.set_aspect('equal') # <-- aspect ratio specified here
sbn.despine()
pass



In [71]:

    
# using seaborn.jointplot
# note that jointplot returns an object of type JointGrid
# Since there are three sets of axes, the JointGrid
# object provide a function `set_axis_labels`
# to properly set labels
# https://stanford.edu/~mwaskom/software/seaborn/generated/seaborn.JointGrid.html

g = sbn.jointplot(setosa.SepalLength, setosa.SepalWidth)
g.set_axis_labels("Sepal Length", "Sepal Width")
pass

Density plots



In [68]:

    
sbn.kdeplot(setosa.SepalLength, setosa.SepalWidth)
sbn.despine()
pass



In [103]:

    
sbn.kdeplot(setosa.SepalLength, setosa.SepalWidth, shade=True)
sbn.despine()
pass



In [83]:

    
sbn.kdeplot(setosa.SepalLength, setosa.SepalWidth)
# the zorder argument makes sure the points are drawn on top of the
# density contours
plt.scatter(setosa.SepalLength, setosa.SepalWidth, color='forestgreen', s=30, zorder=10)
sbn.despine()
pass



In [122]:

    
sbn.jointplot(setosa.SepalLength, setosa.SepalWidth, kind='kde')
pass









    Out[122]:





<seaborn.axisgrid.JointGrid at 0x7fac8f0f9828>



In [123]:

    
sbn.jointplot(setosa.SepalLength, setosa.SepalWidth, kind='hex')
pass



In [120]:

    
sbn.pairplot(setosa)
pass



In [ ]:

Linear vs Non-linear relationships

In the plot below there is a relationship between $x$ and $y$ but it is non-linear.



In [119]:

    
x = np.linspace(-1,1,100) + stats.norm.rvs(0,0.1,size=100)
y = (np.linspace(-1,1,100))**3 + stats.norm.rvs(0,0.1,size=100)
plt.scatter(x,y)
pass









    Out[119]:





<matplotlib.collections.PathCollection at 0x7fac8ff92a58>

Statistical measures of association

covariance



In [100]:

    
# ddof = 1 gives us the sample covariance
np.cov(setosa.SepalLength, setosa.SepalWidth, ddof=1)









    Out[100]:





array([[ 0.12424898,  0.09921633],
       [ 0.09921633,  0.1436898 ]])



In [98]:

    
# using cov method on pandas data frame
setosa[["SepalWidth","SepalLength"]].cov()









    Out[98]:






  
    
      
      SepalWidth
      SepalLength
    
  
  
    
      SepalWidth
      0.143690
      0.099216
    
    
      SepalLength
      0.099216
      0.124249



In [102]:

    
# will generate covariance matrix for all pairs of variables
setosa.cov()









    Out[102]:






  
    
      
      SepalLength
      SepalWidth
      PetalLength
      PetalWidth
    
  
  
    
      SepalLength
      0.124249
      0.099216
      0.016355
      0.010331
    
    
      SepalWidth
      0.099216
      0.143690
      0.011698
      0.009298
    
    
      PetalLength
      0.016355
      0.011698
      0.030159
      0.006069
    
    
      PetalWidth
      0.010331
      0.009298
      0.006069
      0.011106

correlation



In [125]:

    
np.corrcoef(setosa.SepalLength, setosa.SepalWidth, ddof=1)









    Out[125]:





array([[ 1.        ,  0.74254669],
       [ 0.74254669,  1.        ]])



In [128]:

    
setosa[['SepalLength','SepalWidth']].corr()









    Out[128]:






  
    
      
      SepalLength
      SepalWidth
    
  
  
    
      SepalLength
      1.000000
      0.742547
    
    
      SepalWidth
      0.742547
      1.000000



In [129]:

    
setosa.corr()









    Out[129]:






  
    
      
      SepalLength
      SepalWidth
      PetalLength
      PetalWidth
    
  
  
    
      SepalLength
      1.000000
      0.742547
      0.267176
      0.278098
    
    
      SepalWidth
      0.742547
      1.000000
      0.177700
      0.232752
    
    
      PetalLength
      0.267176
      0.177700
      1.000000
      0.331630
    
    
      PetalWidth
      0.278098
      0.232752
      0.331630
      1.000000

Anscombe's quartet -- a cautionary tale



In [130]:

    
# anscombe's synthetic data set is included in seaborn
anscombe = sbn.load_dataset('anscombe')
anscombe.head()

The basic descriptive stats for $x$ and $y$ for each of the groups is quite similar.



In [131]:

    
anscombe.groupby('dataset').describe()



In [132]:

    
groupI = anscombe[anscombe.dataset == "I"]
groupII = anscombe[anscombe.dataset == "II"]
groupIII = anscombe[anscombe.dataset == "III"]
groupIV = anscombe[anscombe.dataset == "IV"]

The pairwise correlations within the groups look very similar



In [133]:

    
groupI.corr()



In [134]:

    
groupII.corr()



In [135]:

    
groupIII.corr()



In [136]:

    
groupIV.corr()



In [147]:

    
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2,2, figsize=(10,10))
ax1.scatter(groupI.x, groupI.y, color='steelblue', s=30)
ax2.scatter(groupII.x, groupII.y, color='forestgreen', s=30)
ax3.scatter(groupIII.x, groupIII.y, color='firebrick', s=30)
ax4.scatter(groupIV.x, groupIV.y, color='purple', s=30)

for ax in (ax1,ax2,ax3,ax4):
    ax.set_xlabel('x')
    ax.set_ylabel('y')

ax1.set_title("Group I")
ax2.set_title("Group II")
ax3.set_title("Group III")
ax4.set_title("Group IV")

fig.tight_layout()

pass



In [ ]:

	dataset	x	y
0	I	10	8.04
1	I	8	6.95
2	I	13	7.58
3	I	9	8.81
4	I	11	8.33

		x	y
dataset
I	count	11.000000	11.000000
	mean	9.000000	7.500909
	std	3.316625	2.031568
	min	4.000000	4.260000
	25%	6.500000	6.315000
	50%	9.000000	7.580000
	75%	11.500000	8.570000
	max	14.000000	10.840000
II	count	11.000000	11.000000
	mean	9.000000	7.500909
	std	3.316625	2.031657
	min	4.000000	3.100000
	25%	6.500000	6.695000
	50%	9.000000	8.140000
	75%	11.500000	8.950000
	max	14.000000	9.260000
III	count	11.000000	11.000000
	mean	9.000000	7.500000
	std	3.316625	2.030424
	min	4.000000	5.390000
	25%	6.500000	6.250000
	50%	9.000000	7.110000
	75%	11.500000	7.980000
	max	14.000000	12.740000
IV	count	11.000000	11.000000
	mean	9.000000	7.500909
	std	3.316625	2.030579
	min	8.000000	5.250000
	25%	8.000000	6.170000
	50%	8.000000	7.040000
	75%	8.000000	8.190000
	max	19.000000	12.500000

	SepalLength	SepalWidth	PetalLength	PetalWidth
SepalLength	0.124249	0.099216	0.016355	0.010331
SepalWidth	0.099216	0.143690	0.011698	0.009298
PetalLength	0.016355	0.011698	0.030159	0.006069
PetalWidth	0.010331	0.009298	0.006069	0.011106

	SepalLength	SepalWidth	PetalLength	PetalWidth
SepalLength	1.000000	0.742547	0.267176	0.278098
SepalWidth	0.742547	1.000000	0.177700	0.232752
PetalLength	0.267176	0.177700	1.000000	0.331630
PetalWidth	0.278098	0.232752	0.331630	1.000000