In [1]:
#!conda install -y numpy pandas matplotlib seaborn statsmodels
In [2]:
%matplotlib inline
import seaborn as sns
import pandas as pd
In [3]:
sns.set(style="ticks")
Load the Anscombe's quartet dataset
In [4]:
df = sns.load_dataset("anscombe")
And df is... a pandas dataframe
In [5]:
type(df)
Out[5]:
that we can print, plot, ...
In [6]:
df.head()
Out[6]:
Print just first dataset
In [7]:
df[df.dataset == 'I']
Out[7]:
Let's compare the basic statistical parameters of each dataset
In [8]:
groups = ['I', 'II', 'III', 'IV']
for group in groups:
print(group)
print(df[df.dataset == group].describe())
print()
Let's compare the correlation coefficient for each dataset
In [9]:
for g in groups:
print(df[df.dataset == g]['x'].corr(df[df.dataset == g]['y']))
Plot datasets
In [10]:
sns.lmplot(x="x", y="y", col="dataset", hue="dataset", data=df,
col_wrap=2, ci=None, palette="muted", size=4,
scatter_kws={"s": 50, "alpha": 1}, fit_reg=False)
Out[10]:
Show the results of a linear regression within each dataset
In [11]:
sns.lmplot(x="x", y="y", col="dataset", hue="dataset", data=df,
col_wrap=2, ci=None, palette="muted", size=4)
Out[11]:
It's the same line for all datasets
Let's plot with its 95% confidence interval region.
In [12]:
sns.lmplot(x="x", y="y", col="dataset", hue="dataset", data=df,
col_wrap=2, ci=95, palette="muted", size=4)
Out[12]:
Visualize your data beforehand
One can fit a polynomial regression model to explore simple kinds of nonlinear trends in the dataset
In [13]:
sns.lmplot(x="x", y="y", data=df[df.dataset == 'II'],
order=2, ci=95, scatter_kws={"s": 80});
In the presence of outliers, it can be useful to fit a robust regression, which uses a different loss function to downweight relatively large residuals:
In [14]:
sns.lmplot(x="x", y="y", data=df[df.dataset == 'III'],
robust=True, ci=None, scatter_kws={"s": 80});