Seaborn

Official documentation: https://seaborn.pydata.org/index.html


In [ ]:
import seaborn as sns
import pandas as pd
import math

In [ ]:
sns.__version__

Aspect


In [ ]:
sns.set_context('talk')
#sns.set_context('poster')

Figsize


In [ ]:
df = sns.load_dataset("fmri")
df.head()

In [ ]:
sns.relplot(x="timepoint", y="signal", kind="line", data=df);

In [ ]:
sns.relplot(x="timepoint", y="signal", kind="line", data=df,
            height=6, aspect=2);

Legend


In [ ]:
l = []

for run in range(100):
    for a in (1., 3.):
        for x in range(10):
            y = a * x + 10. * np.random.normal()
            row = [x, y, a, run]
            l.append(row)
        
df = pd.DataFrame(l, columns=["x", "y", "a", "run"])
df.head()

In [ ]:
sns.catplot(x="x", y="y", hue="a", data=df,
            kind="point",
            height=6, aspect=2);

In [ ]:
g = sns.catplot(x="x", y="y", hue="a", data=df,
                kind="point",
                height=6, aspect=2);

g._legend.set_title("Slope")

Relplot

Scatter plot


In [ ]:
tips = sns.load_dataset("tips")
tips.head()

In [ ]:
sns.relplot(x="total_bill", y="tip", data=tips);

In [ ]:
sns.scatterplot(x="total_bill", y="tip", data=tips);

In [ ]:
sns.relplot(x="total_bill", y="tip", hue="size", size="day", style="time", row="sex", col="smoker", data=tips);

Line plot

Official documentation: https://seaborn.pydata.org/tutorial/relational.html#aggregation-and-representing-uncertainty

"The default behavior in seaborn is to aggregate the multiple measurements at each x value by plotting the mean and the 95% confidence interval around the mean."

First example


In [ ]:
l = []

sigma = 1.

for run in range(1000):
    for x in np.linspace(-10, 10, 100):
        row = [x, np.random.normal(loc=0., scale=sigma), run]
        l.append(row)
        
df = pd.DataFrame(l, columns=["x", "y", "run"])
df.head()

In [ ]:
sns.relplot(x="x", y="y", kind="line", data=df,
            height=6, aspect=2)

plt.axhline(0, color="r", linestyle=":", label="Actual mean")

plt.legend();

In [ ]:
sns.relplot(x="x", y="y", kind="line", data=df,
            height=6, aspect=2,
            units="run", estimator=None, alpha=0.1)

plt.axhline(0, color="r", linestyle=":", label="Actual mean")

plt.legend();

In [ ]:
sns.relplot(x="x", y="y", data=df,
            height=6, aspect=2, marker=".",
            estimator=None, alpha=0.15)

plt.axhline(2. * sigma, color="k", linestyle=":", label=r"$2 \sigma$")
plt.axhline(0, color="r", linestyle=":", label="Actual mean")
plt.axhline(-2. * sigma, color="k", linestyle=":", label=r"$2 \sigma$")

plt.legend();

In [ ]:
sns.relplot(x="x", y="y", kind="line", data=df,
            height=6, aspect=2,
            estimator=np.median)

plt.axhline(0, color="r", linestyle=":", label="Actual median")

plt.legend();

Second example


In [ ]:
l = []

for run in range(100):
    for func in ("sin", "cos"):
        for x in np.linspace(-10, 10, 100):
            y = math.sin(x) if func == "sin" else math.cos(x)
            row = [x, y + np.random.normal(), func, run]
            l.append(row)
        
df = pd.DataFrame(l, columns=["x", "y", "func", "run"])
df.head()

In [ ]:
sns.relplot(x="x", y="y", kind="line", hue="func", data=df,
            height=6, aspect=2);

Third example


In [ ]:
fmri = sns.load_dataset("fmri")
fmri.head()

In [ ]:
sns.relplot(x="timepoint", y="signal", data=fmri,
            height=6, aspect=2);

In [ ]:
sns.catplot(x="timepoint", y="signal", data=fmri, aspect=3);

In [ ]:
sns.relplot(x="timepoint", y="signal", kind="line", data=fmri,
            height=6, aspect=2);

Fourth example


In [ ]:
l = []

for run in range(100):
    for a in (1., 3.):
        for x in range(10):
            y = a * x + 10. * np.random.normal()
            row = [x, y, a, run]
            l.append(row)
        
df = pd.DataFrame(l, columns=["x", "y", "a", "run"])
df.head()

In [ ]:
sns.relplot(x="x", y="y", hue="a", data=df,
            kind="line",
            height=6, aspect=2);

The legend is bad because relplot() is made for real values, even for the "hue" variable... Here, catplot would be more adapted.

Catplot


In [ ]:
l = []

for run in range(100):
    for a in (1., 3.):
        for x in range(10):
            y = a * x + 10. * np.random.normal()
            row = [x, y, a, run]
            l.append(row)
        
df = pd.DataFrame(l, columns=["x", "y", "a", "run"])
df.head()

In [ ]:
sns.catplot(x="x", y="y", hue="a", data=df,
            kind="point",
            height=6, aspect=2);

In [ ]:
sns.catplot(x="x", y="y", hue="a", data=df,
            kind="point",
            markers=".",
            scale=0.7,
            linestyles=":",
            capsize=0.1,
            height=6, aspect=2);

Pairplot


In [ ]:
# https://seaborn.pydata.org/tutorial/distributions.html#visualizing-pairwise-relationships-in-a-dataset

iris = sns.load_dataset("iris")
iris.head()

In [ ]:
sns.pairplot(iris, hue="species");

In [ ]:
# https://seaborn.pydata.org/tutorial/distributions.html#visualizing-pairwise-relationships-in-a-dataset

titanic = sns.load_dataset("titanic")
titanic.head()

In [ ]:
sns.pairplot(titanic, vars=["survived", "pclass", "fare"], hue="survived");