When given a set of numerical features, it is desirable to plot all of them using for example violinplots, to get a sense of their respective distributions. Seaborn can do this automatically using the violinplot
function, but this doesn't work so well when the features have widely different ranges.
In [ ]:
import string
import pandas as pd
import numpy as np
import seaborn as sns
In [ ]:
def get_random_numerical_data(size, *amplitudes):
n = len(amplitudes)
data = np.random.random((size, n)) * np.array(amplitudes).reshape(1, n)
return pd.DataFrame(data=data, columns=pd.Series(list(string.ascii_uppercase[:n]), name="feature"))
In [ ]:
get_random_numerical_data(5, 1, 2)
In [ ]:
get_random_numerical_data(500, 1, 2, 3, 4).describe().loc[['count', 'std', 'max']]
In [ ]:
df_small_range = get_random_numerical_data(500, 1, 2, 3, 4)
sns.violinplot(df_small_range)
In [ ]:
df_big_range = get_random_numerical_data(500, 1, 10, 100, 1000)
sns.violinplot(df_big_range)
Changing the y-scale to log doesn't help much
In [ ]:
df_big_range = get_random_numerical_data(500, 1, 10, 100, 1000)
h = sns.violinplot(df_big_range)
h.set_yscale('log')
In [ ]:
import matplotlib.pyplot as plt
def featureplot(df, nrows=1, ncols=1, figsize=(12,8), plotfunc=sns.violinplot):
"""Plot the dataframe features"""
width, height = figsize
fig, axes = plt.subplots(nrows, ncols, figsize=(width, height * nrows));
i = 0
plots_per_figure = max(df.shape[1] // (nrows * ncols), 1)
if nrows == 1 and ncols == 1:
axes = [axes]
if nrows > 1 and ncols > 1:
axes = chain.from_iterable(axes) # flatten the nested list
for j, ax in zip(range(plots_per_figure, df.shape[1] + 1, plots_per_figure), axes):
plotfunc(df.iloc[:, i:j], ax=ax)
i = j
plt.tight_layout()
In [ ]:
featureplot(df_big_range, ncols=4)
In [ ]:
df_big_range_lf = df_big_range.stack().reset_index(name="value").drop('level_0', axis=1)#.reset_index() # don't keep the index
df_big_range_lf.head()
In [ ]:
# size is the height of each figure and aspect is the with/height aspect ratio of each figure.
sns.FacetGrid(df_big_range_lf, col="feature", hue="feature",
sharey=False, size=7, aspect=8/12.0/2.0).map(sns.violinplot, "value", orient="v")
In [ ]:
test = pd.DataFrame({'foo':["one"] * 3 + ["two"] * 3, 'bar': list("ABC")*2, 'baz': list(range(6))})
In [ ]:
test
In [ ]:
test.pivot('foo', 'bar', 'baz')
In [ ]:
test.set_index(['foo','bar']).unstack()['baz']