In [1]:
# leave this line at the top http://stackoverflow.com/questions/23550056/figurecanvasagg-object-has-no-attribute-invalidate-python-plotting
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
In [2]:
plt.style.available
Out[2]:
In [3]:
plt.style.use('seaborn-notebook')
Adapted from the fantastic blog post https://dansaber.wordpress.com/2016/10/02/a-dramatic-tour-through-pythons-data-visualization-landscape-including-ggplot-and-altair/
Start with 4 time series and "unpivot" the table to get a dataframe with
In [4]:
data = np.random.randn(4, 365).cumsum(axis=1)
dt = pd.date_range('2015-01-01', '2015-12-31')
timeseries_df = pd.DataFrame(data=dict(A=data[0], B=data[1], C=data[2], D=data[3]), index=dt)
timeseries_df.head()
Out[4]:
In [5]:
def alternative_dataset():
import pandas.util.testing as tm;
tm.N = 100
return tm.makeTimeDataFrame()
In [6]:
def unpivot(frame):
data = {'value' : frame.values.ravel('F'),
'kind' : np.asarray(frame.columns).repeat(frame.shape[0]),
'dt' : np.tile(np.asarray(frame.index), frame.shape[1])}
return pd.DataFrame(data, columns=['dt', 'kind', 'value'])
df = unpivot(timeseries_df)
df.head()
Out[6]:
In [7]:
df.tail()
Out[7]:
In [8]:
fig = plt.figure(1, figsize=(10,8))
kinds = df.kind.unique()
for k in kinds:
data = df[df.kind == k]
plt.plot(data.dt, data.value, lw=2)
fig.autofmt_xdate()
plt.ylabel('Value')
plt.xlabel('Date');
plt.legend(kinds)
Out[8]:
In [9]:
dfp = df.pivot(index='dt', columns='kind', values='value')
dfp.head()
Out[9]:
In [10]:
fig = plt.figure(2, figsize=(10, 8))
plt.plot(dfp);
plt.legend(dfp.columns)
fig.autofmt_xdate()
plt.ylabel('Value')
plt.xlabel('Date');
In [11]:
fig, ax = plt.subplots(1, 1, figsize=(10, 8))
dfp.plot(ax=ax)
fig.autofmt_xdate()
plt.ylabel('Value')
plt.xlabel('Date');
In [12]:
g = sns.FacetGrid(df, hue='kind', aspect=10/8)
g.map(plt.plot, 'dt', 'value')
plt.legend();
plt.ylabel('Value')
plt.xlabel('Date');
g.fig.autofmt_xdate()
g.fig.set_figwidth(10)
g.fig.set_figheight(8)
In [13]:
from ggplot import *
# GGPLOT
fig, ax = plt.subplots(1, 1, figsize=(10, 8))
g = ggplot(df, aes(x='dt', y='value', color='kind')) + \
geom_line(size=2.0) + \
xlab('Date') + \
ylab('Value')
g
Out[13]:
In [14]:
from altair import Chart, Scale, Color
cp = sns.palettes.color_palette()
Chart(df).mark_line().encode(
x='dt',
y='value',
color=Color('kind', scale=Scale(range=cp.as_hex()))
)
More complex plots using the Iris dataset https://raw.githubusercontent.com/pydata/pandas/master/pandas/tests/data/iris.csv
In [15]:
iris = pd.read_csv('https://raw.githubusercontent.com/pydata/pandas/master/pandas/tests/data/iris.csv')
In [16]:
iris.head()
Out[16]:
In [17]:
iris['Name'].unique()
Out[17]:
In [18]:
plt.figure(figsize=(10,6))
for name in iris.Name.unique():
this = iris[iris.Name == name]
plt.plot(this.PetalLength, this.PetalWidth, 'o', label=name)
plt.legend(loc=0)
Out[18]:
We can also combine Matplotlib with Pandas's groupby method.
In [19]:
plt.figure(figsize=(10,6))
plt.clf()
def scatter(group):
print('group = {!r}, {!r}'.format(group.Name.iloc[0], group.shape))
plt.plot(group.PetalLength, group.PetalWidth, 'o', label=group.name)
iris.groupby('Name').apply(scatter)
plt.legend(loc=0);
In [20]:
g = sns.FacetGrid(iris, hue='Name', aspect=10/6)
g.map(plt.scatter, 'PetalLength', 'PetalWidth')
plt.xlim([1, 7])
plt.ylim([0, 2.5])
g.fig.set_figheight(6)
g.fig.set_figwidth(10)
In [21]:
# GGPLOT
g = ggplot(iris, aes(x='PetalLength',
y='PetalWidth',
color='Name')) + \
geom_point(size=40.0) + \
ggtitle('Petal Width v. Length -- by Species')
g
Out[21]:
In [26]:
# ALTAIR
plt.figure(figsize=(10,6))
Chart(iris).mark_point(filled=True).encode(
x='PetalLength',
y='PetalWidth',
color=Color('Name', scale=Scale(range=cp.as_hex()))
)
In [30]:
# MATPLOTLIB
fig, ax = plt.subplots(1, 3, figsize=(15, 5),
sharex=True, sharey=True)
for i, s in enumerate(iris.Name.unique()):
tmp = iris[iris.Name == s]
ax[i].scatter(tmp.PetalLength,
tmp.PetalWidth,
c=cp[i])
ax[i].set(xlabel='Petal Length',
ylabel='Petal Width',
title=s)
fig.tight_layout()
In [39]:
# Seaborn
g = sns.FacetGrid(iris, hue='Name', col='Name')
g.map(plt.scatter, 'PetalLength', 'PetalWidth')
Out[39]:
In [40]:
## ggplot
g = ggplot(iris, aes(x='PetalLength',
y='PetalWidth',
color='Name')) + \
geom_point(size=40.0) + \
facet_grid(y='Name') + \
ggtitle('Petal Width v. Length -- by Species')
g
Out[40]:
In [50]:
from altair import Column
# Altair
plt.figure(figsize=(10,6))
c = Chart(iris).mark_point(filled=True).encode(
x='PetalLength',
y='PetalWidth',
column=Column('Name', title='Petal Width v. Length by Species'),
color=Color('Name', scale=Scale(range=cp.as_hex()))
)
c.configure_cell(width=300, height=300)
In [66]:
fig, ax = plt.subplots(1, 1, figsize=(8,8))
ax.boxplot([iris[iris.Name == s]['PetalWidth'].values
for s in iris.Name.unique()])
ax.set(xticklabels=iris.Name.unique(),
xlabel='Species',
ylabel='Petal Width',
title='Distribution of Petal Width by Species');
In [71]:
fig, ax = plt.subplots(1, 1, figsize=(8,8))
for name in iris.Name.unique():
species = iris[iris.Name == name]
ax.hist(species['PetalWidth'], label=name)
plt.xlabel('Petal Width')
plt.ylabel('Frequency')
plt.title('Distribution of Petal Width by Species')
Out[71]:
In [73]:
iris.boxplot(column='PetalWidth', by='Name')
Out[73]:
The hitograms are plotted on separate subfigures, but we can use a pivot
with plot.hist
to get them all on the same figure
In [107]:
fig, ax = plt.subplots(1,1, figsize=(10, 10))
fig.clf()
iris.hist(column='PetalWidth', by='Name', sharex=True);
In [108]:
iris.pivot(columns='Name', values='PetalWidth').plot.hist(bins=30)
Out[108]:
In [116]:
# SEABORN
fig, ax = plt.subplots(1, 1, figsize=(8, 8))
g = sns.boxplot('Name', 'PetalWidth', data=iris, ax=ax)
g.set(title='Distribution of Petal Width by Species')
Out[116]:
In [142]:
g = sns.FacetGrid(iris, hue='Name', size=5, aspect=1)
g.map(sns.distplot, 'PetalWidth', kde=False, rug=True, bins=10)
plt.title('Distribution of Petal Width by name')
plt.legend()
plt.ylabel('Frequency')
Out[142]:
GGPlot
In [150]:
# GGPLOT (color doesn't quite work)
g = ggplot(iris, aes(x='Name',
y='PetalWidth',
fill='Name')) + \
geom_boxplot(color='Name') + \
ggtitle('Distribution of Petal Width by Species')
g
Out[150]:
In [154]:
plt.figure(figsize=(6,6))
plt.clf()
g = ggplot(iris, aes(x='PetalWidth',
fill='Name')) + \
geom_histogram() + \
ylab('Frequency') + \
ggtitle('Distribution of Petal Width by Species')
g
Out[154]:
In [162]:
from altair import X, Bin
Chart(iris).mark_bar(opacity=.75).encode(
x=X('PetalWidth', bin=Bin(maxbins=30)),
y='count(*)',
color=Color('Name', scale=Scale(range=cp.as_hex()))
)
In [208]:
sns.pairplot(iris, 'Name', diag_kind='kde')
Out[208]:
In [166]:
g = sns.PairGrid(iris)
g.map_diag(sns.kdeplot)
g.map_offdiag(sns.kdeplot, cmap="Blues_d", n_levels=6);
In [172]:
from pandas.tools.plotting import scatter_matrix
scatter_matrix(iris, figsize=(10,10), diagonal='hist');
Looking at the Andres curves, it can be seen that the setosa
is very different from the two others (versicolor
and viginica
). Each curve represents all the features of one data point as a sine wave, using the value of each feature for that data point as the coefficients of a Fourier series. Similar categories will exhibit a similar structure.
In [190]:
from pandas.tools.plotting import andrews_curves
andrews_curves(iris, 'Name')
Out[190]:
In [192]:
from pandas.tools.plotting import parallel_coordinates
parallel_coordinates(iris, 'Name')
Out[192]:
The RadViz plot below (http://pandas.pydata.org/pandas-docs/stable/visualization.html#radviz) also suggests that the petal width and petal lengths are different for setosa:
SepalWidth
that most accounts for the difference between setosa
and the two other irises.
In [194]:
from pandas.tools.plotting import radviz
radviz(iris, 'Name')
Out[194]:
It is interesting to compare the RadViz
plot with the distributions. According to the distributions below, all the lengths are smaller for setosa
than the other 2 irises, except for the sepal width, which is substantially bigger. This is consistent with the radviz plot, where the setosa
points are clustered near the SepalWidth
point.
In [274]:
from itertools import chain
fig, axes = plt.subplots(2,2, figsize=(10, 10));
g = sns.FacetGrid(iris, hue='Name')
for ax, feature in zip(chain.from_iterable(axes), [c for c in iris.columns if c != 'Name']):
g.map(sns.kdeplot, feature, ax=ax)
ax.set(xlabel=feature,
title='Distribution of {} per name'.format(feature));
plt.close(g.fig)
plt.tight_layout()
In [384]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
X = iris.drop('Name', axis=1)
x1, x2 = pca.fit(X).transform(X).T
df12 = pd.DataFrame(data={'PCA1': x1, 'PCA2': x2, 'Name':iris.Name})
sns.FacetGrid(df12, hue='Name', size=5).map(plt.scatter, 'PCA1', 'PCA2').add_legend()
Out[384]:
In [400]:
plt.stem(pca.explained_variance_, basefmt='')
plt.xlim([-0.5, 1.5])
plt.gca().set_xticks([0, 1])
plt.gca().set_xticklabels(['PCA1', 'PCA2'])
plt.xlabel('PCA directions')
plt.ylabel('PCA explained variance')
Out[400]:
In [401]:
components = pd.DataFrame(data=pca.components_, columns=iris.columns[:4], index=['PCA1', 'PCA2'])
components
Out[401]:
In [410]:
fig, axes = plt.subplots(1,2,figsize=(12, 6), sharey=True)
components.loc['PCA1'].plot(kind='bar', color=cp[0], ax=axes[0])
components.loc['PCA2'].plot(kind='bar', color=cp[1], ax=axes[1])
axes[0].set_xticklabels(components.columns, rotation=0);
axes[1].set_xticklabels(components.columns, rotation=0);
The figure above confirms that the sepal width is what separates the setosa from the two others. The sepal width is negative in PCA1
while the petal length and width are positive. The PCA analysis indicates that setosa is very negative along the PCA1
axis, due to its large sepal width compared to the two other flowers.
In [330]:
from sklearn.decomposition import PCA
def pca_scatter(group):
data = group.drop('Name', axis=1).values
#print('name = {}, data = {}', group.name, data.iloc[:10])
pca = PCA(n_components=2)
pca.fit(data)
species = pca.transform(data).T
plt.plot(species[0], species[1], 'o', label=group.name)
iris.groupby('Name').apply(pca_scatter)
Out[330]:
In [ ]: