In [1]:
## INIT: usual way to display plots inline and import the matplotlib library
%matplotlib inline
# default value is 'tight', can be made more relaxed about spacing
%config InlineBackend.print_figure_kwargs = {'bbox_inches': None}
import numpy as np
import matplotlib.pyplot as plt
# set the format of the embedded figures (specific to the 'inline' backend)
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('pdf', 'png','svg')
In [2]:
## generate some dummy data
x = np.arange(-5, 5, .1)
y = np.sin(x)
## let's create two plots in a row
# define the 2 plots, use the 1st as active
plt.subplot(2, 1, 1)
plt.plot(x,y)
plt.xlabel("X values")
plt.ylabel("sin(x)")
plt.title("Plot simple sin(x)")
# activate the 2nd plot within the figure
plt.subplot(2, 1, 2)
plt.plot(x, 2*y, "ro", markersize=3, markeredgecolor='none')
plt.xlabel("X values")
# ,bbox=dict(facecolor='red', alpha=0.3), x=0, y=-.2 ;
plt.ylabel("2*sin(x)");
Source: http://matplotlib.org/1.5.1/faq/usage_faq.html#parts-of-a-figure
matplotlib.figure.Figure)top level container for all plot elements
matplotlib.axes._axes.Axes)The
Axescontains most of the figure elements:~matplotlib.axis.Axis,~matplotlib.axis.Tick,~matplotlib.lines.Line2D,~matplotlib.text.Text,~matplotlib.patches.Polygon, etc., and sets the coordinate system.
matplotlib.axis.Axis and matplotlib.axis.Tick)It represents the axis within the plot, sets the limits and controls how the ticks are displayed (formatting, minor and major ticks)
In [3]:
# set the backend to osx (on Mac, would be different on other OS...)
%matplotlib
In [4]:
import pandas as pd
houses = pd.read_csv(filepath_or_buffer='bp_flats.csv')
houses.head(10)
Out[4]:
In [30]:
small_flats = houses[ houses['Flatcount'].isin(['1', '2']) ]\
.groupby('District').sum()
small_flats.head()
Out[30]:
In [7]:
%matplotlib
plt.isinteractive()
Out[7]:
In [8]:
plt.figure()
Out[8]:
In [11]:
plt.show()
In [10]:
## create 3 different figures to compare the number of small houses at 3
## things to show: use figure(), hist() and show()
plt.hist(small_flats['2000'].values, bins=15)
plt.title("# small houses in 2000")
plt.show();
In [12]:
plt.figure()
plt.hist(small_flats['2005'].values, bins=15, color="g")
plt.title("# small houses in 2005")
plt.show()
Okay, it is far from ideal to compare distributions on separate figures... Try subplots...
In [13]:
plt.figure(figsize=(10,8))
plt.subplot(2, 1, 1)
Out[13]:
In [14]:
plt.hist(small_flats['2005'].values, color='g', bins=15)
Out[14]:
In [15]:
plt.title("# of houses in 2005")
plt.xlabel("# of houses with 1 or 2 flats")
Out[15]:
In [ ]:
plt.show()
In [16]:
plt.subplot(2,1,2)
plt.title("# of houses in 2011");
Well, that's a bit clumsy with the overlaps...
In [17]:
plt.subplots_adjust(hspace=.4)
plt.show()
In [18]:
plt.hist(small_flats['2011'].values, color='lightgreen', bins=15)
plt.show()
In [21]:
plt.subplot(2,1,1)
xmin1, xmax1 = plt.xlim()
print("x limits on first plot", (xmin1, xmax1))
plt.subplot(2,1,2)
print("x limits on first plot", plt.xlim())
In [22]:
plt.xlim(xmin1,xmax1)
plt.show()
Well, well, good idea but it just does not work out (same number of bins but different width)... try something different.
In [25]:
plt.gcf()
plt.show()
In [26]:
# close current figure
plt.close()
# switch to inline plotting
%matplotlib inline
set_matplotlib_formats('pdf', 'png', 'svg')
In [38]:
plt.hist(small_flats['2005'], bins=common_bins)
Out[38]:
In [40]:
fig, axes = plt.subplots(nrows=2, ncols=1, sharex=True, sharey=True,
figsize=(6,6))
common_bins = np.arange(
np.min(small_flats[['2005','2011']].values),
np.max(small_flats[['2005','2011']].values) + 150, 150)
ax0 = axes[0]
ax0.hist(small_flats['2005'], bins=common_bins, color="g")
ax0.set_xlabel("2005")
ax1 = axes[1]
ax1.hist(small_flats['2011'], bins=common_bins, color="r", alpha=.6)
ax1.set_xlabel("2011")
fig.suptitle("Comparison of 2005 and 2011");
In [45]:
## compare the distrib of 1 or 2 flat houses to 1-flat houses
fig = plt.figure(figsize=(6,6))
large_houses = houses[ houses['Flatcount'].isin(['2','21-50']) ].groupby(
'District').sum()
bydistrict = houses.groupby("District").sum()
year_of_interest = '2011'
#
common_bins = np.arange(bydistrict[year_of_interest].values.min(),
bydistrict[year_of_interest].values.max()+50,50)
plt.hist(large_houses[year_of_interest].values, bins=common_bins,
color="red", edgecolor='none')
plt.hist(bydistrict[year_of_interest].values, bins=common_bins,
color="green", alpha=.6, edgecolor='none');
In [46]:
houses.head()
Out[46]:
In [78]:
# collect some data
vals = []
labels = []
for k, group in houses.groupby('Flatcount'):
labels.append(k)
vals.append(group["1946"].values)
fig = plt.figure(figsize=(6,6))
ax = fig.gca()
ax.boxplot(vals, labels=labels,
boxprops=dict(color="darkgreen",linewidth=.8));
ax.set_title("# of houses with various flatcounts, 1946")
ax.set_xlabel("Flat count ranges")
ax.set_ylabel("Count")
# set the y tick on each 250
ax.set_yticks(ticks=np.arange(0, np.max(vals)+250, 250));
for i,xtick in enumerate(ax.get_xticklabels()):
xtick.set_rotation(i*10)
## account for the layout changes caused by rotating labels on x axis
fig.tight_layout()
In [51]:
bydistrict = houses.groupby("District").sum()
bydistrict.head()
Out[51]:
In [77]:
plt.figure(figsize=(6,6))
mk = "o"
plt.scatter(x=bydistrict.index,
y=bydistrict['2011'],
edgecolor='none',
marker=mk)
plt.scatter(bydistrict.index, bydistrict['2005'], c="g",
edgecolor='none', marker=mk)
## Note the missing label and .values()
plt.scatter(bydistrict.index,
bydistrict['2000'].values, c="r", edgecolor='none',
marker=mk, label="Series from 2000")
plt.scatter(bydistrict.index,
bydistrict['1990'], c="lightblue", edgecolor='none',
label="1980-1990", marker=mk)
plt.xlim(0,24)
plt.ylim(0,np.max(bydistrict.values))
plt.xticks(np.arange(1,24),
["District %2s" % i for i in bydistrict.index])
#
for xtick in plt.gca().get_xticklabels():
xtick.set_rotation(60)
xtick.set_horizontalalignment('right')
plt.legend( fontsize='medium', frameon=True)
# just to make sure the
plt.tight_layout()
In [61]:
print(bydistrict)
In [76]:
fig = plt.figure(figsize=(6,6))
ax = plt.gca()
plt.pcolormesh(bydistrict.values, cmap=plt.cm.Reds)
ax.xaxis.tick_top()
plt.colorbar(orientation='vertical')
# we don't need the frame just now
ax.set_frame_on(True)
# set the y limit explicitly (otherwise it's gonna be
# longer than needed)
ax.set_ylim(0, np.max(bydistrict.index.values))
# shift the ticks by .5 unit
ax.set_yticks(np.arange(bydistrict.shape[0]) + .5);
ax.set_xticks(np.arange(bydistrict.shape[1]) + .5)
# invert the otherwise sorted y axis
ax.invert_yaxis()
# create the labels for the major ticks
xlabels = bydistrict.columns.values
ylabels = ["District %2s" % i for i in bydistrict.index]
# set up labels on the two axes
ax.set_xticklabels(xlabels, minor=False)
ax.set_yticklabels(ylabels, family="monospace",
minor=False, fontstretch='extra-condensed')
# rotate the
plt.xticks(rotation=45)
# # just to be sure, switch it off
ax.grid(False)
for xt in ax.xaxis.get_major_ticks():
xt.tick1On = False
xt.tick2On = False
for yt in ax.yaxis.get_major_ticks():
yt.tick1On = False
yt.tick2On = False
fig.tight_layout()
In [73]:
!find ~/anaconda/ -name "*.mplstyle"
In [74]:
plt.style.available
Out[74]:
In [79]:
# plt.style.use('ggplot')
## re-plot something from the previous cells
plt.style.use('default')
In [80]:
# due to ipython and inline backend, it is not enough
# to set back the 'default' style, it uses its own rcParams
%matplotlib inline
# see matplotlib.rcParams
In [81]:
import pandas as pd # was already imported
In [83]:
# plot a simple barchart of one of the columns
bydistrict['2011'].plot(kind='bar')
Out[83]:
In [86]:
# what happens when multiple columns are passed at the same time?
bydistrict[['2005','2011']].plot.bar(figsize=(6,6), legend=True)
Out[86]:
In [ ]:
# there are various type of plots supported
help(pd.DataFrame.plot)
In [87]:
# let's check out a histogram of two columns
bydistrict[['2005','2011']].plot.hist(normed=True, alpha=.6)
Out[87]:
In [88]:
## would be nice to see the density plot as well
# save the Axes object
hist_ax = bydistrict[['2005','2011']].plot.hist(
normed=True, alpha=.6, color=["blue",'green'])
# reuse the axes object by passing in as 'ax'
bydistrict[['2005','2011']].plot.kde(ax=hist_ax,
color=["blue", "orange"],
linewidth=3,)
Out[88]:
In [90]:
# more complex use of mpl library
fig, axes = plt.subplots(2,2, figsize=(6,6), sharex=True, sharey=True)
bydistrict[['2005','2011']].plot.hist(subplots=True, normed=True,
alpha=.6, color=["blue",'green'],
ax = [axes[0,0], axes[0,1]])
bydistrict[['2005','2011']].plot.kde(subplots=True,
color=["blue", "green"], linewidth=3,
ax = [axes[1,0], axes[1,1]]);
In [91]:
import seaborn as sns
In [92]:
# create a facet grid based on one variable ('Flatcount')
grid = sns.FacetGrid(data=houses,col="Flatcount", col_wrap=3,
sharex=True, sharey=True)
# and for each slice draw a barchart from District number
# and the count houses in 2011
grid.map(plt.bar,"District",'2005');
In [93]:
sns.violinplot(bydistrict)
Out[93]: