For hints on improving on matplotlib's default style, see this alternate notebook
In [24]:
%matplotlib inline
from urllib import urlopen
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from matplotlib import rcParams
rcParams['figure.figsize'] = (10, 6)
rcParams['figure.dpi'] = 150
In [25]:
file = urlopen('https://raw.github.com/cpcloud/pydatasets/master/datasets/ggplot2/diamonds.csv')
diamonds = pd.read_csv(file)
file = urlopen('http://www.columbia.edu/~cjd11/charles_dimaggio/DIRE/resources/R/titanic.csv')
titanic = pd.read_csv(file)
In [26]:
change = [23.2, 22.7, 19.7, 13.9, 13.1, 12.8, 12.7,
12.6, 12.0, 11.5, 10.8, 10.4, 10.4, 9.8, 9.2,
9.2, 8.8, 7.7, 6.9, 6.9, 6.4, 5.6, 5.3, 5.3, 5.2, 4.9,
4.8, 4.6, 3.6, 3.1, 0.7, -.3, -.7, -1.2, -1.5, -1.7,
-1.7, -1.8, -2, -2.3, -2.4, -3.6, -3.7,
-4.9, -6.5, -6.6, -11.6, -14.8, -17.6, -23.1]
city = ['Philadelphia', 'Tucson', 'Kansas City, MO',
'El Paso', 'Portland, Ore.', 'New York', 'Dallas',
'Columbus', 'Mesa', 'Austin', 'Atlanta', 'Fort Worth',
'Miami', 'Houston', 'Chicago', 'Oakland', 'Virginia Beach',
'Baltimore', 'Denver', 'Detroit', 'San Antonio', 'Phoenix',
'Oklahoma City', 'Indianapolis', 'Milwaukee', 'Sacramento',
'Washington, D.C.', 'Colorado Springs', 'Honolulu', 'Nashville',
'Jacksonville', 'Louisville', 'Seattle',
'Memphis', 'Fresno', 'Boston', 'Mineappolis',
'San Jose', 'Tulsa', 'Charlotte', 'San Diego', 'Los Angeles',
'Long Beach', 'Cleveland', 'San Francisco', 'Albuquerque',
'Arlington, TX', 'Omaha', 'Wichita', 'Las Vegas']
grad = pd.DataFrame({'change' : change, 'city': city})
In [27]:
plt.figure(figsize=(3, 8))
change = grad.change[grad.change > 0]
city = grad.city[grad.change > 0]
pos = np.arange(len(change))
plt.title('1995-2005 Change in HS graduation rate')
plt.barh(pos, change)
#add the numbers to the side of each bar
for p, c, ch in zip(pos, city, change):
plt.annotate(str(ch), xy=(ch + 1, p + .5), va='center')
#set plot limits
plt.ylim(pos.max(), pos.min() - 1)
plt.xlim(0, 30)
Out[27]:
In [28]:
change = grad.change[grad.change < 0].values
city = grad.city[grad.change < 0].values
pos = np.arange(len(change))
red = (0.78, 0.22, 0.18) # RGB triplet
plt.figure(figsize=(3, 6), dpi=200)
plt.barh(pos, change, color=red)
plt.yticks(pos + .5, city)
#add the numbers to the side of each bar
for p, c, ch in zip(pos, city, change):
plt.annotate(str(ch), xy=(ch - 1, p + .5), va='center', ha='right')
plt.ylim(pos.max() + 1, pos.min()- .5)
plt.xlim(-30, 0)
plt.title('1995-2005 Change in HS graduation rate')
Out[28]:
In [29]:
years = np.arange(2004, 2009)
heights = np.random.random(years.shape) * 7000 + 3000
box_colors = ['r', 'g', 'b', 'c', 'm']
plt.bar(years - .4, heights, color=box_colors)
plt.yticks([2000, 4000, 6000, 8000])
for x, y in zip(years, heights):
plt.annotate("%i" % y, (x, y + 200), ha='center')
In [30]:
plt.figure(tight_layout=True, figsize=(6, 4))
plt.subplot(121)
plt.scatter(diamonds.carat, diamonds.price, color='k')
plt.ylim(0, diamonds.price.max())
plt.xlim(0, 5)
plt.xlabel("Carat")
plt.ylabel("Price")
plt.subplot(122)
plt.scatter(diamonds.carat, diamonds.price, color='k', alpha=.01)
plt.ylim(0, diamonds.price.max())
plt.xlim(0, 5)
plt.xlabel("Carat")
plt.ylabel("Price")
Out[30]:
In [31]:
# the raw data
x = diamonds.carat[diamonds.carat < 2]
y = diamonds.price[diamonds.carat < 2]
plt.plot(x, y, 'o', mec='none', alpha=.05)
#fit and overplot a 2nd order polynomial
params = np.polyfit(x, y, 2)
xp = np.linspace(x.min(), 2, 20)
yp = np.polyval(params, xp)
plt.plot(xp, yp, 'k')
#overplot an error band
sig = np.std(y - np.polyval(params, x))
plt.fill_between(xp, yp - sig, yp + sig,
color='k', alpha=0.2)
plt.xlabel("Carat")
plt.ylabel("Price")
plt.xlim(0, 2)
Out[31]:
In [32]:
t = titanic.groupby(['pclass']).size()
print t
plt.subplot(aspect=True)
plt.pie(t, labels=t.index.values, autopct='%i%%')
plt.title("Passenger Class on the Titanic")
Out[32]:
In [33]:
tclass = titanic.groupby(['pclass', 'survived']).size().unstack()
print tclass
red, blue = '#B2182B', '#2166AC'
plt.subplot(121)
plt.bar([0, 1, 2], tclass[0], color=red, label='Died')
plt.bar([0, 1, 2], tclass[1], bottom=tclass[0], color=blue, label='Survived')
plt.xticks([0.5, 1.5, 2.5], ['1st Class', '2nd Class', '3rd Class'], rotation='horizontal')
plt.ylabel("Number")
plt.xlabel("")
plt.legend(loc='upper left')
#normalize each row by transposing, normalizing each column, and un-transposing
tclass = (1. * tclass.T / tclass.T.sum()).T
plt.subplot(122)
plt.bar([0, 1, 2], tclass[0], color=red, label='Died')
plt.bar([0, 1, 2], tclass[1], bottom=tclass[0], color=blue, label='Survived')
plt.xticks([0.5, 1.5, 2.5], ['1st Class', '2nd Class', '3rd Class'], rotation='horizontal')
plt.ylabel("Fraction")
plt.xlabel("")
plt.show()
In [34]:
plt.hist(diamonds.depth, bins=np.linspace(50, 70, 200))
plt.xlabel("Depth")
plt.xlim(55, 70)
plt.show()
plt.hist(diamonds.depth, bins=np.linspace(50, 70, 40))
plt.xlabel("Depth")
plt.xlim(55, 70)
plt.show()
In [35]:
#KernelDensity objects estimate the (log of the) density of points
#see http://scikit-learn.org/stable/modules/density.html
from sklearn.neighbors.kde import KernelDensity
age = titanic.age.dropna().values # drop missing values, turn to normal numpy array
age = age.reshape(-1, 1) # scikit-learn expects data matrices of shape [ndata, ndim]
kde = KernelDensity(bandwidth=2).fit(age)
x = np.linspace(age.min(), age.max(), 100).reshape(-1, 1)
density = np.exp(kde.score_samples(x))
plt.plot(x, density)
plt.plot(age, age * 0, 'ok', alpha=.03)
plt.ylim(-.001, .035)
plt.xlabel("Age")
plt.ylabel("Density")
Out[35]:
In [36]:
male_age = titanic.age[titanic.sex == 'male']
female_age = titanic.age[titanic.sex == 'female']
plt.boxplot([male_age, female_age])
plt.ylabel("Titanic Passanger Age")
plt.xticks([1, 2], ["Male", "Female"])
plt.ylim(0, 85)
Out[36]:
In [37]:
from sklearn.datasets import make_blobs
from matplotlib.colors import LogNorm
X, _ = make_blobs(n_samples=20000, centers=3, random_state=42, cluster_std=2)
plt.scatter(X[:, 0], X[:, 1], 2, color='k')
plt.title("Points")
plt.xlim(-15, 15)
plt.ylim(-15, 15)
plt.gca().set_position([.125, .125, .62, .775])
plt.show()
plt.hist2d(X[:, 0], X[:, 1], bins=40, norm=LogNorm())
ax = plt.gca()
plt.title("Heatmap")
plt.colorbar()
plt.xlim(-15, 15)
plt.ylim(-15, 15)
plt.show()