In [1]:
"""
IPython Notebook v4.0 para python 2.7
Librerías adicionales: numpy, matplotlib
# Contenido bajo licencia CC-BY 4.0. Código bajo licencia MIT. (c) Sebastian Flores.
"""
# Configuracion para recargar módulos y librerías
%reload_ext autoreload
%autoreload 2
%matplotlib inline
from IPython.core.display import HTML
HTML(open("style/mat281.css", "r").read())
# Imágenes: Copyright a autores respectivos.
# Gráficos: Tomados de http://matplotlib.org/gallery.html y modificados.
Out[1]:
Seguramente está exagerando...
¿Porqué utilizamos gráficos para presentar datos?
Cerebro ha sido entrenado evolutivamente para interpretar la información visual.
“The eye and the visual cortex of the brain form a massively parallel processor that provides the highest bandwidth channel into human cognitive centers” — Colin Ware, Information Visualization, 2004.
In [2]:
%%bash
cat data/anscombe.txt
In [3]:
import numpy as np
from scipy import stats
data = np.loadtxt("data/anscombe.txt", delimiter=",")
for i in range(4):
x = data[:,2*i]
y = data[:,2*i+1]
slope, intercept, r_value, p_value, std_err = stats.linregress(x,y)
print "Grupo %d:" %(i+1)
print "\tTiene pendiente m=%.2f e intercepto b=%.2f" %(slope, intercept)
print "\tTiene R^2=%.4f p value=%.4f y std_err=%.4f" %(r_value, p_value, std_err)
In [6]:
from matplotlib import pyplot as plt
import numpy as np
data = np.loadtxt("data/anscombe.txt", delimiter=",")
fig = plt.figure(figsize=(16,8))
for i in range(4):
x = data[:,2*i]
y = data[:,2*i+1]
plt.subplot(2, 2, i+1)
plt.plot(x,y,'o')
plt.xlim([2,20])
plt.ylim([2,20])
plt.title("Grupo %d" %(i+1))
m, b, r_value, p_value, std_err = stats.linregress(x,y)
x_aux = np.linspace(2,16,20)
plt.plot(x_aux, m*x_aux + b, 'r', lw=2.0)
plt.suptitle("Cuarteto de Anscombe")
plt.show()
El principio básico a respetar es que a partir del gráfico uno debe poder reobtener fácilmente los datos originales.
El ojo humano no tiene la misma precisión al estimar distintas atribuciones:
In [ ]:
from matplotlib import pyplot as plt
# make a square figure and axes
plt.figure(figsize=(6,6))
ax = plt.axes([0.1, 0.1, 0.8, 0.8])
# The slices will be ordered and plotted counter-clockwise.
my_labels = 'Frogs', 'Hogs', 'Dogs', 'Logs'
my_fracs = [15, 30, 45, 10]
my_explode=(0, 0.10, 0.10, 0)
#plt.pie(my_fracs, labels=my_labels)
plt.pie(my_fracs, explode=my_explode, labels=my_labels, autopct='%1.1f%%', shadow=True, startangle=90)
plt.title('Raining Hogs and Dogs', bbox={'facecolor':'0.8', 'pad':5})
plt.show()
In [ ]:
import numpy as np
from matplotlib import pyplot as plt
N = 31
x = np.arange(N)
y1 = 80 + 20*x/N + 5*np.random.rand(N)
y2 = 75 + 25*x/N + 5*np.random.rand(N)
fig = plt.figure(figsize=(16,8))
plt.subplot(2, 2, 1)
plt.plot(x, y1, 'ok')
plt.plot(x, y2, 'sk')
plt.subplot(2, 2, 2)
plt.plot(x, y1,'ob')
plt.plot(x, y2,'or')
plt.subplot(2, 2, 3)
plt.plot(x, y1,'ob')
plt.plot(x, y2,'*r')
plt.subplot(2, 2, 4)
plt.plot(x, y1,'sr')
plt.plot(x, y2,'ob')
plt.show()
Puesto que la percepción del color tiene muy baja precisión, resulta inadecuado tratar de representar un valor numérico con colores.
In [ ]:
import matplotlib
import numpy as np
import matplotlib.cm as cm
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
delta = 0.025
x = np.arange(-3.0, 3.0, delta)
y = np.arange(-2.0, 2.0, delta)
X, Y = np.meshgrid(x, y)
Z1 = mlab.bivariate_normal(X, Y, 1.0, 1.0, 0.0, 0.0)
Z2 = mlab.bivariate_normal(X, Y, 1.5, 0.5, 1, 1)
# difference of Gaussians
Z = 10.0 * (Z2 - Z1)
plt.figure(figsize=(16,8))
# First plot
plt.subplot(2,2,1)
im = plt.imshow(Z, interpolation='bilinear', origin='lower',cmap=cm.rainbow, extent=(-3, 3, -2, 2))
plt.colorbar(im, shrink=0.8)
# Second plot
plt.subplot(2,2,2)
im = plt.imshow(Z, interpolation='bilinear', origin='lower',cmap=cm.autumn, extent=(-3, 3, -2, 2))
plt.colorbar(im, shrink=0.8)
# Third plot
plt.subplot(2,2,3)
im = plt.imshow(Z, interpolation='bilinear', origin='lower',cmap=cm.coolwarm, extent=(-3, 3, -2, 2))
plt.colorbar(im, shrink=0.8)
# Fourth plot
plt.subplot(2,2,4)
im = plt.imshow(Z, interpolation='bilinear', origin='lower',cmap=cm.gray, extent=(-3, 3, -2, 2))
plt.colorbar(im, shrink=0.8)
# Show
plt.show()
In [ ]:
import matplotlib
import numpy as np
import matplotlib.cm as cm
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
delta = 0.025
x = np.arange(-3.0, 3.0, delta)
y = np.arange(-2.0, 2.0, delta)
X, Y = np.meshgrid(x, y)
Z1 = mlab.bivariate_normal(X, Y, 1.0, 1.0, 0.0, 0.0)
Z2 = mlab.bivariate_normal(X, Y, 1.5, 0.5, 1, 1)
# difference of Gaussians
Z = 10.0 * (Z2 - Z1)
plt.figure(figsize=(16,8))
# First plot
plt.subplot(2,2,1)
CS = plt.contour(X, Y, Z, 9, cmap=cm.rainbow)
# Second plot
matplotlib.rcParams['contour.negative_linestyle'] = 'solid'
plt.subplot(2,2,2)
CS = plt.contour(X, Y, Z, 9, cmap=cm.rainbow)
plt.clabel(CS, fontsize=9, inline=1)
# Third plot
matplotlib.rcParams['contour.negative_linestyle'] = 'solid'
plt.subplot(2,2,3)
CS = plt.contour(X, Y, Z, 9, colors='k')
plt.clabel(CS, fontsize=9, inline=1)
# Fourth plot
matplotlib.rcParams['contour.negative_linestyle'] = 'dashed'
plt.subplot(2,2,4)
CS = plt.contour(X, Y, Z, 9, colors='k')
plt.clabel(CS, fontsize=9, inline=1)
plt.grid('on')
# Show
plt.show()
Clasificación de datos:
In [ ]:
import matplotlib.pyplot as plt
import numpy as np
brands = {"MSI":"Taiwan", "Asus":"Taiwan", "Acer":"Taiwan",
"HP":"EEUU", "Dell":"EEUU", "Apple":"EEUU",
"Sony":"Japon", "Toshiba":"Japon",
"Lenovo":"Hong Kong",
"Samsung":"Corea del Sur"}
C2N = {"Taiwan":1,"EEUU":2,"Japon":3,"Hong Kong":4,"Corea del Sur":7}
x = np.arange(len(brands.keys()))
y = np.array([C2N[val] for key,val in brands.items()])
width = 0.35 # the width of the bars
fig, ax = plt.subplots(figsize=(16,8))
print x
print y
rects1 = ax.bar(x, y, width, color='r')
# add some text for labels, title and axes ticks
ax.set_xticks(x + 0.5*width)
ax.set_xticklabels(brands.keys(), rotation="90")
ax.set_yticks(C2N.values())
ax.set_yticklabels(C2N.keys())
plt.xlim([-1,len(x)+1])
plt.ylim([-1,y.max()+1])
plt.show()
Clasificación de datos:
In [7]:
import numpy as np
from matplotlib import pyplot as plt
x = range(1,13)
y = 80 + 20*np.random.rand(12)
fig = plt.figure(figsize=(16,8))
plt.subplot(1, 2, 1)
plt.plot(x, y,'o-')
plt.xticks(x, ["E","F","M","A","M","J","J","A","S","O","N","D"])
plt.xlim([-1,13])
plt.subplot(1, 2, 2)
plt.plot(x, y,'o-')
plt.xticks(x, ["E","F","M","A","M","J","J","A","S","O","N","D"])
plt.xlim([-1,13])
plt.ylim([0,100])
plt.show()
In [8]:
import numpy as np
from matplotlib import pyplot as plt
x = np.linspace(0, 1, 50)
f1 = x**2+.2*np.random.rand(50)
g1 = x+.2*np.random.rand(50)
f2 = 0.5-0.2*x+.2*np.random.rand(50)
g2 =x**3+.2*np.random.rand(50)
fig = plt.figure(figsize=(16,8))
plt.subplot(2, 1, 1)
plt.title("Antes de mi trabajo")
plt.plot(x, f1, 'b', label='Chile', lw=2.0)
plt.plot(x, g1, 'g:', label='OECD', lw=2.0)
plt.legend(loc="upper left")
plt.subplot(2, 1, 2)
plt.title("Despues de mi trabajo")
plt.plot(x, f2, 'g:', label='Chile', lw=2.0)
plt.plot(x, g2, 'b', label='OECD', lw=2.0)
plt.legend()
plt.show()
Elementos para la creación de una buena visualización
El principio básico a respetar es que a partir del gráfico uno debe poder reobtener fácilmente los datos originales.
In [9]:
from matplotlib import pyplot as plt
import numpy as np
people = ('Tom', 'Dick', 'Harry', 'Slim', 'Jim')
y_pos = np.arange(len(people))
performance = 3 + 10 * np.random.rand(len(people))
error = np.random.rand(len(people))
fig = plt.figure(figsize=(16,8))
plt.subplot(1,2,1)
plt.barh(y_pos, performance, xerr=error, align='center', color="g", alpha=0.4)
plt.yticks(y_pos, people)
plt.xlabel('Performance')
plt.subplot(1,2,2)
plt.bar(y_pos, performance, yerr=error, align='center', color="g", alpha=0.6)
plt.xticks(y_pos, people)
plt.xlabel('People')
plt.ylabel('Performance')
plt.show()
Evitar: gráfico de nominal vs nominal.
In [ ]:
from matplotlib import pyplot as plt
my_labels = 'Frogs', 'Hogs', 'Dogs', 'Logs'
my_fracs = [15, 30, 45, 10]
my_explode=(0, 0.10, 0.10, 0)
fig = plt.figure(figsize=(16,8))
plt.subplot(1,2,1)
plt.pie(my_fracs, labels=my_labels)
plt.subplot(1,2,2)
plt.pie(my_fracs, explode=my_explode, labels=my_labels, autopct='%1.1f%%', shadow=True, startangle=90)
plt.title('Raining Hogs and Dogs', bbox={'facecolor':'0.8', 'pad':5})
plt.show()
Evitar: gráfico de nominal vs nominal.
In [ ]:
import matplotlib.pyplot as plt
import numpy as np
from numpy import ma
X, Y = np.meshgrid(np.arange(0, 2 * np.pi, .2), np.arange(0, 2 * np.pi, .2))
U = np.cos(X)
V = np.sin(Y)
fig = plt.figure(figsize=(16,8))
plt.subplot(1,2,1)
Q = plt.quiver(U, V)
qk = plt.quiverkey(Q, 0.5, 0.92, 2, r'$2 \frac{m}{s}$', labelpos='W',
fontproperties={'weight': 'bold'})
l, r, b, t = plt.axis()
dx, dy = r - l, t - b
plt.axis([l - 0.05*dx, r + 0.05*dx, b - 0.05*dy, t + 0.05*dy])
plt.subplot(1,2,2)
Q = plt.quiver(X[::3, ::3], Y[::3, ::3], U[::3, ::3], V[::3, ::3],
pivot='mid', color='r', units='inches')
qk = plt.quiverkey(Q, 0.5, 0.03, 1, r'$1 \frac{m}{s}$',
fontproperties={'weight': 'bold'})
plt.plot(X[::3, ::3], Y[::3, ::3], 'k.')
plt.axis([-1, 7, -1, 7])
plt.title("pivot='mid'; every third arrow; units='inches'")
plt.show()
Evitar: gráfico de campo de vectores si no es posible la interpretación correspondiente.
In [ ]:
import matplotlib
import numpy as np
import matplotlib.cm as cm
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
delta = 0.025
x = np.arange(-3.0, 3.0, delta)
y = np.arange(-2.0, 2.0, delta)
X, Y = np.meshgrid(x, y)
Z1 = mlab.bivariate_normal(X, Y, 1.0, 1.0, 0.0, 0.0)
Z2 = mlab.bivariate_normal(X, Y, 1.5, 0.5, 1, 1)
# difference of Gaussians
Z = 10.0 * (Z2 - Z1)
plt.figure(figsize=(16,8))
matplotlib.rcParams['contour.negative_linestyle'] = 'solid'
plt.subplot(1,2,1)
CS = plt.contour(X, Y, Z, 9, colors='k')
plt.clabel(CS, fontsize=9, inline=1)
matplotlib.rcParams['contour.negative_linestyle'] = 'dashed'
plt.subplot(1,2,2)
CS = plt.contour(X, Y, Z, 9, colors='k')
plt.clabel(CS, fontsize=9, inline=1)
plt.grid('on')
# Show
plt.show()
OBSERVACION: Se debe tener suficiente densidad/regularidad de puntos como para poder obtener superficies de nivel.
In [ ]:
import matplotlib.pyplot as plt
import numpy as np
N = 100
r0 = 0.6
x = 0.9*np.random.rand(N)
y = 0.9*np.random.rand(N)
area = np.pi*(10 * np.random.rand(N))**2 # 0 to 10 point radiuses
c = np.sqrt(area)
r = np.sqrt(x*x + y*y)
cm1 = plt.cm.get_cmap('RdYlBu')
cm2 = plt.cm.get_cmap('Greys')
plt.figure(figsize=(16,8))
area1 = np.ma.masked_where(r < r0, area)
area2 = np.ma.masked_where(r >= r0, area)
sc1 = plt.scatter(x, y, s=area1, marker='^', c=c, cmap=cm1)
plt.colorbar(sc1)
sc2 = plt.scatter(x, y, s=area2, marker='o', c=c, cmap=cm2)
plt.colorbar(sc2)
# Show the boundary between the regions:
theta = np.arange(0, np.pi/2, 0.01)
plt.plot(r0*np.cos(theta), r0*np.sin(theta), "k:", lw=2.0)
plt.show()
OBSERVACION: Si hay pocos puntos, también puede usarse para z datos de tipo posicional o cuantitativo.
In [ ]:
import numpy as np
import matplotlib.pyplot as plt
x = np.arange(0.1, 4, 0.5)
y = np.exp(-x)
plt.figure(figsize=(16,8))
plt.subplot(1,2,1)
x_error = 0.1 + 0.2*np.random.rand(len(x))
plt.errorbar(x, y, xerr=x_error)
plt.subplot(1,2,2)
y_error = 0.1 + 0.2*np.random.rand(len(x))
plt.errorbar(x, y, yerr=y_error)
plt.show()