In [1]:
%pylab --no-import-all inline
from scipy.stats import linregress, pearsonr
In [2]:
all_sets = list()
for i in range(0, 8, 2):
x, y = np.loadtxt("anscombe.dat", usecols=(i, i+1), skiprows=1, unpack=True)
all_sets.append((x, y))
In [3]:
print(all_sets[0][0])
print(all_sets[0][1])
In [4]:
def show_stat(data):
x, y = data
print("moyenne x : %4.2f" % x.mean())
print("variance x : %4.2f" % np.var(x))
print("moyenne y : %4.2f" % y.mean())
print("variance y : %4.2f" % np.var(y))
cor, p = pearsonr(x, y)
print("corrélation : %5.3f" % cor)
a, b, r, p_value, std_err = linregress(x, y)
print("regression linéaire : %3.1f x + %3.1f (r^2 = %4.2f)" % (a, b, r**2))
In [5]:
for i, data in enumerate(all_sets):
print("\nset %d" % i)
print("------")
show_stat(data)
La représentation graphique de ces jeux de données a deux objectifs. Elle montre
In [8]:
fig = plt.figure(figsize=(10, 8))
fig.suptitle("Quartet d'Anscombe", size=20)
for i, data in enumerate(all_sets):
ax = plt.subplot(2, 2, i + 1)
x, y = data
ax.plot(x, y, marker="o", color="C3", linestyle="", label="set %d" % (i+1))
ax.set_ylabel("y%d" % (i+1), size=14)
ax.set_xlabel("x%d" % (i+1), size=14)
a, b, r, p_value, std_err = linregress(x, y)
ax.plot([0, 20], [b, a*20 + b], color="C0")
ax.set_xlim(0, 20)
ax.set_ylim(0, 15)
ax.legend(loc="lower right", fontsize=18)
ax.grid(True)