In [68]:
import pandas as pd  
import matplotlib.pyplot as plt  
import seaborn as sns  
from scipy import stats  
import numpy as np

In [69]:
anscombe = sns.load_dataset("anscombe")

In [70]:
anscombe


Out[70]:
dataset x y
0 I 10.0 8.04
1 I 8.0 6.95
2 I 13.0 7.58
3 I 9.0 8.81
4 I 11.0 8.33
5 I 14.0 9.96
6 I 6.0 7.24
7 I 4.0 4.26
8 I 12.0 10.84
9 I 7.0 4.82
10 I 5.0 5.68
11 II 10.0 9.14
12 II 8.0 8.14
13 II 13.0 8.74
14 II 9.0 8.77
15 II 11.0 9.26
16 II 14.0 8.10
17 II 6.0 6.13
18 II 4.0 3.10
19 II 12.0 9.13
20 II 7.0 7.26
21 II 5.0 4.74
22 III 10.0 7.46
23 III 8.0 6.77
24 III 13.0 12.74
25 III 9.0 7.11
26 III 11.0 7.81
27 III 14.0 8.84
28 III 6.0 6.08
29 III 4.0 5.39
30 III 12.0 8.15
31 III 7.0 6.42
32 III 5.0 5.73
33 IV 8.0 6.58
34 IV 8.0 5.76
35 IV 8.0 7.71
36 IV 8.0 8.84
37 IV 8.0 8.47
38 IV 8.0 7.04
39 IV 8.0 5.25
40 IV 19.0 12.50
41 IV 8.0 5.56
42 IV 8.0 7.91
43 IV 8.0 6.89

In [71]:
anscombe.groupby("dataset").describe()


Out[71]:
x y
count mean std min 25% 50% 75% max count mean std min 25% 50% 75% max
dataset
I 11.0 9.0 3.316625 4.0 6.5 9.0 11.5 14.0 11.0 7.500909 2.031568 4.26 6.315 7.58 8.57 10.84
II 11.0 9.0 3.316625 4.0 6.5 9.0 11.5 14.0 11.0 7.500909 2.031657 3.10 6.695 8.14 8.95 9.26
III 11.0 9.0 3.316625 4.0 6.5 9.0 11.5 14.0 11.0 7.500000 2.030424 5.39 6.250 7.11 7.98 12.74
IV 11.0 9.0 3.316625 8.0 8.0 8.0 8.0 19.0 11.0 7.500909 2.030579 5.25 6.170 7.04 8.19 12.50

In [72]:
anscombe.corr()


Out[72]:
x y
x 1.000000 0.816366
y 0.816366 1.000000

In [73]:
for data_set in anscombe.dataset.unique(): 
    #anscombe[anscombe.dataset==data_set].plot(kind='scatter', x='x',y='y')
    df = anscombe[anscombe.dataset==data_set]
    slope, intercept, r_val, p_val, slope_std_error = stats.linregress(x=df.x, y=df.y)
    print('y={:.2f}x+{:.2f}'.format(slope, intercept))


y=0.50x+3.00
y=0.50x+3.00
y=0.50x+3.00
y=0.50x+3.00

In [74]:
for data_set in anscombe.dataset.unique(): 
    df = anscombe[anscombe.dataset==data_set]
    sns.lmplot(x="x", y="y", data=df);
    plt.title("Data set {}: y={:.2f}x+{:.2f} (p: {:.2f}, R^2: {:.2f})".format(data_set, slope, intercept, p_val, r_val))



In [ ]: