1次元データの分析

scikit-learnのiris(あやめ)のデータを用います。


In [30]:
import pandas as pd
from sklearn import datasets
iris = datasets.load_iris()
arr = iris.data
df = pd.DataFrame(arr, columns=iris.feature_names)
%matplotlib inline

In [2]:
df


Out[2]:
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm)
0 5.1 3.5 1.4 0.2
1 4.9 3.0 1.4 0.2
2 4.7 3.2 1.3 0.2
3 4.6 3.1 1.5 0.2
4 5.0 3.6 1.4 0.2
5 5.4 3.9 1.7 0.4
6 4.6 3.4 1.4 0.3
7 5.0 3.4 1.5 0.2
8 4.4 2.9 1.4 0.2
9 4.9 3.1 1.5 0.1
10 5.4 3.7 1.5 0.2
11 4.8 3.4 1.6 0.2
12 4.8 3.0 1.4 0.1
13 4.3 3.0 1.1 0.1
14 5.8 4.0 1.2 0.2
15 5.7 4.4 1.5 0.4
16 5.4 3.9 1.3 0.4
17 5.1 3.5 1.4 0.3
18 5.7 3.8 1.7 0.3
19 5.1 3.8 1.5 0.3
20 5.4 3.4 1.7 0.2
21 5.1 3.7 1.5 0.4
22 4.6 3.6 1.0 0.2
23 5.1 3.3 1.7 0.5
24 4.8 3.4 1.9 0.2
25 5.0 3.0 1.6 0.2
26 5.0 3.4 1.6 0.4
27 5.2 3.5 1.5 0.2
28 5.2 3.4 1.4 0.2
29 4.7 3.2 1.6 0.2
... ... ... ... ...
120 6.9 3.2 5.7 2.3
121 5.6 2.8 4.9 2.0
122 7.7 2.8 6.7 2.0
123 6.3 2.7 4.9 1.8
124 6.7 3.3 5.7 2.1
125 7.2 3.2 6.0 1.8
126 6.2 2.8 4.8 1.8
127 6.1 3.0 4.9 1.8
128 6.4 2.8 5.6 2.1
129 7.2 3.0 5.8 1.6
130 7.4 2.8 6.1 1.9
131 7.9 3.8 6.4 2.0
132 6.4 2.8 5.6 2.2
133 6.3 2.8 5.1 1.5
134 6.1 2.6 5.6 1.4
135 7.7 3.0 6.1 2.3
136 6.3 3.4 5.6 2.4
137 6.4 3.1 5.5 1.8
138 6.0 3.0 4.8 1.8
139 6.9 3.1 5.4 2.1
140 6.7 3.1 5.6 2.4
141 6.9 3.1 5.1 2.3
142 5.8 2.7 5.1 1.9
143 6.8 3.2 5.9 2.3
144 6.7 3.3 5.7 2.5
145 6.7 3.0 5.2 2.3
146 6.3 2.5 5.0 1.9
147 6.5 3.0 5.2 2.0
148 6.2 3.4 5.4 2.3
149 5.9 3.0 5.1 1.8

150 rows × 4 columns


In [3]:
df.columns


Out[3]:
Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object')

'sepal length (cm)'を用います。


In [5]:
s = df['sepal length (cm)']

データ数をカウントする。


In [6]:
s.count()


Out[6]:
150

データの最大と最小とレンジ(範囲)


In [9]:
print(s.max())
print(s.min())
print('レンジ=',s.max()-s.min())


7.9
4.3
レンジ= 3.6

等間隔離散化によりビンを作成


In [12]:
cut = pd.cut(s, 10)
cut


Out[12]:
0       (5.02, 5.38]
1       (4.66, 5.02]
2       (4.66, 5.02]
3      (4.296, 4.66]
4       (4.66, 5.02]
5       (5.38, 5.74]
6      (4.296, 4.66]
7       (4.66, 5.02]
8      (4.296, 4.66]
9       (4.66, 5.02]
10      (5.38, 5.74]
11      (4.66, 5.02]
12      (4.66, 5.02]
13     (4.296, 4.66]
14       (5.74, 6.1]
15      (5.38, 5.74]
16      (5.38, 5.74]
17      (5.02, 5.38]
18      (5.38, 5.74]
19      (5.02, 5.38]
20      (5.38, 5.74]
21      (5.02, 5.38]
22     (4.296, 4.66]
23      (5.02, 5.38]
24      (4.66, 5.02]
25      (4.66, 5.02]
26      (4.66, 5.02]
27      (5.02, 5.38]
28      (5.02, 5.38]
29      (4.66, 5.02]
           ...      
120     (6.82, 7.18]
121     (5.38, 5.74]
122      (7.54, 7.9]
123      (6.1, 6.46]
124     (6.46, 6.82]
125     (7.18, 7.54]
126      (6.1, 6.46]
127      (5.74, 6.1]
128      (6.1, 6.46]
129     (7.18, 7.54]
130     (7.18, 7.54]
131      (7.54, 7.9]
132      (6.1, 6.46]
133      (6.1, 6.46]
134      (5.74, 6.1]
135      (7.54, 7.9]
136      (6.1, 6.46]
137      (6.1, 6.46]
138      (5.74, 6.1]
139     (6.82, 7.18]
140     (6.46, 6.82]
141     (6.82, 7.18]
142      (5.74, 6.1]
143     (6.46, 6.82]
144     (6.46, 6.82]
145     (6.46, 6.82]
146      (6.1, 6.46]
147     (6.46, 6.82]
148      (6.1, 6.46]
149      (5.74, 6.1]
Name: sepal length (cm), dtype: category
Categories (10, object): [(4.296, 4.66] < (4.66, 5.02] < (5.02, 5.38] < (5.38, 5.74] ... (6.46, 6.82] < (6.82, 7.18] < (7.18, 7.54] < (7.54, 7.9]]

In [19]:
print(type(cut))
print(type(cut[0]))


<class 'pandas.core.series.Series'>
<class 'str'>

Categoriesが順序を表している。


In [18]:
cut_sort = cut.sort_values()
cut_sort


Out[18]:
41     (4.296, 4.66]
22     (4.296, 4.66]
13     (4.296, 4.66]
47     (4.296, 4.66]
8      (4.296, 4.66]
6      (4.296, 4.66]
38     (4.296, 4.66]
3      (4.296, 4.66]
42     (4.296, 4.66]
30      (4.66, 5.02]
40      (4.66, 5.02]
43      (4.66, 5.02]
26      (4.66, 5.02]
25      (4.66, 5.02]
34      (4.66, 5.02]
24      (4.66, 5.02]
45      (4.66, 5.02]
35      (4.66, 5.02]
29      (4.66, 5.02]
49      (4.66, 5.02]
57      (4.66, 5.02]
1       (4.66, 5.02]
2       (4.66, 5.02]
4       (4.66, 5.02]
93      (4.66, 5.02]
7       (4.66, 5.02]
9       (4.66, 5.02]
37      (4.66, 5.02]
11      (4.66, 5.02]
12      (4.66, 5.02]
           ...      
140     (6.46, 6.82]
77      (6.46, 6.82]
76      (6.46, 6.82]
75      (6.46, 6.82]
108     (6.46, 6.82]
116     (6.46, 6.82]
65      (6.46, 6.82]
110     (6.46, 6.82]
112     (6.46, 6.82]
104     (6.46, 6.82]
58      (6.46, 6.82]
54      (6.46, 6.82]
124     (6.46, 6.82]
52      (6.82, 7.18]
120     (6.82, 7.18]
141     (6.82, 7.18]
102     (6.82, 7.18]
50      (6.82, 7.18]
139     (6.82, 7.18]
109     (7.18, 7.54]
125     (7.18, 7.54]
130     (7.18, 7.54]
129     (7.18, 7.54]
107     (7.18, 7.54]
135      (7.54, 7.9]
122      (7.54, 7.9]
131      (7.54, 7.9]
105      (7.54, 7.9]
118      (7.54, 7.9]
117      (7.54, 7.9]
Name: sepal length (cm), dtype: category
Categories (10, object): [(4.296, 4.66] < (4.66, 5.02] < (5.02, 5.38] < (5.38, 5.74] ... (6.46, 6.82] < (6.82, 7.18] < (7.18, 7.54] < (7.54, 7.9]]

等間隔離散化の説明は、離散化の学習の際に詳しく行うので飛ばします。


In [24]:
cut_point = cut_sort.unique()
cut_point


Out[24]:
[(4.296, 4.66], (4.66, 5.02], (5.02, 5.38], (5.38, 5.74], (5.74, 6.1], (6.1, 6.46], (6.46, 6.82], (6.82, 7.18], (7.18, 7.54], (7.54, 7.9]]
Categories (10, object): [(4.296, 4.66] < (4.66, 5.02] < (5.02, 5.38] < (5.38, 5.74] ... (6.46, 6.82] < (6.82, 7.18] < (7.18, 7.54] < (7.54, 7.9]]

度数分布


In [48]:
cut.groupby(cut).count()


Out[48]:
sepal length (cm)
(4.296, 4.66]     9
(4.66, 5.02]     23
(5.02, 5.38]     14
(5.38, 5.74]     27
(5.74, 6.1]      22
(6.1, 6.46]      20
(6.46, 6.82]     18
(6.82, 7.18]      6
(7.18, 7.54]      5
(7.54, 7.9]       6
Name: sepal length (cm), dtype: int64

In [43]:
pd.DataFrame(cut,columns=[cut.name]).groupby(cut.name).


Out[43]:
sepal length (cm)
(4.296, 4.66]
(4.66, 5.02]
(5.02, 5.38]
(5.38, 5.74]
(5.74, 6.1]
(6.1, 6.46]
(6.46, 6.82]
(6.82, 7.18]
(7.18, 7.54]
(7.54, 7.9]

In [37]:
cut.name


Out[37]:
'sepal length (cm)'

ヒストグラム

ヒストグラムは簡単に作れる。


In [32]:
s.hist(bins=10)


Out[32]:
<matplotlib.axes._subplots.AxesSubplot at 0x11856fe10>

データの代表値:平均(算術平均)、メディアン(中央値)、モード(最頻値)


In [8]:
print(s.mean())
print(s.median())
print(s.mode())


5.843333333333335
5.8
0    5.0
dtype: float64

分位点(パーセンタイル)


In [49]:
s.quantile([.25,.5,.75])


Out[49]:
0.25    5.1
0.50    5.8
0.75    6.4
Name: sepal length (cm), dtype: float64

データのばらつき

偏差


In [50]:
s - s.mean()


Out[50]:
0     -0.743333
1     -0.943333
2     -1.143333
3     -1.243333
4     -0.843333
5     -0.443333
6     -1.243333
7     -0.843333
8     -1.443333
9     -0.943333
10    -0.443333
11    -1.043333
12    -1.043333
13    -1.543333
14    -0.043333
15    -0.143333
16    -0.443333
17    -0.743333
18    -0.143333
19    -0.743333
20    -0.443333
21    -0.743333
22    -1.243333
23    -0.743333
24    -1.043333
25    -0.843333
26    -0.843333
27    -0.643333
28    -0.643333
29    -1.143333
         ...   
120    1.056667
121   -0.243333
122    1.856667
123    0.456667
124    0.856667
125    1.356667
126    0.356667
127    0.256667
128    0.556667
129    1.356667
130    1.556667
131    2.056667
132    0.556667
133    0.456667
134    0.256667
135    1.856667
136    0.456667
137    0.556667
138    0.156667
139    1.056667
140    0.856667
141    1.056667
142   -0.043333
143    0.956667
144    0.856667
145    0.856667
146    0.456667
147    0.656667
148    0.356667
149    0.056667
Name: sepal length (cm), dtype: float64

偏差の平方


In [ ]:
(s - s.mean())**2

偏差平方和


In [53]:
((s - s.mean())**2).sum()


Out[53]:
102.16833333333332

分散


In [54]:
((s - s.mean())**2).mean()


Out[54]:
0.6811222222222222

In [55]:
s.var()


Out[55]:
0.6856935123042505

varのデフォルトは不偏分散

標本分散(Nで割る)


In [56]:
s.var(ddof=0)


Out[56]:
0.6811222222222222

不偏分散(N-1で割る)


In [57]:
s.var(ddof=1)


Out[57]:
0.6856935123042505

標準偏差


In [59]:
(((s - s.mean())**2).mean())**(1/2)


Out[59]:
0.8253012917851409

不偏標準偏差


In [60]:
s.std()


Out[60]:
0.8280661279778629

標本標準偏差


In [61]:
s.std(ddof=0)


Out[61]:
0.8253012917851409

要約統計量


In [62]:
df.describe()


Out[62]:
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm)
count 150.000000 150.000000 150.000000 150.000000
mean 5.843333 3.054000 3.758667 1.198667
std 0.828066 0.433594 1.764420 0.763161
min 4.300000 2.000000 1.000000 0.100000
25% 5.100000 2.800000 1.600000 0.300000
50% 5.800000 3.000000 4.350000 1.300000
75% 6.400000 3.300000 5.100000 1.800000
max 7.900000 4.400000 6.900000 2.500000

In [64]:
df.describe(percentiles=[.05,.25,.5,.75,.95])


Out[64]:
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm)
count 150.000000 150.000000 150.000000 150.000000
mean 5.843333 3.054000 3.758667 1.198667
std 0.828066 0.433594 1.764420 0.763161
min 4.300000 2.000000 1.000000 0.100000
5% 4.600000 2.345000 1.300000 0.200000
25% 5.100000 2.800000 1.600000 0.300000
50% 5.800000 3.000000 4.350000 1.300000
75% 6.400000 3.300000 5.100000 1.800000
95% 7.255000 3.800000 6.100000 2.300000
max 7.900000 4.400000 6.900000 2.500000

変動係数(CV)


In [65]:
s.std()/s.mean()


Out[65]:
0.1417112597794403

標準化


In [66]:
(s - s.mean())/s.std()


Out[66]:
0     -0.897674
1     -1.139200
2     -1.380727
3     -1.501490
4     -1.018437
5     -0.535384
6     -1.501490
7     -1.018437
8     -1.743017
9     -1.139200
10    -0.535384
11    -1.259964
12    -1.259964
13    -1.863780
14    -0.052331
15    -0.173094
16    -0.535384
17    -0.897674
18    -0.173094
19    -0.897674
20    -0.535384
21    -0.897674
22    -1.501490
23    -0.897674
24    -1.259964
25    -1.018437
26    -1.018437
27    -0.776911
28    -0.776911
29    -1.380727
         ...   
120    1.276066
121   -0.293857
122    2.242172
123    0.551486
124    1.034539
125    1.638355
126    0.430722
127    0.309959
128    0.672249
129    1.638355
130    1.879882
131    2.483699
132    0.672249
133    0.551486
134    0.309959
135    2.242172
136    0.551486
137    0.672249
138    0.189196
139    1.276066
140    1.034539
141    1.276066
142   -0.052331
143    1.155302
144    1.034539
145    1.034539
146    0.551486
147    0.793012
148    0.430722
149    0.068433
Name: sepal length (cm), dtype: float64

In [ ]: