In [2]:
import pandas as pd
from sklearn.datasets import load_boston
boston = load_boston()
df = pd.DataFrame(boston.data, columns=boston.feature_names)
df
Out[2]:
CRIM
ZN
INDUS
CHAS
NOX
RM
AGE
DIS
RAD
TAX
PTRATIO
B
LSTAT
0
0.00632
18.0
2.31
0.0
0.538
6.575
65.2
4.0900
1.0
296.0
15.3
396.90
4.98
1
0.02731
0.0
7.07
0.0
0.469
6.421
78.9
4.9671
2.0
242.0
17.8
396.90
9.14
2
0.02729
0.0
7.07
0.0
0.469
7.185
61.1
4.9671
2.0
242.0
17.8
392.83
4.03
3
0.03237
0.0
2.18
0.0
0.458
6.998
45.8
6.0622
3.0
222.0
18.7
394.63
2.94
4
0.06905
0.0
2.18
0.0
0.458
7.147
54.2
6.0622
3.0
222.0
18.7
396.90
5.33
5
0.02985
0.0
2.18
0.0
0.458
6.430
58.7
6.0622
3.0
222.0
18.7
394.12
5.21
6
0.08829
12.5
7.87
0.0
0.524
6.012
66.6
5.5605
5.0
311.0
15.2
395.60
12.43
7
0.14455
12.5
7.87
0.0
0.524
6.172
96.1
5.9505
5.0
311.0
15.2
396.90
19.15
8
0.21124
12.5
7.87
0.0
0.524
5.631
100.0
6.0821
5.0
311.0
15.2
386.63
29.93
9
0.17004
12.5
7.87
0.0
0.524
6.004
85.9
6.5921
5.0
311.0
15.2
386.71
17.10
10
0.22489
12.5
7.87
0.0
0.524
6.377
94.3
6.3467
5.0
311.0
15.2
392.52
20.45
11
0.11747
12.5
7.87
0.0
0.524
6.009
82.9
6.2267
5.0
311.0
15.2
396.90
13.27
12
0.09378
12.5
7.87
0.0
0.524
5.889
39.0
5.4509
5.0
311.0
15.2
390.50
15.71
13
0.62976
0.0
8.14
0.0
0.538
5.949
61.8
4.7075
4.0
307.0
21.0
396.90
8.26
14
0.63796
0.0
8.14
0.0
0.538
6.096
84.5
4.4619
4.0
307.0
21.0
380.02
10.26
15
0.62739
0.0
8.14
0.0
0.538
5.834
56.5
4.4986
4.0
307.0
21.0
395.62
8.47
16
1.05393
0.0
8.14
0.0
0.538
5.935
29.3
4.4986
4.0
307.0
21.0
386.85
6.58
17
0.78420
0.0
8.14
0.0
0.538
5.990
81.7
4.2579
4.0
307.0
21.0
386.75
14.67
18
0.80271
0.0
8.14
0.0
0.538
5.456
36.6
3.7965
4.0
307.0
21.0
288.99
11.69
19
0.72580
0.0
8.14
0.0
0.538
5.727
69.5
3.7965
4.0
307.0
21.0
390.95
11.28
20
1.25179
0.0
8.14
0.0
0.538
5.570
98.1
3.7979
4.0
307.0
21.0
376.57
21.02
21
0.85204
0.0
8.14
0.0
0.538
5.965
89.2
4.0123
4.0
307.0
21.0
392.53
13.83
22
1.23247
0.0
8.14
0.0
0.538
6.142
91.7
3.9769
4.0
307.0
21.0
396.90
18.72
23
0.98843
0.0
8.14
0.0
0.538
5.813
100.0
4.0952
4.0
307.0
21.0
394.54
19.88
24
0.75026
0.0
8.14
0.0
0.538
5.924
94.1
4.3996
4.0
307.0
21.0
394.33
16.30
25
0.84054
0.0
8.14
0.0
0.538
5.599
85.7
4.4546
4.0
307.0
21.0
303.42
16.51
26
0.67191
0.0
8.14
0.0
0.538
5.813
90.3
4.6820
4.0
307.0
21.0
376.88
14.81
27
0.95577
0.0
8.14
0.0
0.538
6.047
88.8
4.4534
4.0
307.0
21.0
306.38
17.28
28
0.77299
0.0
8.14
0.0
0.538
6.495
94.4
4.4547
4.0
307.0
21.0
387.94
12.80
29
1.00245
0.0
8.14
0.0
0.538
6.674
87.3
4.2390
4.0
307.0
21.0
380.23
11.98
...
...
...
...
...
...
...
...
...
...
...
...
...
...
476
4.87141
0.0
18.10
0.0
0.614
6.484
93.6
2.3053
24.0
666.0
20.2
396.21
18.68
477
15.02340
0.0
18.10
0.0
0.614
5.304
97.3
2.1007
24.0
666.0
20.2
349.48
24.91
478
10.23300
0.0
18.10
0.0
0.614
6.185
96.7
2.1705
24.0
666.0
20.2
379.70
18.03
479
14.33370
0.0
18.10
0.0
0.614
6.229
88.0
1.9512
24.0
666.0
20.2
383.32
13.11
480
5.82401
0.0
18.10
0.0
0.532
6.242
64.7
3.4242
24.0
666.0
20.2
396.90
10.74
481
5.70818
0.0
18.10
0.0
0.532
6.750
74.9
3.3317
24.0
666.0
20.2
393.07
7.74
482
5.73116
0.0
18.10
0.0
0.532
7.061
77.0
3.4106
24.0
666.0
20.2
395.28
7.01
483
2.81838
0.0
18.10
0.0
0.532
5.762
40.3
4.0983
24.0
666.0
20.2
392.92
10.42
484
2.37857
0.0
18.10
0.0
0.583
5.871
41.9
3.7240
24.0
666.0
20.2
370.73
13.34
485
3.67367
0.0
18.10
0.0
0.583
6.312
51.9
3.9917
24.0
666.0
20.2
388.62
10.58
486
5.69175
0.0
18.10
0.0
0.583
6.114
79.8
3.5459
24.0
666.0
20.2
392.68
14.98
487
4.83567
0.0
18.10
0.0
0.583
5.905
53.2
3.1523
24.0
666.0
20.2
388.22
11.45
488
0.15086
0.0
27.74
0.0
0.609
5.454
92.7
1.8209
4.0
711.0
20.1
395.09
18.06
489
0.18337
0.0
27.74
0.0
0.609
5.414
98.3
1.7554
4.0
711.0
20.1
344.05
23.97
490
0.20746
0.0
27.74
0.0
0.609
5.093
98.0
1.8226
4.0
711.0
20.1
318.43
29.68
491
0.10574
0.0
27.74
0.0
0.609
5.983
98.8
1.8681
4.0
711.0
20.1
390.11
18.07
492
0.11132
0.0
27.74
0.0
0.609
5.983
83.5
2.1099
4.0
711.0
20.1
396.90
13.35
493
0.17331
0.0
9.69
0.0
0.585
5.707
54.0
2.3817
6.0
391.0
19.2
396.90
12.01
494
0.27957
0.0
9.69
0.0
0.585
5.926
42.6
2.3817
6.0
391.0
19.2
396.90
13.59
495
0.17899
0.0
9.69
0.0
0.585
5.670
28.8
2.7986
6.0
391.0
19.2
393.29
17.60
496
0.28960
0.0
9.69
0.0
0.585
5.390
72.9
2.7986
6.0
391.0
19.2
396.90
21.14
497
0.26838
0.0
9.69
0.0
0.585
5.794
70.6
2.8927
6.0
391.0
19.2
396.90
14.10
498
0.23912
0.0
9.69
0.0
0.585
6.019
65.3
2.4091
6.0
391.0
19.2
396.90
12.92
499
0.17783
0.0
9.69
0.0
0.585
5.569
73.5
2.3999
6.0
391.0
19.2
395.77
15.10
500
0.22438
0.0
9.69
0.0
0.585
6.027
79.7
2.4982
6.0
391.0
19.2
396.90
14.33
501
0.06263
0.0
11.93
0.0
0.573
6.593
69.1
2.4786
1.0
273.0
21.0
391.99
9.67
502
0.04527
0.0
11.93
0.0
0.573
6.120
76.7
2.2875
1.0
273.0
21.0
396.90
9.08
503
0.06076
0.0
11.93
0.0
0.573
6.976
91.0
2.1675
1.0
273.0
21.0
396.90
5.64
504
0.10959
0.0
11.93
0.0
0.573
6.794
89.3
2.3889
1.0
273.0
21.0
393.45
6.48
505
0.04741
0.0
11.93
0.0
0.573
6.030
80.8
2.5050
1.0
273.0
21.0
396.90
7.88
506 rows × 13 columns
In [11]:
df.describe([.001,.25,.5,.75,.999])
Out[11]:
CRIM
ZN
INDUS
CHAS
NOX
RM
AGE
DIS
RAD
TAX
PTRATIO
B
LSTAT
count
506.000000
506.000000
506.000000
506.000000
506.000000
506.000000
506.000000
506.000000
506.000000
506.000000
506.000000
506.000000
506.000000
mean
3.593761
11.363636
11.136779
0.069170
0.554695
6.284634
68.574901
3.795043
9.549407
408.237154
18.455534
356.674032
12.653063
std
8.596783
23.322453
6.860353
0.253994
0.115878
0.702617
28.148861
2.105710
8.707259
168.537116
2.164946
91.294864
7.141062
min
0.006320
0.000000
0.460000
0.000000
0.385000
3.561000
2.900000
1.129600
1.000000
187.000000
12.600000
0.320000
1.730000
0.1%
0.007704
0.000000
0.601400
0.000000
0.387020
3.713510
4.465500
1.133337
1.000000
187.505000
12.600000
1.431000
1.825950
25%
0.082045
0.000000
5.190000
0.000000
0.449000
5.885500
45.025000
2.100175
4.000000
279.000000
17.400000
375.377500
6.950000
50%
0.256510
0.000000
9.690000
0.000000
0.538000
6.208500
77.500000
3.207450
5.000000
330.000000
19.050000
391.440000
11.360000
75%
3.647423
12.500000
18.100000
0.000000
0.624000
6.623500
94.075000
5.188425
24.000000
666.000000
20.200000
396.225000
16.955000
99.9%
81.177940
97.475000
27.740000
1.000000
0.871000
8.752225
100.000000
11.411319
24.000000
711.000000
22.000000
396.900000
37.470050
max
88.976200
100.000000
27.740000
1.000000
0.871000
8.780000
100.000000
12.126500
24.000000
711.000000
22.000000
396.900000
37.970000
In [5]:
cut = pd.cut(df['TAX'],bins=5)
cut
Out[5]:
0 (291.8, 396.6]
1 (186.476, 291.8]
2 (186.476, 291.8]
3 (186.476, 291.8]
4 (186.476, 291.8]
5 (186.476, 291.8]
6 (291.8, 396.6]
7 (291.8, 396.6]
8 (291.8, 396.6]
9 (291.8, 396.6]
10 (291.8, 396.6]
11 (291.8, 396.6]
12 (291.8, 396.6]
13 (291.8, 396.6]
14 (291.8, 396.6]
15 (291.8, 396.6]
16 (291.8, 396.6]
17 (291.8, 396.6]
18 (291.8, 396.6]
19 (291.8, 396.6]
20 (291.8, 396.6]
21 (291.8, 396.6]
22 (291.8, 396.6]
23 (291.8, 396.6]
24 (291.8, 396.6]
25 (291.8, 396.6]
26 (291.8, 396.6]
27 (291.8, 396.6]
28 (291.8, 396.6]
29 (291.8, 396.6]
...
476 (606.2, 711.0]
477 (606.2, 711.0]
478 (606.2, 711.0]
479 (606.2, 711.0]
480 (606.2, 711.0]
481 (606.2, 711.0]
482 (606.2, 711.0]
483 (606.2, 711.0]
484 (606.2, 711.0]
485 (606.2, 711.0]
486 (606.2, 711.0]
487 (606.2, 711.0]
488 (606.2, 711.0]
489 (606.2, 711.0]
490 (606.2, 711.0]
491 (606.2, 711.0]
492 (606.2, 711.0]
493 (291.8, 396.6]
494 (291.8, 396.6]
495 (291.8, 396.6]
496 (291.8, 396.6]
497 (291.8, 396.6]
498 (291.8, 396.6]
499 (291.8, 396.6]
500 (291.8, 396.6]
501 (186.476, 291.8]
502 (186.476, 291.8]
503 (186.476, 291.8]
504 (186.476, 291.8]
505 (186.476, 291.8]
Name: TAX, Length: 506, dtype: category
Categories (5, interval[float64]): [(186.476, 291.8] < (291.8, 396.6] < (396.6, 501.4] < (501.4, 606.2] < (606.2, 711.0]]
binsでbinの数を変更
In [12]:
pd.cut(df['TAX'],bins=2)
Out[12]:
0 (186.476, 449.0]
1 (186.476, 449.0]
2 (186.476, 449.0]
3 (186.476, 449.0]
4 (186.476, 449.0]
5 (186.476, 449.0]
6 (186.476, 449.0]
7 (186.476, 449.0]
8 (186.476, 449.0]
9 (186.476, 449.0]
10 (186.476, 449.0]
11 (186.476, 449.0]
12 (186.476, 449.0]
13 (186.476, 449.0]
14 (186.476, 449.0]
15 (186.476, 449.0]
16 (186.476, 449.0]
17 (186.476, 449.0]
18 (186.476, 449.0]
19 (186.476, 449.0]
20 (186.476, 449.0]
21 (186.476, 449.0]
22 (186.476, 449.0]
23 (186.476, 449.0]
24 (186.476, 449.0]
25 (186.476, 449.0]
26 (186.476, 449.0]
27 (186.476, 449.0]
28 (186.476, 449.0]
29 (186.476, 449.0]
...
476 (449.0, 711.0]
477 (449.0, 711.0]
478 (449.0, 711.0]
479 (449.0, 711.0]
480 (449.0, 711.0]
481 (449.0, 711.0]
482 (449.0, 711.0]
483 (449.0, 711.0]
484 (449.0, 711.0]
485 (449.0, 711.0]
486 (449.0, 711.0]
487 (449.0, 711.0]
488 (449.0, 711.0]
489 (449.0, 711.0]
490 (449.0, 711.0]
491 (449.0, 711.0]
492 (449.0, 711.0]
493 (186.476, 449.0]
494 (186.476, 449.0]
495 (186.476, 449.0]
496 (186.476, 449.0]
497 (186.476, 449.0]
498 (186.476, 449.0]
499 (186.476, 449.0]
500 (186.476, 449.0]
501 (186.476, 449.0]
502 (186.476, 449.0]
503 (186.476, 449.0]
504 (186.476, 449.0]
505 (186.476, 449.0]
Name: TAX, Length: 506, dtype: category
Categories (2, interval[float64]): [(186.476, 449.0] < (449.0, 711.0]]
In [13]:
pd.cut(df['TAX'],bins=50)
Out[13]:
0 (291.8, 302.28]
1 (239.4, 249.88]
2 (239.4, 249.88]
3 (218.44, 228.92]
4 (218.44, 228.92]
5 (218.44, 228.92]
6 (302.28, 312.76]
7 (302.28, 312.76]
8 (302.28, 312.76]
9 (302.28, 312.76]
10 (302.28, 312.76]
11 (302.28, 312.76]
12 (302.28, 312.76]
13 (302.28, 312.76]
14 (302.28, 312.76]
15 (302.28, 312.76]
16 (302.28, 312.76]
17 (302.28, 312.76]
18 (302.28, 312.76]
19 (302.28, 312.76]
20 (302.28, 312.76]
21 (302.28, 312.76]
22 (302.28, 312.76]
23 (302.28, 312.76]
24 (302.28, 312.76]
25 (302.28, 312.76]
26 (302.28, 312.76]
27 (302.28, 312.76]
28 (302.28, 312.76]
29 (302.28, 312.76]
...
476 (658.6, 669.08]
477 (658.6, 669.08]
478 (658.6, 669.08]
479 (658.6, 669.08]
480 (658.6, 669.08]
481 (658.6, 669.08]
482 (658.6, 669.08]
483 (658.6, 669.08]
484 (658.6, 669.08]
485 (658.6, 669.08]
486 (658.6, 669.08]
487 (658.6, 669.08]
488 (700.52, 711.0]
489 (700.52, 711.0]
490 (700.52, 711.0]
491 (700.52, 711.0]
492 (700.52, 711.0]
493 (386.12, 396.6]
494 (386.12, 396.6]
495 (386.12, 396.6]
496 (386.12, 396.6]
497 (386.12, 396.6]
498 (386.12, 396.6]
499 (386.12, 396.6]
500 (386.12, 396.6]
501 (270.84, 281.32]
502 (270.84, 281.32]
503 (270.84, 281.32]
504 (270.84, 281.32]
505 (270.84, 281.32]
Name: TAX, Length: 506, dtype: category
Categories (50, interval[float64]): [(186.476, 197.48] < (197.48, 207.96] < (207.96, 218.44] < (218.44, 228.92] ... (669.08, 679.56] < (679.56, 690.04] < (690.04, 700.52] < (700.52, 711.0]]
rightで下側と上側どちらを開区間、閉区間にするか設定
In [6]:
pd.cut(df['TAX'],bins=5,right=False)
Out[6]:
0 [291.8, 396.6)
1 [187.0, 291.8)
2 [187.0, 291.8)
3 [187.0, 291.8)
4 [187.0, 291.8)
5 [187.0, 291.8)
6 [291.8, 396.6)
7 [291.8, 396.6)
8 [291.8, 396.6)
9 [291.8, 396.6)
10 [291.8, 396.6)
11 [291.8, 396.6)
12 [291.8, 396.6)
13 [291.8, 396.6)
14 [291.8, 396.6)
15 [291.8, 396.6)
16 [291.8, 396.6)
17 [291.8, 396.6)
18 [291.8, 396.6)
19 [291.8, 396.6)
20 [291.8, 396.6)
21 [291.8, 396.6)
22 [291.8, 396.6)
23 [291.8, 396.6)
24 [291.8, 396.6)
25 [291.8, 396.6)
26 [291.8, 396.6)
27 [291.8, 396.6)
28 [291.8, 396.6)
29 [291.8, 396.6)
...
476 [606.2, 711.524)
477 [606.2, 711.524)
478 [606.2, 711.524)
479 [606.2, 711.524)
480 [606.2, 711.524)
481 [606.2, 711.524)
482 [606.2, 711.524)
483 [606.2, 711.524)
484 [606.2, 711.524)
485 [606.2, 711.524)
486 [606.2, 711.524)
487 [606.2, 711.524)
488 [606.2, 711.524)
489 [606.2, 711.524)
490 [606.2, 711.524)
491 [606.2, 711.524)
492 [606.2, 711.524)
493 [291.8, 396.6)
494 [291.8, 396.6)
495 [291.8, 396.6)
496 [291.8, 396.6)
497 [291.8, 396.6)
498 [291.8, 396.6)
499 [291.8, 396.6)
500 [291.8, 396.6)
501 [187.0, 291.8)
502 [187.0, 291.8)
503 [187.0, 291.8)
504 [187.0, 291.8)
505 [187.0, 291.8)
Name: TAX, Length: 506, dtype: category
Categories (5, interval[float64]): [[187.0, 291.8) < [291.8, 396.6) < [396.6, 501.4) < [501.4, 606.2) < [606.2, 711.524)]
retbinsでbinのエッジも返す。
In [16]:
cut_tuple = pd.cut(df['TAX'],bins=5,retbins=True)
cut_tuple
Out[16]:
(0 (291.8, 396.6]
1 (186.476, 291.8]
2 (186.476, 291.8]
3 (186.476, 291.8]
4 (186.476, 291.8]
5 (186.476, 291.8]
6 (291.8, 396.6]
7 (291.8, 396.6]
8 (291.8, 396.6]
9 (291.8, 396.6]
10 (291.8, 396.6]
11 (291.8, 396.6]
12 (291.8, 396.6]
13 (291.8, 396.6]
14 (291.8, 396.6]
15 (291.8, 396.6]
16 (291.8, 396.6]
17 (291.8, 396.6]
18 (291.8, 396.6]
19 (291.8, 396.6]
20 (291.8, 396.6]
21 (291.8, 396.6]
22 (291.8, 396.6]
23 (291.8, 396.6]
24 (291.8, 396.6]
25 (291.8, 396.6]
26 (291.8, 396.6]
27 (291.8, 396.6]
28 (291.8, 396.6]
29 (291.8, 396.6]
...
476 (606.2, 711.0]
477 (606.2, 711.0]
478 (606.2, 711.0]
479 (606.2, 711.0]
480 (606.2, 711.0]
481 (606.2, 711.0]
482 (606.2, 711.0]
483 (606.2, 711.0]
484 (606.2, 711.0]
485 (606.2, 711.0]
486 (606.2, 711.0]
487 (606.2, 711.0]
488 (606.2, 711.0]
489 (606.2, 711.0]
490 (606.2, 711.0]
491 (606.2, 711.0]
492 (606.2, 711.0]
493 (291.8, 396.6]
494 (291.8, 396.6]
495 (291.8, 396.6]
496 (291.8, 396.6]
497 (291.8, 396.6]
498 (291.8, 396.6]
499 (291.8, 396.6]
500 (291.8, 396.6]
501 (186.476, 291.8]
502 (186.476, 291.8]
503 (186.476, 291.8]
504 (186.476, 291.8]
505 (186.476, 291.8]
Name: TAX, Length: 506, dtype: category
Categories (5, interval[float64]): [(186.476, 291.8] < (291.8, 396.6] < (396.6, 501.4] < (501.4, 606.2] < (606.2, 711.0]],
array([ 186.476, 291.8 , 396.6 , 501.4 , 606.2 , 711. ]))
In [19]:
bin_edges = cut_tuple[1]
bin_edges
Out[19]:
array([ 186.476, 291.8 , 396.6 , 501.4 , 606.2 , 711. ])
下限を含むかどうか
In [29]:
pd.cut(df['TAX'],bins=5,include_lowest=False)
Out[29]:
0 (291.8, 396.6]
1 (186.476, 291.8]
2 (186.476, 291.8]
3 (186.476, 291.8]
4 (186.476, 291.8]
5 (186.476, 291.8]
6 (291.8, 396.6]
7 (291.8, 396.6]
8 (291.8, 396.6]
9 (291.8, 396.6]
10 (291.8, 396.6]
11 (291.8, 396.6]
12 (291.8, 396.6]
13 (291.8, 396.6]
14 (291.8, 396.6]
15 (291.8, 396.6]
16 (291.8, 396.6]
17 (291.8, 396.6]
18 (291.8, 396.6]
19 (291.8, 396.6]
20 (291.8, 396.6]
21 (291.8, 396.6]
22 (291.8, 396.6]
23 (291.8, 396.6]
24 (291.8, 396.6]
25 (291.8, 396.6]
26 (291.8, 396.6]
27 (291.8, 396.6]
28 (291.8, 396.6]
29 (291.8, 396.6]
...
476 (606.2, 711.0]
477 (606.2, 711.0]
478 (606.2, 711.0]
479 (606.2, 711.0]
480 (606.2, 711.0]
481 (606.2, 711.0]
482 (606.2, 711.0]
483 (606.2, 711.0]
484 (606.2, 711.0]
485 (606.2, 711.0]
486 (606.2, 711.0]
487 (606.2, 711.0]
488 (606.2, 711.0]
489 (606.2, 711.0]
490 (606.2, 711.0]
491 (606.2, 711.0]
492 (606.2, 711.0]
493 (291.8, 396.6]
494 (291.8, 396.6]
495 (291.8, 396.6]
496 (291.8, 396.6]
497 (291.8, 396.6]
498 (291.8, 396.6]
499 (291.8, 396.6]
500 (291.8, 396.6]
501 (186.476, 291.8]
502 (186.476, 291.8]
503 (186.476, 291.8]
504 (186.476, 291.8]
505 (186.476, 291.8]
Name: TAX, Length: 506, dtype: category
Categories (5, interval[float64]): [(186.476, 291.8] < (291.8, 396.6] < (396.6, 501.4] < (501.4, 606.2] < (606.2, 711.0]]
precisionで丸め込みの桁数を決める。
In [30]:
pd.cut(df['TAX'],bins=5,precision=0)
Out[30]:
0 (291.8, 396.6]
1 (186.5, 291.8]
2 (186.5, 291.8]
3 (186.5, 291.8]
4 (186.5, 291.8]
5 (186.5, 291.8]
6 (291.8, 396.6]
7 (291.8, 396.6]
8 (291.8, 396.6]
9 (291.8, 396.6]
10 (291.8, 396.6]
11 (291.8, 396.6]
12 (291.8, 396.6]
13 (291.8, 396.6]
14 (291.8, 396.6]
15 (291.8, 396.6]
16 (291.8, 396.6]
17 (291.8, 396.6]
18 (291.8, 396.6]
19 (291.8, 396.6]
20 (291.8, 396.6]
21 (291.8, 396.6]
22 (291.8, 396.6]
23 (291.8, 396.6]
24 (291.8, 396.6]
25 (291.8, 396.6]
26 (291.8, 396.6]
27 (291.8, 396.6]
28 (291.8, 396.6]
29 (291.8, 396.6]
...
476 (606.2, 711.0]
477 (606.2, 711.0]
478 (606.2, 711.0]
479 (606.2, 711.0]
480 (606.2, 711.0]
481 (606.2, 711.0]
482 (606.2, 711.0]
483 (606.2, 711.0]
484 (606.2, 711.0]
485 (606.2, 711.0]
486 (606.2, 711.0]
487 (606.2, 711.0]
488 (606.2, 711.0]
489 (606.2, 711.0]
490 (606.2, 711.0]
491 (606.2, 711.0]
492 (606.2, 711.0]
493 (291.8, 396.6]
494 (291.8, 396.6]
495 (291.8, 396.6]
496 (291.8, 396.6]
497 (291.8, 396.6]
498 (291.8, 396.6]
499 (291.8, 396.6]
500 (291.8, 396.6]
501 (186.5, 291.8]
502 (186.5, 291.8]
503 (186.5, 291.8]
504 (186.5, 291.8]
505 (186.5, 291.8]
Name: TAX, Length: 506, dtype: category
Categories (5, interval[float64]): [(186.5, 291.8] < (291.8, 396.6] < (396.6, 501.4] < (501.4, 606.2] < (606.2, 711.0]]
In [31]:
pd.cut(df['TAX'],bins=5,precision=1)
Out[31]:
0 (292.0, 397.0]
1 (186.0, 292.0]
2 (186.0, 292.0]
3 (186.0, 292.0]
4 (186.0, 292.0]
5 (186.0, 292.0]
6 (292.0, 397.0]
7 (292.0, 397.0]
8 (292.0, 397.0]
9 (292.0, 397.0]
10 (292.0, 397.0]
11 (292.0, 397.0]
12 (292.0, 397.0]
13 (292.0, 397.0]
14 (292.0, 397.0]
15 (292.0, 397.0]
16 (292.0, 397.0]
17 (292.0, 397.0]
18 (292.0, 397.0]
19 (292.0, 397.0]
20 (292.0, 397.0]
21 (292.0, 397.0]
22 (292.0, 397.0]
23 (292.0, 397.0]
24 (292.0, 397.0]
25 (292.0, 397.0]
26 (292.0, 397.0]
27 (292.0, 397.0]
28 (292.0, 397.0]
29 (292.0, 397.0]
...
476 (606.0, 711.0]
477 (606.0, 711.0]
478 (606.0, 711.0]
479 (606.0, 711.0]
480 (606.0, 711.0]
481 (606.0, 711.0]
482 (606.0, 711.0]
483 (606.0, 711.0]
484 (606.0, 711.0]
485 (606.0, 711.0]
486 (606.0, 711.0]
487 (606.0, 711.0]
488 (606.0, 711.0]
489 (606.0, 711.0]
490 (606.0, 711.0]
491 (606.0, 711.0]
492 (606.0, 711.0]
493 (292.0, 397.0]
494 (292.0, 397.0]
495 (292.0, 397.0]
496 (292.0, 397.0]
497 (292.0, 397.0]
498 (292.0, 397.0]
499 (292.0, 397.0]
500 (292.0, 397.0]
501 (186.0, 292.0]
502 (186.0, 292.0]
503 (186.0, 292.0]
504 (186.0, 292.0]
505 (186.0, 292.0]
Name: TAX, Length: 506, dtype: category
Categories (5, interval[float64]): [(186.0, 292.0] < (292.0, 397.0] < (397.0, 501.0] < (501.0, 606.0] < (606.0, 711.0]]
In [34]:
pd.cut(df['TAX'],bins=[300,400,500,800])
Out[34]:
0 NaN
1 NaN
2 NaN
3 NaN
4 NaN
5 NaN
6 (300, 400]
7 (300, 400]
8 (300, 400]
9 (300, 400]
10 (300, 400]
11 (300, 400]
12 (300, 400]
13 (300, 400]
14 (300, 400]
15 (300, 400]
16 (300, 400]
17 (300, 400]
18 (300, 400]
19 (300, 400]
20 (300, 400]
21 (300, 400]
22 (300, 400]
23 (300, 400]
24 (300, 400]
25 (300, 400]
26 (300, 400]
27 (300, 400]
28 (300, 400]
29 (300, 400]
...
476 (500, 800]
477 (500, 800]
478 (500, 800]
479 (500, 800]
480 (500, 800]
481 (500, 800]
482 (500, 800]
483 (500, 800]
484 (500, 800]
485 (500, 800]
486 (500, 800]
487 (500, 800]
488 (500, 800]
489 (500, 800]
490 (500, 800]
491 (500, 800]
492 (500, 800]
493 (300, 400]
494 (300, 400]
495 (300, 400]
496 (300, 400]
497 (300, 400]
498 (300, 400]
499 (300, 400]
500 (300, 400]
501 NaN
502 NaN
503 NaN
504 NaN
505 NaN
Name: TAX, Length: 506, dtype: category
Categories (3, interval[int64]): [(300, 400] < (400, 500] < (500, 800]]
In [35]:
pd.cut(df['CRIM'],bins=5)
Out[35]:
0 (-0.0826, 17.8]
1 (-0.0826, 17.8]
2 (-0.0826, 17.8]
3 (-0.0826, 17.8]
4 (-0.0826, 17.8]
5 (-0.0826, 17.8]
6 (-0.0826, 17.8]
7 (-0.0826, 17.8]
8 (-0.0826, 17.8]
9 (-0.0826, 17.8]
10 (-0.0826, 17.8]
11 (-0.0826, 17.8]
12 (-0.0826, 17.8]
13 (-0.0826, 17.8]
14 (-0.0826, 17.8]
15 (-0.0826, 17.8]
16 (-0.0826, 17.8]
17 (-0.0826, 17.8]
18 (-0.0826, 17.8]
19 (-0.0826, 17.8]
20 (-0.0826, 17.8]
21 (-0.0826, 17.8]
22 (-0.0826, 17.8]
23 (-0.0826, 17.8]
24 (-0.0826, 17.8]
25 (-0.0826, 17.8]
26 (-0.0826, 17.8]
27 (-0.0826, 17.8]
28 (-0.0826, 17.8]
29 (-0.0826, 17.8]
...
476 (-0.0826, 17.8]
477 (-0.0826, 17.8]
478 (-0.0826, 17.8]
479 (-0.0826, 17.8]
480 (-0.0826, 17.8]
481 (-0.0826, 17.8]
482 (-0.0826, 17.8]
483 (-0.0826, 17.8]
484 (-0.0826, 17.8]
485 (-0.0826, 17.8]
486 (-0.0826, 17.8]
487 (-0.0826, 17.8]
488 (-0.0826, 17.8]
489 (-0.0826, 17.8]
490 (-0.0826, 17.8]
491 (-0.0826, 17.8]
492 (-0.0826, 17.8]
493 (-0.0826, 17.8]
494 (-0.0826, 17.8]
495 (-0.0826, 17.8]
496 (-0.0826, 17.8]
497 (-0.0826, 17.8]
498 (-0.0826, 17.8]
499 (-0.0826, 17.8]
500 (-0.0826, 17.8]
501 (-0.0826, 17.8]
502 (-0.0826, 17.8]
503 (-0.0826, 17.8]
504 (-0.0826, 17.8]
505 (-0.0826, 17.8]
Name: CRIM, Length: 506, dtype: category
Categories (5, interval[float64]): [(-0.0826, 17.8] < (17.8, 35.594] < (35.594, 53.388] < (53.388, 71.182] < (71.182, 88.976]]
qで区分数を指定(q=4:[0,.25,.5,.75,1])
In [38]:
pd.qcut(df['CRIM'],q=4)
Out[38]:
0 (0.00532, 0.082]
1 (0.00532, 0.082]
2 (0.00532, 0.082]
3 (0.00532, 0.082]
4 (0.00532, 0.082]
5 (0.00532, 0.082]
6 (0.082, 0.257]
7 (0.082, 0.257]
8 (0.082, 0.257]
9 (0.082, 0.257]
10 (0.082, 0.257]
11 (0.082, 0.257]
12 (0.082, 0.257]
13 (0.257, 3.647]
14 (0.257, 3.647]
15 (0.257, 3.647]
16 (0.257, 3.647]
17 (0.257, 3.647]
18 (0.257, 3.647]
19 (0.257, 3.647]
20 (0.257, 3.647]
21 (0.257, 3.647]
22 (0.257, 3.647]
23 (0.257, 3.647]
24 (0.257, 3.647]
25 (0.257, 3.647]
26 (0.257, 3.647]
27 (0.257, 3.647]
28 (0.257, 3.647]
29 (0.257, 3.647]
...
476 (3.647, 88.976]
477 (3.647, 88.976]
478 (3.647, 88.976]
479 (3.647, 88.976]
480 (3.647, 88.976]
481 (3.647, 88.976]
482 (3.647, 88.976]
483 (0.257, 3.647]
484 (0.257, 3.647]
485 (3.647, 88.976]
486 (3.647, 88.976]
487 (3.647, 88.976]
488 (0.082, 0.257]
489 (0.082, 0.257]
490 (0.082, 0.257]
491 (0.082, 0.257]
492 (0.082, 0.257]
493 (0.082, 0.257]
494 (0.257, 3.647]
495 (0.082, 0.257]
496 (0.257, 3.647]
497 (0.257, 3.647]
498 (0.082, 0.257]
499 (0.082, 0.257]
500 (0.082, 0.257]
501 (0.00532, 0.082]
502 (0.00532, 0.082]
503 (0.00532, 0.082]
504 (0.082, 0.257]
505 (0.00532, 0.082]
Name: CRIM, Length: 506, dtype: category
Categories (4, interval[float64]): [(0.00532, 0.082] < (0.082, 0.257] < (0.257, 3.647] < (3.647, 88.976]]
In [39]:
pd.qcut(df['CRIM'],q=10)
Out[39]:
0 (0.00532, 0.0382]
1 (0.00532, 0.0382]
2 (0.00532, 0.0382]
3 (0.00532, 0.0382]
4 (0.0642, 0.0992]
5 (0.00532, 0.0382]
6 (0.0642, 0.0992]
7 (0.0992, 0.15]
8 (0.15, 0.257]
9 (0.15, 0.257]
10 (0.15, 0.257]
11 (0.0992, 0.15]
12 (0.0642, 0.0992]
13 (0.55, 1.643]
14 (0.55, 1.643]
15 (0.55, 1.643]
16 (0.55, 1.643]
17 (0.55, 1.643]
18 (0.55, 1.643]
19 (0.55, 1.643]
20 (0.55, 1.643]
21 (0.55, 1.643]
22 (0.55, 1.643]
23 (0.55, 1.643]
24 (0.55, 1.643]
25 (0.55, 1.643]
26 (0.55, 1.643]
27 (0.55, 1.643]
28 (0.55, 1.643]
29 (0.55, 1.643]
...
476 (1.643, 5.441]
477 (10.534, 88.976]
478 (5.441, 10.534]
479 (10.534, 88.976]
480 (5.441, 10.534]
481 (5.441, 10.534]
482 (5.441, 10.534]
483 (1.643, 5.441]
484 (1.643, 5.441]
485 (1.643, 5.441]
486 (5.441, 10.534]
487 (1.643, 5.441]
488 (0.15, 0.257]
489 (0.15, 0.257]
490 (0.15, 0.257]
491 (0.0992, 0.15]
492 (0.0992, 0.15]
493 (0.15, 0.257]
494 (0.257, 0.55]
495 (0.15, 0.257]
496 (0.257, 0.55]
497 (0.257, 0.55]
498 (0.15, 0.257]
499 (0.15, 0.257]
500 (0.15, 0.257]
501 (0.0382, 0.0642]
502 (0.0382, 0.0642]
503 (0.0382, 0.0642]
504 (0.0992, 0.15]
505 (0.0382, 0.0642]
Name: CRIM, Length: 506, dtype: category
Categories (10, interval[float64]): [(0.00532, 0.0382] < (0.0382, 0.0642] < (0.0642, 0.0992] < (0.0992, 0.15] ... (0.55, 1.643] < (1.643, 5.441] < (5.441, 10.534] < (10.534, 88.976]]
retbinsでビンのエッジを返す。
In [43]:
qcut_tuple = pd.qcut(df['CRIM'],q=10,retbins=True)
qcut_tuple
In [44]:
qcut_tuple[1]
Out[44]:
array([ 6.32000000e-03, 3.81950000e-02, 6.41700000e-02,
9.92450000e-02, 1.50380000e-01, 2.56510000e-01,
5.50070000e-01, 1.64262000e+00, 5.44114000e+00,
1.05336000e+01, 8.89762000e+01])
precisionで丸め込み
In [45]:
pd.qcut(df['CRIM'],q=10,precision=1)
Out[45]:
0 (-0.094, 0.04]
1 (-0.094, 0.04]
2 (-0.094, 0.04]
3 (-0.094, 0.04]
4 (0.06, 0.1]
5 (-0.094, 0.04]
6 (0.06, 0.1]
7 (0.1, 0.2]
8 (0.2, 0.3]
9 (0.2, 0.3]
10 (0.2, 0.3]
11 (0.1, 0.2]
12 (0.06, 0.1]
13 (0.6, 1.6]
14 (0.6, 1.6]
15 (0.6, 1.6]
16 (0.6, 1.6]
17 (0.6, 1.6]
18 (0.6, 1.6]
19 (0.6, 1.6]
20 (0.6, 1.6]
21 (0.6, 1.6]
22 (0.6, 1.6]
23 (0.6, 1.6]
24 (0.6, 1.6]
25 (0.6, 1.6]
26 (0.6, 1.6]
27 (0.6, 1.6]
28 (0.6, 1.6]
29 (0.6, 1.6]
...
476 (1.6, 5.4]
477 (10.5, 89.0]
478 (5.4, 10.5]
479 (10.5, 89.0]
480 (5.4, 10.5]
481 (5.4, 10.5]
482 (5.4, 10.5]
483 (1.6, 5.4]
484 (1.6, 5.4]
485 (1.6, 5.4]
486 (5.4, 10.5]
487 (1.6, 5.4]
488 (0.2, 0.3]
489 (0.2, 0.3]
490 (0.2, 0.3]
491 (0.1, 0.2]
492 (0.1, 0.2]
493 (0.2, 0.3]
494 (0.3, 0.6]
495 (0.2, 0.3]
496 (0.3, 0.6]
497 (0.3, 0.6]
498 (0.2, 0.3]
499 (0.2, 0.3]
500 (0.2, 0.3]
501 (0.04, 0.06]
502 (0.04, 0.06]
503 (0.04, 0.06]
504 (0.1, 0.2]
505 (0.04, 0.06]
Name: CRIM, Length: 506, dtype: category
Categories (10, interval[float64]): [(-0.094, 0.04] < (0.04, 0.06] < (0.06, 0.1] < (0.1, 0.2] ... (0.6, 1.6] < (1.6, 5.4] < (5.4, 10.5] < (10.5, 89.0]]
qのパーセンタイルは自由に設定できる。
In [47]:
pd.qcut(df['CRIM'],q=[.01,.05,.95,.99,1.0])
Out[47]:
0 NaN
1 (0.0126, 0.0279]
2 (0.0126, 0.0279]
3 (0.0279, 15.789]
4 (0.0279, 15.789]
5 (0.0279, 15.789]
6 (0.0279, 15.789]
7 (0.0279, 15.789]
8 (0.0279, 15.789]
9 (0.0279, 15.789]
10 (0.0279, 15.789]
11 (0.0279, 15.789]
12 (0.0279, 15.789]
13 (0.0279, 15.789]
14 (0.0279, 15.789]
15 (0.0279, 15.789]
16 (0.0279, 15.789]
17 (0.0279, 15.789]
18 (0.0279, 15.789]
19 (0.0279, 15.789]
20 (0.0279, 15.789]
21 (0.0279, 15.789]
22 (0.0279, 15.789]
23 (0.0279, 15.789]
24 (0.0279, 15.789]
25 (0.0279, 15.789]
26 (0.0279, 15.789]
27 (0.0279, 15.789]
28 (0.0279, 15.789]
29 (0.0279, 15.789]
...
476 (0.0279, 15.789]
477 (0.0279, 15.789]
478 (0.0279, 15.789]
479 (0.0279, 15.789]
480 (0.0279, 15.789]
481 (0.0279, 15.789]
482 (0.0279, 15.789]
483 (0.0279, 15.789]
484 (0.0279, 15.789]
485 (0.0279, 15.789]
486 (0.0279, 15.789]
487 (0.0279, 15.789]
488 (0.0279, 15.789]
489 (0.0279, 15.789]
490 (0.0279, 15.789]
491 (0.0279, 15.789]
492 (0.0279, 15.789]
493 (0.0279, 15.789]
494 (0.0279, 15.789]
495 (0.0279, 15.789]
496 (0.0279, 15.789]
497 (0.0279, 15.789]
498 (0.0279, 15.789]
499 (0.0279, 15.789]
500 (0.0279, 15.789]
501 (0.0279, 15.789]
502 (0.0279, 15.789]
503 (0.0279, 15.789]
504 (0.0279, 15.789]
505 (0.0279, 15.789]
Name: CRIM, Length: 506, dtype: category
Categories (4, interval[float64]): [(0.0126, 0.0279] < (0.0279, 15.789] < (15.789, 41.37] < (41.37, 88.976]]
In [ ]: