離散化


In [2]:
import pandas as pd
from sklearn.datasets import load_boston
boston = load_boston()
df = pd.DataFrame(boston.data, columns=boston.feature_names)
df


Out[2]:
CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT
0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.0900 1.0 296.0 15.3 396.90 4.98
1 0.02731 0.0 7.07 0.0 0.469 6.421 78.9 4.9671 2.0 242.0 17.8 396.90 9.14
2 0.02729 0.0 7.07 0.0 0.469 7.185 61.1 4.9671 2.0 242.0 17.8 392.83 4.03
3 0.03237 0.0 2.18 0.0 0.458 6.998 45.8 6.0622 3.0 222.0 18.7 394.63 2.94
4 0.06905 0.0 2.18 0.0 0.458 7.147 54.2 6.0622 3.0 222.0 18.7 396.90 5.33
5 0.02985 0.0 2.18 0.0 0.458 6.430 58.7 6.0622 3.0 222.0 18.7 394.12 5.21
6 0.08829 12.5 7.87 0.0 0.524 6.012 66.6 5.5605 5.0 311.0 15.2 395.60 12.43
7 0.14455 12.5 7.87 0.0 0.524 6.172 96.1 5.9505 5.0 311.0 15.2 396.90 19.15
8 0.21124 12.5 7.87 0.0 0.524 5.631 100.0 6.0821 5.0 311.0 15.2 386.63 29.93
9 0.17004 12.5 7.87 0.0 0.524 6.004 85.9 6.5921 5.0 311.0 15.2 386.71 17.10
10 0.22489 12.5 7.87 0.0 0.524 6.377 94.3 6.3467 5.0 311.0 15.2 392.52 20.45
11 0.11747 12.5 7.87 0.0 0.524 6.009 82.9 6.2267 5.0 311.0 15.2 396.90 13.27
12 0.09378 12.5 7.87 0.0 0.524 5.889 39.0 5.4509 5.0 311.0 15.2 390.50 15.71
13 0.62976 0.0 8.14 0.0 0.538 5.949 61.8 4.7075 4.0 307.0 21.0 396.90 8.26
14 0.63796 0.0 8.14 0.0 0.538 6.096 84.5 4.4619 4.0 307.0 21.0 380.02 10.26
15 0.62739 0.0 8.14 0.0 0.538 5.834 56.5 4.4986 4.0 307.0 21.0 395.62 8.47
16 1.05393 0.0 8.14 0.0 0.538 5.935 29.3 4.4986 4.0 307.0 21.0 386.85 6.58
17 0.78420 0.0 8.14 0.0 0.538 5.990 81.7 4.2579 4.0 307.0 21.0 386.75 14.67
18 0.80271 0.0 8.14 0.0 0.538 5.456 36.6 3.7965 4.0 307.0 21.0 288.99 11.69
19 0.72580 0.0 8.14 0.0 0.538 5.727 69.5 3.7965 4.0 307.0 21.0 390.95 11.28
20 1.25179 0.0 8.14 0.0 0.538 5.570 98.1 3.7979 4.0 307.0 21.0 376.57 21.02
21 0.85204 0.0 8.14 0.0 0.538 5.965 89.2 4.0123 4.0 307.0 21.0 392.53 13.83
22 1.23247 0.0 8.14 0.0 0.538 6.142 91.7 3.9769 4.0 307.0 21.0 396.90 18.72
23 0.98843 0.0 8.14 0.0 0.538 5.813 100.0 4.0952 4.0 307.0 21.0 394.54 19.88
24 0.75026 0.0 8.14 0.0 0.538 5.924 94.1 4.3996 4.0 307.0 21.0 394.33 16.30
25 0.84054 0.0 8.14 0.0 0.538 5.599 85.7 4.4546 4.0 307.0 21.0 303.42 16.51
26 0.67191 0.0 8.14 0.0 0.538 5.813 90.3 4.6820 4.0 307.0 21.0 376.88 14.81
27 0.95577 0.0 8.14 0.0 0.538 6.047 88.8 4.4534 4.0 307.0 21.0 306.38 17.28
28 0.77299 0.0 8.14 0.0 0.538 6.495 94.4 4.4547 4.0 307.0 21.0 387.94 12.80
29 1.00245 0.0 8.14 0.0 0.538 6.674 87.3 4.2390 4.0 307.0 21.0 380.23 11.98
... ... ... ... ... ... ... ... ... ... ... ... ... ...
476 4.87141 0.0 18.10 0.0 0.614 6.484 93.6 2.3053 24.0 666.0 20.2 396.21 18.68
477 15.02340 0.0 18.10 0.0 0.614 5.304 97.3 2.1007 24.0 666.0 20.2 349.48 24.91
478 10.23300 0.0 18.10 0.0 0.614 6.185 96.7 2.1705 24.0 666.0 20.2 379.70 18.03
479 14.33370 0.0 18.10 0.0 0.614 6.229 88.0 1.9512 24.0 666.0 20.2 383.32 13.11
480 5.82401 0.0 18.10 0.0 0.532 6.242 64.7 3.4242 24.0 666.0 20.2 396.90 10.74
481 5.70818 0.0 18.10 0.0 0.532 6.750 74.9 3.3317 24.0 666.0 20.2 393.07 7.74
482 5.73116 0.0 18.10 0.0 0.532 7.061 77.0 3.4106 24.0 666.0 20.2 395.28 7.01
483 2.81838 0.0 18.10 0.0 0.532 5.762 40.3 4.0983 24.0 666.0 20.2 392.92 10.42
484 2.37857 0.0 18.10 0.0 0.583 5.871 41.9 3.7240 24.0 666.0 20.2 370.73 13.34
485 3.67367 0.0 18.10 0.0 0.583 6.312 51.9 3.9917 24.0 666.0 20.2 388.62 10.58
486 5.69175 0.0 18.10 0.0 0.583 6.114 79.8 3.5459 24.0 666.0 20.2 392.68 14.98
487 4.83567 0.0 18.10 0.0 0.583 5.905 53.2 3.1523 24.0 666.0 20.2 388.22 11.45
488 0.15086 0.0 27.74 0.0 0.609 5.454 92.7 1.8209 4.0 711.0 20.1 395.09 18.06
489 0.18337 0.0 27.74 0.0 0.609 5.414 98.3 1.7554 4.0 711.0 20.1 344.05 23.97
490 0.20746 0.0 27.74 0.0 0.609 5.093 98.0 1.8226 4.0 711.0 20.1 318.43 29.68
491 0.10574 0.0 27.74 0.0 0.609 5.983 98.8 1.8681 4.0 711.0 20.1 390.11 18.07
492 0.11132 0.0 27.74 0.0 0.609 5.983 83.5 2.1099 4.0 711.0 20.1 396.90 13.35
493 0.17331 0.0 9.69 0.0 0.585 5.707 54.0 2.3817 6.0 391.0 19.2 396.90 12.01
494 0.27957 0.0 9.69 0.0 0.585 5.926 42.6 2.3817 6.0 391.0 19.2 396.90 13.59
495 0.17899 0.0 9.69 0.0 0.585 5.670 28.8 2.7986 6.0 391.0 19.2 393.29 17.60
496 0.28960 0.0 9.69 0.0 0.585 5.390 72.9 2.7986 6.0 391.0 19.2 396.90 21.14
497 0.26838 0.0 9.69 0.0 0.585 5.794 70.6 2.8927 6.0 391.0 19.2 396.90 14.10
498 0.23912 0.0 9.69 0.0 0.585 6.019 65.3 2.4091 6.0 391.0 19.2 396.90 12.92
499 0.17783 0.0 9.69 0.0 0.585 5.569 73.5 2.3999 6.0 391.0 19.2 395.77 15.10
500 0.22438 0.0 9.69 0.0 0.585 6.027 79.7 2.4982 6.0 391.0 19.2 396.90 14.33
501 0.06263 0.0 11.93 0.0 0.573 6.593 69.1 2.4786 1.0 273.0 21.0 391.99 9.67
502 0.04527 0.0 11.93 0.0 0.573 6.120 76.7 2.2875 1.0 273.0 21.0 396.90 9.08
503 0.06076 0.0 11.93 0.0 0.573 6.976 91.0 2.1675 1.0 273.0 21.0 396.90 5.64
504 0.10959 0.0 11.93 0.0 0.573 6.794 89.3 2.3889 1.0 273.0 21.0 393.45 6.48
505 0.04741 0.0 11.93 0.0 0.573 6.030 80.8 2.5050 1.0 273.0 21.0 396.90 7.88

506 rows × 13 columns


In [11]:
df.describe([.001,.25,.5,.75,.999])


Out[11]:
CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT
count 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000
mean 3.593761 11.363636 11.136779 0.069170 0.554695 6.284634 68.574901 3.795043 9.549407 408.237154 18.455534 356.674032 12.653063
std 8.596783 23.322453 6.860353 0.253994 0.115878 0.702617 28.148861 2.105710 8.707259 168.537116 2.164946 91.294864 7.141062
min 0.006320 0.000000 0.460000 0.000000 0.385000 3.561000 2.900000 1.129600 1.000000 187.000000 12.600000 0.320000 1.730000
0.1% 0.007704 0.000000 0.601400 0.000000 0.387020 3.713510 4.465500 1.133337 1.000000 187.505000 12.600000 1.431000 1.825950
25% 0.082045 0.000000 5.190000 0.000000 0.449000 5.885500 45.025000 2.100175 4.000000 279.000000 17.400000 375.377500 6.950000
50% 0.256510 0.000000 9.690000 0.000000 0.538000 6.208500 77.500000 3.207450 5.000000 330.000000 19.050000 391.440000 11.360000
75% 3.647423 12.500000 18.100000 0.000000 0.624000 6.623500 94.075000 5.188425 24.000000 666.000000 20.200000 396.225000 16.955000
99.9% 81.177940 97.475000 27.740000 1.000000 0.871000 8.752225 100.000000 11.411319 24.000000 711.000000 22.000000 396.900000 37.470050
max 88.976200 100.000000 27.740000 1.000000 0.871000 8.780000 100.000000 12.126500 24.000000 711.000000 22.000000 396.900000 37.970000

間隔による離散化と等間隔離散化


In [5]:
cut = pd.cut(df['TAX'],bins=5)
cut


Out[5]:
0        (291.8, 396.6]
1      (186.476, 291.8]
2      (186.476, 291.8]
3      (186.476, 291.8]
4      (186.476, 291.8]
5      (186.476, 291.8]
6        (291.8, 396.6]
7        (291.8, 396.6]
8        (291.8, 396.6]
9        (291.8, 396.6]
10       (291.8, 396.6]
11       (291.8, 396.6]
12       (291.8, 396.6]
13       (291.8, 396.6]
14       (291.8, 396.6]
15       (291.8, 396.6]
16       (291.8, 396.6]
17       (291.8, 396.6]
18       (291.8, 396.6]
19       (291.8, 396.6]
20       (291.8, 396.6]
21       (291.8, 396.6]
22       (291.8, 396.6]
23       (291.8, 396.6]
24       (291.8, 396.6]
25       (291.8, 396.6]
26       (291.8, 396.6]
27       (291.8, 396.6]
28       (291.8, 396.6]
29       (291.8, 396.6]
             ...       
476      (606.2, 711.0]
477      (606.2, 711.0]
478      (606.2, 711.0]
479      (606.2, 711.0]
480      (606.2, 711.0]
481      (606.2, 711.0]
482      (606.2, 711.0]
483      (606.2, 711.0]
484      (606.2, 711.0]
485      (606.2, 711.0]
486      (606.2, 711.0]
487      (606.2, 711.0]
488      (606.2, 711.0]
489      (606.2, 711.0]
490      (606.2, 711.0]
491      (606.2, 711.0]
492      (606.2, 711.0]
493      (291.8, 396.6]
494      (291.8, 396.6]
495      (291.8, 396.6]
496      (291.8, 396.6]
497      (291.8, 396.6]
498      (291.8, 396.6]
499      (291.8, 396.6]
500      (291.8, 396.6]
501    (186.476, 291.8]
502    (186.476, 291.8]
503    (186.476, 291.8]
504    (186.476, 291.8]
505    (186.476, 291.8]
Name: TAX, Length: 506, dtype: category
Categories (5, interval[float64]): [(186.476, 291.8] < (291.8, 396.6] < (396.6, 501.4] < (501.4, 606.2] < (606.2, 711.0]]

binsでbinの数を変更


In [12]:
pd.cut(df['TAX'],bins=2)


Out[12]:
0      (186.476, 449.0]
1      (186.476, 449.0]
2      (186.476, 449.0]
3      (186.476, 449.0]
4      (186.476, 449.0]
5      (186.476, 449.0]
6      (186.476, 449.0]
7      (186.476, 449.0]
8      (186.476, 449.0]
9      (186.476, 449.0]
10     (186.476, 449.0]
11     (186.476, 449.0]
12     (186.476, 449.0]
13     (186.476, 449.0]
14     (186.476, 449.0]
15     (186.476, 449.0]
16     (186.476, 449.0]
17     (186.476, 449.0]
18     (186.476, 449.0]
19     (186.476, 449.0]
20     (186.476, 449.0]
21     (186.476, 449.0]
22     (186.476, 449.0]
23     (186.476, 449.0]
24     (186.476, 449.0]
25     (186.476, 449.0]
26     (186.476, 449.0]
27     (186.476, 449.0]
28     (186.476, 449.0]
29     (186.476, 449.0]
             ...       
476      (449.0, 711.0]
477      (449.0, 711.0]
478      (449.0, 711.0]
479      (449.0, 711.0]
480      (449.0, 711.0]
481      (449.0, 711.0]
482      (449.0, 711.0]
483      (449.0, 711.0]
484      (449.0, 711.0]
485      (449.0, 711.0]
486      (449.0, 711.0]
487      (449.0, 711.0]
488      (449.0, 711.0]
489      (449.0, 711.0]
490      (449.0, 711.0]
491      (449.0, 711.0]
492      (449.0, 711.0]
493    (186.476, 449.0]
494    (186.476, 449.0]
495    (186.476, 449.0]
496    (186.476, 449.0]
497    (186.476, 449.0]
498    (186.476, 449.0]
499    (186.476, 449.0]
500    (186.476, 449.0]
501    (186.476, 449.0]
502    (186.476, 449.0]
503    (186.476, 449.0]
504    (186.476, 449.0]
505    (186.476, 449.0]
Name: TAX, Length: 506, dtype: category
Categories (2, interval[float64]): [(186.476, 449.0] < (449.0, 711.0]]

In [13]:
pd.cut(df['TAX'],bins=50)


Out[13]:
0       (291.8, 302.28]
1       (239.4, 249.88]
2       (239.4, 249.88]
3      (218.44, 228.92]
4      (218.44, 228.92]
5      (218.44, 228.92]
6      (302.28, 312.76]
7      (302.28, 312.76]
8      (302.28, 312.76]
9      (302.28, 312.76]
10     (302.28, 312.76]
11     (302.28, 312.76]
12     (302.28, 312.76]
13     (302.28, 312.76]
14     (302.28, 312.76]
15     (302.28, 312.76]
16     (302.28, 312.76]
17     (302.28, 312.76]
18     (302.28, 312.76]
19     (302.28, 312.76]
20     (302.28, 312.76]
21     (302.28, 312.76]
22     (302.28, 312.76]
23     (302.28, 312.76]
24     (302.28, 312.76]
25     (302.28, 312.76]
26     (302.28, 312.76]
27     (302.28, 312.76]
28     (302.28, 312.76]
29     (302.28, 312.76]
             ...       
476     (658.6, 669.08]
477     (658.6, 669.08]
478     (658.6, 669.08]
479     (658.6, 669.08]
480     (658.6, 669.08]
481     (658.6, 669.08]
482     (658.6, 669.08]
483     (658.6, 669.08]
484     (658.6, 669.08]
485     (658.6, 669.08]
486     (658.6, 669.08]
487     (658.6, 669.08]
488     (700.52, 711.0]
489     (700.52, 711.0]
490     (700.52, 711.0]
491     (700.52, 711.0]
492     (700.52, 711.0]
493     (386.12, 396.6]
494     (386.12, 396.6]
495     (386.12, 396.6]
496     (386.12, 396.6]
497     (386.12, 396.6]
498     (386.12, 396.6]
499     (386.12, 396.6]
500     (386.12, 396.6]
501    (270.84, 281.32]
502    (270.84, 281.32]
503    (270.84, 281.32]
504    (270.84, 281.32]
505    (270.84, 281.32]
Name: TAX, Length: 506, dtype: category
Categories (50, interval[float64]): [(186.476, 197.48] < (197.48, 207.96] < (207.96, 218.44] < (218.44, 228.92] ... (669.08, 679.56] < (679.56, 690.04] < (690.04, 700.52] < (700.52, 711.0]]

rightで下側と上側どちらを開区間、閉区間にするか設定


In [6]:
pd.cut(df['TAX'],bins=5,right=False)


Out[6]:
0        [291.8, 396.6)
1        [187.0, 291.8)
2        [187.0, 291.8)
3        [187.0, 291.8)
4        [187.0, 291.8)
5        [187.0, 291.8)
6        [291.8, 396.6)
7        [291.8, 396.6)
8        [291.8, 396.6)
9        [291.8, 396.6)
10       [291.8, 396.6)
11       [291.8, 396.6)
12       [291.8, 396.6)
13       [291.8, 396.6)
14       [291.8, 396.6)
15       [291.8, 396.6)
16       [291.8, 396.6)
17       [291.8, 396.6)
18       [291.8, 396.6)
19       [291.8, 396.6)
20       [291.8, 396.6)
21       [291.8, 396.6)
22       [291.8, 396.6)
23       [291.8, 396.6)
24       [291.8, 396.6)
25       [291.8, 396.6)
26       [291.8, 396.6)
27       [291.8, 396.6)
28       [291.8, 396.6)
29       [291.8, 396.6)
             ...       
476    [606.2, 711.524)
477    [606.2, 711.524)
478    [606.2, 711.524)
479    [606.2, 711.524)
480    [606.2, 711.524)
481    [606.2, 711.524)
482    [606.2, 711.524)
483    [606.2, 711.524)
484    [606.2, 711.524)
485    [606.2, 711.524)
486    [606.2, 711.524)
487    [606.2, 711.524)
488    [606.2, 711.524)
489    [606.2, 711.524)
490    [606.2, 711.524)
491    [606.2, 711.524)
492    [606.2, 711.524)
493      [291.8, 396.6)
494      [291.8, 396.6)
495      [291.8, 396.6)
496      [291.8, 396.6)
497      [291.8, 396.6)
498      [291.8, 396.6)
499      [291.8, 396.6)
500      [291.8, 396.6)
501      [187.0, 291.8)
502      [187.0, 291.8)
503      [187.0, 291.8)
504      [187.0, 291.8)
505      [187.0, 291.8)
Name: TAX, Length: 506, dtype: category
Categories (5, interval[float64]): [[187.0, 291.8) < [291.8, 396.6) < [396.6, 501.4) < [501.4, 606.2) < [606.2, 711.524)]

retbinsでbinのエッジも返す。


In [16]:
cut_tuple = pd.cut(df['TAX'],bins=5,retbins=True)
cut_tuple


Out[16]:
(0        (291.8, 396.6]
 1      (186.476, 291.8]
 2      (186.476, 291.8]
 3      (186.476, 291.8]
 4      (186.476, 291.8]
 5      (186.476, 291.8]
 6        (291.8, 396.6]
 7        (291.8, 396.6]
 8        (291.8, 396.6]
 9        (291.8, 396.6]
 10       (291.8, 396.6]
 11       (291.8, 396.6]
 12       (291.8, 396.6]
 13       (291.8, 396.6]
 14       (291.8, 396.6]
 15       (291.8, 396.6]
 16       (291.8, 396.6]
 17       (291.8, 396.6]
 18       (291.8, 396.6]
 19       (291.8, 396.6]
 20       (291.8, 396.6]
 21       (291.8, 396.6]
 22       (291.8, 396.6]
 23       (291.8, 396.6]
 24       (291.8, 396.6]
 25       (291.8, 396.6]
 26       (291.8, 396.6]
 27       (291.8, 396.6]
 28       (291.8, 396.6]
 29       (291.8, 396.6]
              ...       
 476      (606.2, 711.0]
 477      (606.2, 711.0]
 478      (606.2, 711.0]
 479      (606.2, 711.0]
 480      (606.2, 711.0]
 481      (606.2, 711.0]
 482      (606.2, 711.0]
 483      (606.2, 711.0]
 484      (606.2, 711.0]
 485      (606.2, 711.0]
 486      (606.2, 711.0]
 487      (606.2, 711.0]
 488      (606.2, 711.0]
 489      (606.2, 711.0]
 490      (606.2, 711.0]
 491      (606.2, 711.0]
 492      (606.2, 711.0]
 493      (291.8, 396.6]
 494      (291.8, 396.6]
 495      (291.8, 396.6]
 496      (291.8, 396.6]
 497      (291.8, 396.6]
 498      (291.8, 396.6]
 499      (291.8, 396.6]
 500      (291.8, 396.6]
 501    (186.476, 291.8]
 502    (186.476, 291.8]
 503    (186.476, 291.8]
 504    (186.476, 291.8]
 505    (186.476, 291.8]
 Name: TAX, Length: 506, dtype: category
 Categories (5, interval[float64]): [(186.476, 291.8] < (291.8, 396.6] < (396.6, 501.4] < (501.4, 606.2] < (606.2, 711.0]],
 array([ 186.476,  291.8  ,  396.6  ,  501.4  ,  606.2  ,  711.   ]))

In [19]:
bin_edges = cut_tuple[1]
bin_edges


Out[19]:
array([ 186.476,  291.8  ,  396.6  ,  501.4  ,  606.2  ,  711.   ])

下限を含むかどうか


In [29]:
pd.cut(df['TAX'],bins=5,include_lowest=False)


Out[29]:
0        (291.8, 396.6]
1      (186.476, 291.8]
2      (186.476, 291.8]
3      (186.476, 291.8]
4      (186.476, 291.8]
5      (186.476, 291.8]
6        (291.8, 396.6]
7        (291.8, 396.6]
8        (291.8, 396.6]
9        (291.8, 396.6]
10       (291.8, 396.6]
11       (291.8, 396.6]
12       (291.8, 396.6]
13       (291.8, 396.6]
14       (291.8, 396.6]
15       (291.8, 396.6]
16       (291.8, 396.6]
17       (291.8, 396.6]
18       (291.8, 396.6]
19       (291.8, 396.6]
20       (291.8, 396.6]
21       (291.8, 396.6]
22       (291.8, 396.6]
23       (291.8, 396.6]
24       (291.8, 396.6]
25       (291.8, 396.6]
26       (291.8, 396.6]
27       (291.8, 396.6]
28       (291.8, 396.6]
29       (291.8, 396.6]
             ...       
476      (606.2, 711.0]
477      (606.2, 711.0]
478      (606.2, 711.0]
479      (606.2, 711.0]
480      (606.2, 711.0]
481      (606.2, 711.0]
482      (606.2, 711.0]
483      (606.2, 711.0]
484      (606.2, 711.0]
485      (606.2, 711.0]
486      (606.2, 711.0]
487      (606.2, 711.0]
488      (606.2, 711.0]
489      (606.2, 711.0]
490      (606.2, 711.0]
491      (606.2, 711.0]
492      (606.2, 711.0]
493      (291.8, 396.6]
494      (291.8, 396.6]
495      (291.8, 396.6]
496      (291.8, 396.6]
497      (291.8, 396.6]
498      (291.8, 396.6]
499      (291.8, 396.6]
500      (291.8, 396.6]
501    (186.476, 291.8]
502    (186.476, 291.8]
503    (186.476, 291.8]
504    (186.476, 291.8]
505    (186.476, 291.8]
Name: TAX, Length: 506, dtype: category
Categories (5, interval[float64]): [(186.476, 291.8] < (291.8, 396.6] < (396.6, 501.4] < (501.4, 606.2] < (606.2, 711.0]]

precisionで丸め込みの桁数を決める。


In [30]:
pd.cut(df['TAX'],bins=5,precision=0)


Out[30]:
0      (291.8, 396.6]
1      (186.5, 291.8]
2      (186.5, 291.8]
3      (186.5, 291.8]
4      (186.5, 291.8]
5      (186.5, 291.8]
6      (291.8, 396.6]
7      (291.8, 396.6]
8      (291.8, 396.6]
9      (291.8, 396.6]
10     (291.8, 396.6]
11     (291.8, 396.6]
12     (291.8, 396.6]
13     (291.8, 396.6]
14     (291.8, 396.6]
15     (291.8, 396.6]
16     (291.8, 396.6]
17     (291.8, 396.6]
18     (291.8, 396.6]
19     (291.8, 396.6]
20     (291.8, 396.6]
21     (291.8, 396.6]
22     (291.8, 396.6]
23     (291.8, 396.6]
24     (291.8, 396.6]
25     (291.8, 396.6]
26     (291.8, 396.6]
27     (291.8, 396.6]
28     (291.8, 396.6]
29     (291.8, 396.6]
            ...      
476    (606.2, 711.0]
477    (606.2, 711.0]
478    (606.2, 711.0]
479    (606.2, 711.0]
480    (606.2, 711.0]
481    (606.2, 711.0]
482    (606.2, 711.0]
483    (606.2, 711.0]
484    (606.2, 711.0]
485    (606.2, 711.0]
486    (606.2, 711.0]
487    (606.2, 711.0]
488    (606.2, 711.0]
489    (606.2, 711.0]
490    (606.2, 711.0]
491    (606.2, 711.0]
492    (606.2, 711.0]
493    (291.8, 396.6]
494    (291.8, 396.6]
495    (291.8, 396.6]
496    (291.8, 396.6]
497    (291.8, 396.6]
498    (291.8, 396.6]
499    (291.8, 396.6]
500    (291.8, 396.6]
501    (186.5, 291.8]
502    (186.5, 291.8]
503    (186.5, 291.8]
504    (186.5, 291.8]
505    (186.5, 291.8]
Name: TAX, Length: 506, dtype: category
Categories (5, interval[float64]): [(186.5, 291.8] < (291.8, 396.6] < (396.6, 501.4] < (501.4, 606.2] < (606.2, 711.0]]

In [31]:
pd.cut(df['TAX'],bins=5,precision=1)


Out[31]:
0      (292.0, 397.0]
1      (186.0, 292.0]
2      (186.0, 292.0]
3      (186.0, 292.0]
4      (186.0, 292.0]
5      (186.0, 292.0]
6      (292.0, 397.0]
7      (292.0, 397.0]
8      (292.0, 397.0]
9      (292.0, 397.0]
10     (292.0, 397.0]
11     (292.0, 397.0]
12     (292.0, 397.0]
13     (292.0, 397.0]
14     (292.0, 397.0]
15     (292.0, 397.0]
16     (292.0, 397.0]
17     (292.0, 397.0]
18     (292.0, 397.0]
19     (292.0, 397.0]
20     (292.0, 397.0]
21     (292.0, 397.0]
22     (292.0, 397.0]
23     (292.0, 397.0]
24     (292.0, 397.0]
25     (292.0, 397.0]
26     (292.0, 397.0]
27     (292.0, 397.0]
28     (292.0, 397.0]
29     (292.0, 397.0]
            ...      
476    (606.0, 711.0]
477    (606.0, 711.0]
478    (606.0, 711.0]
479    (606.0, 711.0]
480    (606.0, 711.0]
481    (606.0, 711.0]
482    (606.0, 711.0]
483    (606.0, 711.0]
484    (606.0, 711.0]
485    (606.0, 711.0]
486    (606.0, 711.0]
487    (606.0, 711.0]
488    (606.0, 711.0]
489    (606.0, 711.0]
490    (606.0, 711.0]
491    (606.0, 711.0]
492    (606.0, 711.0]
493    (292.0, 397.0]
494    (292.0, 397.0]
495    (292.0, 397.0]
496    (292.0, 397.0]
497    (292.0, 397.0]
498    (292.0, 397.0]
499    (292.0, 397.0]
500    (292.0, 397.0]
501    (186.0, 292.0]
502    (186.0, 292.0]
503    (186.0, 292.0]
504    (186.0, 292.0]
505    (186.0, 292.0]
Name: TAX, Length: 506, dtype: category
Categories (5, interval[float64]): [(186.0, 292.0] < (292.0, 397.0] < (397.0, 501.0] < (501.0, 606.0] < (606.0, 711.0]]
エッジは自由に設定できる。

In [34]:
pd.cut(df['TAX'],bins=[300,400,500,800])


Out[34]:
0             NaN
1             NaN
2             NaN
3             NaN
4             NaN
5             NaN
6      (300, 400]
7      (300, 400]
8      (300, 400]
9      (300, 400]
10     (300, 400]
11     (300, 400]
12     (300, 400]
13     (300, 400]
14     (300, 400]
15     (300, 400]
16     (300, 400]
17     (300, 400]
18     (300, 400]
19     (300, 400]
20     (300, 400]
21     (300, 400]
22     (300, 400]
23     (300, 400]
24     (300, 400]
25     (300, 400]
26     (300, 400]
27     (300, 400]
28     (300, 400]
29     (300, 400]
          ...    
476    (500, 800]
477    (500, 800]
478    (500, 800]
479    (500, 800]
480    (500, 800]
481    (500, 800]
482    (500, 800]
483    (500, 800]
484    (500, 800]
485    (500, 800]
486    (500, 800]
487    (500, 800]
488    (500, 800]
489    (500, 800]
490    (500, 800]
491    (500, 800]
492    (500, 800]
493    (300, 400]
494    (300, 400]
495    (300, 400]
496    (300, 400]
497    (300, 400]
498    (300, 400]
499    (300, 400]
500    (300, 400]
501           NaN
502           NaN
503           NaN
504           NaN
505           NaN
Name: TAX, Length: 506, dtype: category
Categories (3, interval[int64]): [(300, 400] < (400, 500] < (500, 800]]

In [35]:
pd.cut(df['CRIM'],bins=5)


Out[35]:
0      (-0.0826, 17.8]
1      (-0.0826, 17.8]
2      (-0.0826, 17.8]
3      (-0.0826, 17.8]
4      (-0.0826, 17.8]
5      (-0.0826, 17.8]
6      (-0.0826, 17.8]
7      (-0.0826, 17.8]
8      (-0.0826, 17.8]
9      (-0.0826, 17.8]
10     (-0.0826, 17.8]
11     (-0.0826, 17.8]
12     (-0.0826, 17.8]
13     (-0.0826, 17.8]
14     (-0.0826, 17.8]
15     (-0.0826, 17.8]
16     (-0.0826, 17.8]
17     (-0.0826, 17.8]
18     (-0.0826, 17.8]
19     (-0.0826, 17.8]
20     (-0.0826, 17.8]
21     (-0.0826, 17.8]
22     (-0.0826, 17.8]
23     (-0.0826, 17.8]
24     (-0.0826, 17.8]
25     (-0.0826, 17.8]
26     (-0.0826, 17.8]
27     (-0.0826, 17.8]
28     (-0.0826, 17.8]
29     (-0.0826, 17.8]
            ...       
476    (-0.0826, 17.8]
477    (-0.0826, 17.8]
478    (-0.0826, 17.8]
479    (-0.0826, 17.8]
480    (-0.0826, 17.8]
481    (-0.0826, 17.8]
482    (-0.0826, 17.8]
483    (-0.0826, 17.8]
484    (-0.0826, 17.8]
485    (-0.0826, 17.8]
486    (-0.0826, 17.8]
487    (-0.0826, 17.8]
488    (-0.0826, 17.8]
489    (-0.0826, 17.8]
490    (-0.0826, 17.8]
491    (-0.0826, 17.8]
492    (-0.0826, 17.8]
493    (-0.0826, 17.8]
494    (-0.0826, 17.8]
495    (-0.0826, 17.8]
496    (-0.0826, 17.8]
497    (-0.0826, 17.8]
498    (-0.0826, 17.8]
499    (-0.0826, 17.8]
500    (-0.0826, 17.8]
501    (-0.0826, 17.8]
502    (-0.0826, 17.8]
503    (-0.0826, 17.8]
504    (-0.0826, 17.8]
505    (-0.0826, 17.8]
Name: CRIM, Length: 506, dtype: category
Categories (5, interval[float64]): [(-0.0826, 17.8] < (17.8, 35.594] < (35.594, 53.388] < (53.388, 71.182] < (71.182, 88.976]]

頻度による離散化と等頻度離散化

qで区分数を指定(q=4:[0,.25,.5,.75,1])


In [38]:
pd.qcut(df['CRIM'],q=4)


Out[38]:
0      (0.00532, 0.082]
1      (0.00532, 0.082]
2      (0.00532, 0.082]
3      (0.00532, 0.082]
4      (0.00532, 0.082]
5      (0.00532, 0.082]
6        (0.082, 0.257]
7        (0.082, 0.257]
8        (0.082, 0.257]
9        (0.082, 0.257]
10       (0.082, 0.257]
11       (0.082, 0.257]
12       (0.082, 0.257]
13       (0.257, 3.647]
14       (0.257, 3.647]
15       (0.257, 3.647]
16       (0.257, 3.647]
17       (0.257, 3.647]
18       (0.257, 3.647]
19       (0.257, 3.647]
20       (0.257, 3.647]
21       (0.257, 3.647]
22       (0.257, 3.647]
23       (0.257, 3.647]
24       (0.257, 3.647]
25       (0.257, 3.647]
26       (0.257, 3.647]
27       (0.257, 3.647]
28       (0.257, 3.647]
29       (0.257, 3.647]
             ...       
476     (3.647, 88.976]
477     (3.647, 88.976]
478     (3.647, 88.976]
479     (3.647, 88.976]
480     (3.647, 88.976]
481     (3.647, 88.976]
482     (3.647, 88.976]
483      (0.257, 3.647]
484      (0.257, 3.647]
485     (3.647, 88.976]
486     (3.647, 88.976]
487     (3.647, 88.976]
488      (0.082, 0.257]
489      (0.082, 0.257]
490      (0.082, 0.257]
491      (0.082, 0.257]
492      (0.082, 0.257]
493      (0.082, 0.257]
494      (0.257, 3.647]
495      (0.082, 0.257]
496      (0.257, 3.647]
497      (0.257, 3.647]
498      (0.082, 0.257]
499      (0.082, 0.257]
500      (0.082, 0.257]
501    (0.00532, 0.082]
502    (0.00532, 0.082]
503    (0.00532, 0.082]
504      (0.082, 0.257]
505    (0.00532, 0.082]
Name: CRIM, Length: 506, dtype: category
Categories (4, interval[float64]): [(0.00532, 0.082] < (0.082, 0.257] < (0.257, 3.647] < (3.647, 88.976]]

In [39]:
pd.qcut(df['CRIM'],q=10)


Out[39]:
0      (0.00532, 0.0382]
1      (0.00532, 0.0382]
2      (0.00532, 0.0382]
3      (0.00532, 0.0382]
4       (0.0642, 0.0992]
5      (0.00532, 0.0382]
6       (0.0642, 0.0992]
7         (0.0992, 0.15]
8          (0.15, 0.257]
9          (0.15, 0.257]
10         (0.15, 0.257]
11        (0.0992, 0.15]
12      (0.0642, 0.0992]
13         (0.55, 1.643]
14         (0.55, 1.643]
15         (0.55, 1.643]
16         (0.55, 1.643]
17         (0.55, 1.643]
18         (0.55, 1.643]
19         (0.55, 1.643]
20         (0.55, 1.643]
21         (0.55, 1.643]
22         (0.55, 1.643]
23         (0.55, 1.643]
24         (0.55, 1.643]
25         (0.55, 1.643]
26         (0.55, 1.643]
27         (0.55, 1.643]
28         (0.55, 1.643]
29         (0.55, 1.643]
             ...        
476       (1.643, 5.441]
477     (10.534, 88.976]
478      (5.441, 10.534]
479     (10.534, 88.976]
480      (5.441, 10.534]
481      (5.441, 10.534]
482      (5.441, 10.534]
483       (1.643, 5.441]
484       (1.643, 5.441]
485       (1.643, 5.441]
486      (5.441, 10.534]
487       (1.643, 5.441]
488        (0.15, 0.257]
489        (0.15, 0.257]
490        (0.15, 0.257]
491       (0.0992, 0.15]
492       (0.0992, 0.15]
493        (0.15, 0.257]
494        (0.257, 0.55]
495        (0.15, 0.257]
496        (0.257, 0.55]
497        (0.257, 0.55]
498        (0.15, 0.257]
499        (0.15, 0.257]
500        (0.15, 0.257]
501     (0.0382, 0.0642]
502     (0.0382, 0.0642]
503     (0.0382, 0.0642]
504       (0.0992, 0.15]
505     (0.0382, 0.0642]
Name: CRIM, Length: 506, dtype: category
Categories (10, interval[float64]): [(0.00532, 0.0382] < (0.0382, 0.0642] < (0.0642, 0.0992] < (0.0992, 0.15] ... (0.55, 1.643] < (1.643, 5.441] < (5.441, 10.534] < (10.534, 88.976]]

retbinsでビンのエッジを返す。


In [43]:
qcut_tuple = pd.qcut(df['CRIM'],q=10,retbins=True)
qcut_tuple

In [44]:
qcut_tuple[1]


Out[44]:
array([  6.32000000e-03,   3.81950000e-02,   6.41700000e-02,
         9.92450000e-02,   1.50380000e-01,   2.56510000e-01,
         5.50070000e-01,   1.64262000e+00,   5.44114000e+00,
         1.05336000e+01,   8.89762000e+01])

precisionで丸め込み


In [45]:
pd.qcut(df['CRIM'],q=10,precision=1)


Out[45]:
0      (-0.094, 0.04]
1      (-0.094, 0.04]
2      (-0.094, 0.04]
3      (-0.094, 0.04]
4         (0.06, 0.1]
5      (-0.094, 0.04]
6         (0.06, 0.1]
7          (0.1, 0.2]
8          (0.2, 0.3]
9          (0.2, 0.3]
10         (0.2, 0.3]
11         (0.1, 0.2]
12        (0.06, 0.1]
13         (0.6, 1.6]
14         (0.6, 1.6]
15         (0.6, 1.6]
16         (0.6, 1.6]
17         (0.6, 1.6]
18         (0.6, 1.6]
19         (0.6, 1.6]
20         (0.6, 1.6]
21         (0.6, 1.6]
22         (0.6, 1.6]
23         (0.6, 1.6]
24         (0.6, 1.6]
25         (0.6, 1.6]
26         (0.6, 1.6]
27         (0.6, 1.6]
28         (0.6, 1.6]
29         (0.6, 1.6]
            ...      
476        (1.6, 5.4]
477      (10.5, 89.0]
478       (5.4, 10.5]
479      (10.5, 89.0]
480       (5.4, 10.5]
481       (5.4, 10.5]
482       (5.4, 10.5]
483        (1.6, 5.4]
484        (1.6, 5.4]
485        (1.6, 5.4]
486       (5.4, 10.5]
487        (1.6, 5.4]
488        (0.2, 0.3]
489        (0.2, 0.3]
490        (0.2, 0.3]
491        (0.1, 0.2]
492        (0.1, 0.2]
493        (0.2, 0.3]
494        (0.3, 0.6]
495        (0.2, 0.3]
496        (0.3, 0.6]
497        (0.3, 0.6]
498        (0.2, 0.3]
499        (0.2, 0.3]
500        (0.2, 0.3]
501      (0.04, 0.06]
502      (0.04, 0.06]
503      (0.04, 0.06]
504        (0.1, 0.2]
505      (0.04, 0.06]
Name: CRIM, Length: 506, dtype: category
Categories (10, interval[float64]): [(-0.094, 0.04] < (0.04, 0.06] < (0.06, 0.1] < (0.1, 0.2] ... (0.6, 1.6] < (1.6, 5.4] < (5.4, 10.5] < (10.5, 89.0]]

qのパーセンタイルは自由に設定できる。


In [47]:
pd.qcut(df['CRIM'],q=[.01,.05,.95,.99,1.0])


Out[47]:
0                   NaN
1      (0.0126, 0.0279]
2      (0.0126, 0.0279]
3      (0.0279, 15.789]
4      (0.0279, 15.789]
5      (0.0279, 15.789]
6      (0.0279, 15.789]
7      (0.0279, 15.789]
8      (0.0279, 15.789]
9      (0.0279, 15.789]
10     (0.0279, 15.789]
11     (0.0279, 15.789]
12     (0.0279, 15.789]
13     (0.0279, 15.789]
14     (0.0279, 15.789]
15     (0.0279, 15.789]
16     (0.0279, 15.789]
17     (0.0279, 15.789]
18     (0.0279, 15.789]
19     (0.0279, 15.789]
20     (0.0279, 15.789]
21     (0.0279, 15.789]
22     (0.0279, 15.789]
23     (0.0279, 15.789]
24     (0.0279, 15.789]
25     (0.0279, 15.789]
26     (0.0279, 15.789]
27     (0.0279, 15.789]
28     (0.0279, 15.789]
29     (0.0279, 15.789]
             ...       
476    (0.0279, 15.789]
477    (0.0279, 15.789]
478    (0.0279, 15.789]
479    (0.0279, 15.789]
480    (0.0279, 15.789]
481    (0.0279, 15.789]
482    (0.0279, 15.789]
483    (0.0279, 15.789]
484    (0.0279, 15.789]
485    (0.0279, 15.789]
486    (0.0279, 15.789]
487    (0.0279, 15.789]
488    (0.0279, 15.789]
489    (0.0279, 15.789]
490    (0.0279, 15.789]
491    (0.0279, 15.789]
492    (0.0279, 15.789]
493    (0.0279, 15.789]
494    (0.0279, 15.789]
495    (0.0279, 15.789]
496    (0.0279, 15.789]
497    (0.0279, 15.789]
498    (0.0279, 15.789]
499    (0.0279, 15.789]
500    (0.0279, 15.789]
501    (0.0279, 15.789]
502    (0.0279, 15.789]
503    (0.0279, 15.789]
504    (0.0279, 15.789]
505    (0.0279, 15.789]
Name: CRIM, Length: 506, dtype: category
Categories (4, interval[float64]): [(0.0126, 0.0279] < (0.0279, 15.789] < (15.789, 41.37] < (41.37, 88.976]]

In [ ]: