資料預處理


In [2]:
import os
import numpy as np
import pandas as pd

filepath = '/Users/mac/Desktop/Kaggle_datasets/WSC_breast_cancer_FNA/'
filename = 'data.csv'

dfFull = pd.read_csv(os.path.join(filepath+filename))

dfFull


Out[2]:
id diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave points_mean ... texture_worst perimeter_worst area_worst smoothness_worst compactness_worst concavity_worst concave points_worst symmetry_worst fractal_dimension_worst Unnamed: 32
0 842302 M 17.990 10.38 122.80 1001.0 0.11840 0.27760 0.300100 0.147100 ... 17.33 184.60 2019.0 0.16220 0.66560 0.71190 0.26540 0.4601 0.11890 NaN
1 842517 M 20.570 17.77 132.90 1326.0 0.08474 0.07864 0.086900 0.070170 ... 23.41 158.80 1956.0 0.12380 0.18660 0.24160 0.18600 0.2750 0.08902 NaN
2 84300903 M 19.690 21.25 130.00 1203.0 0.10960 0.15990 0.197400 0.127900 ... 25.53 152.50 1709.0 0.14440 0.42450 0.45040 0.24300 0.3613 0.08758 NaN
3 84348301 M 11.420 20.38 77.58 386.1 0.14250 0.28390 0.241400 0.105200 ... 26.50 98.87 567.7 0.20980 0.86630 0.68690 0.25750 0.6638 0.17300 NaN
4 84358402 M 20.290 14.34 135.10 1297.0 0.10030 0.13280 0.198000 0.104300 ... 16.67 152.20 1575.0 0.13740 0.20500 0.40000 0.16250 0.2364 0.07678 NaN
5 843786 M 12.450 15.70 82.57 477.1 0.12780 0.17000 0.157800 0.080890 ... 23.75 103.40 741.6 0.17910 0.52490 0.53550 0.17410 0.3985 0.12440 NaN
6 844359 M 18.250 19.98 119.60 1040.0 0.09463 0.10900 0.112700 0.074000 ... 27.66 153.20 1606.0 0.14420 0.25760 0.37840 0.19320 0.3063 0.08368 NaN
7 84458202 M 13.710 20.83 90.20 577.9 0.11890 0.16450 0.093660 0.059850 ... 28.14 110.60 897.0 0.16540 0.36820 0.26780 0.15560 0.3196 0.11510 NaN
8 844981 M 13.000 21.82 87.50 519.8 0.12730 0.19320 0.185900 0.093530 ... 30.73 106.20 739.3 0.17030 0.54010 0.53900 0.20600 0.4378 0.10720 NaN
9 84501001 M 12.460 24.04 83.97 475.9 0.11860 0.23960 0.227300 0.085430 ... 40.68 97.65 711.4 0.18530 1.05800 1.10500 0.22100 0.4366 0.20750 NaN
10 845636 M 16.020 23.24 102.70 797.8 0.08206 0.06669 0.032990 0.033230 ... 33.88 123.80 1150.0 0.11810 0.15510 0.14590 0.09975 0.2948 0.08452 NaN
11 84610002 M 15.780 17.89 103.60 781.0 0.09710 0.12920 0.099540 0.066060 ... 27.28 136.50 1299.0 0.13960 0.56090 0.39650 0.18100 0.3792 0.10480 NaN
12 846226 M 19.170 24.80 132.40 1123.0 0.09740 0.24580 0.206500 0.111800 ... 29.94 151.70 1332.0 0.10370 0.39030 0.36390 0.17670 0.3176 0.10230 NaN
13 846381 M 15.850 23.95 103.70 782.7 0.08401 0.10020 0.099380 0.053640 ... 27.66 112.00 876.5 0.11310 0.19240 0.23220 0.11190 0.2809 0.06287 NaN
14 84667401 M 13.730 22.61 93.60 578.3 0.11310 0.22930 0.212800 0.080250 ... 32.01 108.80 697.7 0.16510 0.77250 0.69430 0.22080 0.3596 0.14310 NaN
15 84799002 M 14.540 27.54 96.73 658.8 0.11390 0.15950 0.163900 0.073640 ... 37.13 124.10 943.2 0.16780 0.65770 0.70260 0.17120 0.4218 0.13410 NaN
16 848406 M 14.680 20.13 94.74 684.5 0.09867 0.07200 0.073950 0.052590 ... 30.88 123.40 1138.0 0.14640 0.18710 0.29140 0.16090 0.3029 0.08216 NaN
17 84862001 M 16.130 20.68 108.10 798.8 0.11700 0.20220 0.172200 0.102800 ... 31.48 136.80 1315.0 0.17890 0.42330 0.47840 0.20730 0.3706 0.11420 NaN
18 849014 M 19.810 22.15 130.00 1260.0 0.09831 0.10270 0.147900 0.094980 ... 30.88 186.80 2398.0 0.15120 0.31500 0.53720 0.23880 0.2768 0.07615 NaN
19 8510426 B 13.540 14.36 87.46 566.3 0.09779 0.08129 0.066640 0.047810 ... 19.26 99.70 711.2 0.14400 0.17730 0.23900 0.12880 0.2977 0.07259 NaN
20 8510653 B 13.080 15.71 85.63 520.0 0.10750 0.12700 0.045680 0.031100 ... 20.49 96.09 630.5 0.13120 0.27760 0.18900 0.07283 0.3184 0.08183 NaN
21 8510824 B 9.504 12.44 60.34 273.9 0.10240 0.06492 0.029560 0.020760 ... 15.66 65.13 314.9 0.13240 0.11480 0.08867 0.06227 0.2450 0.07773 NaN
22 8511133 M 15.340 14.26 102.50 704.4 0.10730 0.21350 0.207700 0.097560 ... 19.08 125.10 980.9 0.13900 0.59540 0.63050 0.23930 0.4667 0.09946 NaN
23 851509 M 21.160 23.04 137.20 1404.0 0.09428 0.10220 0.109700 0.086320 ... 35.59 188.00 2615.0 0.14010 0.26000 0.31550 0.20090 0.2822 0.07526 NaN
24 852552 M 16.650 21.38 110.00 904.6 0.11210 0.14570 0.152500 0.091700 ... 31.56 177.00 2215.0 0.18050 0.35780 0.46950 0.20950 0.3613 0.09564 NaN
25 852631 M 17.140 16.40 116.00 912.7 0.11860 0.22760 0.222900 0.140100 ... 21.40 152.40 1461.0 0.15450 0.39490 0.38530 0.25500 0.4066 0.10590 NaN
26 852763 M 14.580 21.53 97.41 644.8 0.10540 0.18680 0.142500 0.087830 ... 33.21 122.40 896.9 0.15250 0.66430 0.55390 0.27010 0.4264 0.12750 NaN
27 852781 M 18.610 20.25 122.10 1094.0 0.09440 0.10660 0.149000 0.077310 ... 27.26 139.90 1403.0 0.13380 0.21170 0.34460 0.14900 0.2341 0.07421 NaN
28 852973 M 15.300 25.27 102.40 732.4 0.10820 0.16970 0.168300 0.087510 ... 36.71 149.30 1269.0 0.16410 0.61100 0.63350 0.20240 0.4027 0.09876 NaN
29 853201 M 17.570 15.05 115.00 955.1 0.09847 0.11570 0.098750 0.079530 ... 19.52 134.90 1227.0 0.12550 0.28120 0.24890 0.14560 0.2756 0.07919 NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
539 921362 B 7.691 25.44 48.34 170.4 0.08668 0.11990 0.092520 0.013640 ... 31.89 54.49 223.6 0.15960 0.30640 0.33930 0.05000 0.2790 0.10660 NaN
540 921385 B 11.540 14.44 74.65 402.9 0.09984 0.11200 0.067370 0.025940 ... 19.68 78.78 457.8 0.13450 0.21180 0.17970 0.06918 0.2329 0.08134 NaN
541 921386 B 14.470 24.99 95.81 656.4 0.08837 0.12300 0.100900 0.038900 ... 31.73 113.50 808.9 0.13400 0.42020 0.40400 0.12050 0.3187 0.10230 NaN
542 921644 B 14.740 25.42 94.70 668.6 0.08275 0.07214 0.041050 0.030270 ... 32.29 107.40 826.4 0.10600 0.13760 0.16110 0.10950 0.2722 0.06956 NaN
543 922296 B 13.210 28.06 84.88 538.4 0.08671 0.06877 0.029870 0.032750 ... 37.17 92.48 629.6 0.10720 0.13810 0.10620 0.07958 0.2473 0.06443 NaN
544 922297 B 13.870 20.70 89.77 584.8 0.09578 0.10180 0.036880 0.023690 ... 24.75 99.17 688.6 0.12640 0.20370 0.13770 0.06845 0.2249 0.08492 NaN
545 922576 B 13.620 23.23 87.19 573.2 0.09246 0.06747 0.029740 0.024430 ... 29.09 97.58 729.8 0.12160 0.15170 0.10490 0.07174 0.2642 0.06953 NaN
546 922577 B 10.320 16.35 65.31 324.9 0.09434 0.04994 0.010120 0.005495 ... 21.77 71.12 384.9 0.12850 0.08842 0.04384 0.02381 0.2681 0.07399 NaN
547 922840 B 10.260 16.58 65.85 320.8 0.08877 0.08066 0.043580 0.024380 ... 22.04 71.08 357.4 0.14610 0.22460 0.17830 0.08333 0.2691 0.09479 NaN
548 923169 B 9.683 19.34 61.05 285.7 0.08491 0.05030 0.023370 0.009615 ... 25.59 69.10 364.2 0.11990 0.09546 0.09350 0.03846 0.2552 0.07920 NaN
549 923465 B 10.820 24.21 68.89 361.6 0.08192 0.06602 0.015480 0.008160 ... 31.45 83.90 505.6 0.12040 0.16330 0.06194 0.03264 0.3059 0.07626 NaN
550 923748 B 10.860 21.48 68.51 360.5 0.07431 0.04227 0.000000 0.000000 ... 24.77 74.08 412.3 0.10010 0.07348 0.00000 0.00000 0.2458 0.06592 NaN
551 923780 B 11.130 22.44 71.49 378.4 0.09566 0.08194 0.048240 0.022570 ... 28.26 77.80 436.6 0.10870 0.17820 0.15640 0.06413 0.3169 0.08032 NaN
552 924084 B 12.770 29.43 81.35 507.9 0.08276 0.04234 0.019970 0.014990 ... 36.00 88.10 594.7 0.12340 0.10640 0.08653 0.06498 0.2407 0.06484 NaN
553 924342 B 9.333 21.94 59.01 264.0 0.09240 0.05605 0.039960 0.012820 ... 25.05 62.86 295.8 0.11030 0.08298 0.07993 0.02564 0.2435 0.07393 NaN
554 924632 B 12.880 28.92 82.50 514.3 0.08123 0.05824 0.061950 0.023430 ... 35.74 88.84 595.7 0.12270 0.16200 0.24390 0.06493 0.2372 0.07242 NaN
555 924934 B 10.290 27.61 65.67 321.4 0.09030 0.07658 0.059990 0.027380 ... 34.91 69.57 357.6 0.13840 0.17100 0.20000 0.09127 0.2226 0.08283 NaN
556 924964 B 10.160 19.59 64.73 311.7 0.10030 0.07504 0.005025 0.011160 ... 22.88 67.88 347.3 0.12650 0.12000 0.01005 0.02232 0.2262 0.06742 NaN
557 925236 B 9.423 27.88 59.26 271.3 0.08123 0.04971 0.000000 0.000000 ... 34.24 66.50 330.6 0.10730 0.07158 0.00000 0.00000 0.2475 0.06969 NaN
558 925277 B 14.590 22.68 96.39 657.1 0.08473 0.13300 0.102900 0.037360 ... 27.27 105.90 733.5 0.10260 0.31710 0.36620 0.11050 0.2258 0.08004 NaN
559 925291 B 11.510 23.93 74.52 403.5 0.09261 0.10210 0.111200 0.041050 ... 37.16 82.28 474.2 0.12980 0.25170 0.36300 0.09653 0.2112 0.08732 NaN
560 925292 B 14.050 27.15 91.38 600.4 0.09929 0.11260 0.044620 0.043040 ... 33.17 100.20 706.7 0.12410 0.22640 0.13260 0.10480 0.2250 0.08321 NaN
561 925311 B 11.200 29.37 70.67 386.0 0.07449 0.03558 0.000000 0.000000 ... 38.30 75.19 439.6 0.09267 0.05494 0.00000 0.00000 0.1566 0.05905 NaN
562 925622 M 15.220 30.62 103.40 716.9 0.10480 0.20870 0.255000 0.094290 ... 42.79 128.70 915.0 0.14170 0.79170 1.17000 0.23560 0.4089 0.14090 NaN
563 926125 M 20.920 25.09 143.00 1347.0 0.10990 0.22360 0.317400 0.147400 ... 29.41 179.10 1819.0 0.14070 0.41860 0.65990 0.25420 0.2929 0.09873 NaN
564 926424 M 21.560 22.39 142.00 1479.0 0.11100 0.11590 0.243900 0.138900 ... 26.40 166.10 2027.0 0.14100 0.21130 0.41070 0.22160 0.2060 0.07115 NaN
565 926682 M 20.130 28.25 131.20 1261.0 0.09780 0.10340 0.144000 0.097910 ... 38.25 155.00 1731.0 0.11660 0.19220 0.32150 0.16280 0.2572 0.06637 NaN
566 926954 M 16.600 28.08 108.30 858.1 0.08455 0.10230 0.092510 0.053020 ... 34.12 126.70 1124.0 0.11390 0.30940 0.34030 0.14180 0.2218 0.07820 NaN
567 927241 M 20.600 29.33 140.10 1265.0 0.11780 0.27700 0.351400 0.152000 ... 39.42 184.60 1821.0 0.16500 0.86810 0.93870 0.26500 0.4087 0.12400 NaN
568 92751 B 7.760 24.54 47.92 181.0 0.05263 0.04362 0.000000 0.000000 ... 30.37 59.16 268.6 0.08996 0.06444 0.00000 0.00000 0.2871 0.07039 NaN

569 rows × 33 columns


In [3]:
dfFull.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 33 columns):
id                         569 non-null int64
diagnosis                  569 non-null object
radius_mean                569 non-null float64
texture_mean               569 non-null float64
perimeter_mean             569 non-null float64
area_mean                  569 non-null float64
smoothness_mean            569 non-null float64
compactness_mean           569 non-null float64
concavity_mean             569 non-null float64
concave points_mean        569 non-null float64
symmetry_mean              569 non-null float64
fractal_dimension_mean     569 non-null float64
radius_se                  569 non-null float64
texture_se                 569 non-null float64
perimeter_se               569 non-null float64
area_se                    569 non-null float64
smoothness_se              569 non-null float64
compactness_se             569 non-null float64
concavity_se               569 non-null float64
concave points_se          569 non-null float64
symmetry_se                569 non-null float64
fractal_dimension_se       569 non-null float64
radius_worst               569 non-null float64
texture_worst              569 non-null float64
perimeter_worst            569 non-null float64
area_worst                 569 non-null float64
smoothness_worst           569 non-null float64
compactness_worst          569 non-null float64
concavity_worst            569 non-null float64
concave points_worst       569 non-null float64
symmetry_worst             569 non-null float64
fractal_dimension_worst    569 non-null float64
Unnamed: 32                0 non-null float64
dtypes: float64(31), int64(1), object(1)
memory usage: 146.8+ KB

In [4]:
#dfFull.dropna(axis=1, inplace=True) #直的方向丟棄Unamed:32那排廢物

In [5]:
dfFull.isnull().sum() #現在data通通都有了


Out[5]:
id                           0
diagnosis                    0
radius_mean                  0
texture_mean                 0
perimeter_mean               0
area_mean                    0
smoothness_mean              0
compactness_mean             0
concavity_mean               0
concave points_mean          0
symmetry_mean                0
fractal_dimension_mean       0
radius_se                    0
texture_se                   0
perimeter_se                 0
area_se                      0
smoothness_se                0
compactness_se               0
concavity_se                 0
concave points_se            0
symmetry_se                  0
fractal_dimension_se         0
radius_worst                 0
texture_worst                0
perimeter_worst              0
area_worst                   0
smoothness_worst             0
compactness_worst            0
concavity_worst              0
concave points_worst         0
symmetry_worst               0
fractal_dimension_worst      0
Unnamed: 32                569
dtype: int64

In [6]:
dfFull.describe()


Out[6]:
id radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave points_mean symmetry_mean ... texture_worst perimeter_worst area_worst smoothness_worst compactness_worst concavity_worst concave points_worst symmetry_worst fractal_dimension_worst Unnamed: 32
count 5.690000e+02 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 ... 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 0.0
mean 3.037183e+07 14.127292 19.289649 91.969033 654.889104 0.096360 0.104341 0.088799 0.048919 0.181162 ... 25.677223 107.261213 880.583128 0.132369 0.254265 0.272188 0.114606 0.290076 0.083946 NaN
std 1.250206e+08 3.524049 4.301036 24.298981 351.914129 0.014064 0.052813 0.079720 0.038803 0.027414 ... 6.146258 33.602542 569.356993 0.022832 0.157336 0.208624 0.065732 0.061867 0.018061 NaN
min 8.670000e+03 6.981000 9.710000 43.790000 143.500000 0.052630 0.019380 0.000000 0.000000 0.106000 ... 12.020000 50.410000 185.200000 0.071170 0.027290 0.000000 0.000000 0.156500 0.055040 NaN
25% 8.692180e+05 11.700000 16.170000 75.170000 420.300000 0.086370 0.064920 0.029560 0.020310 0.161900 ... 21.080000 84.110000 515.300000 0.116600 0.147200 0.114500 0.064930 0.250400 0.071460 NaN
50% 9.060240e+05 13.370000 18.840000 86.240000 551.100000 0.095870 0.092630 0.061540 0.033500 0.179200 ... 25.410000 97.660000 686.500000 0.131300 0.211900 0.226700 0.099930 0.282200 0.080040 NaN
75% 8.813129e+06 15.780000 21.800000 104.100000 782.700000 0.105300 0.130400 0.130700 0.074000 0.195700 ... 29.720000 125.400000 1084.000000 0.146000 0.339100 0.382900 0.161400 0.317900 0.092080 NaN
max 9.113205e+08 28.110000 39.280000 188.500000 2501.000000 0.163400 0.345400 0.426800 0.201200 0.304000 ... 49.540000 251.200000 4254.000000 0.222600 1.058000 1.252000 0.291000 0.663800 0.207500 NaN

8 rows × 32 columns


In [7]:
dfFull['diagnosis'] = dfFull['diagnosis'].map({'M': 1 ,'B': 0 }).astype(int)
dfFull['diagnosis'].head() #出錯了只好one_hot_encoding


Out[7]:
0    1
1    1
2    1
3    1
4    1
Name: diagnosis, dtype: int64

In [8]:
dfFull.dropna(axis=1, inplace=True)

In [9]:
dfFull.head()


Out[9]:
id diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave points_mean ... radius_worst texture_worst perimeter_worst area_worst smoothness_worst compactness_worst concavity_worst concave points_worst symmetry_worst fractal_dimension_worst
0 842302 1 17.99 10.38 122.80 1001.0 0.11840 0.27760 0.3001 0.14710 ... 25.38 17.33 184.60 2019.0 0.1622 0.6656 0.7119 0.2654 0.4601 0.11890
1 842517 1 20.57 17.77 132.90 1326.0 0.08474 0.07864 0.0869 0.07017 ... 24.99 23.41 158.80 1956.0 0.1238 0.1866 0.2416 0.1860 0.2750 0.08902
2 84300903 1 19.69 21.25 130.00 1203.0 0.10960 0.15990 0.1974 0.12790 ... 23.57 25.53 152.50 1709.0 0.1444 0.4245 0.4504 0.2430 0.3613 0.08758
3 84348301 1 11.42 20.38 77.58 386.1 0.14250 0.28390 0.2414 0.10520 ... 14.91 26.50 98.87 567.7 0.2098 0.8663 0.6869 0.2575 0.6638 0.17300
4 84358402 1 20.29 14.34 135.10 1297.0 0.10030 0.13280 0.1980 0.10430 ... 22.54 16.67 152.20 1575.0 0.1374 0.2050 0.4000 0.1625 0.2364 0.07678

5 rows × 32 columns

先區分Train和Test (4:1)


In [10]:
np.random.seed(42)
dfFull_shuffle = dfFull.iloc[np.random.permutation(len(dfFull))]

dfTrain = dfFull_shuffle.iloc[1:454, :]
dfTest = dfFull_shuffle.iloc[454: , :]

#拿掉id, diagnosis
train_feature = np.array(dfTrain[['radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst']])

train_label = np.array(dfTrain['diagnosis'])

#拿掉id, diagnosis
test_feature = np.array(dfTest[['radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst']])

test_label = np.array(dfTest['diagnosis'])

In [11]:
len(train_feature)


Out[11]:
453

Normalize feature


In [12]:
from sklearn import preprocessing
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))
train_feature = minmax_scale.fit_transform(train_feature)
test_feature = minmax_scale.fit_transform(test_feature)

print(train_feature[0])
print(train_label[0])
print(test_feature[0])
print(test_label[0])


[ 0.59088888  0.37820069  0.57703709  0.46831237  0.40686434  0.2561806
  0.25304592  0.39517893  0.26363636  0.09793597  0.48465116  0.09612578
  0.45779038  0.39455698  0.09283747  0.13699735  0.05729798  0.25951885
  0.08411662  0.02774899  0.6718254   0.40640323  0.64559226  0.51767895
  0.31783662  0.20016299  0.21461661  0.61477663  0.19436231  0.07116621]
1
[ 0.1804294   0.4129979   0.17466036  0.10001324  0.31853496  0.25224889
  0.11518569  0.24316163  0.55362319  0.19348128  0.08634007  0.39009613
  0.07308109  0.03419768  0.30244421  0.11162643  0.12015437  0.32692859
  0.1217981   0.09939934  0.17892576  0.43550107  0.15685963  0.08917775
  0.43471099  0.17410538  0.16220992  0.41142857  0.28993746  0.29647454]
0

In [13]:
dfTrain.columns #方便複製欲選取的欄位


Out[13]:
Index(['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst'],
      dtype='object')

In [14]:
train_feature.shape #確認餵給的參數多少


Out[14]:
(453, 30)

再從train區分出Newtrain和validation (4:1)

跑Scikit-Learn需要這一步驟(跑Keras不用這一步驟,因為內建啦)


In [57]:
from sklearn.model_selection import train_test_split

Newtrain_feature, Val_feature, Newtrain_label, Val_label = train_test_split(train_feature, 
                                                                            train_label,
                                                                            test_size=0.2, # 0.2 test; 0.8 train
                                                                            random_state=40) # seed,這樣才會都使用同一組train_data

建立、跑模型囉!!


In [15]:
import matplotlib.pyplot as plt
def show_train_history(train_history,train,validation):
    plt.plot(train_history.history[train])
    plt.plot(train_history.history[validation])
    plt.title('Train History')
    plt.ylabel(train)
    plt.xlabel('Epoch')
    plt.legend(['train', 'validation'], loc='best')
    plt.show()


######################### 建立模型
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout

model = Sequential() #一層一層到底,按順序

#輸入層(隱藏層1)
model.add(Dense(units=1000, 
                input_dim=30, 
                kernel_initializer='uniform', 
                activation='relu'))

#使用dropout避免overfitting
model.add(Dropout(0.5))

#隱藏層2,不用寫input_dim,因為就是前一層的units
model.add(Dense(units=500,  
                kernel_initializer='uniform', 
                activation='relu'))

#使用dropout避免overfitting
model.add(Dropout(0.5))


model.add(Dense(units=250,  
                kernel_initializer='uniform', 
                activation='relu'))

#使用dropout避免overfitting
model.add(Dropout(0.5))


#輸出層
model.add(Dense(units=1, #輸出一個數字 
                kernel_initializer='uniform', 
                activation='sigmoid'))

print(model.summary()) #可以清楚看到model還有參數數量


######################### 訓練模型
#選擇loss度量,optimizer學習路徑速度、
model.compile(loss='binary_crossentropy',   #二元用binary
              optimizer='adam', metrics=['accuracy'])

#開始train,並且記錄情況(設有val項以免overfitting)
train_history=model.fit(x=train_feature, y=train_label,  #上面多分割一步在keras是內建的
                        validation_split=0.2, epochs=20, batch_size=50, verbose=2) #verbose=2表示顯示訓練過程


######################### 訓練過程視覺化
show_train_history(train_history,'acc','val_acc')
show_train_history(train_history,'loss','val_loss')

#儲存訓練結果
model.save_weights("Savemodels/BRCA_FNA(Kaggles)_MLP.h5")
print('model saved to disk')


Using TensorFlow backend.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense_1 (Dense)              (None, 1000)              31000     
_________________________________________________________________
dropout_1 (Dropout)          (None, 1000)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 500)               500500    
_________________________________________________________________
dropout_2 (Dropout)          (None, 500)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 250)               125250    
_________________________________________________________________
dropout_3 (Dropout)          (None, 250)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 251       
=================================================================
Total params: 657,001
Trainable params: 657,001
Non-trainable params: 0
_________________________________________________________________
None
Train on 362 samples, validate on 91 samples
Epoch 1/20
0s - loss: 0.6869 - acc: 0.6602 - val_loss: 0.6634 - val_acc: 0.6923
Epoch 2/20
0s - loss: 0.6246 - acc: 0.8011 - val_loss: 0.4985 - val_acc: 0.8462
Epoch 3/20
0s - loss: 0.3916 - acc: 0.8812 - val_loss: 0.3303 - val_acc: 0.8791
Epoch 4/20
0s - loss: 0.2354 - acc: 0.8950 - val_loss: 0.2157 - val_acc: 0.9121
Epoch 5/20
0s - loss: 0.2086 - acc: 0.9144 - val_loss: 0.1951 - val_acc: 0.9560
Epoch 6/20
0s - loss: 0.1535 - acc: 0.9227 - val_loss: 0.1530 - val_acc: 0.9231
Epoch 7/20
0s - loss: 0.1600 - acc: 0.9254 - val_loss: 0.1596 - val_acc: 0.9670
Epoch 8/20
0s - loss: 0.1350 - acc: 0.9337 - val_loss: 0.2297 - val_acc: 0.9560
Epoch 9/20
0s - loss: 0.1486 - acc: 0.9420 - val_loss: 0.1368 - val_acc: 0.9560
Epoch 10/20
0s - loss: 0.0995 - acc: 0.9641 - val_loss: 0.1407 - val_acc: 0.9670
Epoch 11/20
0s - loss: 0.0876 - acc: 0.9669 - val_loss: 0.1383 - val_acc: 0.9780
Epoch 12/20
0s - loss: 0.0773 - acc: 0.9696 - val_loss: 0.1624 - val_acc: 0.9670
Epoch 13/20
0s - loss: 0.0916 - acc: 0.9613 - val_loss: 0.2289 - val_acc: 0.9560
Epoch 14/20
0s - loss: 0.0964 - acc: 0.9586 - val_loss: 0.1797 - val_acc: 0.9670
Epoch 15/20
0s - loss: 0.0622 - acc: 0.9696 - val_loss: 0.1613 - val_acc: 0.9670
Epoch 16/20
0s - loss: 0.0880 - acc: 0.9696 - val_loss: 0.2094 - val_acc: 0.9560
Epoch 17/20
0s - loss: 0.0780 - acc: 0.9696 - val_loss: 0.1616 - val_acc: 0.9780
Epoch 18/20
0s - loss: 0.0701 - acc: 0.9724 - val_loss: 0.2001 - val_acc: 0.9560
Epoch 19/20
0s - loss: 0.0726 - acc: 0.9669 - val_loss: 0.1587 - val_acc: 0.9890
Epoch 20/20
0s - loss: 0.0801 - acc: 0.9641 - val_loss: 0.1696 - val_acc: 0.9780
model saved to disk

In [16]:
scores = model.evaluate(test_feature, test_label)
print('\n')
print('accuracy=',scores[1])

######################### 紀錄模型預測情形(答案卷)
prediction = model.predict_classes(test_feature)


 32/115 [=======>......................] - ETA: 0s

accuracy= 0.930434786237
 32/115 [=======>......................] - ETA: 0s

解謎分析:Train set和Test set到底發生了什麼事情,有沒有bias


In [43]:
from collections import Counter

Train set


In [17]:
prediction_train = model.predict_classes(train_feature)


453/453 [==============================] - 0s     

In [34]:
df_prediction_train = pd.DataFrame(prediction_train, index=range(len(prediction_train)))
df_prediction_train['prediction'] = df_prediction_train
df_train_label = pd.DataFrame(train_label)

df_prediction_train['diagnosis'] = df_train_label
df_prediction_train = df_prediction_train[['prediction', 'diagnosis']]

In [35]:
df_prediction_train


Out[35]:
prediction diagnosis
0 1 1
1 1 1
2 0 0
3 0 0
4 1 1
5 1 1
6 1 1
7 0 0
8 0 0
9 0 0
10 1 1
11 0 0
12 1 1
13 0 0
14 1 1
15 0 0
16 0 0
17 0 0
18 1 1
19 0 1
20 0 0
21 1 1
22 0 0
23 0 0
24 0 0
25 0 0
26 0 0
27 0 0
28 1 1
29 0 0
... ... ...
423 1 1
424 0 0
425 1 1
426 0 0
427 1 1
428 1 1
429 0 0
430 0 0
431 1 1
432 0 0
433 1 1
434 1 1
435 0 0
436 1 1
437 1 1
438 0 0
439 0 0
440 0 1
441 0 0
442 1 1
443 0 0
444 0 0
445 0 1
446 0 0
447 0 0
448 1 1
449 1 1
450 1 1
451 0 0
452 0 0

453 rows × 2 columns


In [45]:
df_prediction_train[ df_prediction_train['diagnosis'] != df_prediction_train['prediction']]


Out[45]:
prediction diagnosis
19 0 1
111 1 0
247 0 1
306 0 1
440 0 1
445 0 1

In [51]:
df_prediction_train['diagnosis'].value_counts()


Out[51]:
0    288
1    165
Name: diagnosis, dtype: int64

In [48]:
len(df_prediction_train['diagnosis']==1)


Out[48]:
453

Test set


In [18]:
df_prediction = pd.DataFrame(prediction, index=range(len(prediction)))
df_prediction['prediction'] = df_prediction
df_test_label = pd.DataFrame(test_label)

df_prediction['diagnosis'] = df_test_label

In [19]:
df_prediction = df_prediction[['prediction', 'diagnosis']]

In [33]:
df_prediction


Out[33]:
prediction diagnosis
0 0 0
1 1 1
2 1 1
3 0 0
4 1 1
5 1 1
6 0 0
7 0 0
8 0 0
9 1 1
10 0 0
11 1 1
12 1 1
13 1 1
14 0 1
15 0 0
16 0 0
17 1 1
18 0 0
19 0 0
20 1 1
21 1 1
22 1 1
23 1 1
24 0 1
25 1 1
26 0 0
27 0 0
28 0 0
29 0 0
... ... ...
85 0 0
86 0 0
87 0 0
88 1 1
89 0 0
90 0 0
91 0 0
92 0 0
93 0 0
94 0 0
95 0 1
96 0 0
97 0 0
98 1 1
99 0 0
100 0 0
101 0 1
102 1 1
103 1 1
104 0 0
105 1 1
106 1 1
107 0 0
108 1 1
109 0 0
110 0 0
111 0 0
112 0 0
113 1 1
114 0 0

115 rows × 2 columns


In [50]:
df_prediction['diagnosis'].value_counts()


Out[50]:
0    68
1    47
Name: diagnosis, dtype: int64

In [39]:
df_prediction[df_prediction['diagnosis'] != df_prediction['prediction']]


Out[39]:
prediction diagnosis
14 0 1
24 0 1
54 0 1
59 0 1
69 0 1
73 0 1
95 0 1
101 0 1

In [21]:
len(df_prediction)


Out[21]:
115

In [22]:
plt.hist(dfFull['diagnosis'])
plt.show()


原始data相關分佈


In [23]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
import numpy as np
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

dfTrain


In [24]:
dfTrain['diagnosis'].value_counts()


Out[24]:
0    288
1    165
Name: diagnosis, dtype: int64

In [25]:
plt.hist(dfTrain['diagnosis'])


Out[25]:
(array([ 288.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,  165.]),
 array([ 0. ,  0.1,  0.2,  0.3,  0.4,  0.5,  0.6,  0.7,  0.8,  0.9,  1. ]),
 <a list of 10 Patch objects>)

In [26]:
corrmat = dfTrain.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=1, square=True, cmap='rainbow')


Out[26]:
<matplotlib.axes._subplots.AxesSubplot at 0x120a76dd8>

In [27]:
k = 10 #number of variables for heatmap
cols = corrmat.nlargest(k, 'diagnosis')['diagnosis'].index
cm = np.corrcoef(dfTrain[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10},
                 yticklabels = cols.values, xticklabels = cols.values)
plt.show()



In [28]:
#scatterplot
sns.set()
cols = ['diagnosis', 'concave points_worst', 'concave points_mean', 'concavity_mean', 'concavity_worst', 'perimeter_worst', 'radius_worst']
sns.pairplot(dfTrain[cols], size = 2.5)
plt.show();


dfTest


In [17]:
dfTest['diagnosis'].value_counts()


Out[17]:
0    68
1    47
Name: diagnosis, dtype: int64

In [55]:
plt.hist(dfTest['diagnosis'])


Out[55]:
(array([ 68.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,  47.]),
 array([ 0. ,  0.1,  0.2,  0.3,  0.4,  0.5,  0.6,  0.7,  0.8,  0.9,  1. ]),
 <a list of 10 Patch objects>)

In [47]:
corrmat = dfTest.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=1, square=True, cmap='rainbow')


Out[47]:
<matplotlib.axes._subplots.AxesSubplot at 0x127dc22b0>

In [29]:
corrmat = dfTest.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=1, square=True, cmap='rainbow', mask= corrmat<0.7)


Out[29]:
<matplotlib.axes._subplots.AxesSubplot at 0x12359c9b0>

In [30]:
k = 10 #number of variables for heatmap
cols = corrmat.nlargest(k, 'diagnosis')['diagnosis'].index
cm = np.corrcoef(dfTest[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10},
                 yticklabels = cols.values, xticklabels = cols.values)
plt.show()



In [31]:
k = 10 #number of variables for heatmap
cols = corrmat.nlargest(k, 'diagnosis')['diagnosis'].index
cm = np.corrcoef(dfTest[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10},
                 yticklabels = cols.values, xticklabels = cols.values, mask=cm<0.75)
plt.show()



In [32]:
#scatterplot
sns.set()
cols = ['diagnosis', 'concave points_worst', 'concave points_mean', 'concavity_mean', 'concavity_worst', 'perimeter_worst', 'radius_worst']
sns.pairplot(dfTest[cols], size = 2.5)
plt.show();



In [ ]: