資料預處理：看清楚datatype，有問號的會變成object，不是numeric



In [1]:

    
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
import numpy as np
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline



In [2]:

    
import numpy as np 
import pandas as pd
import os 

filepath = '/Users/mac/Desktop/Kaggle_datasets/Cervical_Cancer_Risk_Classification/'
filename = 'kag_risk_factors_cervical_cancer.csv'

df_full = pd.read_csv(os.path.join(filepath+filename))



In [3]:

    
df_full









    Out[3]:






  
    
      
      Age
      Number of sexual partners
      First sexual intercourse
      Num of pregnancies
      Smokes
      Smokes (years)
      Smokes (packs/year)
      Hormonal Contraceptives
      Hormonal Contraceptives (years)
      IUD
      ...
      STDs: Time since first diagnosis
      STDs: Time since last diagnosis
      Dx:Cancer
      Dx:CIN
      Dx:HPV
      Dx
      Hinselmann
      Schiller
      Citology
      Biopsy
    
  
  
    
      0
      18
      4.0
      15.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      ?
      ?
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      1
      15
      1.0
      14.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      ?
      ?
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      2
      34
      1.0
      ?
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      ?
      ?
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      3
      52
      5.0
      16.0
      4.0
      1.0
      37.0
      37.0
      1.0
      3.0
      0.0
      ...
      ?
      ?
      1
      0
      1
      0
      0
      0
      0
      0
    
    
      4
      46
      3.0
      21.0
      4.0
      0.0
      0.0
      0.0
      1.0
      15.0
      0.0
      ...
      ?
      ?
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      5
      42
      3.0
      23.0
      2.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      ?
      ?
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      6
      51
      3.0
      17.0
      6.0
      1.0
      34.0
      3.4
      0.0
      0.0
      1.0
      ...
      ?
      ?
      0
      0
      0
      0
      1
      1
      0
      1
    
    
      7
      26
      1.0
      26.0
      3.0
      0.0
      0.0
      0.0
      1.0
      2.0
      1.0
      ...
      ?
      ?
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      8
      45
      1.0
      20.0
      5.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      ?
      ?
      1
      0
      1
      1
      0
      0
      0
      0
    
    
      9
      44
      3.0
      15.0
      ?
      1.0
      1.266972909
      2.8
      0.0
      0.0
      ?
      ...
      ?
      ?
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      10
      44
      3.0
      26.0
      4.0
      0.0
      0.0
      0.0
      1.0
      2.0
      0.0
      ...
      ?
      ?
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      11
      27
      1.0
      17.0
      3.0
      0.0
      0.0
      0.0
      1.0
      8.0
      0.0
      ...
      ?
      ?
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      12
      45
      4.0
      14.0
      6.0
      0.0
      0.0
      0.0
      1.0
      10.0
      1.0
      ...
      ?
      ?
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      13
      44
      2.0
      25.0
      2.0
      0.0
      0.0
      0.0
      1.0
      5.0
      0.0
      ...
      ?
      ?
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      14
      43
      2.0
      18.0
      5.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      ...
      ?
      ?
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      15
      40
      3.0
      18.0
      2.0
      0.0
      0.0
      0.0
      1.0
      15.0
      0.0
      ...
      ?
      ?
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      16
      41
      4.0
      21.0
      3.0
      0.0
      0.0
      0.0
      1.0
      0.25
      0.0
      ...
      ?
      ?
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      17
      43
      3.0
      15.0
      8.0
      0.0
      0.0
      0.0
      1.0
      3.0
      0.0
      ...
      ?
      ?
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      18
      42
      2.0
      20.0
      ?
      0.0
      0.0
      0.0
      1.0
      7.0
      1.0
      ...
      ?
      ?
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      19
      40
      2.0
      27.0
      ?
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      ...
      ?
      ?
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      20
      43
      2.0
      18.0
      4.0
      0.0
      0.0
      0.0
      1.0
      15.0
      0.0
      ...
      ?
      ?
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      21
      41
      3.0
      17.0
      4.0
      0.0
      0.0
      0.0
      1.0
      10.0
      0.0
      ...
      21.0
      21.0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      22
      40
      1.0
      18.0
      1.0
      0.0
      0.0
      0.0
      1.0
      0.25
      0.0
      ...
      2.0
      2.0
      0
      0
      0
      0
      0
      1
      1
      1
    
    
      23
      40
      1.0
      20.0
      2.0
      0.0
      0.0
      0.0
      1.0
      15.0
      0.0
      ...
      ?
      ?
      1
      0
      1
      0
      1
      1
      0
      1
    
    
      24
      40
      3.0
      15.0
      3.0
      0.0
      0.0
      0.0
      1.0
      3.0
      0.0
      ...
      ?
      ?
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      25
      44
      3.0
      19.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      ?
      ?
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      26
      39
      5.0
      23.0
      2.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      ...
      ?
      ?
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      27
      39
      2.0
      17.0
      4.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      ?
      ?
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      28
      37
      3.0
      24.0
      1.0
      1.0
      3.0
      0.04
      0.0
      0.0
      0.0
      ...
      ?
      ?
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      29
      37
      6.0
      26.0
      1.0
      0.0
      0.0
      0.0
      1.0
      0.25
      0.0
      ...
      ?
      ?
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      828
      33
      2.0
      21.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      ?
      ?
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      829
      34
      3.0
      14.0
      4.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      ?
      ?
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      830
      35
      4.0
      16.0
      4.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      ?
      ?
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      831
      40
      3.0
      23.0
      2.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      8.0
      8.0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      832
      30
      2.0
      18.0
      0.0
      0.0
      0.0
      0.0
      1.0
      1.0
      0.0
      ...
      ?
      ?
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      833
      34
      1.0
      ?
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      ?
      ?
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      834
      30
      3.0
      15.0
      0.0
      1.0
      16.0
      8.0
      0.0
      0.0
      0.0
      ...
      ?
      ?
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      835
      24
      1.0
      14.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      ?
      ?
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      836
      37
      3.0
      ?
      0.0
      0.0
      0.0
      0.0
      1.0
      0.25
      0.0
      ...
      ?
      ?
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      837
      31
      9.0
      ?
      1.0
      1.0
      11.0
      5.5
      1.0
      0.25
      0.0
      ...
      ?
      ?
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      838
      35
      3.0
      18.0
      3.0
      0.0
      0.0
      0.0
      1.0
      5.0
      0.0
      ...
      ?
      ?
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      839
      31
      3.0
      19.0
      1.0
      0.0
      0.0
      0.0
      1.0
      0.08
      1.0
      ...
      ?
      ?
      1
      0
      0
      1
      0
      0
      0
      0
    
    
      840
      24
      2.0
      16.0
      3.0
      0.0
      0.0
      0.0
      1.0
      5.0
      0.0
      ...
      ?
      ?
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      841
      23
      2.0
      15.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      ?
      ?
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      842
      36
      3.0
      16.0
      3.0
      1.0
      6.0
      0.3
      1.0
      2.0
      0.0
      ...
      ?
      ?
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      843
      30
      3.0
      14.0
      3.0
      0.0
      0.0
      0.0
      1.0
      2.0
      0.0
      ...
      ?
      ?
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      844
      26
      8.0
      15.0
      1.0
      1.0
      9.0
      1.35
      1.0
      5.0
      1.0
      ...
      ?
      ?
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      845
      19
      2.0
      15.0
      2.0
      0.0
      0.0
      0.0
      1.0
      0.75
      0.0
      ...
      ?
      ?
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      846
      35
      2.0
      17.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      ?
      ?
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      847
      30
      3.0
      22.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      ?
      ?
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      848
      31
      3.0
      18.0
      1.0
      0.0
      0.0
      0.0
      1.0
      0.5
      0.0
      ...
      ?
      ?
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      849
      32
      3.0
      18.0
      1.0
      1.0
      11.0
      0.16
      1.0
      6.0
      0.0
      ...
      ?
      ?
      1
      0
      1
      0
      0
      0
      0
      0
    
    
      850
      19
      1.0
      14.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      ?
      ?
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      851
      23
      2.0
      15.0
      2.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      ?
      ?
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      852
      43
      3.0
      17.0
      3.0
      0.0
      0.0
      0.0
      1.0
      5.0
      0.0
      ...
      ?
      ?
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      853
      34
      3.0
      18.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      ?
      ?
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      854
      32
      2.0
      19.0
      1.0
      0.0
      0.0
      0.0
      1.0
      8.0
      0.0
      ...
      ?
      ?
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      855
      25
      2.0
      17.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.08
      0.0
      ...
      ?
      ?
      0
      0
      0
      0
      0
      0
      1
      0
    
    
      856
      33
      2.0
      24.0
      2.0
      0.0
      0.0
      0.0
      1.0
      0.08
      0.0
      ...
      ?
      ?
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      857
      29
      2.0
      20.0
      1.0
      0.0
      0.0
      0.0
      1.0
      0.5
      0.0
      ...
      ?
      ?
      0
      0
      0
      0
      0
      0
      0
      0
    
  

858 rows × 36 columns



In [4]:

    
df_fullna = df_full.replace('?', np.nan) #把可惡的「？」用np.nan替換掉，待會可以直接轉換成numeric



In [5]:

    
df_fullna.isnull().sum()









    Out[5]:





Age                                     0
Number of sexual partners              26
First sexual intercourse                7
Num of pregnancies                     56
Smokes                                 13
Smokes (years)                         13
Smokes (packs/year)                    13
Hormonal Contraceptives               108
Hormonal Contraceptives (years)       108
IUD                                   117
IUD (years)                           117
STDs                                  105
STDs (number)                         105
STDs:condylomatosis                   105
STDs:cervical condylomatosis          105
STDs:vaginal condylomatosis           105
STDs:vulvo-perineal condylomatosis    105
STDs:syphilis                         105
STDs:pelvic inflammatory disease      105
STDs:genital herpes                   105
STDs:molluscum contagiosum            105
STDs:AIDS                             105
STDs:HIV                              105
STDs:Hepatitis B                      105
STDs:HPV                              105
STDs: Number of diagnosis               0
STDs: Time since first diagnosis      787
STDs: Time since last diagnosis       787
Dx:Cancer                               0
Dx:CIN                                  0
Dx:HPV                                  0
Dx                                      0
Hinselmann                              0
Schiller                                0
Citology                                0
Biopsy                                  0
dtype: int64



In [6]:

    
df = df_fullna #做紀錄XD



In [7]:

    
df = df.convert_objects(convert_numeric=True) #把這些混蛋「？」變成的object項目通通轉成numeric type才能做運算



In [8]:

    
df.info() #已經都變成numeric了









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 858 entries, 0 to 857
Data columns (total 36 columns):
Age                                   858 non-null int64
Number of sexual partners             832 non-null float64
First sexual intercourse              851 non-null float64
Num of pregnancies                    802 non-null float64
Smokes                                845 non-null float64
Smokes (years)                        845 non-null float64
Smokes (packs/year)                   845 non-null float64
Hormonal Contraceptives               750 non-null float64
Hormonal Contraceptives (years)       750 non-null float64
IUD                                   741 non-null float64
IUD (years)                           741 non-null float64
STDs                                  753 non-null float64
STDs (number)                         753 non-null float64
STDs:condylomatosis                   753 non-null float64
STDs:cervical condylomatosis          753 non-null float64
STDs:vaginal condylomatosis           753 non-null float64
STDs:vulvo-perineal condylomatosis    753 non-null float64
STDs:syphilis                         753 non-null float64
STDs:pelvic inflammatory disease      753 non-null float64
STDs:genital herpes                   753 non-null float64
STDs:molluscum contagiosum            753 non-null float64
STDs:AIDS                             753 non-null float64
STDs:HIV                              753 non-null float64
STDs:Hepatitis B                      753 non-null float64
STDs:HPV                              753 non-null float64
STDs: Number of diagnosis             858 non-null int64
STDs: Time since first diagnosis      71 non-null float64
STDs: Time since last diagnosis       71 non-null float64
Dx:Cancer                             858 non-null int64
Dx:CIN                                858 non-null int64
Dx:HPV                                858 non-null int64
Dx                                    858 non-null int64
Hinselmann                            858 non-null int64
Schiller                              858 non-null int64
Citology                              858 non-null int64
Biopsy                                858 non-null int64
dtypes: float64(26), int64(10)
memory usage: 241.4 KB



In [9]:

    
# for continuous variable
df['Number of sexual partners'] = df['Number of sexual partners'].fillna(df['Number of sexual partners'].median())
df['First sexual intercourse'] = df['First sexual intercourse'].fillna(df['First sexual intercourse'].median())
df['Num of pregnancies'] = df['Num of pregnancies'].fillna(df['Num of pregnancies'].median())
df['Smokes'] = df['Smokes'].fillna(1)
df['Smokes (years)'] = df['Smokes (years)'].fillna(df['Smokes (years)'].median())
df['Smokes (packs/year)'] = df['Smokes (packs/year)'].fillna(df['Smokes (packs/year)'].median())
df['Hormonal Contraceptives'] = df['Hormonal Contraceptives'].fillna(1)
df['Hormonal Contraceptives (years)'] = df['Hormonal Contraceptives (years)'].fillna(df['Hormonal Contraceptives (years)'].median())
df['IUD'] = df['IUD'].fillna(0) # Under suggestion
df['IUD (years)'] = df['IUD (years)'].fillna(0) #Under suggestion
df['STDs'] = df['STDs'].fillna(1)
df['STDs (number)'] = df['STDs (number)'].fillna(df['STDs (number)'].median())
df['STDs:condylomatosis'] = df['STDs:condylomatosis'].fillna(df['STDs:condylomatosis'].median())
df['STDs:cervical condylomatosis'] = df['STDs:cervical condylomatosis'].fillna(df['STDs:cervical condylomatosis'].median())
df['STDs:vaginal condylomatosis'] = df['STDs:vaginal condylomatosis'].fillna(df['STDs:vaginal condylomatosis'].median())
df['STDs:vulvo-perineal condylomatosis'] = df['STDs:vulvo-perineal condylomatosis'].fillna(df['STDs:vulvo-perineal condylomatosis'].median())
df['STDs:syphilis'] = df['STDs:syphilis'].fillna(df['STDs:syphilis'].median())
df['STDs:pelvic inflammatory disease'] = df['STDs:pelvic inflammatory disease'].fillna(df['STDs:pelvic inflammatory disease'].median())
df['STDs:genital herpes'] = df['STDs:genital herpes'].fillna(df['STDs:genital herpes'].median())
df['STDs:molluscum contagiosum'] = df['STDs:molluscum contagiosum'].fillna(df['STDs:molluscum contagiosum'].median())
df['STDs:AIDS'] = df['STDs:AIDS'].fillna(df['STDs:AIDS'].median())
df['STDs:HIV'] = df['STDs:HIV'].fillna(df['STDs:HIV'].median())
df['STDs:Hepatitis B'] = df['STDs:Hepatitis B'].fillna(df['STDs:Hepatitis B'].median())
df['STDs:HPV'] = df['STDs:HPV'].fillna(df['STDs:HPV'].median())
df['STDs: Time since first diagnosis'] = df['STDs: Time since first diagnosis'].fillna(df['STDs: Time since first diagnosis'].median())
df['STDs: Time since last diagnosis'] = df['STDs: Time since last diagnosis'].fillna(df['STDs: Time since last diagnosis'].median())



In [10]:

    
#處理類別變項
df = pd.get_dummies(data=df, columns=['Smokes','Hormonal Contraceptives','IUD','STDs',
                                      'Dx:Cancer','Dx:CIN','Dx:HPV','Dx','Hinselmann','Citology','Schiller'])



In [11]:

    
df.isnull().sum()









    Out[11]:





Age                                   0
Number of sexual partners             0
First sexual intercourse              0
Num of pregnancies                    0
Smokes (years)                        0
Smokes (packs/year)                   0
Hormonal Contraceptives (years)       0
IUD (years)                           0
STDs (number)                         0
STDs:condylomatosis                   0
STDs:cervical condylomatosis          0
STDs:vaginal condylomatosis           0
STDs:vulvo-perineal condylomatosis    0
STDs:syphilis                         0
STDs:pelvic inflammatory disease      0
STDs:genital herpes                   0
STDs:molluscum contagiosum            0
STDs:AIDS                             0
STDs:HIV                              0
STDs:Hepatitis B                      0
STDs:HPV                              0
STDs: Number of diagnosis             0
STDs: Time since first diagnosis      0
STDs: Time since last diagnosis       0
Biopsy                                0
Smokes_0.0                            0
Smokes_1.0                            0
Hormonal Contraceptives_0.0           0
Hormonal Contraceptives_1.0           0
IUD_0.0                               0
IUD_1.0                               0
STDs_0.0                              0
STDs_1.0                              0
Dx:Cancer_0                           0
Dx:Cancer_1                           0
Dx:CIN_0                              0
Dx:CIN_1                              0
Dx:HPV_0                              0
Dx:HPV_1                              0
Dx_0                                  0
Dx_1                                  0
Hinselmann_0                          0
Hinselmann_1                          0
Citology_0                            0
Citology_1                            0
Schiller_0                            0
Schiller_1                            0
dtype: int64



In [12]:

    
df









    Out[12]:






  
    
      
      Age
      Number of sexual partners
      First sexual intercourse
      Num of pregnancies
      Smokes (years)
      Smokes (packs/year)
      Hormonal Contraceptives (years)
      IUD (years)
      STDs (number)
      STDs:condylomatosis
      ...
      Dx:HPV_0
      Dx:HPV_1
      Dx_0
      Dx_1
      Hinselmann_0
      Hinselmann_1
      Citology_0
      Citology_1
      Schiller_0
      Schiller_1
    
  
  
    
      0
      18
      4.0
      15.0
      1.0
      0.000000
      0.00
      0.00
      0.00
      0.0
      0.0
      ...
      1
      0
      1
      0
      1
      0
      1
      0
      1
      0
    
    
      1
      15
      1.0
      14.0
      1.0
      0.000000
      0.00
      0.00
      0.00
      0.0
      0.0
      ...
      1
      0
      1
      0
      1
      0
      1
      0
      1
      0
    
    
      2
      34
      1.0
      17.0
      1.0
      0.000000
      0.00
      0.00
      0.00
      0.0
      0.0
      ...
      1
      0
      1
      0
      1
      0
      1
      0
      1
      0
    
    
      3
      52
      5.0
      16.0
      4.0
      37.000000
      37.00
      3.00
      0.00
      0.0
      0.0
      ...
      0
      1
      1
      0
      1
      0
      1
      0
      1
      0
    
    
      4
      46
      3.0
      21.0
      4.0
      0.000000
      0.00
      15.00
      0.00
      0.0
      0.0
      ...
      1
      0
      1
      0
      1
      0
      1
      0
      1
      0
    
    
      5
      42
      3.0
      23.0
      2.0
      0.000000
      0.00
      0.00
      0.00
      0.0
      0.0
      ...
      1
      0
      1
      0
      1
      0
      1
      0
      1
      0
    
    
      6
      51
      3.0
      17.0
      6.0
      34.000000
      3.40
      0.00
      7.00
      0.0
      0.0
      ...
      1
      0
      1
      0
      0
      1
      1
      0
      0
      1
    
    
      7
      26
      1.0
      26.0
      3.0
      0.000000
      0.00
      2.00
      7.00
      0.0
      0.0
      ...
      1
      0
      1
      0
      1
      0
      1
      0
      1
      0
    
    
      8
      45
      1.0
      20.0
      5.0
      0.000000
      0.00
      0.00
      0.00
      0.0
      0.0
      ...
      0
      1
      0
      1
      1
      0
      1
      0
      1
      0
    
    
      9
      44
      3.0
      15.0
      2.0
      1.266973
      2.80
      0.00
      0.00
      0.0
      0.0
      ...
      1
      0
      1
      0
      1
      0
      1
      0
      1
      0
    
    
      10
      44
      3.0
      26.0
      4.0
      0.000000
      0.00
      2.00
      0.00
      0.0
      0.0
      ...
      1
      0
      1
      0
      1
      0
      1
      0
      1
      0
    
    
      11
      27
      1.0
      17.0
      3.0
      0.000000
      0.00
      8.00
      0.00
      0.0
      0.0
      ...
      1
      0
      1
      0
      1
      0
      1
      0
      1
      0
    
    
      12
      45
      4.0
      14.0
      6.0
      0.000000
      0.00
      10.00
      5.00
      0.0
      0.0
      ...
      1
      0
      1
      0
      1
      0
      1
      0
      1
      0
    
    
      13
      44
      2.0
      25.0
      2.0
      0.000000
      0.00
      5.00
      0.00
      0.0
      0.0
      ...
      1
      0
      1
      0
      1
      0
      1
      0
      1
      0
    
    
      14
      43
      2.0
      18.0
      5.0
      0.000000
      0.00
      0.00
      8.00
      0.0
      0.0
      ...
      1
      0
      1
      0
      1
      0
      1
      0
      1
      0
    
    
      15
      40
      3.0
      18.0
      2.0
      0.000000
      0.00
      15.00
      0.00
      0.0
      0.0
      ...
      1
      0
      1
      0
      1
      0
      1
      0
      1
      0
    
    
      16
      41
      4.0
      21.0
      3.0
      0.000000
      0.00
      0.25
      0.00
      0.0
      0.0
      ...
      1
      0
      1
      0
      1
      0
      1
      0
      1
      0
    
    
      17
      43
      3.0
      15.0
      8.0
      0.000000
      0.00
      3.00
      0.00
      0.0
      0.0
      ...
      1
      0
      1
      0
      1
      0
      1
      0
      1
      0
    
    
      18
      42
      2.0
      20.0
      2.0
      0.000000
      0.00
      7.00
      6.00
      2.0
      1.0
      ...
      1
      0
      1
      0
      1
      0
      1
      0
      1
      0
    
    
      19
      40
      2.0
      27.0
      2.0
      0.000000
      0.00
      0.00
      1.00
      0.0
      0.0
      ...
      1
      0
      1
      0
      1
      0
      1
      0
      1
      0
    
    
      20
      43
      2.0
      18.0
      4.0
      0.000000
      0.00
      15.00
      0.00
      0.0
      0.0
      ...
      1
      0
      1
      0
      1
      0
      1
      0
      1
      0
    
    
      21
      41
      3.0
      17.0
      4.0
      0.000000
      0.00
      10.00
      0.00
      1.0
      0.0
      ...
      1
      0
      1
      0
      1
      0
      1
      0
      1
      0
    
    
      22
      40
      1.0
      18.0
      1.0
      0.000000
      0.00
      0.25
      0.00
      2.0
      1.0
      ...
      1
      0
      1
      0
      1
      0
      0
      1
      0
      1
    
    
      23
      40
      1.0
      20.0
      2.0
      0.000000
      0.00
      15.00
      0.00
      0.0
      0.0
      ...
      0
      1
      1
      0
      0
      1
      1
      0
      0
      1
    
    
      24
      40
      3.0
      15.0
      3.0
      0.000000
      0.00
      3.00
      0.00
      0.0
      0.0
      ...
      1
      0
      1
      0
      1
      0
      1
      0
      1
      0
    
    
      25
      44
      3.0
      19.0
      1.0
      0.000000
      0.00
      0.00
      0.00
      0.0
      0.0
      ...
      1
      0
      1
      0
      1
      0
      1
      0
      1
      0
    
    
      26
      39
      5.0
      23.0
      2.0
      0.000000
      0.00
      0.00
      1.00
      0.0
      0.0
      ...
      1
      0
      1
      0
      1
      0
      1
      0
      1
      0
    
    
      27
      39
      2.0
      17.0
      4.0
      0.000000
      0.00
      0.00
      0.00
      0.0
      0.0
      ...
      1
      0
      1
      0
      1
      0
      1
      0
      1
      0
    
    
      28
      37
      3.0
      24.0
      1.0
      3.000000
      0.04
      0.00
      0.00
      0.0
      0.0
      ...
      1
      0
      1
      0
      1
      0
      1
      0
      1
      0
    
    
      29
      37
      6.0
      26.0
      1.0
      0.000000
      0.00
      0.25
      0.00
      0.0
      0.0
      ...
      1
      0
      1
      0
      1
      0
      1
      0
      1
      0
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      828
      33
      2.0
      21.0
      0.0
      0.000000
      0.00
      0.00
      0.00
      0.0
      0.0
      ...
      1
      0
      1
      0
      1
      0
      1
      0
      1
      0
    
    
      829
      34
      3.0
      14.0
      4.0
      0.000000
      0.00
      0.00
      0.00
      0.0
      0.0
      ...
      1
      0
      1
      0
      1
      0
      1
      0
      1
      0
    
    
      830
      35
      4.0
      16.0
      4.0
      0.000000
      0.00
      0.00
      0.00
      0.0
      0.0
      ...
      1
      0
      1
      0
      1
      0
      1
      0
      1
      0
    
    
      831
      40
      3.0
      23.0
      2.0
      0.000000
      0.00
      0.00
      0.00
      1.0
      0.0
      ...
      1
      0
      1
      0
      1
      0
      1
      0
      1
      0
    
    
      832
      30
      2.0
      18.0
      0.0
      0.000000
      0.00
      1.00
      0.00
      0.0
      0.0
      ...
      1
      0
      1
      0
      1
      0
      1
      0
      1
      0
    
    
      833
      34
      1.0
      17.0
      0.0
      0.000000
      0.00
      0.00
      0.00
      0.0
      0.0
      ...
      1
      0
      1
      0
      1
      0
      1
      0
      1
      0
    
    
      834
      30
      3.0
      15.0
      0.0
      16.000000
      8.00
      0.00
      0.00
      0.0
      0.0
      ...
      1
      0
      1
      0
      1
      0
      1
      0
      1
      0
    
    
      835
      24
      1.0
      14.0
      0.0
      0.000000
      0.00
      0.00
      0.00
      0.0
      0.0
      ...
      1
      0
      1
      0
      1
      0
      1
      0
      1
      0
    
    
      836
      37
      3.0
      17.0
      0.0
      0.000000
      0.00
      0.25
      0.00
      0.0
      0.0
      ...
      1
      0
      1
      0
      1
      0
      1
      0
      1
      0
    
    
      837
      31
      9.0
      17.0
      1.0
      11.000000
      5.50
      0.25
      0.00
      0.0
      0.0
      ...
      1
      0
      1
      0
      1
      0
      1
      0
      1
      0
    
    
      838
      35
      3.0
      18.0
      3.0
      0.000000
      0.00
      5.00
      0.00
      0.0
      0.0
      ...
      1
      0
      1
      0
      1
      0
      1
      0
      1
      0
    
    
      839
      31
      3.0
      19.0
      1.0
      0.000000
      0.00
      0.08
      8.00
      0.0
      0.0
      ...
      1
      0
      0
      1
      1
      0
      1
      0
      1
      0
    
    
      840
      24
      2.0
      16.0
      3.0
      0.000000
      0.00
      5.00
      0.00
      0.0
      0.0
      ...
      1
      0
      1
      0
      1
      0
      1
      0
      1
      0
    
    
      841
      23
      2.0
      15.0
      0.0
      0.000000
      0.00
      0.00
      0.00
      0.0
      0.0
      ...
      1
      0
      1
      0
      1
      0
      1
      0
      1
      0
    
    
      842
      36
      3.0
      16.0
      3.0
      6.000000
      0.30
      2.00
      0.00
      0.0
      0.0
      ...
      1
      0
      1
      0
      1
      0
      1
      0
      1
      0
    
    
      843
      30
      3.0
      14.0
      3.0
      0.000000
      0.00
      2.00
      0.00
      0.0
      0.0
      ...
      1
      0
      1
      0
      1
      0
      1
      0
      1
      0
    
    
      844
      26
      8.0
      15.0
      1.0
      9.000000
      1.35
      5.00
      0.17
      0.0
      0.0
      ...
      1
      0
      1
      0
      1
      0
      1
      0
      1
      0
    
    
      845
      19
      2.0
      15.0
      2.0
      0.000000
      0.00
      0.75
      0.00
      0.0
      0.0
      ...
      1
      0
      1
      0
      1
      0
      1
      0
      1
      0
    
    
      846
      35
      2.0
      17.0
      1.0
      0.000000
      0.00
      0.00
      0.00
      0.0
      0.0
      ...
      1
      0
      1
      0
      1
      0
      1
      0
      1
      0
    
    
      847
      30
      3.0
      22.0
      1.0
      0.000000
      0.00
      0.00
      0.00
      0.0
      0.0
      ...
      1
      0
      1
      0
      1
      0
      1
      0
      1
      0
    
    
      848
      31
      3.0
      18.0
      1.0
      0.000000
      0.00
      0.50
      0.00
      0.0
      0.0
      ...
      1
      0
      1
      0
      1
      0
      1
      0
      1
      0
    
    
      849
      32
      3.0
      18.0
      1.0
      11.000000
      0.16
      6.00
      0.00
      1.0
      0.0
      ...
      0
      1
      1
      0
      1
      0
      1
      0
      1
      0
    
    
      850
      19
      1.0
      14.0
      0.0
      0.000000
      0.00
      0.00
      0.00
      0.0
      0.0
      ...
      1
      0
      1
      0
      1
      0
      1
      0
      1
      0
    
    
      851
      23
      2.0
      15.0
      2.0
      0.000000
      0.00
      0.00
      0.00
      0.0
      0.0
      ...
      1
      0
      1
      0
      1
      0
      1
      0
      1
      0
    
    
      852
      43
      3.0
      17.0
      3.0
      0.000000
      0.00
      5.00
      0.00
      0.0
      0.0
      ...
      1
      0
      1
      0
      1
      0
      1
      0
      1
      0
    
    
      853
      34
      3.0
      18.0
      0.0
      0.000000
      0.00
      0.00
      0.00
      0.0
      0.0
      ...
      1
      0
      1
      0
      1
      0
      1
      0
      1
      0
    
    
      854
      32
      2.0
      19.0
      1.0
      0.000000
      0.00
      8.00
      0.00
      0.0
      0.0
      ...
      1
      0
      1
      0
      1
      0
      1
      0
      1
      0
    
    
      855
      25
      2.0
      17.0
      0.0
      0.000000
      0.00
      0.08
      0.00
      0.0
      0.0
      ...
      1
      0
      1
      0
      1
      0
      0
      1
      1
      0
    
    
      856
      33
      2.0
      24.0
      2.0
      0.000000
      0.00
      0.08
      0.00
      0.0
      0.0
      ...
      1
      0
      1
      0
      1
      0
      1
      0
      1
      0
    
    
      857
      29
      2.0
      20.0
      1.0
      0.000000
      0.00
      0.50
      0.00
      0.0
      0.0
      ...
      1
      0
      1
      0
      1
      0
      1
      0
      1
      0
    
  

858 rows × 47 columns



In [13]:

    
df_data = df #做紀錄

查看Outlier



In [14]:

    
df.describe()









    Out[14]:






  
    
      
      Age
      Number of sexual partners
      First sexual intercourse
      Num of pregnancies
      Smokes (years)
      Smokes (packs/year)
      Hormonal Contraceptives (years)
      IUD (years)
      STDs (number)
      STDs:condylomatosis
      ...
      Dx:HPV_0
      Dx:HPV_1
      Dx_0
      Dx_1
      Hinselmann_0
      Hinselmann_1
      Citology_0
      Citology_1
      Schiller_0
      Schiller_1
    
  
  
    
      count
      858.000000
      858.000000
      858.000000
      858.000000
      858.000000
      858.000000
      858.000000
      858.000000
      858.000000
      858.000000
      ...
      858.000000
      858.000000
      858.000000
      858.000000
      858.000000
      858.000000
      858.000000
      858.000000
      858.000000
      858.000000
    
    
      mean
      26.820513
      2.511655
      16.995338
      2.257576
      1.201241
      0.446278
      2.035331
      0.444604
      0.155012
      0.051282
      ...
      0.979021
      0.020979
      0.972028
      0.027972
      0.959207
      0.040793
      0.948718
      0.051282
      0.913753
      0.086247
    
    
      std
      8.497948
      1.644759
      2.791883
      1.400981
      4.060623
      2.210351
      3.567040
      1.814218
      0.529617
      0.220701
      ...
      0.143398
      0.143398
      0.164989
      0.164989
      0.197925
      0.197925
      0.220701
      0.220701
      0.280892
      0.280892
    
    
      min
      13.000000
      1.000000
      10.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      ...
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
    
    
      25%
      20.000000
      2.000000
      15.000000
      1.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      ...
      1.000000
      0.000000
      1.000000
      0.000000
      1.000000
      0.000000
      1.000000
      0.000000
      1.000000
      0.000000
    
    
      50%
      25.000000
      2.000000
      17.000000
      2.000000
      0.000000
      0.000000
      0.500000
      0.000000
      0.000000
      0.000000
      ...
      1.000000
      0.000000
      1.000000
      0.000000
      1.000000
      0.000000
      1.000000
      0.000000
      1.000000
      0.000000
    
    
      75%
      32.000000
      3.000000
      18.000000
      3.000000
      0.000000
      0.000000
      2.000000
      0.000000
      0.000000
      0.000000
      ...
      1.000000
      0.000000
      1.000000
      0.000000
      1.000000
      0.000000
      1.000000
      0.000000
      1.000000
      0.000000
    
    
      max
      84.000000
      28.000000
      32.000000
      11.000000
      37.000000
      37.000000
      30.000000
      19.000000
      4.000000
      1.000000
      ...
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
    
  

8 rows × 47 columns



In [15]:

    
fig, (ax1,ax2,ax3,ax4,ax5,ax6,ax7) = plt.subplots(7,1,figsize=(20,40))
sns.countplot(x='Age', data=df, ax=ax1)
sns.countplot(x='Number of sexual partners', data=df, ax=ax2)
sns.countplot(x='Num of pregnancies', data=df, ax=ax3)
sns.countplot(x='Smokes (years)', data=df, ax=ax4)
sns.countplot(x='Hormonal Contraceptives (years)', data=df, ax=ax5)
sns.countplot(x='IUD (years)', data=df, ax=ax6)
sns.countplot(x='STDs (number)', data=df, ax=ax7)









    Out[15]:





<matplotlib.axes._subplots.AxesSubplot at 0x112549898>

做shuffle分類train/test data，並且標示feature/labels，資料標準化



In [16]:

    
#Shuffle
np.random.seed(42)
df_data_shuffle = df_data.iloc[np.random.permutation(len(df_data))]

df_train = df_data_shuffle.iloc[1:686, :]
df_test = df_data_shuffle.iloc[686: , :]



In [17]:

    
df_data.columns









    Out[17]:





Index(['Age', 'Number of sexual partners', 'First sexual intercourse',
       'Num of pregnancies', 'Smokes (years)', 'Smokes (packs/year)',
       'Hormonal Contraceptives (years)', 'IUD (years)', 'STDs (number)',
       'STDs:condylomatosis', 'STDs:cervical condylomatosis',
       'STDs:vaginal condylomatosis', 'STDs:vulvo-perineal condylomatosis',
       'STDs:syphilis', 'STDs:pelvic inflammatory disease',
       'STDs:genital herpes', 'STDs:molluscum contagiosum', 'STDs:AIDS',
       'STDs:HIV', 'STDs:Hepatitis B', 'STDs:HPV', 'STDs: Number of diagnosis',
       'STDs: Time since first diagnosis', 'STDs: Time since last diagnosis',
       'Biopsy', 'Smokes_0.0', 'Smokes_1.0', 'Hormonal Contraceptives_0.0',
       'Hormonal Contraceptives_1.0', 'IUD_0.0', 'IUD_1.0', 'STDs_0.0',
       'STDs_1.0', 'Dx:Cancer_0', 'Dx:Cancer_1', 'Dx:CIN_0', 'Dx:CIN_1',
       'Dx:HPV_0', 'Dx:HPV_1', 'Dx_0', 'Dx_1', 'Hinselmann_0', 'Hinselmann_1',
       'Citology_0', 'Citology_1', 'Schiller_0', 'Schiller_1'],
      dtype='object')



In [18]:

    
#分類feature/label
df_train_feature = df_train[['Age', 'Number of sexual partners', 'First sexual intercourse',
       'Num of pregnancies', 'Smokes (years)', 'Smokes (packs/year)',
       'Hormonal Contraceptives (years)', 'IUD (years)', 'STDs (number)',
       'STDs:condylomatosis', 'STDs:cervical condylomatosis',
       'STDs:vaginal condylomatosis', 'STDs:vulvo-perineal condylomatosis',
       'STDs:syphilis', 'STDs:pelvic inflammatory disease',
       'STDs:genital herpes', 'STDs:molluscum contagiosum', 'STDs:AIDS',
       'STDs:HIV', 'STDs:Hepatitis B', 'STDs:HPV', 'STDs: Number of diagnosis',
       'STDs: Time since first diagnosis', 'STDs: Time since last diagnosis','Smokes_0.0', 'Smokes_1.0',
       'Hormonal Contraceptives_0.0', 'Hormonal Contraceptives_1.0', 'IUD_0.0',
       'IUD_1.0', 'STDs_0.0', 'STDs_1.0', 'Dx:Cancer_0', 'Dx:Cancer_1',
       'Dx:CIN_0', 'Dx:CIN_1', 'Dx:HPV_0', 'Dx:HPV_1', 'Dx_0', 'Dx_1',
       'Hinselmann_0', 'Hinselmann_1', 'Citology_0', 'Citology_1','Schiller_0','Schiller_1']]

train_label = np.array(df_train['Biopsy'])

df_test_feature = df_test[['Age', 'Number of sexual partners', 'First sexual intercourse',
       'Num of pregnancies', 'Smokes (years)', 'Smokes (packs/year)',
       'Hormonal Contraceptives (years)', 'IUD (years)', 'STDs (number)',
       'STDs:condylomatosis', 'STDs:cervical condylomatosis',
       'STDs:vaginal condylomatosis', 'STDs:vulvo-perineal condylomatosis',
       'STDs:syphilis', 'STDs:pelvic inflammatory disease',
       'STDs:genital herpes', 'STDs:molluscum contagiosum', 'STDs:AIDS',
       'STDs:HIV', 'STDs:Hepatitis B', 'STDs:HPV', 'STDs: Number of diagnosis',
       'STDs: Time since first diagnosis', 'STDs: Time since last diagnosis','Smokes_0.0', 'Smokes_1.0',
       'Hormonal Contraceptives_0.0', 'Hormonal Contraceptives_1.0', 'IUD_0.0',
       'IUD_1.0', 'STDs_0.0', 'STDs_1.0', 'Dx:Cancer_0', 'Dx:Cancer_1',
       'Dx:CIN_0', 'Dx:CIN_1', 'Dx:HPV_0', 'Dx:HPV_1', 'Dx_0', 'Dx_1',
       'Hinselmann_0', 'Hinselmann_1', 'Citology_0', 'Citology_1','Schiller_0','Schiller_1']]

test_label = np.array(df_test['Biopsy'])



In [19]:

    
#Normalization
from sklearn import preprocessing
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))
train_feature = minmax_scale.fit_transform(df_train_feature)
test_feature = minmax_scale.fit_transform(df_test_feature)



In [20]:

    
#確認資料樣子是否ok
print(train_feature[0])
print(train_label[0])
print(test_feature[0])
print(test_label[0])









    



[ 0.14084507  0.07407407  0.31818182  0.18181818  0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.14285714  0.0952381   1.          0.          1.          0.
  1.          0.          1.          0.          1.          0.          1.
  0.          1.          0.          1.          0.          1.          0.
  1.          0.          1.          0.        ]
0
[ 0.61764706  0.33333333  0.25        0.42857143  0.28125     0.3375
  0.89473684  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.15        0.1         0.          1.          0.
  1.          1.          0.          1.          0.          1.          0.
  1.          0.          1.          0.          1.          0.          1.
  0.          1.          0.          1.          0.        ]
0



In [21]:

    
train_feature.shape









    Out[21]:





(685, 46)

跑模型囉～～



In [22]:

    
import matplotlib.pyplot as plt
def show_train_history(train_history,train,validation):
    plt.plot(train_history.history[train])
    plt.plot(train_history.history[validation])
    plt.title('Train History')
    plt.ylabel(train)
    plt.xlabel('Epoch')
    plt.legend(['train', 'validation'], loc='best')
    plt.show()


######################### 建立模型
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout

model = Sequential() #一層一層到底，按順序

#輸入層（隱藏層1）
model.add(Dense(units=1000, 
                input_dim=46, 
                kernel_initializer='uniform', 
                activation='relu'))
model.add(Dropout(0.5))

#隱藏層2，不用寫input_dim，因為就是前一層的units
model.add(Dense(units=500,  
                kernel_initializer='uniform', 
                activation='relu'))
model.add(Dropout(0.5))


model.add(Dense(units=500,  
                kernel_initializer='uniform', 
                activation='relu'))
model.add(Dropout(0.5))


#輸出層
model.add(Dense(units=1, #輸出一個數字 
                kernel_initializer='uniform', 
                activation='sigmoid'))

print(model.summary()) #可以清楚看到model還有參數數量


######################### 訓練模型
#選擇loss度量，optimizer學習路徑速度、
model.compile(loss='binary_crossentropy',   #二元用binary
              optimizer='adam', metrics=['accuracy'])

#開始train，並且記錄情況(設有val項以免overfitting)
train_history = model.fit(x=train_feature, y=train_label,  #上面多分割一步在keras是內建的
                          validation_split=0.2, epochs=40, batch_size=200, verbose=2) #verbose=2表示顯示訓練過程


######################### 訓練過程視覺化
show_train_history(train_history,'acc','val_acc')
show_train_history(train_history,'loss','val_loss')

#儲存訓練結果
model.save_weights("Savemodels/Cervical_ca(Kaggles)_MLP.h5")
print('model saved to disk')









    



Using TensorFlow backend.






    



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense_1 (Dense)              (None, 1000)              47000     
_________________________________________________________________
dropout_1 (Dropout)          (None, 1000)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 500)               500500    
_________________________________________________________________
dropout_2 (Dropout)          (None, 500)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 500)               250500    
_________________________________________________________________
dropout_3 (Dropout)          (None, 500)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 501       
=================================================================
Total params: 798,501
Trainable params: 798,501
Non-trainable params: 0
_________________________________________________________________
None
Train on 548 samples, validate on 137 samples
Epoch 1/40
0s - loss: 0.6007 - acc: 0.8212 - val_loss: 0.3811 - val_acc: 0.9124
Epoch 2/40
0s - loss: 0.2767 - acc: 0.9361 - val_loss: 0.3283 - val_acc: 0.9124
Epoch 3/40
0s - loss: 0.2603 - acc: 0.9361 - val_loss: 0.3383 - val_acc: 0.9124
Epoch 4/40
0s - loss: 0.2426 - acc: 0.9361 - val_loss: 0.2180 - val_acc: 0.9124
Epoch 5/40
0s - loss: 0.1700 - acc: 0.9361 - val_loss: 0.1843 - val_acc: 0.9124
Epoch 6/40
0s - loss: 0.1660 - acc: 0.9361 - val_loss: 0.1632 - val_acc: 0.9124
Epoch 7/40
0s - loss: 0.1412 - acc: 0.9361 - val_loss: 0.1347 - val_acc: 0.9124
Epoch 8/40
0s - loss: 0.1253 - acc: 0.9361 - val_loss: 0.1287 - val_acc: 0.9124
Epoch 9/40
0s - loss: 0.1151 - acc: 0.9361 - val_loss: 0.1195 - val_acc: 0.9124
Epoch 10/40
0s - loss: 0.1120 - acc: 0.9361 - val_loss: 0.1110 - val_acc: 0.9124
Epoch 11/40
0s - loss: 0.1112 - acc: 0.9398 - val_loss: 0.1084 - val_acc: 0.9708
Epoch 12/40
0s - loss: 0.1066 - acc: 0.9489 - val_loss: 0.1081 - val_acc: 0.9708
Epoch 13/40
0s - loss: 0.1103 - acc: 0.9617 - val_loss: 0.1093 - val_acc: 0.9708
Epoch 14/40
0s - loss: 0.1032 - acc: 0.9599 - val_loss: 0.1107 - val_acc: 0.9708
Epoch 15/40
0s - loss: 0.0981 - acc: 0.9708 - val_loss: 0.1128 - val_acc: 0.9708
Epoch 16/40
0s - loss: 0.0979 - acc: 0.9672 - val_loss: 0.1141 - val_acc: 0.9708
Epoch 17/40
0s - loss: 0.0987 - acc: 0.9672 - val_loss: 0.1130 - val_acc: 0.9708
Epoch 18/40
0s - loss: 0.0968 - acc: 0.9635 - val_loss: 0.1103 - val_acc: 0.9708
Epoch 19/40
0s - loss: 0.0979 - acc: 0.9708 - val_loss: 0.1094 - val_acc: 0.9562
Epoch 20/40
0s - loss: 0.0916 - acc: 0.9726 - val_loss: 0.1091 - val_acc: 0.9562
Epoch 21/40
0s - loss: 0.0950 - acc: 0.9708 - val_loss: 0.1071 - val_acc: 0.9562
Epoch 22/40
0s - loss: 0.0920 - acc: 0.9690 - val_loss: 0.1041 - val_acc: 0.9635
Epoch 23/40
0s - loss: 0.0906 - acc: 0.9635 - val_loss: 0.1034 - val_acc: 0.9635
Epoch 24/40
0s - loss: 0.0881 - acc: 0.9708 - val_loss: 0.1071 - val_acc: 0.9562
Epoch 25/40
0s - loss: 0.0904 - acc: 0.9708 - val_loss: 0.1092 - val_acc: 0.9562
Epoch 26/40
0s - loss: 0.0900 - acc: 0.9708 - val_loss: 0.1071 - val_acc: 0.9562
Epoch 27/40
0s - loss: 0.0882 - acc: 0.9690 - val_loss: 0.1111 - val_acc: 0.9562
Epoch 28/40
0s - loss: 0.0821 - acc: 0.9763 - val_loss: 0.1079 - val_acc: 0.9562
Epoch 29/40
0s - loss: 0.0801 - acc: 0.9690 - val_loss: 0.1061 - val_acc: 0.9562
Epoch 30/40
0s - loss: 0.0790 - acc: 0.9745 - val_loss: 0.1049 - val_acc: 0.9562
Epoch 31/40
0s - loss: 0.0756 - acc: 0.9745 - val_loss: 0.1070 - val_acc: 0.9562
Epoch 32/40
0s - loss: 0.0739 - acc: 0.9726 - val_loss: 0.1124 - val_acc: 0.9562
Epoch 33/40
0s - loss: 0.0880 - acc: 0.9690 - val_loss: 0.1146 - val_acc: 0.9562
Epoch 34/40
0s - loss: 0.0738 - acc: 0.9708 - val_loss: 0.1087 - val_acc: 0.9562
Epoch 35/40
0s - loss: 0.0857 - acc: 0.9690 - val_loss: 0.1069 - val_acc: 0.9562
Epoch 36/40
0s - loss: 0.0815 - acc: 0.9726 - val_loss: 0.1149 - val_acc: 0.9562
Epoch 37/40
0s - loss: 0.0772 - acc: 0.9708 - val_loss: 0.1235 - val_acc: 0.9708
Epoch 38/40
0s - loss: 0.0764 - acc: 0.9726 - val_loss: 0.1159 - val_acc: 0.9562
Epoch 39/40
0s - loss: 0.0821 - acc: 0.9690 - val_loss: 0.1075 - val_acc: 0.9562
Epoch 40/40
0s - loss: 0.0769 - acc: 0.9745 - val_loss: 0.1072 - val_acc: 0.9562






    












    












    



model saved to disk

Test set 精準率: 94%



In [23]:

    
scores = model.evaluate(test_feature, test_label)
print('\n')
print('accuracy=',scores[1])









    



160/172 [==========================>...] - ETA: 0s

accuracy= 0.93023255814



In [24]:

    
######################### 紀錄模型預測情形（答案卷）
prediction = model.predict_classes(test_feature)









    



 32/172 [====>.........................] - ETA: 0s



In [25]:

    
df_ans = pd.DataFrame({'Biopsy' :test_label})
df_ans['Prediction'] = prediction



In [26]:

    
df_ans









    Out[26]:






  
    
      
      Biopsy
      Prediction
    
  
  
    
      0
      0
      0
    
    
      1
      0
      0
    
    
      2
      0
      0
    
    
      3
      0
      0
    
    
      4
      0
      0
    
    
      5
      0
      0
    
    
      6
      0
      0
    
    
      7
      0
      0
    
    
      8
      0
      0
    
    
      9
      0
      0
    
    
      10
      0
      0
    
    
      11
      0
      0
    
    
      12
      0
      0
    
    
      13
      0
      0
    
    
      14
      0
      0
    
    
      15
      0
      0
    
    
      16
      0
      0
    
    
      17
      0
      0
    
    
      18
      0
      0
    
    
      19
      0
      0
    
    
      20
      0
      0
    
    
      21
      0
      0
    
    
      22
      0
      0
    
    
      23
      0
      0
    
    
      24
      0
      0
    
    
      25
      0
      0
    
    
      26
      0
      0
    
    
      27
      0
      0
    
    
      28
      0
      0
    
    
      29
      0
      0
    
    
      ...
      ...
      ...
    
    
      142
      0
      0
    
    
      143
      0
      0
    
    
      144
      0
      0
    
    
      145
      0
      1
    
    
      146
      0
      1
    
    
      147
      0
      0
    
    
      148
      0
      0
    
    
      149
      0
      0
    
    
      150
      0
      0
    
    
      151
      0
      1
    
    
      152
      0
      0
    
    
      153
      0
      0
    
    
      154
      0
      0
    
    
      155
      0
      0
    
    
      156
      0
      0
    
    
      157
      0
      0
    
    
      158
      0
      0
    
    
      159
      0
      0
    
    
      160
      0
      0
    
    
      161
      1
      1
    
    
      162
      0
      0
    
    
      163
      0
      0
    
    
      164
      1
      1
    
    
      165
      0
      0
    
    
      166
      0
      0
    
    
      167
      0
      0
    
    
      168
      0
      0
    
    
      169
      0
      0
    
    
      170
      0
      0
    
    
      171
      0
      0
    
  

172 rows × 2 columns



In [27]:

    
df_ans[ df_ans['Biopsy'] != df_ans['Prediction'] ]



In [28]:

    
df_ans['Prediction'].value_counts()









    Out[28]:





0    160
1     12
Name: Prediction, dtype: int64



In [29]:

    
df_ans['Biopsy'].value_counts()









    Out[29]:





0    164
1      8
Name: Biopsy, dtype: int64

Train set 精確率：97.81%



In [30]:

    
scores = model.evaluate(train_feature, train_label)
print('\n')
print('accuracy=',scores[1])









    



576/685 [========================>.....] - ETA: 0s

accuracy= 0.972262773723



In [31]:

    
prediction_train = model.predict_classes(train_feature)









    



576/685 [========================>.....] - ETA: 0s



In [32]:

    
df_train_ans = pd.DataFrame({'Biopsy': train_label})
df_train_ans['Prediction'] = prediction_train



In [33]:

    
df_train_ans









    Out[33]:






  
    
      
      Biopsy
      Prediction
    
  
  
    
      0
      0
      0
    
    
      1
      0
      0
    
    
      2
      0
      0
    
    
      3
      0
      0
    
    
      4
      0
      0
    
    
      5
      0
      0
    
    
      6
      0
      0
    
    
      7
      0
      0
    
    
      8
      0
      0
    
    
      9
      0
      0
    
    
      10
      0
      0
    
    
      11
      0
      0
    
    
      12
      0
      0
    
    
      13
      0
      0
    
    
      14
      0
      0
    
    
      15
      0
      0
    
    
      16
      0
      0
    
    
      17
      0
      0
    
    
      18
      0
      0
    
    
      19
      0
      0
    
    
      20
      0
      0
    
    
      21
      0
      0
    
    
      22
      0
      0
    
    
      23
      0
      0
    
    
      24
      0
      0
    
    
      25
      0
      0
    
    
      26
      0
      0
    
    
      27
      0
      0
    
    
      28
      0
      0
    
    
      29
      0
      0
    
    
      ...
      ...
      ...
    
    
      655
      0
      0
    
    
      656
      0
      0
    
    
      657
      1
      1
    
    
      658
      0
      0
    
    
      659
      0
      0
    
    
      660
      0
      0
    
    
      661
      0
      0
    
    
      662
      0
      0
    
    
      663
      0
      0
    
    
      664
      0
      0
    
    
      665
      0
      0
    
    
      666
      0
      0
    
    
      667
      0
      0
    
    
      668
      0
      0
    
    
      669
      0
      0
    
    
      670
      0
      0
    
    
      671
      0
      0
    
    
      672
      0
      0
    
    
      673
      0
      0
    
    
      674
      0
      0
    
    
      675
      0
      0
    
    
      676
      0
      0
    
    
      677
      1
      1
    
    
      678
      0
      0
    
    
      679
      0
      0
    
    
      680
      1
      1
    
    
      681
      0
      0
    
    
      682
      0
      0
    
    
      683
      0
      0
    
    
      684
      0
      0
    
  

685 rows × 2 columns



In [34]:

    
df_train_ans[ df_train_ans['Biopsy'] != df_train_ans['Prediction'] ]



In [35]:

    
df_train_ans['Prediction'].value_counts()









    Out[35]:





0    635
1     50
Name: Prediction, dtype: int64



In [36]:

    
df_train_ans['Biopsy'].value_counts()









    Out[36]:





0    638
1     47
Name: Biopsy, dtype: int64

Confusion matrix



In [37]:

    
cols = ['Biopsy_1','Biopsy_0']  #Gold standard
rows = ['Prediction_1','Prediction_0'] #diagnostic tool (our prediction)

B1P1 = len(df_ans[(df_ans['Prediction'] == df_ans['Biopsy']) & (df_ans['Biopsy'] == 1)])
B1P0 = len(df_ans[(df_ans['Prediction'] != df_ans['Biopsy']) & (df_ans['Biopsy'] == 1)])
B0P1 = len(df_ans[(df_ans['Prediction'] != df_ans['Biopsy']) & (df_ans['Biopsy'] == 0)])
B0P0 = len(df_ans[(df_ans['Prediction'] == df_ans['Biopsy']) & (df_ans['Biopsy'] == 0)])

conf = np.array([[B1P1,B0P1],[B1P0,B0P0]])
df_cm = pd.DataFrame(conf, columns = [i for i in cols], index = [i for i in rows])

f, ax= plt.subplots(figsize = (5, 5))
sns.heatmap(df_cm, annot=True, ax=ax) 
ax.xaxis.set_ticks_position('top') #Making x label be on top is common in textbooks.

print('total test case number: ', np.sum(conf))









    



total test case number:  172

Calculating sensitivity, specificity, false_positive_rate and false_negative_rate



In [38]:

    
def model_efficacy(conf):
    total_num = np.sum(conf)
    sen = conf[0][0]/(conf[0][0]+conf[1][0])
    spe = conf[1][1]/(conf[1][0]+conf[1][1])
    false_positive_rate = conf[0][1]/(conf[0][1]+conf[1][1])
    false_negative_rate = conf[1][0]/(conf[0][0]+conf[1][0])
    
    print('total_num: ',total_num)
    print('G1P1: ',conf[0][0]) #G = gold standard; P = prediction
    print('G0P1: ',conf[0][1])
    print('G1P0: ',conf[1][0])
    print('G0P0: ',conf[1][1])
    print('##########################')
    print('sensitivity: ',sen)
    print('specificity: ',spe)
    print('false_positive_rate: ',false_positive_rate)
    print('false_negative_rate: ',false_negative_rate)
    
    return total_num, sen, spe, false_positive_rate, false_negative_rate

model_efficacy(conf)









    



total_num:  172
G1P1:  4
G0P1:  8
G1P0:  4
G0P0:  156
##########################
sensitivity:  0.5
specificity:  0.975
false_positive_rate:  0.0487804878049
false_negative_rate:  0.5






    Out[38]:





(172, 0.5, 0.97499999999999998, 0.04878048780487805, 0.5)

分析原始data



In [39]:

    
import seaborn as sns
sns.jointplot(x='Age', y='Biopsy', data=df, alpha=0.1) 
#By adding alpha, we can see the density of the scattered spots clearly.









    Out[39]:





<seaborn.axisgrid.JointGrid at 0x1125c66a0>



In [40]:

    
fig, (ax1,ax2,ax3) = plt.subplots(3,1,figsize=(15,12))
sns.countplot(x='Age', data=df, ax=ax1)
sns.countplot(x='Biopsy', data=df, ax=ax2)
sns.barplot(x='Age', y='Biopsy', data=df, ax=ax3)

#Stratified
facet = sns.FacetGrid(df, hue='Biopsy',aspect=4)
facet.map(sns.kdeplot,'Age',shade= True)
facet.set(xlim=(0, df['Age'].max()))
facet.add_legend()









    Out[40]:





<seaborn.axisgrid.FacetGrid at 0x113644d30>



In [41]:

    
import seaborn as sns
sns.jointplot(x='Number of sexual partners', y='Biopsy', data=df, alpha=0.1) 
#By adding alpha, we can see the density of the scattered spots clearly.









    Out[41]:





<seaborn.axisgrid.JointGrid at 0x113578d68>



In [42]:

    
fig, (ax1,ax2) = plt.subplots(2,1,figsize=(15,8))
sns.countplot(x='Number of sexual partners', data=df, ax=ax1)
sns.barplot(x='Number of sexual partners', y='Biopsy', data=df, ax=ax2) #categorical to categorical

#continuous to categorical
facet = sns.FacetGrid(df, hue='Biopsy',aspect=4)
facet.map(sns.kdeplot,'Number of sexual partners',shade= True)
facet.set(xlim=(0, df['Number of sexual partners'].max()))
facet.add_legend()









    Out[42]:





<seaborn.axisgrid.FacetGrid at 0x122672748>



In [43]:

    
import seaborn as sns
sns.jointplot(x='Num of pregnancies', y='Biopsy', data=df, alpha=0.1)









    Out[43]:





<seaborn.axisgrid.JointGrid at 0x11262c5f8>



In [44]:

    
sns.factorplot('Num of pregnancies','Biopsy',data=df, size=5, aspect=3)









    Out[44]:





<seaborn.axisgrid.FacetGrid at 0x123212978>



In [45]:

    
#continuous to categorical
facet = sns.FacetGrid(df, hue='Biopsy',aspect=4)
facet.map(sns.kdeplot,'Num of pregnancies',shade= True)
facet.set(xlim=(0, df['Num of pregnancies'].max()))
facet.add_legend()









    Out[45]:





<seaborn.axisgrid.FacetGrid at 0x123ce1438>



In [46]:

    
import seaborn as sns
sns.jointplot(x='Citology_1', y='Biopsy', data=df, alpha=0.1) 
# Hard do see anything...









    Out[46]:





<seaborn.axisgrid.JointGrid at 0x123f149e8>



In [47]:

    
fig, (axis1,axis2,axis3) = plt.subplots(1,3,figsize=(15,5))
sns.countplot(x='Citology_1', data=df, ax=axis1)
sns.countplot(x='Biopsy', data=df, ax=axis2)
sns.barplot(x='Citology_1', y='Biopsy', data=df, ax=axis3)  #categorical to categorical









    Out[47]:





<matplotlib.axes._subplots.AxesSubplot at 0x1244e1978>



In [48]:

    
fig, (axis1,axis2,axis3) = plt.subplots(1,3,figsize=(15,5))
sns.countplot(x='Schiller_1', data=df, ax=axis1)
sns.countplot(x='Biopsy', data=df, ax=axis2)
sns.barplot(x='Schiller_1', y='Biopsy', data=df, ax=axis3) #categorical to categorical









    Out[48]:





<matplotlib.axes._subplots.AxesSubplot at 0x124716e10>

使用前面fillna後的df



In [49]:

    
corrmat = df.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=1, square=True, cmap='rainbow')









    Out[49]:





<matplotlib.axes._subplots.AxesSubplot at 0x1246feeb8>



In [50]:

    
k = 15 #number of variables for heatmap
cols = corrmat.nlargest(k, 'Biopsy')['Biopsy'].index
cm = np.corrcoef(df[cols].values.T)

plt.figure(figsize=(9,9)) #可以調整大小

sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10},
                 yticklabels = cols.values, xticklabels = cols.values)
plt.show()



In [ ]:

	Age	Number of sexual partners	First sexual intercourse	Num of pregnancies	Smokes	Smokes (years)	Smokes (packs/year)	Hormonal Contraceptives	Hormonal Contraceptives (years)	IUD	...	STDs: Time since first diagnosis	STDs: Time since last diagnosis	Dx:Cancer	Dx:CIN	Dx:HPV	Dx	Hinselmann	Schiller	Citology	Biopsy
0	18	4.0	15.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	...	?	?	0	0	0	0	0	0	0	0
1	15	1.0	14.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	...	?	?	0	0	0	0	0	0	0	0
2	34	1.0	?	1.0	0.0	0.0	0.0	0.0	0.0	0.0	...	?	?	0	0	0	0	0	0	0	0
3	52	5.0	16.0	4.0	1.0	37.0	37.0	1.0	3.0	0.0	...	?	?	1	0	1	0	0	0	0	0
4	46	3.0	21.0	4.0	0.0	0.0	0.0	1.0	15.0	0.0	...	?	?	0	0	0	0	0	0	0	0
5	42	3.0	23.0	2.0	0.0	0.0	0.0	0.0	0.0	0.0	...	?	?	0	0	0	0	0	0	0	0
6	51	3.0	17.0	6.0	1.0	34.0	3.4	0.0	0.0	1.0	...	?	?	0	0	0	0	1	1	0	1
7	26	1.0	26.0	3.0	0.0	0.0	0.0	1.0	2.0	1.0	...	?	?	0	0	0	0	0	0	0	0
8	45	1.0	20.0	5.0	0.0	0.0	0.0	0.0	0.0	0.0	...	?	?	1	0	1	1	0	0	0	0
9	44	3.0	15.0	?	1.0	1.266972909	2.8	0.0	0.0	?	...	?	?	0	0	0	0	0	0	0	0
10	44	3.0	26.0	4.0	0.0	0.0	0.0	1.0	2.0	0.0	...	?	?	0	0	0	0	0	0	0	0
11	27	1.0	17.0	3.0	0.0	0.0	0.0	1.0	8.0	0.0	...	?	?	0	0	0	0	0	0	0	0
12	45	4.0	14.0	6.0	0.0	0.0	0.0	1.0	10.0	1.0	...	?	?	0	0	0	0	0	0	0	0
13	44	2.0	25.0	2.0	0.0	0.0	0.0	1.0	5.0	0.0	...	?	?	0	0	0	0	0	0	0	0
14	43	2.0	18.0	5.0	0.0	0.0	0.0	0.0	0.0	1.0	...	?	?	0	0	0	0	0	0	0	0
15	40	3.0	18.0	2.0	0.0	0.0	0.0	1.0	15.0	0.0	...	?	?	0	0	0	0	0	0	0	0
16	41	4.0	21.0	3.0	0.0	0.0	0.0	1.0	0.25	0.0	...	?	?	0	0	0	0	0	0	0	0
17	43	3.0	15.0	8.0	0.0	0.0	0.0	1.0	3.0	0.0	...	?	?	0	0	0	0	0	0	0	0
18	42	2.0	20.0	?	0.0	0.0	0.0	1.0	7.0	1.0	...	?	?	0	0	0	0	0	0	0	0
19	40	2.0	27.0	?	0.0	0.0	0.0	0.0	0.0	1.0	...	?	?	0	0	0	0	0	0	0	0
20	43	2.0	18.0	4.0	0.0	0.0	0.0	1.0	15.0	0.0	...	?	?	0	0	0	0	0	0	0	0
21	41	3.0	17.0	4.0	0.0	0.0	0.0	1.0	10.0	0.0	...	21.0	21.0	0	0	0	0	0	0	0	0
22	40	1.0	18.0	1.0	0.0	0.0	0.0	1.0	0.25	0.0	...	2.0	2.0	0	0	0	0	0	1	1	1
23	40	1.0	20.0	2.0	0.0	0.0	0.0	1.0	15.0	0.0	...	?	?	1	0	1	0	1	1	0	1
24	40	3.0	15.0	3.0	0.0	0.0	0.0	1.0	3.0	0.0	...	?	?	0	0	0	0	0	0	0	0
25	44	3.0	19.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	...	?	?	0	0	0	0	0	0	0	0
26	39	5.0	23.0	2.0	0.0	0.0	0.0	0.0	0.0	1.0	...	?	?	0	0	0	0	0	0	0	0
27	39	2.0	17.0	4.0	0.0	0.0	0.0	0.0	0.0	0.0	...	?	?	0	0	0	0	0	0	0	0
28	37	3.0	24.0	1.0	1.0	3.0	0.04	0.0	0.0	0.0	...	?	?	0	0	0	0	0	0	0	0
29	37	6.0	26.0	1.0	0.0	0.0	0.0	1.0	0.25	0.0	...	?	?	0	0	0	0	0	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
828	33	2.0	21.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	?	?	0	0	0	0	0	0	0	0
829	34	3.0	14.0	4.0	0.0	0.0	0.0	0.0	0.0	0.0	...	?	?	0	0	0	0	0	0	0	0
830	35	4.0	16.0	4.0	0.0	0.0	0.0	0.0	0.0	0.0	...	?	?	0	0	0	0	0	0	0	0
831	40	3.0	23.0	2.0	0.0	0.0	0.0	0.0	0.0	0.0	...	8.0	8.0	0	0	0	0	0	0	0	0
832	30	2.0	18.0	0.0	0.0	0.0	0.0	1.0	1.0	0.0	...	?	?	0	0	0	0	0	0	0	0
833	34	1.0	?	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	?	?	0	0	0	0	0	0	0	0
834	30	3.0	15.0	0.0	1.0	16.0	8.0	0.0	0.0	0.0	...	?	?	0	0	0	0	0	0	0	0
835	24	1.0	14.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	?	?	0	0	0	0	0	0	0	0
836	37	3.0	?	0.0	0.0	0.0	0.0	1.0	0.25	0.0	...	?	?	0	0	0	0	0	0	0	0
837	31	9.0	?	1.0	1.0	11.0	5.5	1.0	0.25	0.0	...	?	?	0	0	0	0	0	0	0	0
838	35	3.0	18.0	3.0	0.0	0.0	0.0	1.0	5.0	0.0	...	?	?	0	0	0	0	0	0	0	0
839	31	3.0	19.0	1.0	0.0	0.0	0.0	1.0	0.08	1.0	...	?	?	1	0	0	1	0	0	0	0
840	24	2.0	16.0	3.0	0.0	0.0	0.0	1.0	5.0	0.0	...	?	?	0	0	0	0	0	0	0	0
841	23	2.0	15.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	?	?	0	0	0	0	0	0	0	0
842	36	3.0	16.0	3.0	1.0	6.0	0.3	1.0	2.0	0.0	...	?	?	0	0	0	0	0	0	0	0
843	30	3.0	14.0	3.0	0.0	0.0	0.0	1.0	2.0	0.0	...	?	?	0	0	0	0	0	0	0	0
844	26	8.0	15.0	1.0	1.0	9.0	1.35	1.0	5.0	1.0	...	?	?	0	0	0	0	0	0	0	0
845	19	2.0	15.0	2.0	0.0	0.0	0.0	1.0	0.75	0.0	...	?	?	0	0	0	0	0	0	0	0
846	35	2.0	17.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	...	?	?	0	0	0	0	0	0	0	0
847	30	3.0	22.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	...	?	?	0	0	0	0	0	0	0	0
848	31	3.0	18.0	1.0	0.0	0.0	0.0	1.0	0.5	0.0	...	?	?	0	0	0	0	0	0	0	0
849	32	3.0	18.0	1.0	1.0	11.0	0.16	1.0	6.0	0.0	...	?	?	1	0	1	0	0	0	0	0
850	19	1.0	14.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	?	?	0	0	0	0	0	0	0	0
851	23	2.0	15.0	2.0	0.0	0.0	0.0	0.0	0.0	0.0	...	?	?	0	0	0	0	0	0	0	0
852	43	3.0	17.0	3.0	0.0	0.0	0.0	1.0	5.0	0.0	...	?	?	0	0	0	0	0	0	0	0
853	34	3.0	18.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	?	?	0	0	0	0	0	0	0	0
854	32	2.0	19.0	1.0	0.0	0.0	0.0	1.0	8.0	0.0	...	?	?	0	0	0	0	0	0	0	0
855	25	2.0	17.0	0.0	0.0	0.0	0.0	1.0	0.08	0.0	...	?	?	0	0	0	0	0	0	1	0
856	33	2.0	24.0	2.0	0.0	0.0	0.0	1.0	0.08	0.0	...	?	?	0	0	0	0	0	0	0	0
857	29	2.0	20.0	1.0	0.0	0.0	0.0	1.0	0.5	0.0	...	?	?	0	0	0	0	0	0	0	0

	Age	Number of sexual partners	First sexual intercourse	Num of pregnancies	Smokes (years)	Smokes (packs/year)	Hormonal Contraceptives (years)	IUD (years)	STDs (number)	STDs:condylomatosis	...	Dx:HPV_0	Dx:HPV_1	Dx_0	Dx_1	Hinselmann_0	Hinselmann_1	Citology_0	Citology_1	Schiller_0	Schiller_1
0	18	4.0	15.0	1.0	0.000000	0.00	0.00	0.00	0.0	0.0	...	1	0	1	0	1	0	1	0	1	0
1	15	1.0	14.0	1.0	0.000000	0.00	0.00	0.00	0.0	0.0	...	1	0	1	0	1	0	1	0	1	0
2	34	1.0	17.0	1.0	0.000000	0.00	0.00	0.00	0.0	0.0	...	1	0	1	0	1	0	1	0	1	0
3	52	5.0	16.0	4.0	37.000000	37.00	3.00	0.00	0.0	0.0	...	0	1	1	0	1	0	1	0	1	0
4	46	3.0	21.0	4.0	0.000000	0.00	15.00	0.00	0.0	0.0	...	1	0	1	0	1	0	1	0	1	0
5	42	3.0	23.0	2.0	0.000000	0.00	0.00	0.00	0.0	0.0	...	1	0	1	0	1	0	1	0	1	0
6	51	3.0	17.0	6.0	34.000000	3.40	0.00	7.00	0.0	0.0	...	1	0	1	0	0	1	1	0	0	1
7	26	1.0	26.0	3.0	0.000000	0.00	2.00	7.00	0.0	0.0	...	1	0	1	0	1	0	1	0	1	0
8	45	1.0	20.0	5.0	0.000000	0.00	0.00	0.00	0.0	0.0	...	0	1	0	1	1	0	1	0	1	0
9	44	3.0	15.0	2.0	1.266973	2.80	0.00	0.00	0.0	0.0	...	1	0	1	0	1	0	1	0	1	0
10	44	3.0	26.0	4.0	0.000000	0.00	2.00	0.00	0.0	0.0	...	1	0	1	0	1	0	1	0	1	0
11	27	1.0	17.0	3.0	0.000000	0.00	8.00	0.00	0.0	0.0	...	1	0	1	0	1	0	1	0	1	0
12	45	4.0	14.0	6.0	0.000000	0.00	10.00	5.00	0.0	0.0	...	1	0	1	0	1	0	1	0	1	0
13	44	2.0	25.0	2.0	0.000000	0.00	5.00	0.00	0.0	0.0	...	1	0	1	0	1	0	1	0	1	0
14	43	2.0	18.0	5.0	0.000000	0.00	0.00	8.00	0.0	0.0	...	1	0	1	0	1	0	1	0	1	0
15	40	3.0	18.0	2.0	0.000000	0.00	15.00	0.00	0.0	0.0	...	1	0	1	0	1	0	1	0	1	0
16	41	4.0	21.0	3.0	0.000000	0.00	0.25	0.00	0.0	0.0	...	1	0	1	0	1	0	1	0	1	0
17	43	3.0	15.0	8.0	0.000000	0.00	3.00	0.00	0.0	0.0	...	1	0	1	0	1	0	1	0	1	0
18	42	2.0	20.0	2.0	0.000000	0.00	7.00	6.00	2.0	1.0	...	1	0	1	0	1	0	1	0	1	0
19	40	2.0	27.0	2.0	0.000000	0.00	0.00	1.00	0.0	0.0	...	1	0	1	0	1	0	1	0	1	0
20	43	2.0	18.0	4.0	0.000000	0.00	15.00	0.00	0.0	0.0	...	1	0	1	0	1	0	1	0	1	0
21	41	3.0	17.0	4.0	0.000000	0.00	10.00	0.00	1.0	0.0	...	1	0	1	0	1	0	1	0	1	0
22	40	1.0	18.0	1.0	0.000000	0.00	0.25	0.00	2.0	1.0	...	1	0	1	0	1	0	0	1	0	1
23	40	1.0	20.0	2.0	0.000000	0.00	15.00	0.00	0.0	0.0	...	0	1	1	0	0	1	1	0	0	1
24	40	3.0	15.0	3.0	0.000000	0.00	3.00	0.00	0.0	0.0	...	1	0	1	0	1	0	1	0	1	0
25	44	3.0	19.0	1.0	0.000000	0.00	0.00	0.00	0.0	0.0	...	1	0	1	0	1	0	1	0	1	0
26	39	5.0	23.0	2.0	0.000000	0.00	0.00	1.00	0.0	0.0	...	1	0	1	0	1	0	1	0	1	0
27	39	2.0	17.0	4.0	0.000000	0.00	0.00	0.00	0.0	0.0	...	1	0	1	0	1	0	1	0	1	0
28	37	3.0	24.0	1.0	3.000000	0.04	0.00	0.00	0.0	0.0	...	1	0	1	0	1	0	1	0	1	0
29	37	6.0	26.0	1.0	0.000000	0.00	0.25	0.00	0.0	0.0	...	1	0	1	0	1	0	1	0	1	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
828	33	2.0	21.0	0.0	0.000000	0.00	0.00	0.00	0.0	0.0	...	1	0	1	0	1	0	1	0	1	0
829	34	3.0	14.0	4.0	0.000000	0.00	0.00	0.00	0.0	0.0	...	1	0	1	0	1	0	1	0	1	0
830	35	4.0	16.0	4.0	0.000000	0.00	0.00	0.00	0.0	0.0	...	1	0	1	0	1	0	1	0	1	0
831	40	3.0	23.0	2.0	0.000000	0.00	0.00	0.00	1.0	0.0	...	1	0	1	0	1	0	1	0	1	0
832	30	2.0	18.0	0.0	0.000000	0.00	1.00	0.00	0.0	0.0	...	1	0	1	0	1	0	1	0	1	0
833	34	1.0	17.0	0.0	0.000000	0.00	0.00	0.00	0.0	0.0	...	1	0	1	0	1	0	1	0	1	0
834	30	3.0	15.0	0.0	16.000000	8.00	0.00	0.00	0.0	0.0	...	1	0	1	0	1	0	1	0	1	0
835	24	1.0	14.0	0.0	0.000000	0.00	0.00	0.00	0.0	0.0	...	1	0	1	0	1	0	1	0	1	0
836	37	3.0	17.0	0.0	0.000000	0.00	0.25	0.00	0.0	0.0	...	1	0	1	0	1	0	1	0	1	0
837	31	9.0	17.0	1.0	11.000000	5.50	0.25	0.00	0.0	0.0	...	1	0	1	0	1	0	1	0	1	0
838	35	3.0	18.0	3.0	0.000000	0.00	5.00	0.00	0.0	0.0	...	1	0	1	0	1	0	1	0	1	0
839	31	3.0	19.0	1.0	0.000000	0.00	0.08	8.00	0.0	0.0	...	1	0	0	1	1	0	1	0	1	0
840	24	2.0	16.0	3.0	0.000000	0.00	5.00	0.00	0.0	0.0	...	1	0	1	0	1	0	1	0	1	0
841	23	2.0	15.0	0.0	0.000000	0.00	0.00	0.00	0.0	0.0	...	1	0	1	0	1	0	1	0	1	0
842	36	3.0	16.0	3.0	6.000000	0.30	2.00	0.00	0.0	0.0	...	1	0	1	0	1	0	1	0	1	0
843	30	3.0	14.0	3.0	0.000000	0.00	2.00	0.00	0.0	0.0	...	1	0	1	0	1	0	1	0	1	0
844	26	8.0	15.0	1.0	9.000000	1.35	5.00	0.17	0.0	0.0	...	1	0	1	0	1	0	1	0	1	0
845	19	2.0	15.0	2.0	0.000000	0.00	0.75	0.00	0.0	0.0	...	1	0	1	0	1	0	1	0	1	0
846	35	2.0	17.0	1.0	0.000000	0.00	0.00	0.00	0.0	0.0	...	1	0	1	0	1	0	1	0	1	0
847	30	3.0	22.0	1.0	0.000000	0.00	0.00	0.00	0.0	0.0	...	1	0	1	0	1	0	1	0	1	0
848	31	3.0	18.0	1.0	0.000000	0.00	0.50	0.00	0.0	0.0	...	1	0	1	0	1	0	1	0	1	0
849	32	3.0	18.0	1.0	11.000000	0.16	6.00	0.00	1.0	0.0	...	0	1	1	0	1	0	1	0	1	0
850	19	1.0	14.0	0.0	0.000000	0.00	0.00	0.00	0.0	0.0	...	1	0	1	0	1	0	1	0	1	0
851	23	2.0	15.0	2.0	0.000000	0.00	0.00	0.00	0.0	0.0	...	1	0	1	0	1	0	1	0	1	0
852	43	3.0	17.0	3.0	0.000000	0.00	5.00	0.00	0.0	0.0	...	1	0	1	0	1	0	1	0	1	0
853	34	3.0	18.0	0.0	0.000000	0.00	0.00	0.00	0.0	0.0	...	1	0	1	0	1	0	1	0	1	0
854	32	2.0	19.0	1.0	0.000000	0.00	8.00	0.00	0.0	0.0	...	1	0	1	0	1	0	1	0	1	0
855	25	2.0	17.0	0.0	0.000000	0.00	0.08	0.00	0.0	0.0	...	1	0	1	0	1	0	0	1	1	0
856	33	2.0	24.0	2.0	0.000000	0.00	0.08	0.00	0.0	0.0	...	1	0	1	0	1	0	1	0	1	0
857	29	2.0	20.0	1.0	0.000000	0.00	0.50	0.00	0.0	0.0	...	1	0	1	0	1	0	1	0	1	0

	Age	Number of sexual partners	First sexual intercourse	Num of pregnancies	Smokes (years)	Smokes (packs/year)	Hormonal Contraceptives (years)	IUD (years)	STDs (number)	STDs:condylomatosis	...	Dx:HPV_0	Dx:HPV_1	Dx_0	Dx_1	Hinselmann_0	Hinselmann_1	Citology_0	Citology_1	Schiller_0	Schiller_1
count	858.000000	858.000000	858.000000	858.000000	858.000000	858.000000	858.000000	858.000000	858.000000	858.000000	...	858.000000	858.000000	858.000000	858.000000	858.000000	858.000000	858.000000	858.000000	858.000000	858.000000
mean	26.820513	2.511655	16.995338	2.257576	1.201241	0.446278	2.035331	0.444604	0.155012	0.051282	...	0.979021	0.020979	0.972028	0.027972	0.959207	0.040793	0.948718	0.051282	0.913753	0.086247
std	8.497948	1.644759	2.791883	1.400981	4.060623	2.210351	3.567040	1.814218	0.529617	0.220701	...	0.143398	0.143398	0.164989	0.164989	0.197925	0.197925	0.220701	0.220701	0.280892	0.280892
min	13.000000	1.000000	10.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	20.000000	2.000000	15.000000	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	1.000000	0.000000	1.000000	0.000000	1.000000	0.000000	1.000000	0.000000	1.000000	0.000000
50%	25.000000	2.000000	17.000000	2.000000	0.000000	0.000000	0.500000	0.000000	0.000000	0.000000	...	1.000000	0.000000	1.000000	0.000000	1.000000	0.000000	1.000000	0.000000	1.000000	0.000000
75%	32.000000	3.000000	18.000000	3.000000	0.000000	0.000000	2.000000	0.000000	0.000000	0.000000	...	1.000000	0.000000	1.000000	0.000000	1.000000	0.000000	1.000000	0.000000	1.000000	0.000000
max	84.000000	28.000000	32.000000	11.000000	37.000000	37.000000	30.000000	19.000000	4.000000	1.000000	...	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000

	Biopsy	Prediction
0	0	0
1	0	0
2	0	0
3	0	0
4	0	0
5	0	0
6	0	0
7	0	0
8	0	0
9	0	0
10	0	0
11	0	0
12	0	0
13	0	0
14	0	0
15	0	0
16	0	0
17	0	0
18	0	0
19	0	0
20	0	0
21	0	0
22	0	0
23	0	0
24	0	0
25	0	0
26	0	0
27	0	0
28	0	0
29	0	0
...	...	...
142	0	0
143	0	0
144	0	0
145	0	1
146	0	1
147	0	0
148	0	0
149	0	0
150	0	0
151	0	1
152	0	0
153	0	0
154	0	0
155	0	0
156	0	0
157	0	0
158	0	0
159	0	0
160	0	0
161	1	1
162	0	0
163	0	0
164	1	1
165	0	0
166	0	0
167	0	0
168	0	0
169	0	0
170	0	0
171	0	0

	Biopsy	Prediction
0	0	0
1	0	0
2	0	0
3	0	0
4	0	0
5	0	0
6	0	0
7	0	0
8	0	0
9	0	0
10	0	0
11	0	0
12	0	0
13	0	0
14	0	0
15	0	0
16	0	0
17	0	0
18	0	0
19	0	0
20	0	0
21	0	0
22	0	0
23	0	0
24	0	0
25	0	0
26	0	0
27	0	0
28	0	0
29	0	0
...	...	...
655	0	0
656	0	0
657	1	1
658	0	0
659	0	0
660	0	0
661	0	0
662	0	0
663	0	0
664	0	0
665	0	0
666	0	0
667	0	0
668	0	0
669	0	0
670	0	0
671	0	0
672	0	0
673	0	0
674	0	0
675	0	0
676	0	0
677	1	1
678	0	0
679	0	0
680	1	1
681	0	0
682	0	0
683	0	0
684	0	0

	Biopsy	Prediction
102	0	1
125	1	0
163	0	1
214	1	0
325	1	0
340	0	1
353	0	1
361	1	0
369	0	1
371	1	0
470	0	1
505	0	1
541	0	1
558	0	1
574	1	0
575	0	1
607	1	0
618	1	0
625	0	1

	Biopsy	Prediction
0	0	0
1	0	0
2	0	0
3	0	0
4	0	0
5	0	0
6	0	0
7	0	0
8	0	0
9	0	0
10	0	0
11	0	0
12	0	0
13	0	0
14	0	0
15	0	0
16	0	0
17	0	0
18	0	0
19	0	0
20	0	0
21	0	0
22	0	0
23	0	0
24	0	0
25	0	0
26	0	0
27	0	0
28	0	0
29	0	0
...	...	...
142	0	0
143	0	0
144	0	0
145	0	1
146	0	1
147	0	0
148	0	0
149	0	0
150	0	0
151	0	1
152	0	0
153	0	0
154	0	0
155	0	0
156	0	0
157	0	0
158	0	0
159	0	0
160	0	0
161	1	1
162	0	0
163	0	0
164	1	1
165	0	0
166	0	0
167	0	0
168	0	0
169	0	0
170	0	0
171	0	0

	Biopsy	Prediction
0	0	0
1	0	0
2	0	0
3	0	0
4	0	0
5	0	0
6	0	0
7	0	0
8	0	0
9	0	0
10	0	0
11	0	0
12	0	0
13	0	0
14	0	0
15	0	0
16	0	0
17	0	0
18	0	0
19	0	0
20	0	0
21	0	0
22	0	0
23	0	0
24	0	0
25	0	0
26	0	0
27	0	0
28	0	0
29	0	0
...	...	...
655	0	0
656	0	0
657	1	1
658	0	0
659	0	0
660	0	0
661	0	0
662	0	0
663	0	0
664	0	0
665	0	0
666	0	0
667	0	0
668	0	0
669	0	0
670	0	0
671	0	0
672	0	0
673	0	0
674	0	0
675	0	0
676	0	0
677	1	1
678	0	0
679	0	0
680	1	1
681	0	0
682	0	0
683	0	0
684	0	0

	Biopsy	Prediction
102	0	1
125	1	0
163	0	1
214	1	0
325	1	0
340	0	1
353	0	1
361	1	0
369	0	1
371	1	0
470	0	1
505	0	1
541	0	1
558	0	1
574	1	0
575	0	1
607	1	0
618	1	0
625	0	1

	Biopsy	Prediction
0	0	0
1	0	0
2	0	0
3	0	0
4	0	0
5	0	0
6	0	0
7	0	0
8	0	0
9	0	0
10	0	0
11	0	0
12	0	0
13	0	0
14	0	0
15	0	0
16	0	0
17	0	0
18	0	0
19	0	0
20	0	0
21	0	0
22	0	0
23	0	0
24	0	0
25	0	0
26	0	0
27	0	0
28	0	0
29	0	0
...	...	...
142	0	0
143	0	0
144	0	0
145	0	1
146	0	1
147	0	0
148	0	0
149	0	0
150	0	0
151	0	1
152	0	0
153	0	0
154	0	0
155	0	0
156	0	0
157	0	0
158	0	0
159	0	0
160	0	0
161	1	1
162	0	0
163	0	0
164	1	1
165	0	0
166	0	0
167	0	0
168	0	0
169	0	0
170	0	0
171	0	0

	Biopsy	Prediction
0	0	0
1	0	0
2	0	0
3	0	0
4	0	0
5	0	0
6	0	0
7	0	0
8	0	0
9	0	0
10	0	0
11	0	0
12	0	0
13	0	0
14	0	0
15	0	0
16	0	0
17	0	0
18	0	0
19	0	0
20	0	0
21	0	0
22	0	0
23	0	0
24	0	0
25	0	0
26	0	0
27	0	0
28	0	0
29	0	0
...	...	...
655	0	0
656	0	0
657	1	1
658	0	0
659	0	0
660	0	0
661	0	0
662	0	0
663	0	0
664	0	0
665	0	0
666	0	0
667	0	0
668	0	0
669	0	0
670	0	0
671	0	0
672	0	0
673	0	0
674	0	0
675	0	0
676	0	0
677	1	1
678	0	0
679	0	0
680	1	1
681	0	0
682	0	0
683	0	0
684	0	0

	Biopsy	Prediction
102	0	1
125	1	0
163	0	1
214	1	0
325	1	0
340	0	1
353	0	1
361	1	0
369	0	1
371	1	0
470	0	1
505	0	1
541	0	1
558	0	1
574	1	0
575	0	1
607	1	0
618	1	0
625	0	1