In [1]:
    
!pip install pandas-profiling
    
    
Requirement already satisfied: pandas-profiling in /Users/monkee/anaconda3/lib/python3.6/site-packages
Requirement already satisfied: six>=1.9 in /Users/monkee/anaconda3/lib/python3.6/site-packages (from pandas-profiling)
Requirement already satisfied: matplotlib>=1.4 in /Users/monkee/anaconda3/lib/python3.6/site-packages (from pandas-profiling)
Requirement already satisfied: pandas>=0.19 in /Users/monkee/anaconda3/lib/python3.6/site-packages (from pandas-profiling)
Requirement already satisfied: jinja2>=2.8 in /Users/monkee/anaconda3/lib/python3.6/site-packages (from pandas-profiling)
Requirement already satisfied: numpy>=1.7.1 in /Users/monkee/anaconda3/lib/python3.6/site-packages (from matplotlib>=1.4->pandas-profiling)
Requirement already satisfied: python-dateutil in /Users/monkee/anaconda3/lib/python3.6/site-packages (from matplotlib>=1.4->pandas-profiling)
Requirement already satisfied: pytz in /Users/monkee/anaconda3/lib/python3.6/site-packages (from matplotlib>=1.4->pandas-profiling)
Requirement already satisfied: cycler>=0.10 in /Users/monkee/anaconda3/lib/python3.6/site-packages (from matplotlib>=1.4->pandas-profiling)
Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=1.5.6 in /Users/monkee/anaconda3/lib/python3.6/site-packages (from matplotlib>=1.4->pandas-profiling)
Requirement already satisfied: MarkupSafe>=0.23 in /Users/monkee/anaconda3/lib/python3.6/site-packages (from jinja2>=2.8->pandas-profiling)
In [2]:
    
from IPython.core.debugger import set_trace
import pandas as pd
import pandas_profiling
    
In [3]:
    
df=pd.read_csv("donor_information.csv", parse_dates=['Aquisition Date', 'Dob'], encoding='UTF-8')
    
In [4]:
    
pandas_profiling.ProfileReport(df)
    
    Out[4]:
    
        Overview
    
    
    
        Dataset info
        
            
            
                Number of variables 
                14  
             
            
                Number of observations 
                23389  
             
            
                Total Missing (%) 
                12.6%  
             
            
                Total size in memory 
                2.5 MiB  
             
            
                Average record size in memory 
                112.0 B  
             
            
        
    
    
        Variables types
        
            
            
                Numeric 
                1  
             
            
                Categorical 
                13  
             
            
                Date 
                0  
             
            
                Text (Unique) 
                0  
             
            
                Rejected 
                0  
             
            
        
    
    
        Warnings
        Aquisition Date has a high cardinality: 2925 distinct values  WarningCountry has 23195 / 99.2% missing values MissingDob has a high cardinality: 1727 distinct values  WarningExtra Codes has 15291 / 65.4% missing values MissingExtra Codes has a high cardinality: 678 distinct values  WarningPostcode has 331 / 1.4% missing values MissingPostcode has a high cardinality: 1021 distinct values  WarningSex has 2184 / 9.3% missing values MissingSuburb has a high cardinality: 2069 distinct values  Warning 
    
    
        Variables
    
    
    
        Aquisition Date
            Categorical
        
    
    
        
            Distinct count 
            2925 
         
        
            Unique (%) 
            12.5% 
         
        
            Missing (%) 
            0.0% 
         
        
            Missing (n) 
            0 
         
    
    
        
    5/26/17 
    
        
             
        
        526
     
 
    2/7/13 
    
        
             
        
        159
     
 
    10/3/96 
    
        
             
        
        141
     
 
    Other values (2922) 
    
        
            22563
        
        
     
 
    
    
    
    
        Value 
        Count 
        Frequency (%) 
          
     
    
    
        5/26/17 
        526 
        2.2% 
        
             
         
 
        2/7/13 
        159 
        0.7% 
        
             
         
 
        10/3/96 
        141 
        0.6% 
        
             
         
 
        10/4/96 
        124 
        0.5% 
        
             
         
 
        9/12/13 
        110 
        0.5% 
        
             
         
 
        9/9/13 
        106 
        0.5% 
        
             
         
 
        9/22/08 
        103 
        0.4% 
        
             
         
 
        8/16/11 
        98 
        0.4% 
        
             
         
 
        9/25/08 
        97 
        0.4% 
        
             
         
 
        6/14/11 
        93 
        0.4% 
        
             
         
 
        Other values (2915) 
        21832 
        93.3% 
        
             
         
 
    
        Country
            Categorical
        
    
    
        
            Distinct count 
            26 
         
        
            Unique (%) 
            13.4% 
         
        
            Missing (%) 
            99.2% 
         
        
            Missing (n) 
            23195 
         
    
    
        
    UNITED KINGDOM 
    
        
             
        
        75
     
 
    NEW ZEALAND 
    
        
             
        
        42
     
 
    ANONYMOUS COUNTRY 
    
        
             
        
        12
     
 
    Other values (22) 
    
        
             
        
        65
     
 
    (Missing) 
    
        
            23195
        
        
     
 
    
    
    
    
        Value 
        Count 
        Frequency (%) 
          
     
    
    
        UNITED KINGDOM 
        75 
        0.3% 
        
             
         
 
        NEW ZEALAND 
        42 
        0.2% 
        
             
         
 
        ANONYMOUS COUNTRY 
        12 
        0.1% 
        
             
         
 
        NETHERLANDS 
        11 
        0.0% 
        
             
         
 
        UNITED STATES 
        10 
        0.0% 
        
             
         
 
        SINGAPORE 
        7 
        0.0% 
        
             
         
 
        UNITED STATES OF AMERICA 
        5 
        0.0% 
        
             
         
 
        AU 
        5 
        0.0% 
        
             
         
 
        CANADA 
        4 
        0.0% 
        
             
         
 
        IRELAND 
        3 
        0.0% 
        
             
         
 
        Other values (15) 
        20 
        0.1% 
        
             
         
 
        (Missing) 
        23195 
        99.2% 
        
             
         
 
    
        Dob
            Categorical
        
    
    
        
            Distinct count 
            1727 
         
        
            Unique (%) 
            7.4% 
         
        
            Missing (%) 
            0.0% 
         
        
            Missing (n) 
            0 
         
    
    
        
    00/00/0000 
    
        
            21479
        
        
     
 
    1/1/99 
    
        
             
        
        12
     
 
    1/1/63 
    
        
             
        
        8
     
 
    Other values (1724) 
    
        
             
        
        1890
     
 
    
    
    
    
        Value 
        Count 
        Frequency (%) 
          
     
    
    
        00/00/0000 
        21479 
        91.8% 
        
             
         
 
        1/1/99 
        12 
        0.1% 
        
             
         
 
        1/1/63 
        8 
        0.0% 
        
             
         
 
        1/1/96 
        8 
        0.0% 
        
             
         
 
        1/1/38 
        7 
        0.0% 
        
             
         
 
        1/1/03 
        7 
        0.0% 
        
             
         
 
        1/1/98 
        7 
        0.0% 
        
             
         
 
        1/1/02 
        7 
        0.0% 
        
             
         
 
        1/1/00 
        7 
        0.0% 
        
             
         
 
        1/1/93 
        5 
        0.0% 
        
             
         
 
        Other values (1717) 
        1842 
        7.9% 
        
             
         
 
    
        Donor Number
            Numeric
        
    
    
        
            
                
                    Distinct count 
                    23389 
                 
                
                    Unique (%) 
                    100.0% 
                 
                
                    Missing (%) 
                    0.0% 
                 
                
                    Missing (n) 
                    0 
                 
                
                    Infinite (%) 
                    0.0% 
                 
                
                    Infinite (n) 
                    0 
                 
            
        
        
            
                
                    Mean 
                    43778 
                 
                
                    Minimum 
                    1009 
                 
                
                    Maximum 
                    63921 
                 
                
                    Zeros (%) 
                    0.0% 
                 
            
        
    
    
 
    
    
        
            
                Quantile statistics
                
                    
                        Minimum 
                        1009 
                     
                    
                        5-th percentile 
                        9302.4 
                     
                    
                        Q1 
                        30190 
                     
                    
                        Median 
                        51581 
                     
                    
                        Q3 
                        57925 
                     
                    
                        95-th percentile 
                        62736 
                     
                    
                        Maximum 
                        63921 
                     
                    
                        Range 
                        62912 
                     
                    
                        Interquartile range 
                        27735 
                     
                
            
            
                Descriptive statistics
                
                    
                        Standard deviation 
                        18070 
                     
                    
                        Coef of variation 
                        0.41277 
                     
                    
                        Kurtosis 
                        -0.61181 
                     
                    
                        Mean 
                        43778 
                     
                    
                        MAD 
                        15359 
                     
                    
                        Skewness 
                        -0.87179 
                     
                    
                        Sum 
                        1023916814 
                     
                    
                        Variance 
                        326530000 
                     
                    
                        Memory size 
                        182.8 KiB 
                     
                
            
        
        
             
        
        
            
    
    
        Value 
        Count 
        Frequency (%) 
          
     
    
    
        61715 
        1 
        0.0% 
        
             
         
 
        12915 
        1 
        0.0% 
        
             
         
 
        6774 
        1 
        0.0% 
        
             
         
 
        56824 
        1 
        0.0% 
        
             
         
 
        60024 
        1 
        0.0% 
        
             
         
 
        57977 
        1 
        0.0% 
        
             
         
 
        62075 
        1 
        0.0% 
        
             
         
 
        51836 
        1 
        0.0% 
        
             
         
 
        49789 
        1 
        0.0% 
        
             
         
 
        55934 
        1 
        0.0% 
        
             
         
 
        Other values (23379) 
        23379 
        100.0% 
        
             
         
 
        
        
            Minimum 5 values
            
    
    
        Value 
        Count 
        Frequency (%) 
          
     
    
    
        1009 
        1 
        0.0% 
        
             
         
 
        1015 
        1 
        0.0% 
        
             
         
 
        1027 
        1 
        0.0% 
        
             
         
 
        1029 
        1 
        0.0% 
        
             
         
 
        1030 
        1 
        0.0% 
        
             
         
 
            Maximum 5 values
            
    
    
        Value 
        Count 
        Frequency (%) 
          
     
    
    
        63917 
        1 
        0.0% 
        
             
         
 
        63918 
        1 
        0.0% 
        
             
         
 
        63919 
        1 
        0.0% 
        
             
         
 
        63920 
        1 
        0.0% 
        
             
         
 
        63921 
        1 
        0.0% 
        
             
         
 
        
    
    
        Donor Source
            Categorical
        
    
    
        
            Distinct count 
            43 
         
        
            Unique (%) 
            0.2% 
         
        
            Missing (%) 
            0.1% 
         
        
            Missing (n) 
            30 
         
    
    
        
    D2D 
    
        
            5151
        
        
     
 
    ON-LINE 
    
        
            3565
        
        
     
 
    SERV 
    
        
            3512
        
        
     
 
    Other values (39) 
    
        
            11131
        
        
     
 
    
    
    
    
        Value 
        Count 
        Frequency (%) 
          
     
    
    
        D2D 
        5151 
        22.0% 
        
             
         
 
        ON-LINE 
        3565 
        15.2% 
        
             
         
 
        SERV 
        3512 
        15.0% 
        
             
         
 
        ACQ 
        1744 
        7.5% 
        
             
         
 
        EQI 
        1577 
        6.7% 
        
             
         
 
        3RDPARTY 
        970 
        4.1% 
        
             
         
 
        PURPLE 
        949 
        4.1% 
        
             
         
 
        SEMINAR 
        826 
        3.5% 
        
             
         
 
        UNSOL 
        555 
        2.4% 
        
             
         
 
        MEMPRE10 
        530 
        2.3% 
        
             
         
 
        Other values (32) 
        3980 
        17.0% 
        
             
         
 
    
        Donor Type
            Categorical
        
    
    
        
            Distinct count 
            9 
         
        
            Unique (%) 
            0.0% 
         
        
            Missing (%) 
            0.2% 
         
        
            Missing (n) 
            40 
         
    
    
        
    IND 
    
        
            18976
        
        
     
 
    CORP 
    
        
             
        
        2662
     
 
    SCHOOL 
    
        
             
        
        613
     
 
    Other values (5) 
    
        
             
        
        1098
     
 
    
    
    
    
        Value 
        Count 
        Frequency (%) 
          
     
    
    
        IND 
        18976 
        81.1% 
        
             
         
 
        CORP 
        2662 
        11.4% 
        
             
         
 
        SCHOOL 
        613 
        2.6% 
        
             
         
 
        SERVCLUB 
        482 
        2.1% 
        
             
         
 
        COMMORG 
        424 
        1.8% 
        
             
         
 
        HOSP 
        122 
        0.5% 
        
             
         
 
        GOVTDEPT 
        55 
        0.2% 
        
             
         
 
        TRUST 
        15 
        0.1% 
        
             
         
 
        (Missing) 
        40 
        0.2% 
        
             
         
 
    
        Extra Codes
            Categorical
        
    
    
        
            Distinct count 
            678 
         
        
            Unique (%) 
            8.4% 
         
        
            Missing (%) 
            65.4% 
         
        
            Missing (n) 
            15291 
         
    
    
        
    NO ENEWS 
    
        
             
        
        915
     
 
    PWE 
    
        
             
        
        864
     
 
    FME 
    
        
             
        
        705
     
 
    Other values (674) 
    
        
            5614
        
        
     
 
    (Missing) 
    
        
            15291
        
        
     
 
    
    
    
    
        Value 
        Count 
        Frequency (%) 
          
     
    
    
        NO ENEWS 
        915 
        3.9% 
        
             
         
 
        PWE 
        864 
        3.7% 
        
             
         
 
        FME 
        705 
        3.0% 
        
             
         
 
        RET-PHAR 
        485 
        2.1% 
        
             
         
 
        PWE     LPCNOMIN 
        321 
        1.4% 
        
             
         
 
        F-SUPP  FME 
        296 
        1.3% 
        
             
         
 
        LAPSDMEM 
        293 
        1.3% 
        
             
         
 
        RET-NEWS 
        224 
        1.0% 
        
             
         
 
        FUNERALD 
        183 
        0.8% 
        
             
         
 
        FME     LAPSDMEM 
        159 
        0.7% 
        
             
         
 
        Other values (667) 
        3653 
        15.6% 
        
             
         
 
        (Missing) 
        15291 
        65.4% 
        
             
         
 
    
        Fme
            Categorical
        
    
    
        
            Distinct count 
            3 
         
        
            Unique (%) 
            0.0% 
         
        
            Missing (%) 
            0.1% 
         
        
            Missing (n) 
            20 
         
    
    
        
    N 
    
        
            23263
        
        
     
 
    Y 
    
        
             
        
        106
     
 
    (Missing) 
    
        
             
        
        20
     
 
    
    
    
    
        Value 
        Count 
        Frequency (%) 
          
     
    
    
        N 
        23263 
        99.5% 
        
             
         
 
        Y 
        106 
        0.5% 
        
             
         
 
        (Missing) 
        20 
        0.1% 
        
             
         
 
    
        Member
            Categorical
        
    
    
        
            Distinct count 
            3 
         
        
            Unique (%) 
            0.0% 
         
        
            Missing (%) 
            0.1% 
         
        
            Missing (n) 
            20 
         
    
    
        
    N 
    
        
            22875
        
        
     
 
    Y 
    
        
             
        
        494
     
 
    (Missing) 
    
        
             
        
        20
     
 
    
    
    
    
        Value 
        Count 
        Frequency (%) 
          
     
    
    
        N 
        22875 
        97.8% 
        
             
         
 
        Y 
        494 
        2.1% 
        
             
         
 
        (Missing) 
        20 
        0.1% 
        
             
         
 
    
        Postcode
            Categorical
        
    
    
        
            Distinct count 
            1021 
         
        
            Unique (%) 
            4.4% 
         
        
            Missing (%) 
            1.4% 
         
        
            Missing (n) 
            331 
         
    
    
        
    4074 
    
        
             
        
        1171
     
 
    4510 
    
        
             
        
        616
     
 
    4350 
    
        
             
        
        529
     
 
    Other values (1017) 
    
        
            20742
        
        
     
 
    
    
    
    
        Value 
        Count 
        Frequency (%) 
          
     
    
    
        4074 
        1171 
        5.0% 
        
             
         
 
        4510 
        616 
        2.6% 
        
             
         
 
        4350 
        529 
        2.3% 
        
             
         
 
        4068 
        510 
        2.2% 
        
             
         
 
        4073 
        466 
        2.0% 
        
             
         
 
        4069 
        419 
        1.8% 
        
             
         
 
        4000 
        326 
        1.4% 
        
             
         
 
        4305 
        312 
        1.3% 
        
             
         
 
        4066 
        309 
        1.3% 
        
             
         
 
        4506 
        286 
        1.2% 
        
             
         
 
        Other values (1010) 
        18114 
        77.4% 
        
             
         
 
        (Missing) 
        331 
        1.4% 
        
             
         
 
    
        Pwe
            Categorical
        
    
    
        
            Distinct count 
            3 
         
        
            Unique (%) 
            0.0% 
         
        
            Missing (%) 
            0.1% 
         
        
            Missing (n) 
            20 
         
    
    
        
    N 
    
        
            23101
        
        
     
 
    Y 
    
        
             
        
        268
     
 
    (Missing) 
    
        
             
        
        20
     
 
    
    
    
    
        Value 
        Count 
        Frequency (%) 
          
     
    
    
        N 
        23101 
        98.8% 
        
             
         
 
        Y 
        268 
        1.1% 
        
             
         
 
        (Missing) 
        20 
        0.1% 
        
             
         
 
    
        Sex
            Categorical
        
    
    
        
            Distinct count 
            4 
         
        
            Unique (%) 
            0.0% 
         
        
            Missing (%) 
            9.3% 
         
        
            Missing (n) 
            2184 
         
    
    
        
    F 
    
        
            11924
        
        
     
 
    M 
    
        
            8436
        
        
     
 
    B 
    
        
             
        
        845
     
 
    (Missing) 
    
        
             
        
        2184
     
 
    
    
    
    
        Value 
        Count 
        Frequency (%) 
          
     
    
    
        F 
        11924 
        51.0% 
        
             
         
 
        M 
        8436 
        36.1% 
        
             
         
 
        B 
        845 
        3.6% 
        
             
         
 
        (Missing) 
        2184 
        9.3% 
        
             
         
 
    
        Suburb
            Categorical
        
    
    
        
            Distinct count 
            2069 
         
        
            Unique (%) 
            8.9% 
         
        
            Missing (%) 
            0.7% 
         
        
            Missing (n) 
            159 
         
    
    
        
    SINNAMON PARK 
    
        
             
        
        345
     
 
    INDOOROOPILLY 
    
        
             
        
        341
     
 
    MORAYFIELD 
    
        
             
        
        286
     
 
    Other values (2065) 
    
        
            22258
        
        
     
 
    
    
    
    
        Value 
        Count 
        Frequency (%) 
          
     
    
    
        SINNAMON PARK 
        345 
        1.5% 
        
             
         
 
        INDOOROOPILLY 
        341 
        1.5% 
        
             
         
 
        MORAYFIELD 
        286 
        1.2% 
        
             
         
 
        UNKNOWN 
        283 
        1.2% 
        
             
         
 
        CABOOLTURE 
        278 
        1.2% 
        
             
         
 
        BURPENGARY 
        260 
        1.1% 
        
             
         
 
        WESTLAKE 
        258 
        1.1% 
        
             
         
 
        NARANGBA 
        246 
        1.1% 
        
             
         
 
        MIDDLE PARK 
        236 
        1.0% 
        
             
         
 
        RIVERHILLS 
        227 
        1.0% 
        
             
         
 
        Other values (2058) 
        20470 
        87.5% 
        
             
         
 
    
        Vip
            Categorical
        
    
    
        
            Distinct count 
            3 
         
        
            Unique (%) 
            0.0% 
         
        
            Missing (%) 
            0.1% 
         
        
            Missing (n) 
            22 
         
    
    
        
    N 
    
        
            23240
        
        
     
 
    Y 
    
        
             
        
        127
     
 
    (Missing) 
    
        
             
        
        22
     
 
    
    
    
    
        Value 
        Count 
        Frequency (%) 
          
     
    
    
        N 
        23240 
        99.4% 
        
             
         
 
        Y 
        127 
        0.5% 
        
             
         
 
        (Missing) 
        22 
        0.1% 
        
             
         
 
    
        Sample
    
    
    
        
  
    
       
      Donor Number 
      Aquisition Date 
      Country 
      Dob 
      Extra Codes 
      Fme 
      Member 
      Pwe 
      Sex 
      Suburb 
      Vip 
      Donor Source 
      Donor Type 
      Postcode 
     
  
  
    
      0 
      1009 
      1/4/94 
      NaN 
      00/00/0000 
      NaN 
      N 
      N 
      N 
      F 
      MURARRIE 
      N 
      ACQ 
      IND 
      4172 
     
    
      1 
      1015 
      1/4/94 
      NaN 
      00/00/0000 
      NaN 
      N 
      N 
      N 
      M 
      MANLY WEST 
      N 
      TELETR05 
      IND 
      4179 
     
    
      2 
      1027 
      1/4/94 
      NaN 
      00/00/0000 
      NaN 
      N 
      N 
      N 
      F 
      BOONDALL 
      N 
      ACQ 
      IND 
      4034 
     
    
      3 
      1029 
      1/4/94 
      NaN 
      00/00/0000 
      NaN 
      N 
      N 
      N 
      F 
      EASTERN HEIGHTS 
      N 
      ACQ 
      IND 
      4305 
     
    
      4 
      1030 
      1/4/94 
      NaN 
      00/00/0000 
      NaN 
      N 
      N 
      N 
      B 
      CLONTARF 
      N 
      ACQ 
      IND 
      4019 
     
  
    
In [ ]:
    
pfr = pandas_profiling.ProfileReport(df)
pfr.to_file("donor_information.html")
    
In [ ]:
    
#### Print existing ProfileReport object inline
pfr
    
In [ ]:
    
    
Content source: bigdatabigheart/Datasets
Similar notebooks: