Tutorial



In [1]:

    
import pandas as pd
from autoc import DataExploration, NaImputer, PreProcessor
from autoc.naimputer import missing_map
from autoc.outliersdetection import OutliersDetection
from autoc.utils.getdata import get_dataset
from autoc.utils.helpers import cserie
%matplotlib inline
import matplotlib.pyplot as plt

Titanic dataset



In [2]:

    
# Loading Titanic dataset 
titanic = get_dataset('titanic')



In [3]:

    
titanic.head()









    Out[3]:






  
    
      
      survived
      pclass
      sex
      age
      sibsp
      parch
      fare
      embarked
      class
      who
      adult_male
      deck
      embark_town
      alive
      alone
    
  
  
    
      0
      0
      3
      male
      22
      1
      0
      7.2500
      S
      Third
      man
      True
      NaN
      Southampton
      no
      False
    
    
      1
      1
      1
      female
      38
      1
      0
      71.2833
      C
      First
      woman
      False
      C
      Cherbourg
      yes
      False
    
    
      2
      1
      3
      female
      26
      0
      0
      7.9250
      S
      Third
      woman
      False
      NaN
      Southampton
      yes
      True
    
    
      3
      1
      1
      female
      35
      1
      0
      53.1000
      S
      First
      woman
      False
      C
      Southampton
      yes
      False
    
    
      4
      0
      3
      male
      35
      0
      0
      8.0500
      S
      Third
      man
      True
      NaN
      Southampton
      no
      True

DataExploration

The DataExploraion class is designed to provide helpers for basic Dataexploration task



In [4]:

    
# Instantiate the class this way 
exploration_titanic = DataExploration(titanic)



In [5]:

    
# The structure function gives a good summary of important characteristics of the dataset like
# missing values, nb_unique values, cst columns, types of the column ...
exploration_titanic.structure()









    Out[5]:






  
    
      
      dtypes_p
      dtypes_r
      nb_missing
      perc_missing
      nb_unique_values
      constant_columns
      na_columns
      is_key
      dtype_infer
      string_length
    
  
  
    
      survived
      int64
      numeric
      0
      0.000000
      2
      False
      False
      False
      integer
      NaN
    
    
      pclass
      int64
      numeric
      0
      0.000000
      3
      False
      False
      False
      integer
      NaN
    
    
      sex
      object
      factor
      0
      0.000000
      2
      False
      False
      False
      string
      6
    
    
      age
      float64
      numeric
      177
      0.198653
      88
      False
      False
      False
      floating
      NaN
    
    
      sibsp
      int64
      numeric
      0
      0.000000
      7
      False
      False
      False
      integer
      NaN
    
    
      parch
      int64
      numeric
      0
      0.000000
      7
      False
      False
      False
      integer
      NaN
    
    
      fare
      float64
      numeric
      0
      0.000000
      248
      False
      False
      False
      floating
      NaN
    
    
      embarked
      object
      factor
      2
      0.002245
      3
      False
      False
      False
      mixed
      1
    
    
      class
      object
      factor
      0
      0.000000
      3
      False
      False
      False
      string
      6
    
    
      who
      object
      factor
      0
      0.000000
      3
      False
      False
      False
      string
      5
    
    
      adult_male
      bool
      factor
      0
      0.000000
      2
      False
      False
      False
      boolean
      NaN
    
    
      deck
      object
      factor
      688
      0.772166
      7
      False
      False
      False
      mixed
      1
    
    
      embark_town
      object
      factor
      2
      0.002245
      3
      False
      False
      False
      mixed
      11
    
    
      alive
      object
      factor
      0
      0.000000
      2
      False
      False
      False
      string
      3
    
    
      alone
      bool
      factor
      0
      0.000000
      2
      False
      False
      False
      boolean
      NaN



In [6]:

    
# If you want more specific primitive :
exploration_titanic.nacolcount()









    Out[6]:






  
    
      
      Nanumber
      Napercentage
    
  
  
    
      survived
      0
      0.000000
    
    
      pclass
      0
      0.000000
    
    
      sex
      0
      0.000000
    
    
      age
      177
      0.198653
    
    
      sibsp
      0
      0.000000
    
    
      parch
      0
      0.000000
    
    
      fare
      0
      0.000000
    
    
      embarked
      2
      0.002245
    
    
      class
      0
      0.000000
    
    
      who
      0
      0.000000
    
    
      adult_male
      0
      0.000000
    
    
      deck
      688
      0.772166
    
    
      embark_town
      2
      0.002245
    
    
      alive
      0
      0.000000
    
    
      alone
      0
      0.000000



In [7]:

    
cserie(exploration_titanic.narows_full) # no rows of only missing values









    Out[7]:





[]



In [8]:

    
exploration_titanic.count_unique()









    Out[8]:





survived         2
pclass           3
sex              2
age             88
sibsp            7
parch            7
fare           248
embarked         3
class            3
who              3
adult_male       2
deck             7
embark_town      3
alive            2
alone            2
dtype: int64



In [9]:

    
# More complete numeric summary than describe()
exploration_titanic.numeric_summary() # you can access to numeric









    Out[9]:






  
    
      
      Count
      Min
      FirstQuartile
      Median
      Mean
      Std
      Mad
      Skewness
      Kurtosis
      Thirdquartile
      Max
    
  
  
    
      survived
      891
      0.00
      0.0000
      0.0000
      0.383838
      0.486592
      0.473013
      0.478523
      -1.775005
      1
      1.0000
    
    
      pclass
      891
      1.00
      2.0000
      3.0000
      2.308642
      0.836071
      0.761968
      -0.630548
      -1.280015
      3
      3.0000
    
    
      age
      714
      0.42
      20.1250
      28.0000
      29.699118
      14.526497
      11.322944
      0.389108
      0.178274
      38
      80.0000
    
    
      sibsp
      891
      0.00
      0.0000
      0.0000
      0.523008
      1.102743
      0.713780
      3.695352
      17.880420
      1
      8.0000
    
    
      parch
      891
      0.00
      0.0000
      0.0000
      0.381594
      0.806057
      0.580742
      2.749117
      9.778125
      0
      6.0000
    
    
      fare
      891
      0.00
      7.9104
      14.4542
      32.204208
      49.693429
      28.163692
      4.787317
      33.398141
      31
      512.3292



In [10]:

    
# Look at quantiles



In [11]:

    
exploration_titanic.dfquantiles(nb_quantiles=10)

Primitive list :

Print Warning



In [12]:

    
# print Consistency infos 
# This function helps you to trakc potential consistency errors in the dataset
# like duplicates columns, constant columns, full missing rows, full missing columns. 
exploration_titanic.print_infos('consistency', print_empty=False)









    



{'duplicated_rows': {'action': 'delete',
                     'comment': 'You should delete this rows with df.drop_duplicates()',
                     'level': 'ERROR',
                     'value': Int64Index([ 47,  76,  77,  87,  95, 101, 121, 133, 173, 196,
            ...
            838, 844, 846, 859, 863, 870, 877, 878, 884, 886],
           dtype='int64', length=107)}}

Fancier Functions



In [13]:

    
# Nearzerovariance function inspired from caret 
exploration_titanic.nearzerovar()









    



             freq_ratio    nzv  percent_unique zero_var
survived       1.605263  False        0.224467    False
pclass         2.273148  False        0.336700    False
sex            1.837580  False        0.224467    False
age            1.111111  False        9.876543    False
sibsp          2.909091  False        0.785634    False
parch          5.745763  False        0.785634    False
fare           1.023810  False       27.833895    False
embarked       3.833333  False        0.336700    False
class          2.273148  False        0.336700    False
who            1.981550  False        0.336700    False
adult_male     1.516949  False        0.224467    False
deck           1.255319  False        0.785634    False
embark_town    3.833333  False        0.336700    False
alive          1.605263  False        0.224467    False
alone          1.516949  False        0.224467    False






    Out[13]:





Index([], dtype='object')



In [14]:

    
# Find highly correlated columns 
exploration_titanic.findcorr() # no highly numerical correlated columns









    Out[14]:





[]



In [15]:

    
exploration_titanic.findupcol()
# no duplicated cols









    Out[15]:





[]



In [16]:

    
# Recheck duplicated row
titanic.duplicated().sum()









    Out[16]:





107

Outliers Detection

This class is a simple class to detect one dimension outliers.



In [17]:

    
outlier_detection = OutliersDetection(titanic)



In [18]:

    
outlier_detection.basic_cutoff









    Out[18]:





{'cutoff_iqr': 2, 'cutoff_mad': 2, 'cutoff_z': 3}



In [19]:

    
outlier_detection.strong_cutoff









    Out[19]:





{'cutoff_iqr': 6, 'cutoff_mad': 6, 'cutoff_z': 6}



In [20]:

    
soft_outliers_fare = outlier_detection.outlier_detection_serie_1d('fare',cutoff_params=outlier_detection.basic_cutoff)
strong_outliers_fare = outlier_detection.outlier_detection_serie_1d('fare',cutoff_params=outlier_detection.strong_cutoff)



In [21]:

    
# finding index of your Dataframe
index_strong_outliers = (strong_outliers_fare.is_outlier == 1)



In [22]:

    
titanic.fare.describe()









    Out[22]:





count    891.000000
mean      32.204208
std       49.693429
min        0.000000
25%        7.910400
50%       14.454200
75%       31.000000
max      512.329200
Name: fare, dtype: float64



In [23]:

    
# a lot of outliers because distribution is lognormal
titanic.loc[index_strong_outliers, :].head()









    Out[23]:






  
    
      
      survived
      pclass
      sex
      age
      sibsp
      parch
      fare
      embarked
      class
      who
      adult_male
      deck
      embark_town
      alive
      alone
    
  
  
    
      27
      0
      1
      male
      19
      3
      2
      263.0000
      S
      First
      man
      True
      C
      Southampton
      no
      False
    
    
      31
      1
      1
      female
      NaN
      1
      0
      146.5208
      C
      First
      woman
      False
      B
      Cherbourg
      yes
      False
    
    
      34
      0
      1
      male
      28
      1
      0
      82.1708
      C
      First
      man
      True
      NaN
      Cherbourg
      no
      False
    
    
      52
      1
      1
      female
      49
      1
      0
      76.7292
      C
      First
      woman
      False
      D
      Cherbourg
      yes
      False
    
    
      61
      1
      1
      female
      38
      0
      0
      80.0000
      NaN
      First
      woman
      False
      B
      NaN
      yes
      True



In [24]:

    
titanic.fare.hist()









    Out[24]:





<matplotlib.axes._subplots.AxesSubplot at 0x1045de190>



In [25]:

    
outlier_detection.outlier_detection_1d(cutoff_params=outlier_detection.basic_cutoff).head(20)









    Out[25]:






  
    
      
      fare_iqr_score
      fare_mad_score
      fare_z_score
      fare_is_outlier
    
  
  
    
      0
      -0.312011
      -0.703808
      -0.502445
      0
    
    
      1
      2.461242
      5.551871
      0.786845
      1
    
    
      2
      -0.282777
      -0.637865
      -0.488854
      0
    
    
      3
      1.673732
      3.775469
      0.420730
      1
    
    
      4
      -0.277363
      -0.625653
      -0.486337
      0
    
    
      5
      -0.259680
      -0.585764
      -0.478116
      0
    
    
      6
      1.620136
      3.654572
      0.395814
      1
    
    
      7
      0.286744
      0.646813
      -0.224083
      0
    
    
      8
      -0.143827
      -0.324433
      -0.424256
      0
    
    
      9
      0.676348
      1.525651
      -0.042956
      0
    
    
      10
      0.097265
      0.219402
      -0.312172
      0
    
    
      11
      0.523864
      1.181689
      -0.113846
      0
    
    
      12
      -0.277363
      -0.625653
      -0.486337
      0
    
    
      13
      0.728501
      1.643294
      -0.018709
      0
    
    
      14
      -0.285843
      -0.644781
      -0.490280
      0
    
    
      15
      0.066948
      0.151016
      -0.326267
      0
    
    
      16
      0.635386
      1.433251
      -0.061999
      0
    
    
      17
      -0.062981
      -0.142067
      -0.386671
      0
    
    
      18
      0.153567
      0.346404
      -0.285997
      0
    
    
      19
      -0.313093
      -0.706251
      -0.502949
      0

Prerocessor



In [26]:

    
# initialize preprocessing 
preprocessor = PreProcessor(titanic, copy=True)
print("We made a copy so id titanic :  {} different from id preprocessor.data {}".format(
        id(titanic),id(preprocessor.data)))









    



We made a copy so id titanic :  4407004560 different from id preprocessor.data 4747383056



In [27]:

    
# using infos consistency from DataExploration 
preprocessor.print_infos('consistency')









    



{'duplicated_rows': {'action': 'delete',
                     'comment': 'You should delete this rows with df.drop_duplicates()',
                     'level': 'ERROR',
                     'value': Int64Index([ 47,  76,  77,  87,  95, 101, 121, 133, 173, 196,
            ...
            838, 844, 846, 859, 863, 870, 877, 878, 884, 886],
           dtype='int64', length=107)}}



In [28]:

    
# basic cleaning delete constant columns 
titanic_clean = preprocessor.basic_cleaning()









    



We are removing the folowing columns : []
We are removing the folowing rows : [521, 522, 526, 531, 870, 47, 560, 563, 564, 568, 573, 588, 589, 598, 87, 601, 95, 612, 101, 614, 121, 635, 636, 640, 641, 644, 133, 646, 650, 656, 666, 674, 173, 692, 696, 196, 709, 198, 201, 213, 732, 733, 734, 223, 738, 739, 241, 757, 758, 760, 260, 773, 274, 790, 792, 800, 295, 808, 300, 304, 313, 320, 324, 837, 838, 844, 846, 335, 343, 859, 863, 354, 355, 358, 359, 364, 877, 878, 368, 884, 886, 384, 832, 409, 410, 413, 418, 420, 425, 428, 431, 454, 76, 459, 77, 464, 466, 470, 476, 481, 485, 488, 490, 494, 613, 500, 511]



In [32]:

    
titanic_clean.shape # We removed the dupliated columns









    Out[32]:





(784, 15)



In [33]:

    
titanic.shape









    Out[33]:





(891, 15)



In [29]:

    
preprocessor.infer_subtypes() # this function tries to indentify different subtypes of data









    Out[29]:





{'adult_male': {'dtype': dtype('bool'), 'subtype': 'binary'},
 'age': {'dtype': dtype('float64'), 'subtype': None},
 'alive': {'dtype': dtype('O'), 'subtype': 'binary'},
 'alone': {'dtype': dtype('bool'), 'subtype': 'binary'},
 'class': {'dtype': dtype('O'), 'subtype': 'text_categorical'},
 'deck': {'dtype': dtype('O'), 'subtype': 'text_categorical'},
 'embark_town': {'dtype': dtype('O'), 'subtype': 'text_categorical'},
 'embarked': {'dtype': dtype('O'), 'subtype': 'text_categorical'},
 'fare': {'dtype': dtype('float64'), 'subtype': None},
 'parch': {'dtype': dtype('int64'), 'subtype': 'ordinal'},
 'pclass': {'dtype': dtype('int64'), 'subtype': 'ordinal'},
 'sex': {'dtype': dtype('O'), 'subtype': 'binary'},
 'sibsp': {'dtype': dtype('int64'), 'subtype': 'ordinal'},
 'survived': {'dtype': dtype('int64'), 'subtype': 'binary'},
 'who': {'dtype': dtype('O'), 'subtype': 'text_categorical'}}



In [30]:

    
preprocessor.subtypes









    Out[30]:





['text_raw', 'text_categorical', 'ordinal', 'binary', 'other']

Airbnb Dataset

This is a dataset from airbnb users found (the dataset used here is train_users_2.csv from the this airbnb kaggle competition



In [34]:

    
df_airbnb = get_dataset('airbnb_users')

DataExploration



In [35]:

    
exploration_airbnb = DataExploration(df_airbnb)



In [36]:

    
exploration_airbnb.print_infos('consistency')

{}



In [37]:

    
exploration_airbnb.structure()









    Out[37]:






  
    
      
      dtypes_p
      dtypes_r
      nb_missing
      perc_missing
      nb_unique_values
      constant_columns
      na_columns
      is_key
      dtype_infer
      string_length
    
  
  
    
      id
      object
      character
      0
      0.000000
      213451
      False
      False
      True
      string
      10
    
    
      date_account_created
      object
      character
      0
      0.000000
      1634
      False
      False
      False
      string
      10
    
    
      timestamp_first_active
      int64
      numeric
      0
      0.000000
      213451
      False
      False
      True
      integer
      NaN
    
    
      date_first_booking
      object
      character
      124543
      0.583473
      1976
      False
      False
      False
      mixed
      10
    
    
      gender
      object
      factor
      0
      0.000000
      4
      False
      False
      False
      string
      9
    
    
      age
      float64
      numeric
      87990
      0.412226
      127
      False
      False
      False
      floating
      NaN
    
    
      signup_method
      object
      factor
      0
      0.000000
      3
      False
      False
      False
      string
      8
    
    
      signup_flow
      int64
      numeric
      0
      0.000000
      17
      False
      False
      False
      integer
      NaN
    
    
      language
      object
      character
      0
      0.000000
      25
      False
      False
      False
      string
      2
    
    
      affiliate_channel
      object
      factor
      0
      0.000000
      8
      False
      False
      False
      string
      13
    
    
      affiliate_provider
      object
      character
      0
      0.000000
      18
      False
      False
      False
      string
      19
    
    
      first_affiliate_tracked
      object
      factor
      6065
      0.028414
      7
      False
      False
      False
      mixed
      13
    
    
      signup_app
      object
      factor
      0
      0.000000
      4
      False
      False
      False
      string
      7
    
    
      first_device_type
      object
      factor
      0
      0.000000
      9
      False
      False
      False
      string
      18
    
    
      first_browser
      object
      character
      0
      0.000000
      52
      False
      False
      False
      string
      20
    
    
      country_destination
      object
      character
      0
      0.000000
      12
      False
      False
      False
      string
      5



In [38]:

    
exploration_airbnb.sign_summary() # Get sign summary (look for -1 na encoded value for example)









    Out[38]:






  
    
      
      NumOfNegative
      PctOfNegative
      NumOfPositive
      PctOfPositive
    
  
  
    
      timestamp_first_active
      0
      0.000000
      213451
      1.000000
    
    
      age
      0
      0.000000
      125461
      0.587774
    
    
      signup_flow
      164739
      0.771788
      213451
      1.000000

Outliers Detection



In [39]:

    
airbnb_od = OutliersDetection(df_airbnb)



In [40]:

    
# OutliersDetection is a subclass of DataExploration
airbnb_od.structure()









    Out[40]:






  
    
      
      dtypes_p
      dtypes_r
      nb_missing
      perc_missing
      nb_unique_values
      constant_columns
      na_columns
      is_key
      dtype_infer
      string_length
    
  
  
    
      id
      object
      character
      0
      0.000000
      213451
      False
      False
      True
      string
      10
    
    
      date_account_created
      object
      character
      0
      0.000000
      1634
      False
      False
      False
      string
      10
    
    
      timestamp_first_active
      int64
      numeric
      0
      0.000000
      213451
      False
      False
      True
      integer
      NaN
    
    
      date_first_booking
      object
      character
      124543
      0.583473
      1976
      False
      False
      False
      mixed
      10
    
    
      gender
      object
      factor
      0
      0.000000
      4
      False
      False
      False
      string
      9
    
    
      age
      float64
      numeric
      87990
      0.412226
      127
      False
      False
      False
      floating
      NaN
    
    
      signup_method
      object
      factor
      0
      0.000000
      3
      False
      False
      False
      string
      8
    
    
      signup_flow
      int64
      numeric
      0
      0.000000
      17
      False
      False
      False
      integer
      NaN
    
    
      language
      object
      character
      0
      0.000000
      25
      False
      False
      False
      string
      2
    
    
      affiliate_channel
      object
      factor
      0
      0.000000
      8
      False
      False
      False
      string
      13
    
    
      affiliate_provider
      object
      character
      0
      0.000000
      18
      False
      False
      False
      string
      19
    
    
      first_affiliate_tracked
      object
      factor
      6065
      0.028414
      7
      False
      False
      False
      mixed
      13
    
    
      signup_app
      object
      factor
      0
      0.000000
      4
      False
      False
      False
      string
      7
    
    
      first_device_type
      object
      factor
      0
      0.000000
      9
      False
      False
      False
      string
      18
    
    
      first_browser
      object
      character
      0
      0.000000
      52
      False
      False
      False
      string
      20
    
    
      country_destination
      object
      character
      0
      0.000000
      12
      False
      False
      False
      string
      5



In [41]:

    
airbnb_od.numeric_summary() # you can access to numeric









    Out[41]:






  
    
      
      Count
      Min
      FirstQuartile
      Median
      Mean
      Std
      Mad
      Skewness
      Kurtosis
      Thirdquartile
      Max
    
  
  
    
      timestamp_first_active
      213451
      2.009032e+13
      2.012123e+13
      2.013091e+13
      2.013085e+13
      9.253717e+09
      6.898231e+09
      -0.860883
      -3.142610
      2.014031e+13
      2.014063e+13
    
    
      age
      125461
      1.000000e+00
      2.800000e+01
      3.400000e+01
      4.966834e+01
      1.556666e+02
      2.838745e+01
      12.422586
      153.568929
      4.300000e+01
      2.014000e+03
    
    
      signup_flow
      213451
      0.000000e+00
      0.000000e+00
      0.000000e+00
      3.267387e+00
      7.637707e+00
      5.169522e+00
      2.234366
      3.324444
      0.000000e+00
      2.500000e+01



In [42]:

    
airbnb_od.strong_cutoff









    Out[42]:





{'cutoff_iqr': 6, 'cutoff_mad': 6, 'cutoff_z': 6}



In [43]:

    
outliers_age = airbnb_od.outlier_detection_serie_1d('age', cutoff_params=airbnb_od.strong_cutoff)
outliers_age.head(10)



In [44]:

    
print("nb strong outliers : {}".format(outliers_age.is_outlier.sum()))









    



nb strong outliers : 2413



In [45]:

    
index_outliers_age = cserie(outliers_age.is_outlier==1, index=True)



In [46]:

    
df_airbnb.loc[index_outliers_age,:]









    Out[46]:






  
    
      
      id
      date_account_created
      timestamp_first_active
      date_first_booking
      gender
      age
      signup_method
      signup_flow
      language
      affiliate_channel
      affiliate_provider
      first_affiliate_tracked
      signup_app
      first_device_type
      first_browser
      country_destination
    
  
  
    
      388
      v2x0ms9c62
      2010-04-11
      20100411065602
      2010-04-13
      -unknown-
      2014
      basic
      3
      en
      other
      craigslist
      untracked
      Web
      Windows Desktop
      Firefox
      FR
    
    
      398
      9ouah6tc30
      2010-04-12
      20100412231534
      2010-04-12
      FEMALE
      104
      facebook
      3
      en
      other
      craigslist
      linked
      Web
      iPhone
      Mobile Safari
      FR
    
    
      627
      dc3udjfdij
      2010-05-19
      20100519012455
      2010-06-16
      -unknown-
      105
      basic
      2
      en
      other
      craigslist
      omg
      Web
      Mac Desktop
      Safari
      FR
    
    
      673
      umf1wdk9uc
      2010-05-25
      20100525155541
      NaN
      FEMALE
      2014
      basic
      2
      en
      other
      craigslist
      untracked
      Web
      Mac Desktop
      Safari
      NDF
    
    
      1040
      m82epwn7i8
      2010-07-14
      20100714230556
      2010-07-15
      MALE
      2014
      facebook
      0
      en
      other
      craigslist
      untracked
      Web
      Mac Desktop
      Chrome
      US
    
    
      1177
      2th813zdx7
      2010-07-25
      20100725234419
      2010-07-26
      MALE
      2013
      facebook
      3
      en
      direct
      direct
      untracked
      Web
      Mac Desktop
      Chrome
      US
    
    
      1190
      qc9se9qucz
      2010-07-27
      20100727002029
      2010-07-27
      -unknown-
      105
      basic
      2
      en
      other
      craigslist
      untracked
      Web
      Mac Desktop
      Firefox
      US
    
    
      1200
      3amf04n3o3
      2010-07-27
      20100727190447
      2010-07-29
      FEMALE
      2014
      basic
      2
      en
      direct
      direct
      untracked
      Web
      Windows Desktop
      IE
      US
    
    
      1208
      cguxptdi6h
      2010-07-28
      20100728034415
      2010-07-28
      -unknown-
      105
      basic
      3
      en
      direct
      direct
      untracked
      Web
      Mac Desktop
      Firefox
      US
    
    
      1239
      6vpmryt377
      2010-07-30
      20100730055204
      2010-07-30
      FEMALE
      2014
      basic
      0
      en
      direct
      direct
      untracked
      Web
      Mac Desktop
      Firefox
      CA
    
    
      1257
      uxy91xb5p2
      2010-08-01
      20100801071023
      2010-08-01
      MALE
      2014
      facebook
      3
      en
      direct
      direct
      untracked
      Web
      Windows Desktop
      Chrome
      US
    
    
      1308
      bno0vva4uz
      2010-08-06
      20100806133725
      NaN
      MALE
      2014
      basic
      3
      en
      other
      craigslist
      linked
      Web
      Windows Desktop
      Chrome
      NDF
    
    
      1314
      4wir86n2az
      2010-08-06
      20100806201512
      NaN
      MALE
      104
      basic
      2
      en
      direct
      direct
      untracked
      Web
      Other/Unknown
      -unknown-
      NDF
    
    
      1474
      h3rrmak4tu
      2010-08-21
      20100821225214
      NaN
      MALE
      2014
      facebook
      2
      pt
      other
      craigslist
      untracked
      Web
      Mac Desktop
      Safari
      NDF
    
    
      1502
      fou0j7fhnm
      2010-08-24
      20100824205045
      2010-08-25
      FEMALE
      2014
      basic
      3
      en
      sem-non-brand
      google
      untracked
      Web
      Windows Desktop
      Chrome
      US
    
    
      1529
      1j5uk4f0ay
      2010-08-27
      20100827135415
      NaN
      MALE
      105
      facebook
      2
      en
      other
      craigslist
      linked
      Web
      Windows Desktop
      IE
      NDF
    
    
      1615
      uqncyj8byz
      2010-09-04
      20100904210638
      2010-10-03
      FEMALE
      101
      basic
      2
      en
      direct
      direct
      untracked
      Web
      Windows Desktop
      IE
      US
    
    
      1619
      9lkved7fhg
      2010-09-05
      20100905050114
      2010-09-05
      FEMALE
      98
      basic
      3
      en
      seo
      google
      untracked
      Web
      Mac Desktop
      Safari
      US
    
    
      1739
      eef74e94rg
      2010-09-15
      20100915023255
      2010-09-20
      -unknown-
      105
      facebook
      2
      en
      sem-non-brand
      google
      tracked-other
      Web
      Other/Unknown
      Mobile Firefox
      US
    
    
      1826
      kfeh5350pi
      2010-09-21
      20100921134556
      2010-09-26
      -unknown-
      2014
      facebook
      2
      en
      direct
      direct
      untracked
      Web
      Mac Desktop
      Firefox
      other
    
    
      1860
      9aouhu15rk
      2010-09-23
      20100923114758
      2011-03-25
      FEMALE
      2014
      basic
      2
      en
      seo
      google
      linked
      Web
      Windows Desktop
      IE
      US
    
    
      2228
      i0j7vqzk2m
      2010-10-29
      20101029182448
      NaN
      MALE
      115
      facebook
      0
      en
      other
      craigslist
      NaN
      Web
      Other/Unknown
      -unknown-
      NDF
    
    
      2235
      9s9mdj9xeb
      2010-10-30
      20101030104540
      NaN
      MALE
      104
      facebook
      2
      en
      content
      google
      omg
      Web
      Mac Desktop
      Safari
      NDF
    
    
      2289
      v1xp5ggiti
      2010-11-04
      20101104235028
      2011-05-16
      -unknown-
      105
      basic
      2
      en
      direct
      direct
      untracked
      Web
      Windows Desktop
      Chrome
      US
    
    
      2582
      aq0lqrjf2y
      2010-12-06
      20101206173457
      2010-12-06
      -unknown-
      2014
      facebook
      3
      en
      sem-non-brand
      google
      linked
      Web
      iPad
      Mobile Safari
      FR
    
    
      2654
      kkfa80noq4
      2010-12-15
      20101215031123
      2010-12-16
      FEMALE
      105
      basic
      2
      en
      other
      craigslist
      untracked
      Web
      iPhone
      Mobile Safari
      US
    
    
      2657
      qg2p3l847j
      2010-12-15
      20101215040851
      NaN
      FEMALE
      107
      basic
      2
      en
      other
      craigslist
      untracked
      Web
      Windows Desktop
      Chrome
      NDF
    
    
      2775
      34tpc91y6j
      2010-12-29
      20101229034206
      2011-09-29
      MALE
      105
      basic
      0
      en
      direct
      direct
      untracked
      Web
      Mac Desktop
      Safari
      US
    
    
      2825
      wohy9u0us5
      2011-01-03
      20110103163350
      NaN
      FEMALE
      99
      basic
      3
      en
      sem-non-brand
      google
      untracked
      Web
      iPad
      Mobile Safari
      NDF
    
    
      2846
      ee45osn71s
      2011-01-05
      20110105173958
      NaN
      FEMALE
      110
      facebook
      2
      en
      direct
      direct
      linked
      Web
      Mac Desktop
      Safari
      NDF
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      208435
      xddkqb6u5c
      2014-06-22
      20140622015513
      NaN
      MALE
      105
      basic
      0
      en
      sem-non-brand
      google
      omg
      Web
      Mac Desktop
      Safari
      NDF
    
    
      208675
      wucmeipz5m
      2014-06-22
      20140622170254
      NaN
      -unknown-
      105
      basic
      0
      en
      direct
      direct
      untracked
      Web
      Windows Desktop
      Chrome
      NDF
    
    
      208704
      fo3mxaa34u
      2014-06-22
      20140622180847
      2014-06-22
      -unknown-
      105
      basic
      0
      en
      direct
      direct
      linked
      Web
      Mac Desktop
      Safari
      FR
    
    
      208819
      0dn4tfj6gw
      2014-06-22
      20140622214130
      NaN
      FEMALE
      1949
      basic
      0
      en
      direct
      direct
      untracked
      Moweb
      Android Phone
      Chrome Mobile
      NDF
    
    
      209074
      py0sivmitv
      2014-06-23
      20140623062745
      2014-06-23
      -unknown-
      105
      basic
      0
      en
      direct
      direct
      untracked
      Web
      Mac Desktop
      Chrome
      US
    
    
      209222
      p2bg0ux0uf
      2014-06-23
      20140623173318
      2014-06-28
      -unknown-
      105
      basic
      0
      en
      sem-non-brand
      google
      omg
      Web
      Windows Desktop
      IE
      US
    
    
      209228
      v4o93j53zn
      2014-06-23
      20140623174300
      NaN
      MALE
      110
      facebook
      0
      en
      direct
      direct
      linked
      Web
      Mac Desktop
      Chrome
      NDF
    
    
      209672
      1zzmeakk1b
      2014-06-24
      20140624051116
      NaN
      MALE
      105
      facebook
      0
      en
      direct
      direct
      linked
      Moweb
      iPhone
      Mobile Safari
      NDF
    
    
      210086
      0fjffldr3f
      2014-06-24
      20140624224114
      2014-06-24
      -unknown-
      105
      basic
      0
      en
      direct
      direct
      untracked
      Web
      Windows Desktop
      Firefox
      US
    
    
      210232
      nhzq5d8nby
      2014-06-25
      20140625030229
      2014-06-25
      FEMALE
      105
      basic
      0
      en
      sem-brand
      google
      omg
      Web
      Mac Desktop
      Safari
      FR
    
    
      210257
      smzfy2c47g
      2014-06-25
      20140625035237
      2014-07-21
      -unknown-
      105
      basic
      0
      en
      direct
      direct
      untracked
      Web
      Mac Desktop
      Safari
      US
    
    
      210335
      oldf6jxjv1
      2014-06-25
      20140625054539
      2014-12-23
      -unknown-
      105
      basic
      25
      en
      direct
      direct
      untracked
      iOS
      iPhone
      -unknown-
      US
    
    
      210793
      ty6g90h81q
      2014-06-26
      20140626003400
      NaN
      FEMALE
      105
      basic
      0
      en
      sem-non-brand
      google
      omg
      Web
      Windows Desktop
      IE
      NDF
    
    
      210928
      xr911lto8m
      2014-06-26
      20140626041212
      2014-07-02
      MALE
      105
      basic
      0
      en
      direct
      direct
      linked
      Web
      Mac Desktop
      Safari
      GB
    
    
      211074
      9ppttyasd8
      2014-06-26
      20140626131631
      NaN
      FEMALE
      110
      facebook
      0
      fr
      sem-non-brand
      google
      omg
      Web
      Windows Desktop
      IE
      NDF
    
    
      211196
      xcm902wnfb
      2014-06-26
      20140626191640
      2015-02-08
      -unknown-
      105
      basic
      0
      en
      direct
      direct
      linked
      Moweb
      iPhone
      Mobile Safari
      US
    
    
      211228
      n9ks4ugwqg
      2014-06-26
      20140626202148
      2014-07-01
      MALE
      110
      basic
      25
      en
      direct
      direct
      untracked
      iOS
      iPhone
      -unknown-
      US
    
    
      211373
      dtwwccb707
      2014-06-27
      20140627002940
      NaN
      FEMALE
      105
      basic
      0
      en
      direct
      direct
      untracked
      Web
      Mac Desktop
      Safari
      NDF
    
    
      211388
      r5m2au58pp
      2014-06-27
      20140627004836
      2014-07-24
      -unknown-
      105
      basic
      0
      en
      sem-brand
      google
      omg
      Web
      Mac Desktop
      Firefox
      US
    
    
      211424
      mi1cn9suyv
      2014-06-27
      20140627022018
      2014-06-27
      -unknown-
      105
      basic
      0
      en
      direct
      direct
      linked
      Web
      iPad
      Mobile Safari
      US
    
    
      211496
      025sv1949e
      2014-06-27
      20140627043908
      2014-07-11
      FEMALE
      1926
      basic
      0
      en
      direct
      direct
      untracked
      Web
      Windows Desktop
      Firefox
      US
    
    
      211552
      mi21cgnfr3
      2014-06-27
      20140627064132
      NaN
      FEMALE
      101
      basic
      0
      en
      sem-brand
      google
      omg
      Web
      Windows Desktop
      Chrome
      NDF
    
    
      211728
      ldwhkrhtk6
      2014-06-27
      20140627184539
      2014-06-27
      MALE
      105
      basic
      0
      en
      direct
      direct
      linked
      Web
      Mac Desktop
      Chrome
      US
    
    
      212720
      a6r17kfpyi
      2014-06-29
      20140629193647
      2014-06-29
      -unknown-
      105
      basic
      0
      en
      direct
      direct
      omg
      Web
      Mac Desktop
      Safari
      US
    
    
      212786
      4wub3u531r
      2014-06-29
      20140629214926
      NaN
      MALE
      110
      facebook
      0
      en
      direct
      direct
      untracked
      Web
      Windows Desktop
      IE
      NDF
    
    
      212981
      l6y2gwgoni
      2014-06-30
      20140630035617
      2014-07-27
      -unknown-
      105
      basic
      0
      en
      sem-brand
      google
      omg
      Web
      Mac Desktop
      Chrome
      US
    
    
      213106
      1c83n850up
      2014-06-30
      20140630085741
      NaN
      MALE
      110
      facebook
      25
      en
      direct
      direct
      untracked
      iOS
      iPhone
      -unknown-
      NDF
    
    
      213182
      kh8odg7wnv
      2014-06-30
      20140630164858
      2014-07-04
      FEMALE
      105
      basic
      0
      en
      direct
      direct
      linked
      Web
      iPhone
      Mobile Safari
      ES
    
    
      213199
      lyuno62d1q
      2014-06-30
      20140630171821
      2014-07-07
      MALE
      105
      basic
      0
      en
      direct
      direct
      untracked
      Web
      Windows Desktop
      Chrome
      US
    
    
      213345
      gfend4omwv
      2014-06-30
      20140630205707
      2014-07-01
      FEMALE
      105
      basic
      0
      en
      direct
      direct
      omg
      Web
      iPhone
      Mobile Safari
      US
    
  

2413 rows × 16 columns

Naimputer



In [47]:

    
#plt.style.use('ggplot') # ggplot2 style for mathplotlib



In [48]:

    
naimp = NaImputer(df_airbnb)



In [49]:

    
naimp.data_isna.corr()









    Out[49]:






  
    
      
      is_na_date_first_booking
      is_na_age
      is_na_first_affiliate_tracked
    
  
  
    
      is_na_date_first_booking
      1.000000
      0.314193
      0.045084
    
    
      is_na_age
      0.314193
      1.000000
      0.087806
    
    
      is_na_first_affiliate_tracked
      0.045084
      0.087806
      1.000000



In [50]:

    
naimp.plot_corrplot_na()



In [51]:

    
missing_map(df_airbnb, nmax=200)









    Out[51]:





<matplotlib.axes._subplots.AxesSubplot at 0x11bd9efd0>



In [54]:

    
naimp.get_isna_ttest('age', type_test='ks')









    Out[54]:






  
    
      
      pvalue
      statistic
      type_test
    
  
  
    
      timestamp_first_active
      0.000000e+00
      0.090653
      ks
    
    
      signup_flow
      1.989118e-318
      0.084137
      ks



In [55]:

    
naimp.get_isna_ttest('age', type_test='ttest')









    Out[55]:






  
    
      
      pvalue
      statistic
      type_test
    
  
  
    
      timestamp_first_active
      1.804771e-236
      32.877547
      ttest
    
    
      signup_flow
      0.000000e+00
      54.941500
      ttest



In [57]:

    
naimp.get_overlapping_matrix()









    Out[57]:






  
    
      
      date_first_booking
      age
      first_affiliate_tracked
    
  
  
    
      date_first_booking
      1.000000
      0.542897
      0.034743
    
    
      age
      0.768428
      1.000000
      0.045835
    
    
      first_affiliate_tracked
      0.713438
      0.664963
      1.000000



In [58]:

    
naimp.nacolcount()









    Out[58]:






  
    
      
      Nanumber
      Napercentage
    
  
  
    
      id
      0
      0.000000
    
    
      date_account_created
      0
      0.000000
    
    
      timestamp_first_active
      0
      0.000000
    
    
      date_first_booking
      124543
      0.583473
    
    
      gender
      0
      0.000000
    
    
      age
      87990
      0.412226
    
    
      signup_method
      0
      0.000000
    
    
      signup_flow
      0
      0.000000
    
    
      language
      0
      0.000000
    
    
      affiliate_channel
      0
      0.000000
    
    
      affiliate_provider
      0
      0.000000
    
    
      first_affiliate_tracked
      6065
      0.028414
    
    
      signup_app
      0
      0.000000
    
    
      first_device_type
      0
      0.000000
    
    
      first_browser
      0
      0.000000
    
    
      country_destination
      0
      0.000000



In [ ]:

	iqr_score	mad_score	z_score
1	0.266667	0.385429	-0.074958
2	1.466667	2.119857	0.040675
3	0.533333	0.770857	-0.049261
4	0.466667	0.674500	-0.055685
6	0.800000	1.156286	-0.023565
7	0.866667	1.252643	-0.017141
8	1.066667	1.541714	0.002131
9	0.800000	1.156286	-0.023565
10	0.133333	0.192714	-0.087806
11	0.866667	1.252643	-0.017141

	survived	pclass	sex	age	sibsp	fare	embarked	class	who	adult_male	deck	embark_town	alive	alone
0	0	3	male	22	1	7.2500	S	Third	man	True	NaN	Southampton	no	False
1	1	1	female	38	1	71.2833	C	First	woman	False	C	Cherbourg	yes	False
2	1	3	female	26	0	7.9250	S	Third	woman	False	NaN	Southampton	yes	True
3	1	1	female	35	1	53.1000	S	First	woman	False	C	Southampton	yes	False
4	0	3	male	35	0	8.0500	S	Third	man	True	NaN	Southampton	no	True

	dtypes_p	dtypes_r	nb_missing	perc_missing	nb_unique_values	constant_columns	na_columns	is_key	dtype_infer	string_length
survived	int64	numeric	0	0.000000	2	False	False	False	integer	NaN
pclass	int64	numeric	0	0.000000	3	False	False	False	integer	NaN
sex	object	factor	0	0.000000	2	False	False	False	string	6
age	float64	numeric	177	0.198653	88	False	False	False	floating	NaN
sibsp	int64	numeric	0	0.000000	7	False	False	False	integer	NaN
parch	int64	numeric	0	0.000000	7	False	False	False	integer	NaN
fare	float64	numeric	0	0.000000	248	False	False	False	floating	NaN
embarked	object	factor	2	0.002245	3	False	False	False	mixed	1
class	object	factor	0	0.000000	3	False	False	False	string	6
who	object	factor	0	0.000000	3	False	False	False	string	5
adult_male	bool	factor	0	0.000000	2	False	False	False	boolean	NaN
deck	object	factor	688	0.772166	7	False	False	False	mixed	1
embark_town	object	factor	2	0.002245	3	False	False	False	mixed	11
alive	object	factor	0	0.000000	2	False	False	False	string	3
alone	bool	factor	0	0.000000	2	False	False	False	boolean	NaN

	Count	Min	FirstQuartile	Median	Mean	Std	Mad	Skewness	Kurtosis	Thirdquartile	Max
survived	891	0.00	0.0000	0.0000	0.383838	0.486592	0.473013	0.478523	-1.775005	1	1.0000
pclass	891	1.00	2.0000	3.0000	2.308642	0.836071	0.761968	-0.630548	-1.280015	3	3.0000
age	714	0.42	20.1250	28.0000	29.699118	14.526497	11.322944	0.389108	0.178274	38	80.0000
sibsp	891	0.00	0.0000	0.0000	0.523008	1.102743	0.713780	3.695352	17.880420	1	8.0000
parch	891	0.00	0.0000	0.0000	0.381594	0.806057	0.580742	2.749117	9.778125	0	6.0000
fare	891	0.00	7.9104	14.4542	32.204208	49.693429	28.163692	4.787317	33.398141	31	512.3292

	survived	pclass	age	sibsp	parch	fare
0.0	0	1	0.42	0	0	0.0000
0.1	0	1	14.00	0	0	7.5500
0.2	0	1	19.00	0	0	7.8542
0.3	0	2	22.00	0	0	8.0500
0.4	0	2	25.00	0	0	10.5000
0.5	0	3	28.00	0	0	14.4542
0.6	0	3	31.80	0	0	21.6792
0.7	1	3	36.00	1	0	27.0000
0.8	1	3	41.00	1	1	39.6875
0.9	1	3	50.00	1	2	77.9583
1.0	1	3	80.00	8	6	512.3292

	survived	pclass	sex	age	sibsp	parch	fare	embarked	class	who	adult_male	deck	embark_town	alive	alone
27	0	1	male	19	3	2	263.0000	S	First	man	True	C	Southampton	no	False
31	1	1	female	NaN	1	0	146.5208	C	First	woman	False	B	Cherbourg	yes	False
34	0	1	male	28	1	0	82.1708	C	First	man	True	NaN	Cherbourg	no	False
52	1	1	female	49	1	0	76.7292	C	First	woman	False	D	Cherbourg	yes	False
61	1	1	female	38	0	0	80.0000	NaN	First	woman	False	B	NaN	yes	True

	fare_iqr_score	fare_mad_score	fare_z_score	fare_is_outlier
0	-0.312011	-0.703808	-0.502445	0
1	2.461242	5.551871	0.786845	1
2	-0.282777	-0.637865	-0.488854	0
3	1.673732	3.775469	0.420730	1
4	-0.277363	-0.625653	-0.486337	0
5	-0.259680	-0.585764	-0.478116	0
6	1.620136	3.654572	0.395814	1
7	0.286744	0.646813	-0.224083	0
8	-0.143827	-0.324433	-0.424256	0
9	0.676348	1.525651	-0.042956	0
10	0.097265	0.219402	-0.312172	0
11	0.523864	1.181689	-0.113846	0
12	-0.277363	-0.625653	-0.486337	0
13	0.728501	1.643294	-0.018709	0
14	-0.285843	-0.644781	-0.490280	0
15	0.066948	0.151016	-0.326267	0
16	0.635386	1.433251	-0.061999	0
17	-0.062981	-0.142067	-0.386671	0
18	0.153567	0.346404	-0.285997	0
19	-0.313093	-0.706251	-0.502949	0

	dtypes_p	dtypes_r	nb_missing	perc_missing	nb_unique_values	constant_columns	na_columns	is_key	dtype_infer	string_length
id	object	character	0	0.000000	213451	False	False	True	string	10
date_account_created	object	character	0	0.000000	1634	False	False	False	string	10
timestamp_first_active	int64	numeric	0	0.000000	213451	False	False	True	integer	NaN
date_first_booking	object	character	124543	0.583473	1976	False	False	False	mixed	10
gender	object	factor	0	0.000000	4	False	False	False	string	9
age	float64	numeric	87990	0.412226	127	False	False	False	floating	NaN
signup_method	object	factor	0	0.000000	3	False	False	False	string	8
signup_flow	int64	numeric	0	0.000000	17	False	False	False	integer	NaN
language	object	character	0	0.000000	25	False	False	False	string	2
affiliate_channel	object	factor	0	0.000000	8	False	False	False	string	13
affiliate_provider	object	character	0	0.000000	18	False	False	False	string	19
first_affiliate_tracked	object	factor	6065	0.028414	7	False	False	False	mixed	13
signup_app	object	factor	0	0.000000	4	False	False	False	string	7
first_device_type	object	factor	0	0.000000	9	False	False	False	string	18
first_browser	object	character	0	0.000000	52	False	False	False	string	20
country_destination	object	character	0	0.000000	12	False	False	False	string	5

	Count	Min	FirstQuartile	Median	Mean	Std	Mad	Skewness	Kurtosis	Thirdquartile	Max
timestamp_first_active	213451	2.009032e+13	2.012123e+13	2.013091e+13	2.013085e+13	9.253717e+09	6.898231e+09	-0.860883	-3.142610	2.014031e+13	2.014063e+13
age	125461	1.000000e+00	2.800000e+01	3.400000e+01	4.966834e+01	1.556666e+02	2.838745e+01	12.422586	153.568929	4.300000e+01	2.014000e+03
signup_flow	213451	0.000000e+00	0.000000e+00	0.000000e+00	3.267387e+00	7.637707e+00	5.169522e+00	2.234366	3.324444	0.000000e+00	2.500000e+01

	id	date_account_created	timestamp_first_active	date_first_booking	gender	age	signup_method	signup_flow	language	affiliate_channel	affiliate_provider	first_affiliate_tracked	signup_app	first_device_type	first_browser	country_destination
388	v2x0ms9c62	2010-04-11	20100411065602	2010-04-13	-unknown-	2014	basic	3	en	other	craigslist	untracked	Web	Windows Desktop	Firefox	FR
398	9ouah6tc30	2010-04-12	20100412231534	2010-04-12	FEMALE	104	facebook	3	en	other	craigslist	linked	Web	iPhone	Mobile Safari	FR
627	dc3udjfdij	2010-05-19	20100519012455	2010-06-16	-unknown-	105	basic	2	en	other	craigslist	omg	Web	Mac Desktop	Safari	FR
673	umf1wdk9uc	2010-05-25	20100525155541	NaN	FEMALE	2014	basic	2	en	other	craigslist	untracked	Web	Mac Desktop	Safari	NDF
1040	m82epwn7i8	2010-07-14	20100714230556	2010-07-15	MALE	2014	facebook	0	en	other	craigslist	untracked	Web	Mac Desktop	Chrome	US
1177	2th813zdx7	2010-07-25	20100725234419	2010-07-26	MALE	2013	facebook	3	en	direct	direct	untracked	Web	Mac Desktop	Chrome	US
1190	qc9se9qucz	2010-07-27	20100727002029	2010-07-27	-unknown-	105	basic	2	en	other	craigslist	untracked	Web	Mac Desktop	Firefox	US
1200	3amf04n3o3	2010-07-27	20100727190447	2010-07-29	FEMALE	2014	basic	2	en	direct	direct	untracked	Web	Windows Desktop	IE	US
1208	cguxptdi6h	2010-07-28	20100728034415	2010-07-28	-unknown-	105	basic	3	en	direct	direct	untracked	Web	Mac Desktop	Firefox	US
1239	6vpmryt377	2010-07-30	20100730055204	2010-07-30	FEMALE	2014	basic	0	en	direct	direct	untracked	Web	Mac Desktop	Firefox	CA
1257	uxy91xb5p2	2010-08-01	20100801071023	2010-08-01	MALE	2014	facebook	3	en	direct	direct	untracked	Web	Windows Desktop	Chrome	US
1308	bno0vva4uz	2010-08-06	20100806133725	NaN	MALE	2014	basic	3	en	other	craigslist	linked	Web	Windows Desktop	Chrome	NDF
1314	4wir86n2az	2010-08-06	20100806201512	NaN	MALE	104	basic	2	en	direct	direct	untracked	Web	Other/Unknown	-unknown-	NDF
1474	h3rrmak4tu	2010-08-21	20100821225214	NaN	MALE	2014	facebook	2	pt	other	craigslist	untracked	Web	Mac Desktop	Safari	NDF
1502	fou0j7fhnm	2010-08-24	20100824205045	2010-08-25	FEMALE	2014	basic	3	en	sem-non-brand	google	untracked	Web	Windows Desktop	Chrome	US
1529	1j5uk4f0ay	2010-08-27	20100827135415	NaN	MALE	105	facebook	2	en	other	craigslist	linked	Web	Windows Desktop	IE	NDF
1615	uqncyj8byz	2010-09-04	20100904210638	2010-10-03	FEMALE	101	basic	2	en	direct	direct	untracked	Web	Windows Desktop	IE	US
1619	9lkved7fhg	2010-09-05	20100905050114	2010-09-05	FEMALE	98	basic	3	en	seo	google	untracked	Web	Mac Desktop	Safari	US
1739	eef74e94rg	2010-09-15	20100915023255	2010-09-20	-unknown-	105	facebook	2	en	sem-non-brand	google	tracked-other	Web	Other/Unknown	Mobile Firefox	US
1826	kfeh5350pi	2010-09-21	20100921134556	2010-09-26	-unknown-	2014	facebook	2	en	direct	direct	untracked	Web	Mac Desktop	Firefox	other
1860	9aouhu15rk	2010-09-23	20100923114758	2011-03-25	FEMALE	2014	basic	2	en	seo	google	linked	Web	Windows Desktop	IE	US
2228	i0j7vqzk2m	2010-10-29	20101029182448	NaN	MALE	115	facebook	0	en	other	craigslist	NaN	Web	Other/Unknown	-unknown-	NDF
2235	9s9mdj9xeb	2010-10-30	20101030104540	NaN	MALE	104	facebook	2	en	content	google	omg	Web	Mac Desktop	Safari	NDF
2289	v1xp5ggiti	2010-11-04	20101104235028	2011-05-16	-unknown-	105	basic	2	en	direct	direct	untracked	Web	Windows Desktop	Chrome	US
2582	aq0lqrjf2y	2010-12-06	20101206173457	2010-12-06	-unknown-	2014	facebook	3	en	sem-non-brand	google	linked	Web	iPad	Mobile Safari	FR
2654	kkfa80noq4	2010-12-15	20101215031123	2010-12-16	FEMALE	105	basic	2	en	other	craigslist	untracked	Web	iPhone	Mobile Safari	US
2657	qg2p3l847j	2010-12-15	20101215040851	NaN	FEMALE	107	basic	2	en	other	craigslist	untracked	Web	Windows Desktop	Chrome	NDF
2775	34tpc91y6j	2010-12-29	20101229034206	2011-09-29	MALE	105	basic	0	en	direct	direct	untracked	Web	Mac Desktop	Safari	US
2825	wohy9u0us5	2011-01-03	20110103163350	NaN	FEMALE	99	basic	3	en	sem-non-brand	google	untracked	Web	iPad	Mobile Safari	NDF
2846	ee45osn71s	2011-01-05	20110105173958	NaN	FEMALE	110	facebook	2	en	direct	direct	linked	Web	Mac Desktop	Safari	NDF
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
208435	xddkqb6u5c	2014-06-22	20140622015513	NaN	MALE	105	basic	0	en	sem-non-brand	google	omg	Web	Mac Desktop	Safari	NDF
208675	wucmeipz5m	2014-06-22	20140622170254	NaN	-unknown-	105	basic	0	en	direct	direct	untracked	Web	Windows Desktop	Chrome	NDF
208704	fo3mxaa34u	2014-06-22	20140622180847	2014-06-22	-unknown-	105	basic	0	en	direct	direct	linked	Web	Mac Desktop	Safari	FR
208819	0dn4tfj6gw	2014-06-22	20140622214130	NaN	FEMALE	1949	basic	0	en	direct	direct	untracked	Moweb	Android Phone	Chrome Mobile	NDF
209074	py0sivmitv	2014-06-23	20140623062745	2014-06-23	-unknown-	105	basic	0	en	direct	direct	untracked	Web	Mac Desktop	Chrome	US
209222	p2bg0ux0uf	2014-06-23	20140623173318	2014-06-28	-unknown-	105	basic	0	en	sem-non-brand	google	omg	Web	Windows Desktop	IE	US
209228	v4o93j53zn	2014-06-23	20140623174300	NaN	MALE	110	facebook	0	en	direct	direct	linked	Web	Mac Desktop	Chrome	NDF
209672	1zzmeakk1b	2014-06-24	20140624051116	NaN	MALE	105	facebook	0	en	direct	direct	linked	Moweb	iPhone	Mobile Safari	NDF
210086	0fjffldr3f	2014-06-24	20140624224114	2014-06-24	-unknown-	105	basic	0	en	direct	direct	untracked	Web	Windows Desktop	Firefox	US
210232	nhzq5d8nby	2014-06-25	20140625030229	2014-06-25	FEMALE	105	basic	0	en	sem-brand	google	omg	Web	Mac Desktop	Safari	FR
210257	smzfy2c47g	2014-06-25	20140625035237	2014-07-21	-unknown-	105	basic	0	en	direct	direct	untracked	Web	Mac Desktop	Safari	US
210335	oldf6jxjv1	2014-06-25	20140625054539	2014-12-23	-unknown-	105	basic	25	en	direct	direct	untracked	iOS	iPhone	-unknown-	US
210793	ty6g90h81q	2014-06-26	20140626003400	NaN	FEMALE	105	basic	0	en	sem-non-brand	google	omg	Web	Windows Desktop	IE	NDF
210928	xr911lto8m	2014-06-26	20140626041212	2014-07-02	MALE	105	basic	0	en	direct	direct	linked	Web	Mac Desktop	Safari	GB
211074	9ppttyasd8	2014-06-26	20140626131631	NaN	FEMALE	110	facebook	0	fr	sem-non-brand	google	omg	Web	Windows Desktop	IE	NDF
211196	xcm902wnfb	2014-06-26	20140626191640	2015-02-08	-unknown-	105	basic	0	en	direct	direct	linked	Moweb	iPhone	Mobile Safari	US
211228	n9ks4ugwqg	2014-06-26	20140626202148	2014-07-01	MALE	110	basic	25	en	direct	direct	untracked	iOS	iPhone	-unknown-	US
211373	dtwwccb707	2014-06-27	20140627002940	NaN	FEMALE	105	basic	0	en	direct	direct	untracked	Web	Mac Desktop	Safari	NDF
211388	r5m2au58pp	2014-06-27	20140627004836	2014-07-24	-unknown-	105	basic	0	en	sem-brand	google	omg	Web	Mac Desktop	Firefox	US
211424	mi1cn9suyv	2014-06-27	20140627022018	2014-06-27	-unknown-	105	basic	0	en	direct	direct	linked	Web	iPad	Mobile Safari	US
211496	025sv1949e	2014-06-27	20140627043908	2014-07-11	FEMALE	1926	basic	0	en	direct	direct	untracked	Web	Windows Desktop	Firefox	US
211552	mi21cgnfr3	2014-06-27	20140627064132	NaN	FEMALE	101	basic	0	en	sem-brand	google	omg	Web	Windows Desktop	Chrome	NDF
211728	ldwhkrhtk6	2014-06-27	20140627184539	2014-06-27	MALE	105	basic	0	en	direct	direct	linked	Web	Mac Desktop	Chrome	US
212720	a6r17kfpyi	2014-06-29	20140629193647	2014-06-29	-unknown-	105	basic	0	en	direct	direct	omg	Web	Mac Desktop	Safari	US
212786	4wub3u531r	2014-06-29	20140629214926	NaN	MALE	110	facebook	0	en	direct	direct	untracked	Web	Windows Desktop	IE	NDF
212981	l6y2gwgoni	2014-06-30	20140630035617	2014-07-27	-unknown-	105	basic	0	en	sem-brand	google	omg	Web	Mac Desktop	Chrome	US
213106	1c83n850up	2014-06-30	20140630085741	NaN	MALE	110	facebook	25	en	direct	direct	untracked	iOS	iPhone	-unknown-	NDF
213182	kh8odg7wnv	2014-06-30	20140630164858	2014-07-04	FEMALE	105	basic	0	en	direct	direct	linked	Web	iPhone	Mobile Safari	ES
213199	lyuno62d1q	2014-06-30	20140630171821	2014-07-07	MALE	105	basic	0	en	direct	direct	untracked	Web	Windows Desktop	Chrome	US
213345	gfend4omwv	2014-06-30	20140630205707	2014-07-01	FEMALE	105	basic	0	en	direct	direct	omg	Web	iPhone	Mobile Safari	US

	is_na_date_first_booking	is_na_age	is_na_first_affiliate_tracked
is_na_date_first_booking	1.000000	0.314193	0.045084
is_na_age	0.314193	1.000000	0.087806
is_na_first_affiliate_tracked	0.045084	0.087806	1.000000

	pvalue	statistic	type_test
timestamp_first_active	0.000000e+00	0.090653	ks
signup_flow	1.989118e-318	0.084137	ks

	pvalue	statistic	type_test
timestamp_first_active	1.804771e-236	32.877547	ttest
signup_flow	0.000000e+00	54.941500	ttest

	date_first_booking	age	first_affiliate_tracked
date_first_booking	1.000000	0.542897	0.034743
age	0.768428	1.000000	0.045835
first_affiliate_tracked	0.713438	0.664963	1.000000