In [1]:
from autoc import DataExploration 
from autoc.utils.getdata import get_dataset
from autoc.utils.helpers import create_test_df
import pandas as pd

In [2]:
list_other_na = {'unknown', 'na', 'missing', 'n/a','not available'}

In [3]:
df_test = create_test_df()

In [4]:
exploration_test = DataExploration(df_test)

In [5]:
exploration_test.structure()


Out[5]:
dtypes_p dtypes_r nb_missing perc_missing nb_unique_values constant_columns na_columns is_key dtype_infer
id int64 numeric 0 0.000 1000 False False True integer
member_id int64 numeric 0 0.000 1000 False False True integer
na_col float64 numeric 1000 1.000 0 False True False floating
id_na float64 numeric 3 0.003 997 False False False floating
constant_col object factor 0 0.000 1 True False False string
constant_col_num int64 numeric 0 0.000 1 True False False integer
character_factor object factor 0 0.000 7 False False False string
num_factor int64 numeric 0 0.000 4 False False False integer
nearzerovar_variable object factor 0 0.000 2 False False False string
binary_variable int64 numeric 0 0.000 2 False False False integer
character_variable object character 0 0.000 1000 False False True string
duplicated_column int64 numeric 0 0.000 1000 False False True integer
many_missing_70 float64 numeric 700 0.700 1 True False False floating
character_variable_fillna object factor 300 0.300 3 False False False mixed
numeric_variable_fillna float64 numeric 200 0.200 2 False False False floating
num_variable float64 numeric 0 0.000 1 True False False floating
int_factor_10 int64 numeric 0 0.000 10 False False False integer
outlier float64 numeric 0 0.000 999 False False False floating
datetime datetime64[ns] character 0 0.000 1000 False False True datetime
None_100 float64 numeric 100 0.100 1 True False False floating
None_na_200 float64 numeric 200 0.200 1 True False False floating
character_variable_up1 object factor 0 0.000 3 False False False string
character_variable_up2 object factor 0 0.000 3 False False False string
other_na object factor 0 0.000 9 False False False string

In [6]:
df_test.id_na


Out[6]:
0         1
1       NaN
2       NaN
3       NaN
4         5
5         6
6         7
7         8
8         9
9        10
10       11
11       12
12       13
13       14
14       15
15       16
16       17
17       18
18       19
19       20
20       21
21       22
22       23
23       24
24       25
25       26
26       27
27       28
28       29
29       30
       ... 
970     971
971     972
972     973
973     974
974     975
975     976
976     977
977     978
978     979
979     980
980     981
981     982
982     983
983     984
984     985
985     986
986     987
987     988
988     989
989     990
990     991
991     992
992     993
993     994
994     995
995     996
996     997
997     998
998     999
999    1000
Name: id_na, dtype: float64

In [7]:
exploration_test.get_infos_consistency()


Out[7]:
{'constant_columns': {'action': 'delete',
  'level': 'WARNING',
  'value': ['na_col', 'constant_col', 'constant_col_num', 'num_variable']},
 'dup_columns': {'action': 'delete',
  'level': 'ERROR',
  'value': [['id', 'duplicated_column']]},
 'nacols_full': {'action': 'delete', 'level': 'ERROR', 'value': ['na_col']},
 'narows_full': {'action': 'delete', 'level': 'ERROR', 'value': []},
 'nb_duplicated_rows': {'action': 'delete', 'level': 'ERROR', 'value': 0}}

In [8]:
exploration_test.print_infos()


{'constant_columns': {'action': 'delete',
                      'level': 'WARNING',
                      'value': ['na_col',
                                'constant_col',
                                'constant_col_num',
                                'num_variable']},
 'dup_columns': {'action': 'delete',
                 'level': 'ERROR',
                 'value': [['id', 'duplicated_column']]},
 'nacols_full': {'action': 'delete', 'level': 'ERROR', 'value': ['na_col']},
 'narows_full': {'action': 'delete', 'level': 'ERROR', 'value': []},
 'nb_duplicated_rows': {'action': 'delete', 'level': 'ERROR', 'value': 0}}

In [9]:
df_test['Upper_text_1'] = 'AHH'

In [10]:
df_test.dtypes


Out[10]:
id                                    int64
member_id                             int64
na_col                              float64
id_na                               float64
constant_col                         object
constant_col_num                      int64
character_factor                     object
num_factor                            int64
nearzerovar_variable                 object
binary_variable                       int64
character_variable                   object
duplicated_column                     int64
many_missing_70                     float64
character_variable_fillna            object
numeric_variable_fillna             float64
num_variable                        float64
int_factor_10                         int64
outlier                             float64
datetime                     datetime64[ns]
None_100                            float64
None_na_200                         float64
character_variable_up1               object
character_variable_up2               object
other_na                             object
Upper_text_1                         object
dtype: object

In [11]:
df_test.applymap(lambda x: x.lower() if type(x) == str else x)


Out[11]:
id member_id na_col id_na constant_col constant_col_num character_factor num_factor nearzerovar_variable binary_variable ... num_variable int_factor_10 outlier datetime None_100 None_na_200 character_variable_up1 character_variable_up2 other_na Upper_text_1
0 1 10 NaN 1 constant 0 B 1 one_value 1 ... 100 6 0.504816 2015-01-01 00:00:00 1 1 a a missing ahh
1 2 20 NaN NaN constant 0 C 3 most_common_value 1 ... 100 2 10.000000 2015-01-01 01:00:00 1 1 a a missing ahh
2 3 30 NaN NaN constant 0 E 2 most_common_value 1 ... 100 8 -0.297964 2015-01-01 02:00:00 1 1 a a missing ahh
3 4 40 NaN NaN constant 0 D 3 most_common_value 1 ... 100 8 -1.243812 2015-01-01 03:00:00 1 1 a a missing ahh
4 5 50 NaN 5 constant 0 F 2 most_common_value 1 ... 100 9 0.759264 2015-01-01 04:00:00 1 1 a a missing ahh
5 6 60 NaN 6 constant 0 C 3 most_common_value 0 ... 100 1 0.517270 2015-01-01 05:00:00 1 1 a a missing ahh
6 7 70 NaN 7 constant 0 C 1 most_common_value 0 ... 100 8 0.418787 2015-01-01 06:00:00 1 1 a a missing ahh
7 8 80 NaN 8 constant 0 C 3 most_common_value 0 ... 100 5 -0.578671 2015-01-01 07:00:00 1 1 a a missing ahh
8 9 90 NaN 9 constant 0 B 2 most_common_value 1 ... 100 6 -0.203731 2015-01-01 08:00:00 1 1 a a missing ahh
9 10 100 NaN 10 constant 0 C 2 most_common_value 0 ... 100 8 1.616109 2015-01-01 09:00:00 1 1 a a missing ahh
10 11 110 NaN 11 constant 0 D 1 most_common_value 1 ... 100 3 5.000000 2015-01-01 10:00:00 1 1 a a missing ahh
11 12 120 NaN 12 constant 0 B 3 most_common_value 1 ... 100 3 -1.135743 2015-01-01 11:00:00 1 1 a a missing ahh
12 13 130 NaN 13 constant 0 D 3 most_common_value 0 ... 100 6 -1.648627 2015-01-01 12:00:00 1 1 a a missing ahh
13 14 140 NaN 14 constant 0 A 1 most_common_value 0 ... 100 3 -0.672021 2015-01-01 13:00:00 1 1 a a missing ahh
14 15 150 NaN 15 constant 0 A 4 most_common_value 0 ... 100 5 -0.661877 2015-01-01 14:00:00 1 1 a a missing ahh
15 16 160 NaN 16 constant 0 D 1 most_common_value 0 ... 100 4 0.901657 2015-01-01 15:00:00 1 1 a a missing ahh
16 17 170 NaN 17 constant 0 E 3 most_common_value 1 ... 100 8 -0.848503 2015-01-01 16:00:00 1 1 a a missing ahh
17 18 180 NaN 18 constant 0 C 3 most_common_value 0 ... 100 3 -0.282732 2015-01-01 17:00:00 1 1 a a missing ahh
18 19 190 NaN 19 constant 0 F 4 most_common_value 1 ... 100 4 -1.800709 2015-01-01 18:00:00 1 1 a a missing ahh
19 20 200 NaN 20 constant 0 C 3 most_common_value 1 ... 100 0 0.659969 2015-01-01 19:00:00 1 1 a a missing ahh
20 21 210 NaN 21 constant 0 F 2 most_common_value 1 ... 100 3 0.918322 2015-01-01 20:00:00 1 1 a a missing ahh
21 22 220 NaN 22 constant 0 G 2 most_common_value 0 ... 100 0 -0.337731 2015-01-01 21:00:00 1 1 a a missing ahh
22 23 230 NaN 23 constant 0 B 2 most_common_value 0 ... 100 8 -1.386793 2015-01-01 22:00:00 1 1 a a missing ahh
23 24 240 NaN 24 constant 0 F 4 most_common_value 0 ... 100 1 1.239776 2015-01-01 23:00:00 1 1 a a missing ahh
24 25 250 NaN 25 constant 0 A 3 most_common_value 1 ... 100 9 0.589520 2015-01-02 00:00:00 1 1 a a missing ahh
25 26 260 NaN 26 constant 0 A 4 most_common_value 0 ... 100 1 0.060490 2015-01-02 01:00:00 1 1 a a missing ahh
26 27 270 NaN 27 constant 0 G 2 most_common_value 1 ... 100 9 0.053333 2015-01-02 02:00:00 1 1 a a missing ahh
27 28 280 NaN 28 constant 0 F 4 most_common_value 1 ... 100 2 1.232756 2015-01-02 03:00:00 1 1 a a missing ahh
28 29 290 NaN 29 constant 0 E 1 most_common_value 0 ... 100 8 -1.098578 2015-01-02 04:00:00 1 1 a a missing ahh
29 30 300 NaN 30 constant 0 D 3 most_common_value 1 ... 100 3 0.579508 2015-01-02 05:00:00 1 1 a a missing ahh
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
970 971 9710 NaN 971 constant 0 G 2 most_common_value 1 ... 100 6 -0.687444 2015-02-10 10:00:00 NaN NaN c d do_not_touch ahh
971 972 9720 NaN 972 constant 0 C 1 most_common_value 0 ... 100 9 1.475201 2015-02-10 11:00:00 NaN NaN c d do_not_touch ahh
972 973 9730 NaN 973 constant 0 A 4 most_common_value 1 ... 100 2 -0.519341 2015-02-10 12:00:00 NaN NaN c d do_not_touch ahh
973 974 9740 NaN 974 constant 0 B 1 most_common_value 0 ... 100 5 0.068614 2015-02-10 13:00:00 NaN NaN c d do_not_touch ahh
974 975 9750 NaN 975 constant 0 G 1 most_common_value 1 ... 100 7 -0.723333 2015-02-10 14:00:00 NaN NaN c d do_not_touch ahh
975 976 9760 NaN 976 constant 0 D 2 most_common_value 0 ... 100 3 -0.491493 2015-02-10 15:00:00 NaN NaN c d do_not_touch ahh
976 977 9770 NaN 977 constant 0 E 4 most_common_value 1 ... 100 4 -1.262389 2015-02-10 16:00:00 NaN NaN c d do_not_touch ahh
977 978 9780 NaN 978 constant 0 E 3 most_common_value 0 ... 100 4 0.630329 2015-02-10 17:00:00 NaN NaN c d do_not_touch ahh
978 979 9790 NaN 979 constant 0 B 3 most_common_value 0 ... 100 2 1.556459 2015-02-10 18:00:00 NaN NaN c d do_not_touch ahh
979 980 9800 NaN 980 constant 0 C 2 most_common_value 0 ... 100 2 -0.639053 2015-02-10 19:00:00 NaN NaN c d do_not_touch ahh
980 981 9810 NaN 981 constant 0 A 4 most_common_value 0 ... 100 8 -0.307174 2015-02-10 20:00:00 NaN NaN c d do_not_touch ahh
981 982 9820 NaN 982 constant 0 B 3 most_common_value 0 ... 100 3 -0.187863 2015-02-10 21:00:00 NaN NaN c d do_not_touch ahh
982 983 9830 NaN 983 constant 0 B 4 most_common_value 0 ... 100 0 -0.572205 2015-02-10 22:00:00 NaN NaN c d do_not_touch ahh
983 984 9840 NaN 984 constant 0 C 2 most_common_value 0 ... 100 8 -0.414235 2015-02-10 23:00:00 NaN NaN c d do_not_touch ahh
984 985 9850 NaN 985 constant 0 C 2 most_common_value 0 ... 100 7 -0.651123 2015-02-11 00:00:00 NaN NaN c d do_not_touch ahh
985 986 9860 NaN 986 constant 0 D 4 most_common_value 1 ... 100 4 0.769402 2015-02-11 01:00:00 NaN NaN c d do_not_touch ahh
986 987 9870 NaN 987 constant 0 A 2 most_common_value 0 ... 100 1 -0.602695 2015-02-11 02:00:00 NaN NaN c d do_not_touch ahh
987 988 9880 NaN 988 constant 0 C 3 most_common_value 1 ... 100 6 0.216869 2015-02-11 03:00:00 NaN NaN c d do_not_touch ahh
988 989 9890 NaN 989 constant 0 D 1 most_common_value 1 ... 100 9 -1.052458 2015-02-11 04:00:00 NaN NaN c d do_not_touch ahh
989 990 9900 NaN 990 constant 0 F 1 most_common_value 1 ... 100 8 -0.896159 2015-02-11 05:00:00 NaN NaN c d do_not_touch ahh
990 991 9910 NaN 991 constant 0 B 1 most_common_value 0 ... 100 4 -0.530823 2015-02-11 06:00:00 NaN NaN c d do_not_touch ahh
991 992 9920 NaN 992 constant 0 E 3 most_common_value 1 ... 100 3 -0.860887 2015-02-11 07:00:00 NaN NaN c d do_not_touch ahh
992 993 9930 NaN 993 constant 0 G 3 most_common_value 1 ... 100 3 1.259120 2015-02-11 08:00:00 NaN NaN c d do_not_touch ahh
993 994 9940 NaN 994 constant 0 C 2 most_common_value 0 ... 100 7 0.876362 2015-02-11 09:00:00 NaN NaN c d do_not_touch ahh
994 995 9950 NaN 995 constant 0 E 2 most_common_value 1 ... 100 4 -0.498452 2015-02-11 10:00:00 NaN NaN c d do_not_touch ahh
995 996 9960 NaN 996 constant 0 B 4 most_common_value 1 ... 100 9 0.171826 2015-02-11 11:00:00 NaN NaN c d do_not_touch ahh
996 997 9970 NaN 997 constant 0 D 1 most_common_value 1 ... 100 2 0.080841 2015-02-11 12:00:00 NaN NaN c d do_not_touch ahh
997 998 9980 NaN 998 constant 0 A 1 most_common_value 1 ... 100 9 0.193660 2015-02-11 13:00:00 NaN NaN c d do_not_touch ahh
998 999 9990 NaN 999 constant 0 D 2 most_common_value 0 ... 100 7 -2.874406 2015-02-11 14:00:00 NaN NaN c d do_not_touch ahh
999 1000 10000 NaN 1000 constant 0 G 2 most_common_value 1 ... 100 3 -0.095891 2015-02-11 15:00:00 NaN NaN c d do_not_touch ahh

1000 rows × 25 columns


In [16]:
arr =exploration_test.detect_other_na(auto_replace=False)


We detected id                             0
member_id                      0
na_col                         0
id_na                          0
constant_col                   0
constant_col_num               0
character_factor               0
num_factor                     0
nearzerovar_variable           0
binary_variable                0
character_variable             0
duplicated_column              0
many_missing_70                0
character_variable_fillna      0
numeric_variable_fillna        0
num_variable                   0
int_factor_10                  0
outlier                        0
datetime                       0
None_100                       0
None_na_200                    0
character_variable_up1         0
character_variable_up2         0
other_na                     800
Upper_text_1                   0
dtype: int64 other type of missing values

In [20]:
df_2 =df_test.where((arr==False), )

In [22]:
df_2.isnull().sum()


Out[22]:
id                              0
member_id                       0
na_col                       1000
id_na                           3
constant_col                    0
constant_col_num                0
character_factor                0
num_factor                      0
nearzerovar_variable            0
binary_variable                 0
character_variable              0
duplicated_column               0
many_missing_70               700
character_variable_fillna     300
numeric_variable_fillna       200
num_variable                    0
int_factor_10                   0
outlier                         0
datetime                        0
None_100                      100
None_na_200                   200
character_variable_up1          0
character_variable_up2          0
other_na                      800
Upper_text_1                    0
dtype: int64

Test Basic Cleaning


In [13]:
df.shape


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-13-0fe90f8063b0> in <module>()
----> 1 df.shape

NameError: name 'df' is not defined

In [ ]:
df_test.datetime.dtype == '<M8[ns]'

In [ ]: