In [1]:
from autoc import DataExploration
from autoc.utils.getdata import get_dataset
from autoc.utils.helpers import create_test_df
import pandas as pd
In [2]:
list_other_na = {'unknown', 'na', 'missing', 'n/a','not available'}
In [3]:
df_test = create_test_df()
In [4]:
exploration_test = DataExploration(df_test)
In [5]:
exploration_test.structure()
Out[5]:
dtypes_p
dtypes_r
nb_missing
perc_missing
nb_unique_values
constant_columns
na_columns
is_key
dtype_infer
id
int64
numeric
0
0.000
1000
False
False
True
integer
member_id
int64
numeric
0
0.000
1000
False
False
True
integer
na_col
float64
numeric
1000
1.000
0
False
True
False
floating
id_na
float64
numeric
3
0.003
997
False
False
False
floating
constant_col
object
factor
0
0.000
1
True
False
False
string
constant_col_num
int64
numeric
0
0.000
1
True
False
False
integer
character_factor
object
factor
0
0.000
7
False
False
False
string
num_factor
int64
numeric
0
0.000
4
False
False
False
integer
nearzerovar_variable
object
factor
0
0.000
2
False
False
False
string
binary_variable
int64
numeric
0
0.000
2
False
False
False
integer
character_variable
object
character
0
0.000
1000
False
False
True
string
duplicated_column
int64
numeric
0
0.000
1000
False
False
True
integer
many_missing_70
float64
numeric
700
0.700
1
True
False
False
floating
character_variable_fillna
object
factor
300
0.300
3
False
False
False
mixed
numeric_variable_fillna
float64
numeric
200
0.200
2
False
False
False
floating
num_variable
float64
numeric
0
0.000
1
True
False
False
floating
int_factor_10
int64
numeric
0
0.000
10
False
False
False
integer
outlier
float64
numeric
0
0.000
999
False
False
False
floating
datetime
datetime64[ns]
character
0
0.000
1000
False
False
True
datetime
None_100
float64
numeric
100
0.100
1
True
False
False
floating
None_na_200
float64
numeric
200
0.200
1
True
False
False
floating
character_variable_up1
object
factor
0
0.000
3
False
False
False
string
character_variable_up2
object
factor
0
0.000
3
False
False
False
string
other_na
object
factor
0
0.000
9
False
False
False
string
In [6]:
df_test.id_na
Out[6]:
0 1
1 NaN
2 NaN
3 NaN
4 5
5 6
6 7
7 8
8 9
9 10
10 11
11 12
12 13
13 14
14 15
15 16
16 17
17 18
18 19
19 20
20 21
21 22
22 23
23 24
24 25
25 26
26 27
27 28
28 29
29 30
...
970 971
971 972
972 973
973 974
974 975
975 976
976 977
977 978
978 979
979 980
980 981
981 982
982 983
983 984
984 985
985 986
986 987
987 988
988 989
989 990
990 991
991 992
992 993
993 994
994 995
995 996
996 997
997 998
998 999
999 1000
Name: id_na, dtype: float64
In [7]:
exploration_test.get_infos_consistency()
Out[7]:
{'constant_columns': {'action': 'delete',
'level': 'WARNING',
'value': ['na_col', 'constant_col', 'constant_col_num', 'num_variable']},
'dup_columns': {'action': 'delete',
'level': 'ERROR',
'value': [['id', 'duplicated_column']]},
'nacols_full': {'action': 'delete', 'level': 'ERROR', 'value': ['na_col']},
'narows_full': {'action': 'delete', 'level': 'ERROR', 'value': []},
'nb_duplicated_rows': {'action': 'delete', 'level': 'ERROR', 'value': 0}}
In [8]:
exploration_test.print_infos()
{'constant_columns': {'action': 'delete',
'level': 'WARNING',
'value': ['na_col',
'constant_col',
'constant_col_num',
'num_variable']},
'dup_columns': {'action': 'delete',
'level': 'ERROR',
'value': [['id', 'duplicated_column']]},
'nacols_full': {'action': 'delete', 'level': 'ERROR', 'value': ['na_col']},
'narows_full': {'action': 'delete', 'level': 'ERROR', 'value': []},
'nb_duplicated_rows': {'action': 'delete', 'level': 'ERROR', 'value': 0}}
In [9]:
df_test['Upper_text_1'] = 'AHH'
In [10]:
df_test.dtypes
Out[10]:
id int64
member_id int64
na_col float64
id_na float64
constant_col object
constant_col_num int64
character_factor object
num_factor int64
nearzerovar_variable object
binary_variable int64
character_variable object
duplicated_column int64
many_missing_70 float64
character_variable_fillna object
numeric_variable_fillna float64
num_variable float64
int_factor_10 int64
outlier float64
datetime datetime64[ns]
None_100 float64
None_na_200 float64
character_variable_up1 object
character_variable_up2 object
other_na object
Upper_text_1 object
dtype: object
In [11]:
df_test.applymap(lambda x: x.lower() if type(x) == str else x)
Out[11]:
id
member_id
na_col
id_na
constant_col
constant_col_num
character_factor
num_factor
nearzerovar_variable
binary_variable
...
num_variable
int_factor_10
outlier
datetime
None_100
None_na_200
character_variable_up1
character_variable_up2
other_na
Upper_text_1
0
1
10
NaN
1
constant
0
B
1
one_value
1
...
100
6
0.504816
2015-01-01 00:00:00
1
1
a
a
missing
ahh
1
2
20
NaN
NaN
constant
0
C
3
most_common_value
1
...
100
2
10.000000
2015-01-01 01:00:00
1
1
a
a
missing
ahh
2
3
30
NaN
NaN
constant
0
E
2
most_common_value
1
...
100
8
-0.297964
2015-01-01 02:00:00
1
1
a
a
missing
ahh
3
4
40
NaN
NaN
constant
0
D
3
most_common_value
1
...
100
8
-1.243812
2015-01-01 03:00:00
1
1
a
a
missing
ahh
4
5
50
NaN
5
constant
0
F
2
most_common_value
1
...
100
9
0.759264
2015-01-01 04:00:00
1
1
a
a
missing
ahh
5
6
60
NaN
6
constant
0
C
3
most_common_value
0
...
100
1
0.517270
2015-01-01 05:00:00
1
1
a
a
missing
ahh
6
7
70
NaN
7
constant
0
C
1
most_common_value
0
...
100
8
0.418787
2015-01-01 06:00:00
1
1
a
a
missing
ahh
7
8
80
NaN
8
constant
0
C
3
most_common_value
0
...
100
5
-0.578671
2015-01-01 07:00:00
1
1
a
a
missing
ahh
8
9
90
NaN
9
constant
0
B
2
most_common_value
1
...
100
6
-0.203731
2015-01-01 08:00:00
1
1
a
a
missing
ahh
9
10
100
NaN
10
constant
0
C
2
most_common_value
0
...
100
8
1.616109
2015-01-01 09:00:00
1
1
a
a
missing
ahh
10
11
110
NaN
11
constant
0
D
1
most_common_value
1
...
100
3
5.000000
2015-01-01 10:00:00
1
1
a
a
missing
ahh
11
12
120
NaN
12
constant
0
B
3
most_common_value
1
...
100
3
-1.135743
2015-01-01 11:00:00
1
1
a
a
missing
ahh
12
13
130
NaN
13
constant
0
D
3
most_common_value
0
...
100
6
-1.648627
2015-01-01 12:00:00
1
1
a
a
missing
ahh
13
14
140
NaN
14
constant
0
A
1
most_common_value
0
...
100
3
-0.672021
2015-01-01 13:00:00
1
1
a
a
missing
ahh
14
15
150
NaN
15
constant
0
A
4
most_common_value
0
...
100
5
-0.661877
2015-01-01 14:00:00
1
1
a
a
missing
ahh
15
16
160
NaN
16
constant
0
D
1
most_common_value
0
...
100
4
0.901657
2015-01-01 15:00:00
1
1
a
a
missing
ahh
16
17
170
NaN
17
constant
0
E
3
most_common_value
1
...
100
8
-0.848503
2015-01-01 16:00:00
1
1
a
a
missing
ahh
17
18
180
NaN
18
constant
0
C
3
most_common_value
0
...
100
3
-0.282732
2015-01-01 17:00:00
1
1
a
a
missing
ahh
18
19
190
NaN
19
constant
0
F
4
most_common_value
1
...
100
4
-1.800709
2015-01-01 18:00:00
1
1
a
a
missing
ahh
19
20
200
NaN
20
constant
0
C
3
most_common_value
1
...
100
0
0.659969
2015-01-01 19:00:00
1
1
a
a
missing
ahh
20
21
210
NaN
21
constant
0
F
2
most_common_value
1
...
100
3
0.918322
2015-01-01 20:00:00
1
1
a
a
missing
ahh
21
22
220
NaN
22
constant
0
G
2
most_common_value
0
...
100
0
-0.337731
2015-01-01 21:00:00
1
1
a
a
missing
ahh
22
23
230
NaN
23
constant
0
B
2
most_common_value
0
...
100
8
-1.386793
2015-01-01 22:00:00
1
1
a
a
missing
ahh
23
24
240
NaN
24
constant
0
F
4
most_common_value
0
...
100
1
1.239776
2015-01-01 23:00:00
1
1
a
a
missing
ahh
24
25
250
NaN
25
constant
0
A
3
most_common_value
1
...
100
9
0.589520
2015-01-02 00:00:00
1
1
a
a
missing
ahh
25
26
260
NaN
26
constant
0
A
4
most_common_value
0
...
100
1
0.060490
2015-01-02 01:00:00
1
1
a
a
missing
ahh
26
27
270
NaN
27
constant
0
G
2
most_common_value
1
...
100
9
0.053333
2015-01-02 02:00:00
1
1
a
a
missing
ahh
27
28
280
NaN
28
constant
0
F
4
most_common_value
1
...
100
2
1.232756
2015-01-02 03:00:00
1
1
a
a
missing
ahh
28
29
290
NaN
29
constant
0
E
1
most_common_value
0
...
100
8
-1.098578
2015-01-02 04:00:00
1
1
a
a
missing
ahh
29
30
300
NaN
30
constant
0
D
3
most_common_value
1
...
100
3
0.579508
2015-01-02 05:00:00
1
1
a
a
missing
ahh
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
970
971
9710
NaN
971
constant
0
G
2
most_common_value
1
...
100
6
-0.687444
2015-02-10 10:00:00
NaN
NaN
c
d
do_not_touch
ahh
971
972
9720
NaN
972
constant
0
C
1
most_common_value
0
...
100
9
1.475201
2015-02-10 11:00:00
NaN
NaN
c
d
do_not_touch
ahh
972
973
9730
NaN
973
constant
0
A
4
most_common_value
1
...
100
2
-0.519341
2015-02-10 12:00:00
NaN
NaN
c
d
do_not_touch
ahh
973
974
9740
NaN
974
constant
0
B
1
most_common_value
0
...
100
5
0.068614
2015-02-10 13:00:00
NaN
NaN
c
d
do_not_touch
ahh
974
975
9750
NaN
975
constant
0
G
1
most_common_value
1
...
100
7
-0.723333
2015-02-10 14:00:00
NaN
NaN
c
d
do_not_touch
ahh
975
976
9760
NaN
976
constant
0
D
2
most_common_value
0
...
100
3
-0.491493
2015-02-10 15:00:00
NaN
NaN
c
d
do_not_touch
ahh
976
977
9770
NaN
977
constant
0
E
4
most_common_value
1
...
100
4
-1.262389
2015-02-10 16:00:00
NaN
NaN
c
d
do_not_touch
ahh
977
978
9780
NaN
978
constant
0
E
3
most_common_value
0
...
100
4
0.630329
2015-02-10 17:00:00
NaN
NaN
c
d
do_not_touch
ahh
978
979
9790
NaN
979
constant
0
B
3
most_common_value
0
...
100
2
1.556459
2015-02-10 18:00:00
NaN
NaN
c
d
do_not_touch
ahh
979
980
9800
NaN
980
constant
0
C
2
most_common_value
0
...
100
2
-0.639053
2015-02-10 19:00:00
NaN
NaN
c
d
do_not_touch
ahh
980
981
9810
NaN
981
constant
0
A
4
most_common_value
0
...
100
8
-0.307174
2015-02-10 20:00:00
NaN
NaN
c
d
do_not_touch
ahh
981
982
9820
NaN
982
constant
0
B
3
most_common_value
0
...
100
3
-0.187863
2015-02-10 21:00:00
NaN
NaN
c
d
do_not_touch
ahh
982
983
9830
NaN
983
constant
0
B
4
most_common_value
0
...
100
0
-0.572205
2015-02-10 22:00:00
NaN
NaN
c
d
do_not_touch
ahh
983
984
9840
NaN
984
constant
0
C
2
most_common_value
0
...
100
8
-0.414235
2015-02-10 23:00:00
NaN
NaN
c
d
do_not_touch
ahh
984
985
9850
NaN
985
constant
0
C
2
most_common_value
0
...
100
7
-0.651123
2015-02-11 00:00:00
NaN
NaN
c
d
do_not_touch
ahh
985
986
9860
NaN
986
constant
0
D
4
most_common_value
1
...
100
4
0.769402
2015-02-11 01:00:00
NaN
NaN
c
d
do_not_touch
ahh
986
987
9870
NaN
987
constant
0
A
2
most_common_value
0
...
100
1
-0.602695
2015-02-11 02:00:00
NaN
NaN
c
d
do_not_touch
ahh
987
988
9880
NaN
988
constant
0
C
3
most_common_value
1
...
100
6
0.216869
2015-02-11 03:00:00
NaN
NaN
c
d
do_not_touch
ahh
988
989
9890
NaN
989
constant
0
D
1
most_common_value
1
...
100
9
-1.052458
2015-02-11 04:00:00
NaN
NaN
c
d
do_not_touch
ahh
989
990
9900
NaN
990
constant
0
F
1
most_common_value
1
...
100
8
-0.896159
2015-02-11 05:00:00
NaN
NaN
c
d
do_not_touch
ahh
990
991
9910
NaN
991
constant
0
B
1
most_common_value
0
...
100
4
-0.530823
2015-02-11 06:00:00
NaN
NaN
c
d
do_not_touch
ahh
991
992
9920
NaN
992
constant
0
E
3
most_common_value
1
...
100
3
-0.860887
2015-02-11 07:00:00
NaN
NaN
c
d
do_not_touch
ahh
992
993
9930
NaN
993
constant
0
G
3
most_common_value
1
...
100
3
1.259120
2015-02-11 08:00:00
NaN
NaN
c
d
do_not_touch
ahh
993
994
9940
NaN
994
constant
0
C
2
most_common_value
0
...
100
7
0.876362
2015-02-11 09:00:00
NaN
NaN
c
d
do_not_touch
ahh
994
995
9950
NaN
995
constant
0
E
2
most_common_value
1
...
100
4
-0.498452
2015-02-11 10:00:00
NaN
NaN
c
d
do_not_touch
ahh
995
996
9960
NaN
996
constant
0
B
4
most_common_value
1
...
100
9
0.171826
2015-02-11 11:00:00
NaN
NaN
c
d
do_not_touch
ahh
996
997
9970
NaN
997
constant
0
D
1
most_common_value
1
...
100
2
0.080841
2015-02-11 12:00:00
NaN
NaN
c
d
do_not_touch
ahh
997
998
9980
NaN
998
constant
0
A
1
most_common_value
1
...
100
9
0.193660
2015-02-11 13:00:00
NaN
NaN
c
d
do_not_touch
ahh
998
999
9990
NaN
999
constant
0
D
2
most_common_value
0
...
100
7
-2.874406
2015-02-11 14:00:00
NaN
NaN
c
d
do_not_touch
ahh
999
1000
10000
NaN
1000
constant
0
G
2
most_common_value
1
...
100
3
-0.095891
2015-02-11 15:00:00
NaN
NaN
c
d
do_not_touch
ahh
1000 rows × 25 columns
In [16]:
arr =exploration_test.detect_other_na(auto_replace=False)
We detected id 0
member_id 0
na_col 0
id_na 0
constant_col 0
constant_col_num 0
character_factor 0
num_factor 0
nearzerovar_variable 0
binary_variable 0
character_variable 0
duplicated_column 0
many_missing_70 0
character_variable_fillna 0
numeric_variable_fillna 0
num_variable 0
int_factor_10 0
outlier 0
datetime 0
None_100 0
None_na_200 0
character_variable_up1 0
character_variable_up2 0
other_na 800
Upper_text_1 0
dtype: int64 other type of missing values
In [20]:
df_2 =df_test.where((arr==False), )
In [22]:
df_2.isnull().sum()
Out[22]:
id 0
member_id 0
na_col 1000
id_na 3
constant_col 0
constant_col_num 0
character_factor 0
num_factor 0
nearzerovar_variable 0
binary_variable 0
character_variable 0
duplicated_column 0
many_missing_70 700
character_variable_fillna 300
numeric_variable_fillna 200
num_variable 0
int_factor_10 0
outlier 0
datetime 0
None_100 100
None_na_200 200
character_variable_up1 0
character_variable_up2 0
other_na 800
Upper_text_1 0
dtype: int64
In [13]:
df.shape
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-13-0fe90f8063b0> in <module>()
----> 1 df.shape
NameError: name 'df' is not defined
In [ ]:
df_test.datetime.dtype == '<M8[ns]'
In [ ]:
Content source: ericfourrier/auto-clean
Similar notebooks: