Dataset info
Number of columns | 16 |
Number of rows | 7 |
Total Missing (%) | 26 |
Total size in memory | 44.9 MB |
Column types
Categorical | 0 |
Numeric | 0 |
Date | 2 |
Array | 0 |
Not available | 5 |
In [1]:
%load_ext autoreload
%autoreload 2
In [2]:
import sys
sys.path.append("../..")
In [3]:
from optimus import Optimus
from optimus.helpers.test import Test
You are using PySparkling of version 2.4.10, but your PySpark is of
version 2.3.1. Please make sure Spark and PySparkling versions are compatible.
In [4]:
op = Optimus(master='local', verbose=True)
INFO:optimus:Operative System:Windows
INFO:optimus:Just check that Spark and all necessary environments vars are present...
INFO:optimus:-----
INFO:optimus:SPARK_HOME=C:\opt\spark\spark-2.3.1-bin-hadoop2.7
INFO:optimus:HADOOP_HOME=C:\opt\hadoop-2.7.7
INFO:optimus:PYSPARK_PYTHON=C:\Users\argenisleon\Anaconda3\python.exe
INFO:optimus:PYSPARK_DRIVER_PYTHON=jupyter
INFO:optimus:PYSPARK_SUBMIT_ARGS=--jars "file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/spark-redis-2.4.1-SNAPSHOT-jar-with-dependencies.jar,file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/RedshiftJDBC42-1.2.16.1027.jar,file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/mysql-connector-java-8.0.16.jar,file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/ojdbc8.jar,file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/postgresql-42.2.5.jar,file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/presto-jdbc-0.224.jar,file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/spark-cassandra-connector_2.11-2.4.1.jar,file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/sqlite-jdbc-3.27.2.1.jar,file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/mssql-jdbc-7.4.1.jre8.jar" --driver-class-path "C:/Users/argenisleon/Documents/Optimus/optimus/jars/spark-redis-2.4.1-SNAPSHOT-jar-with-dependencies.jar;C:/Users/argenisleon/Documents/Optimus/optimus/jars/RedshiftJDBC42-1.2.16.1027.jar;C:/Users/argenisleon/Documents/Optimus/optimus/jars/mysql-connector-java-8.0.16.jar;C:/Users/argenisleon/Documents/Optimus/optimus/jars/ojdbc8.jar;C:/Users/argenisleon/Documents/Optimus/optimus/jars/postgresql-42.2.5.jar;C:/Users/argenisleon/Documents/Optimus/optimus/jars/presto-jdbc-0.224.jar;C:/Users/argenisleon/Documents/Optimus/optimus/jars/spark-cassandra-connector_2.11-2.4.1.jar;C:/Users/argenisleon/Documents/Optimus/optimus/jars/sqlite-jdbc-3.27.2.1.jar;C:/Users/argenisleon/Documents/Optimus/optimus/jars/mssql-jdbc-7.4.1.jre8.jar" --conf "spark.sql.catalogImplementation=hive" pyspark-shell
INFO:optimus:JAVA_HOME=C:\java
INFO:optimus:Pyarrow Installed
INFO:optimus:-----
INFO:optimus:Starting or getting SparkSession and SparkContext...
INFO:optimus:Spark Version:2.3.1
INFO:optimus:
____ __ _
/ __ \____ / /_(_)___ ___ __ _______
/ / / / __ \/ __/ / __ `__ \/ / / / ___/
/ /_/ / /_/ / /_/ / / / / / / /_/ (__ )
\____/ .___/\__/_/_/ /_/ /_/\__,_/____/
/_/
INFO:optimus:Transform and Roll out...
C:/Users/argenisleon/Documents/Optimus/optimus/../parse/infer.py
INFO:optimus:Optimus successfully imported. Have fun :).
INFO:optimus:Config.ini not found
In [5]:
import pandas as pd
from pyspark.sql.types import *
from datetime import date, datetime
cols = [
("names", "str"),
("height(ft)", ShortType()),
("function", "str"),
("rank", ByteType()),
("age", "int"),
("weight(t)", "float"),
"japanese name",
"last position seen",
"date arrival",
"last date seen",
("attributes", ArrayType(FloatType())),
("Date Type", DateType()),
("timestamp", TimestampType()),
("Cybertronian", BooleanType()),
("function(binary)", BinaryType()),
("NullType", NullType())
]
rows = [
("Optimus", -28, "Leader", 10, 5000000, 4.30, ["Inochi", "Convoy"], "19.442735,-99.201111", "1980/04/10",
"2016/09/10", [8.5344, 4300.0], date(2016, 9, 10), datetime(2014, 6, 24), True, bytearray("Leader", "utf-8"),
None),
("bumbl#ebéé ", 17, "Espionage", 7, 5000000, 2.0, ["Bumble", "Goldback"], "10.642707,-71.612534", "1980/04/10",
"2015/08/10", [5.334, 2000.0], date(2015, 8, 10), datetime(2014, 6, 24), True, bytearray("Espionage", "utf-8"),
None),
("ironhide&", 26, "Security", 7, 5000000, 4.0, ["Roadbuster"], "37.789563,-122.400356", "1980/04/10",
"2014/07/10", [7.9248, 4000.0], date(2014, 6, 24), datetime(2014, 6, 24), True, bytearray("Security", "utf-8"),
None),
("Jazz", 13, "First Lieutenant", 8, 5000000, 1.80, ["Meister"], "33.670666,-117.841553", "1980/04/10",
"2013/06/10", [3.9624, 1800.0], date(2013, 6, 24), datetime(2014, 6, 24), True,
bytearray("First Lieutenant", "utf-8"), None),
("Megatron", None, "None", 10, 5000000, 5.70, ["Megatron"], None, "1980/04/10", "2012/05/10", [None, 5700.0],
date(2012, 5, 10), datetime(2014, 6, 24), True, bytearray("None", "utf-8"), None),
("Metroplex_)^$", 300, "Battle Station", 8, 5000000, None, ["Metroflex"], None, "1980/04/10", "2011/04/10",
[91.44, None], date(2011, 4, 10), datetime(2014, 6, 24), True, bytearray("Battle Station", "utf-8"), None),
(None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None),
]
source_df = op.create.df(cols ,rows)
source_df.table()
Viewing 7 of 7 rows / 16 columns
1 partition(s)
names
1 (string)
nullable
height(ft)
2 (smallint)
nullable
function
3 (string)
nullable
rank
4 (tinyint)
nullable
age
5 (int)
nullable
weight(t)
6 (float)
nullable
japanese name
7 (array<string>)
nullable
last position seen
8 (string)
nullable
date arrival
9 (string)
nullable
last date seen
10 (string)
nullable
attributes
11 (array<float>)
nullable
Date Type
12 (date)
nullable
timestamp
13 (timestamp)
nullable
Cybertronian
14 (boolean)
nullable
function(binary)
15 (binary)
nullable
NullType
16 (null)
nullable
Optimus
-28
Leader
10
5000000
4.300000190734863
['Inochi',⋅'Convoy']
19.442735,-99.201111
1980/04/10
2016/09/10
[8.53439998626709,⋅4300.0]
2016-09-10
2014-06-24⋅00:00:00
True
bytearray(b'Leader')
None
bumbl#ebéé⋅⋅
17
Espionage
7
5000000
2.0
['Bumble',⋅'Goldback']
10.642707,-71.612534
1980/04/10
2015/08/10
[5.334000110626221,⋅2000.0]
2015-08-10
2014-06-24⋅00:00:00
True
bytearray(b'Espionage')
None
ironhide&
26
Security
7
5000000
4.0
['Roadbuster']
37.789563,-122.400356
1980/04/10
2014/07/10
[7.924799919128418,⋅4000.0]
2014-06-24
2014-06-24⋅00:00:00
True
bytearray(b'Security')
None
Jazz
13
First⋅Lieutenant
8
5000000
1.7999999523162842
['Meister']
33.670666,-117.841553
1980/04/10
2013/06/10
[3.962399959564209,⋅1800.0]
2013-06-24
2014-06-24⋅00:00:00
True
bytearray(b'First⋅Lieutenant')
None
Megatron
None
None
10
5000000
5.699999809265137
['Megatron']
None
1980/04/10
2012/05/10
[None,⋅5700.0]
2012-05-10
2014-06-24⋅00:00:00
True
bytearray(b'None')
None
Metroplex_)^$
300
Battle⋅Station
8
5000000
None
['Metroflex']
None
1980/04/10
2011/04/10
[91.44000244140625,⋅None]
2011-04-10
2014-06-24⋅00:00:00
True
bytearray(b'Battle⋅Station')
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
Viewing 7 of 7 rows / 16 columns
1 partition(s)
In [6]:
from pyspark.ml.linalg import Vectors
In [7]:
import re
a="a\'a"
re.escape(a)
Out[7]:
"a'a"
In [8]:
print(a)
a'a
In [9]:
t = Test(op, source_df, "df_profiler", imports=["from pyspark.ml.linalg import Vectors, VectorUDT, DenseVector",
"import numpy as np",
"nan = np.nan",
"import datetime",
"from pyspark.sql import functions as F",
"from optimus.profiler.profiler import Profiler",
"null = None",
"true = True",
"p= Profiler()"], path = "df_profiler", final_path="..")
In [10]:
from pyspark.sql import functions as F
def func(col_name, attrs):
return F.col(col_name) * 2
numeric_col = "height(ft)"
numeric_col_B = "rank"
numeric_col_C = "rank"
string_col = "function"
date_col = "date arrival"
date_col_B = "last date seen"
new_col = "new col"
array_col = "attributes"
In [11]:
from optimus.profiler.profiler import Profiler
p= Profiler()
INFO:optimus:Config.ini not found
In [13]:
p.run(source_df, "*")
INFO:optimus:Processing Stats For columns...
INFO:optimus:Batch Stats 0. Processing columns['names', 'height(ft)', 'function', 'rank', 'age', 'weight(t)', 'japanese name', 'last position seen', 'date arrival', 'last date seen', 'attributes', 'Date Type', 'timestamp', 'Cybertronian', 'function(binary)', 'NullType']
'kurtosis' function in 'age' column is returning 'nan'. Is that what you expected?. Seems that 'age' has 'nan' values
'skewness' function in 'age' column is returning 'nan'. Is that what you expected?. Seems that 'age' has 'nan' values
INFO:optimus:Batch Histogram 0. Processing columns['names', 'height(ft)', 'function', 'rank', 'age', 'weight(t)', 'japanese name', 'last position seen', 'date arrival', 'last date seen', 'attributes', 'Date Type', 'timestamp', 'Cybertronian', 'function(binary)', 'NullType']
INFO:optimus:Processing Frequency ...
INFO:optimus:`names`,`function`,`japanese name`,`last position seen`,`date arrival`,`last date seen`,`attributes`,`Date Type`,`timestamp`,`Cybertronian`,`function(binary)`,`NullType` column(s) was not processed because is/are not byte,short,big,int,double,float
INFO:optimus:`names`,`function`,`last position seen`,`date arrival`,`last date seen`,`timestamp`,`Cybertronian`,`NullType` column(s) was not processed because is/are not array,vector,byte,date,binary
INFO:optimus:Using 'column_exp' to process column 'japanese name' with function _cast_to
INFO:optimus:Using 'column_exp' to process column 'attributes' with function _cast_to
INFO:optimus:Using 'column_exp' to process column 'Date Type' with function _cast_to
INFO:optimus:Using 'column_exp' to process column 'function(binary)' with function _cast_to
Overview
Dataset info
Number of columns
16
Number of rows
7
Total Missing (%)
26
Total size in memory
44.9 MB
Column types
Categorical
0
Numeric
0
Date
2
Array
0
Not available
5
names
categorical
Unique
5
Unique (%)
71.43
Missing
1
Missing (%)
14.29
Datatypes
String
6
Integer
0
Decimal
0
Bool
0
Date
0
Missing
0
Null
1
Frequency
Value
Count
Frequency (%)
Optim'us
1
14.29%
bumbl#ebéé
1
14.29%
ironhide&
1
14.29%
Jazz
1
14.29%
Megatron
1
14.29%
Metroplex_)^$
1
14.29%
None
1
14.29%
"Missing"
1
14.29%
height(ft)
numeric
Unique
5
Unique (%)
71.43
Missing
2
Missing (%)
28.57
Datatypes
String
0
Integer
5
Decimal
0
Bool
0
Date
0
Missing
0
Null
2
Basic Stats
Mean
65.6
Minimum
-28
Maximum
300
Zeros(%)
0
Quantile statistics
Minimum
-28
5-th percentile
-28
Q1
13
Median
17
Q3
26
95-th percentile
300
Maximum
300
Range
328
Interquartile range
13
Descriptive statistics
Standard deviation
132.66612
Coef of variation
2.02235
Kurtosis
0.13863
Mean
65.6
MAD
9
Skewness
1.4049
Sum
328
Variance
17600.3
function
categorical
Unique
6
Unique (%)
85.71
Missing
1
Missing (%)
14.29
Datatypes
String
6
Integer
0
Decimal
0
Bool
0
Date
0
Missing
0
Null
1
Frequency
Value
Count
Frequency (%)
Leader
1
14.29%
Espionage
1
14.29%
Security
1
14.29%
First Lieutenant
1
14.29%
None
1
14.29%
Battle Station
1
14.29%
None
1
14.29%
"Missing"
1
14.29%
rank
numeric
Unique
3
Unique (%)
42.86
Missing
1
Missing (%)
14.29
Datatypes
String
0
Integer
6
Decimal
0
Bool
0
Date
0
Missing
0
Null
1
Basic Stats
Mean
8.33333
Minimum
7
Maximum
10
Zeros(%)
0
Quantile statistics
Minimum
7
5-th percentile
7
Q1
7
Median
8
Q3
10
95-th percentile
10
Maximum
10
Range
3
Interquartile range
3
Descriptive statistics
Standard deviation
1.36626
Coef of variation
0.16395
Kurtosis
-1.5
Mean
8.33333
MAD
1
Skewness
0.3818
Sum
50
Variance
1.86667
age
numeric
Unique
1
Unique (%)
14.29
Missing
1
Missing (%)
14.29
Datatypes
String
0
Integer
6
Decimal
0
Bool
0
Date
0
Missing
0
Null
1
Basic Stats
Mean
5000000.0
Minimum
5000000
Maximum
5000000
Zeros(%)
0
Quantile statistics
Minimum
5000000
5-th percentile
5000000
Q1
5000000
Median
5000000
Q3
5000000
95-th percentile
5000000
Maximum
5000000
Range
0
Interquartile range
0
Descriptive statistics
Standard deviation
0.0
Coef of variation
0.0
Kurtosis
nan
Mean
5000000.0
MAD
0
Skewness
nan
Sum
30000000
Variance
0.0
weight(t)
numeric
Unique
5
Unique (%)
71.43
Missing
2
Missing (%)
28.57
Datatypes
String
0
Integer
0
Decimal
5
Bool
0
Date
0
Missing
0
Null
2
Basic Stats
Mean
3.56
Minimum
1.8
Maximum
5.7
Zeros(%)
0
Quantile statistics
Minimum
1.8
5-th percentile
1.7999999523162842
Q1
2.0
Median
4.0
Q3
4.300000190734863
95-th percentile
5.699999809265137
Maximum
5.7
Range
3.9000000000000004
Interquartile range
2.3000001907348633
Descriptive statistics
Standard deviation
1.64712
Coef of variation
0.46267
Kurtosis
-1.43641
Mean
3.56
MAD
1.7
Skewness
0.06521
Sum
17.8
Variance
2.713
japanese name
null
Unique
6
Unique (%)
85.71
Missing
1
Missing (%)
14.29
Datatypes
String
0
Integer
0
Decimal
0
Bool
0
Date
0
Missing
0
Null
1
Frequency
Value
Count
Frequency (%)
[Inochi, Convoy]
1
14.29%
[Bumble, Goldback]
1
14.29%
[Roadbuster]
1
14.29%
[Meister]
1
14.29%
[Megatron]
1
14.29%
[Metroflex]
1
14.29%
None
1
14.29%
"Missing"
1
14.29%
last position seen
categorical
Unique
4
Unique (%)
57.14
Missing
3
Missing (%)
42.86
Datatypes
String
4
Integer
0
Decimal
0
Bool
0
Date
0
Missing
0
Null
3
Frequency
Value
Count
Frequency (%)
None
3
42.86%
19.442735,-99.201111
1
14.29%
10.642707,-71.612534
1
14.29%
37.789563,-122.400356
1
14.29%
33.670666,-117.841553
1
14.29%
"Missing"
3
42.86%
date arrival
categorical
Unique
1
Unique (%)
14.29
Missing
1
Missing (%)
14.29
Datatypes
String
6
Integer
0
Decimal
0
Bool
0
Date
0
Missing
0
Null
1
Frequency
Value
Count
Frequency (%)
1980/04/10
6
85.71%
None
1
14.29%
"Missing"
1
14.29%
last date seen
categorical
Unique
6
Unique (%)
85.71
Missing
1
Missing (%)
14.29
Datatypes
String
6
Integer
0
Decimal
0
Bool
0
Date
0
Missing
0
Null
1
Frequency
Value
Count
Frequency (%)
2016/09/10
1
14.29%
2015/08/10
1
14.29%
2014/07/10
1
14.29%
2013/06/10
1
14.29%
2012/05/10
1
14.29%
2011/04/10
1
14.29%
None
1
14.29%
"Missing"
1
14.29%
attributes
null
Unique
6
Unique (%)
85.71
Missing
1
Missing (%)
14.29
Datatypes
String
0
Integer
0
Decimal
0
Bool
0
Date
0
Missing
0
Null
1
Frequency
Value
Count
Frequency (%)
[8.5344, 4300.0]
1
14.29%
[5.334, 2000.0]
1
14.29%
[7.9248, 4000.0]
1
14.29%
[3.9624, 1800.0]
1
14.29%
[, 5700.0]
1
14.29%
[91.44,]
1
14.29%
None
1
14.29%
"Missing"
1
14.29%
Date Type
date
Unique
6
Unique (%)
85.71
Missing
1
Missing (%)
14.29
Datatypes
String
0
Integer
0
Decimal
0
Bool
0
Date
6
Missing
0
Null
1
Frequency
Value
Count
Frequency (%)
2016-09-10
1
14.29%
2015-08-10
1
14.29%
2014-06-24
1
14.29%
2013-06-24
1
14.29%
2012-05-10
1
14.29%
2011-04-10
1
14.29%
None
1
14.29%
"Missing"
1
14.29%
timestamp
date
Unique
1
Unique (%)
14.29
Missing
1
Missing (%)
14.29
Datatypes
String
0
Integer
0
Decimal
0
Bool
0
Date
6
Missing
0
Null
1
Frequency
Value
Count
Frequency (%)
2014-06-24 00:00:00
6
85.71%
None
1
14.29%
"Missing"
1
14.29%
Cybertronian
null
Unique
1
Unique (%)
14.29
Missing
1
Missing (%)
14.29
Datatypes
String
0
Integer
0
Decimal
0
Bool
0
Date
0
Missing
0
Null
1
Frequency
Value
Count
Frequency (%)
True
6
85.71%
None
1
14.29%
"Missing"
1
14.29%
function(binary)
null
Unique
6
Unique (%)
85.71
Missing
1
Missing (%)
14.29
Datatypes
String
0
Integer
0
Decimal
0
Bool
0
Date
0
Missing
0
Null
1
Frequency
Value
Count
Frequency (%)
Leader
1
14.29%
Espionage
1
14.29%
Security
1
14.29%
First Lieutenant
1
14.29%
None
1
14.29%
Battle Station
1
14.29%
None
1
14.29%
"Missing"
1
14.29%
NullType
null
Unique
0
Unique (%)
0.0
Missing
7
Missing (%)
100.0
Datatypes
String
0
Integer
0
Decimal
0
Bool
0
Date
0
Missing
0
Null
7
Frequency
Value
Count
Frequency (%)
None
7
100.0%
"Missing"
7
100.0%
Viewing 7 of 7 rows / 16 columns
1 partition(s)
names
1 (string)
nullable
height(ft)
2 (smallint)
nullable
function
3 (string)
nullable
rank
4 (tinyint)
nullable
age
5 (int)
nullable
weight(t)
6 (float)
nullable
japanese name
7 (array<string>)
nullable
last position seen
8 (string)
nullable
date arrival
9 (string)
nullable
last date seen
10 (string)
nullable
attributes
11 (array<float>)
nullable
Date Type
12 (date)
nullable
timestamp
13 (timestamp)
nullable
Cybertronian
14 (boolean)
nullable
function(binary)
15 (binary)
nullable
NullType
16 (null)
nullable
Optim'us
-28
Leader
10
5000000
4.300000190734863
['Inochi',⋅'Convoy']
19.442735,-99.201111
1980/04/10
2016/09/10
[8.53439998626709,⋅4300.0]
2016-09-10
2014-06-24⋅00:00:00
True
bytearray(b'Leader')
None
bumbl#ebéé⋅⋅
17
Espionage
7
5000000
2.0
['Bumble',⋅'Goldback']
10.642707,-71.612534
1980/04/10
2015/08/10
[5.334000110626221,⋅2000.0]
2015-08-10
2014-06-24⋅00:00:00
True
bytearray(b'Espionage')
None
ironhide&
26
Security
7
5000000
4.0
['Roadbuster']
37.789563,-122.400356
1980/04/10
2014/07/10
[7.924799919128418,⋅4000.0]
2014-06-24
2014-06-24⋅00:00:00
True
bytearray(b'Security')
None
Jazz
13
First⋅Lieutenant
8
5000000
1.7999999523162842
['Meister']
33.670666,-117.841553
1980/04/10
2013/06/10
[3.962399959564209,⋅1800.0]
2013-06-24
2014-06-24⋅00:00:00
True
bytearray(b'First⋅Lieutenant')
None
Megatron
None
None
10
5000000
5.699999809265137
['Megatron']
None
1980/04/10
2012/05/10
[None,⋅5700.0]
2012-05-10
2014-06-24⋅00:00:00
True
bytearray(b'None')
None
Metroplex_)^$
300
Battle⋅Station
8
5000000
None
['Metroflex']
None
1980/04/10
2011/04/10
[91.44000244140625,⋅None]
2011-04-10
2014-06-24⋅00:00:00
True
bytearray(b'Battle⋅Station')
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
Viewing 7 of 7 rows / 16 columns
1 partition(s)
INFO:optimus:run() executed in 69.73 sec
Out[13]:
<optimus.profiler.profiler.Profiler at 0x1cc2093b7b8>
In [44]:
t.create(p, "dataset", None, 'json', None, source_df,"*")
Creating test_dataset() test function...
INFO:optimus:test_dataset()
INFO:optimus:Processing Stats For columns...
{'names': {'string': 6, 'null': 1}, 'height(ft)': {'smallint': 5, 'null': 2}, 'function': {'string': 6, 'null': 1}, 'rank': {'tinyint': 6, 'null': 1}, 'age': {'int': 6, 'null': 1}, 'weight(t)': {'float': 5, 'null': 2}, 'japanese name': {'array': 6, 'null': 1}, 'last position seen': {'string': 4, 'null': 3}, 'date arrival': {'string': 6, 'null': 1}, 'last date seen': {'string': 6, 'null': 1}, 'attributes': {'array': 6, 'null': 1}, 'Date Type': {'date': 6, 'null': 1}, 'timestamp': {'timestamp': 6, 'null': 1}, 'Cybertronian': {'boolean': 6, 'null': 1}, 'function(binary)': {'binary': 6, 'null': 1}, 'NullType': {'null': 7}}
INFO:optimus:Batch Stats 0. Processing columns['names', 'height(ft)', 'function', 'rank', 'age', 'weight(t)', 'japanese name', 'last position seen', 'date arrival', 'last date seen', 'attributes', 'Date Type', 'timestamp', 'Cybertronian', 'function(binary)', 'NullType']
INFO:optimus:'kurtosis' function in 'age' column is returning 'nan'. Is that what you expected?. Seems that 'age' has 'nan' values
INFO:optimus:'skewness' function in 'age' column is returning 'nan'. Is that what you expected?. Seems that 'age' has 'nan' values
INFO:optimus:Batch Histogram 0. Processing columns['names', 'height(ft)', 'function', 'rank', 'age', 'weight(t)', 'japanese name', 'last position seen', 'date arrival', 'last date seen', 'attributes', 'Date Type', 'timestamp', 'Cybertronian', 'function(binary)', 'NullType']
INFO:optimus:Processing Frequency ...
INFO:optimus:`names`,`function`,`japanese name`,`last position seen`,`date arrival`,`last date seen`,`attributes`,`Date Type`,`timestamp`,`Cybertronian`,`function(binary)`,`NullType` column(s) was not processed because is/are not byte,short,big,int,double,float
INFO:optimus:`names`,`function`,`last position seen`,`date arrival`,`last date seen`,`timestamp`,`Cybertronian`,`NullType` column(s) was not processed because is/are not array,vector,byte,date,binary
INFO:optimus:Using 'column_exp' to process column 'japanese name' with function _cast_to
INFO:optimus:Using 'column_exp' to process column 'attributes' with function _cast_to
INFO:optimus:Using 'column_exp' to process column 'Date Type' with function _cast_to
INFO:optimus:Using 'column_exp' to process column 'function(binary)' with function _cast_to
{"count_types": {"string": 5, "int": 3, "decimal": 1, "array": 2, "date": 2, "boolean": 1, "binary": 1, "null": 1, "numeric": 0, "categorical": 0}, "total_count_dtypes": 8, "dtypes_list": ["string", "int", "decimal", "array", "date", "boolean", "binary", "null"], "columns": {"names": {"stats": {"count_uniques": 5, "min": "Jazz", "max": "ironhide&", "stddev": null, "kurtosis": null, "mean": null, "skewness": null, "sum": null, "variance": null, "zeros": 0, "count_na": 1, "p_count_na": 14.29, "p_count_uniques": 71.43}, "frequency": [{"value": "Optimus", "count": 1, "percentage": 14.29}, {"value": "bumbl#eb\u00e9\u00e9 ", "count": 1, "percentage": 14.29}, {"value": "ironhide&", "count": 1, "percentage": 14.29}, {"value": "Jazz", "count": 1, "percentage": 14.29}, {"value": "Megatron", "count": 1, "percentage": 14.29}, {"value": "Metroplex_)^$", "count": 1, "percentage": 14.29}, {"value": null, "count": 1, "percentage": 14.29}], "name": "names", "column_dtype": "string", "dtypes_stats": {"null": 1, "missing": 0, "string": 6}, "column_type": "categorical"}, "height(ft)": {"stats": {"count_uniques": 5, "min": -28, "max": 300, "stddev": 132.66612, "kurtosis": 0.13863, "mean": 65.6, "skewness": 1.4049, "sum": 328, "variance": 17600.3, "zeros": 0, "percentile": {"0.75": 26, "0.95": 300, "0.05": -28, "0.25": 13, "0.5": 17}, "count_na": 2, "hist": [{"count": 4.0, "lower": -28.0, "upper": 54.0}, {"count": 0.0, "lower": 54.0, "upper": 136.0}, {"count": 0.0, "lower": 136.0, "upper": 218.0}, {"count": 0.0, "lower": 218.0, "upper": 300.0}], "range": 328, "median": 17, "interquartile_range": 13, "coef_variation": 2.02235, "mad": 9, "p_count_na": 28.57, "p_count_uniques": 71.43}, "name": "height(ft)", "column_dtype": "int", "dtypes_stats": {"null": 2, "missing": 0, "int": 5}, "column_type": "numeric"}, "function": {"stats": {"count_uniques": 6, "min": "Battle Station", "max": "Security", "stddev": null, "kurtosis": null, "mean": null, "skewness": null, "sum": null, "variance": null, "zeros": 0, "count_na": 1, "p_count_na": 14.29, "p_count_uniques": 85.71}, "frequency": [{"value": "Leader", "count": 1, "percentage": 14.29}, {"value": "Espionage", "count": 1, "percentage": 14.29}, {"value": "Security", "count": 1, "percentage": 14.29}, {"value": "First Lieutenant", "count": 1, "percentage": 14.29}, {"value": "None", "count": 1, "percentage": 14.29}, {"value": "Battle Station", "count": 1, "percentage": 14.29}, {"value": null, "count": 1, "percentage": 14.29}], "name": "function", "column_dtype": "string", "dtypes_stats": {"null": 1, "missing": 0, "string": 6}, "column_type": "categorical"}, "rank": {"stats": {"count_uniques": 3, "min": 7, "max": 10, "stddev": 1.36626, "kurtosis": -1.5, "mean": 8.33333, "skewness": 0.3818, "sum": 50, "variance": 1.86667, "zeros": 0, "percentile": {"0.75": 10, "0.95": 10, "0.05": 7, "0.25": 7, "0.5": 8}, "count_na": 1, "hist": [{"count": 4.0, "lower": 7.0, "upper": 8.5}, {"count": 0.0, "lower": 8.5, "upper": 10.0}], "range": 3, "median": 8, "interquartile_range": 3, "coef_variation": 0.16395, "mad": 1, "p_count_na": 14.29, "p_count_uniques": 42.86}, "name": "rank", "column_dtype": "int", "dtypes_stats": {"null": 1, "missing": 0, "int": 6}, "column_type": "numeric"}, "age": {"stats": {"count_uniques": 1, "min": 5000000, "max": 5000000, "stddev": 0.0, "kurtosis": null, "mean": 5000000.0, "skewness": null, "sum": 30000000, "variance": 0.0, "zeros": 0, "percentile": {"0.75": 5000000, "0.95": 5000000, "0.05": 5000000, "0.25": 5000000, "0.5": 5000000}, "count_na": 1, "hist": [{"count": 6, "lower": 5000000, "upper": 5000001}], "range": 0, "median": 5000000, "interquartile_range": 0, "coef_variation": 0.0, "mad": 0, "p_count_na": 14.29, "p_count_uniques": 14.29}, "name": "age", "column_dtype": "int", "dtypes_stats": {"null": 1, "missing": 0, "int": 6}, "column_type": "numeric"}, "weight(t)": {"stats": {"count_uniques": 5, "min": 1.8, "max": 5.7, "stddev": 1.64712, "kurtosis": -1.43641, "mean": 3.56, "skewness": 0.06521, "sum": 17.8, "variance": 2.713, "zeros": 0, "percentile": {"0.75": 4.300000190734863, "0.95": 5.699999809265137, "0.05": 1.7999999523162842, "0.25": 2.0, "0.5": 4.0}, "count_na": 2, "hist": [{"count": 1.0, "lower": 1.8, "upper": 2.78}, {"count": 0.0, "lower": 2.78, "upper": 3.75}, {"count": 2.0, "lower": 3.75, "upper": 4.73}, {"count": 1.0, "lower": 4.73, "upper": 5.7}], "range": 3.9000000000000004, "median": 4.0, "interquartile_range": 2.3000001907348633, "coef_variation": 0.46267, "mad": 1.7, "p_count_na": 28.57, "p_count_uniques": 71.43}, "name": "weight(t)", "column_dtype": "decimal", "dtypes_stats": {"null": 2, "missing": 0, "decimal": 5}, "column_type": "numeric"}, "japanese name": {"stats": {"count_uniques": 6, "min": ["Bumble", "Goldback"], "max": ["Roadbuster"], "count_na": 1, "p_count_na": 14.29, "p_count_uniques": 85.71}, "frequency": [{"value": "[Inochi, Convoy]", "count": 1, "percentage": 14.29}, {"value": "[Bumble, Goldback]", "count": 1, "percentage": 14.29}, {"value": "[Roadbuster]", "count": 1, "percentage": 14.29}, {"value": "[Meister]", "count": 1, "percentage": 14.29}, {"value": "[Megatron]", "count": 1, "percentage": 14.29}, {"value": "[Metroflex]", "count": 1, "percentage": 14.29}, {"value": null, "count": 1, "percentage": 14.29}], "name": "japanese name", "column_dtype": "array", "dtypes_stats": {"null": 1, "missing": 0, "array": 6}, "column_type": "array"}, "last position seen": {"stats": {"count_uniques": 4, "min": "10.642707,-71.612534", "max": "37.789563,-122.400356", "stddev": null, "kurtosis": null, "mean": null, "skewness": null, "sum": null, "variance": null, "zeros": 0, "count_na": 3, "p_count_na": 42.86, "p_count_uniques": 57.14}, "frequency": [{"value": null, "count": 3, "percentage": 42.86}, {"value": "19.442735,-99.201111", "count": 1, "percentage": 14.29}, {"value": "10.642707,-71.612534", "count": 1, "percentage": 14.29}, {"value": "37.789563,-122.400356", "count": 1, "percentage": 14.29}, {"value": "33.670666,-117.841553", "count": 1, "percentage": 14.29}], "name": "last position seen", "column_dtype": "string", "dtypes_stats": {"null": 3, "missing": 0, "string": 4}, "column_type": "categorical"}, "date arrival": {"stats": {"count_uniques": 1, "min": "1980/04/10", "max": "1980/04/10", "stddev": null, "kurtosis": null, "mean": null, "skewness": null, "sum": null, "variance": null, "zeros": 0, "count_na": 1, "p_count_na": 14.29, "p_count_uniques": 14.29}, "frequency": [{"value": "1980/04/10", "count": 6, "percentage": 85.71}, {"value": null, "count": 1, "percentage": 14.29}], "name": "date arrival", "column_dtype": "string", "dtypes_stats": {"null": 1, "missing": 0, "string": 6}, "column_type": "categorical"}, "last date seen": {"stats": {"count_uniques": 6, "min": "2011/04/10", "max": "2016/09/10", "stddev": null, "kurtosis": null, "mean": null, "skewness": null, "sum": null, "variance": null, "zeros": 0, "count_na": 1, "p_count_na": 14.29, "p_count_uniques": 85.71}, "frequency": [{"value": "2016/09/10", "count": 1, "percentage": 14.29}, {"value": "2015/08/10", "count": 1, "percentage": 14.29}, {"value": "2014/07/10", "count": 1, "percentage": 14.29}, {"value": "2013/06/10", "count": 1, "percentage": 14.29}, {"value": "2012/05/10", "count": 1, "percentage": 14.29}, {"value": "2011/04/10", "count": 1, "percentage": 14.29}, {"value": null, "count": 1, "percentage": 14.29}], "name": "last date seen", "column_dtype": "string", "dtypes_stats": {"null": 1, "missing": 0, "string": 6}, "column_type": "categorical"}, "attributes": {"stats": {"count_uniques": 6, "min": [null, 5700.0], "max": [91.44000244140625, null], "count_na": 1, "p_count_na": 14.29, "p_count_uniques": 85.71}, "frequency": [{"value": "[8.5344, 4300.0]", "count": 1, "percentage": 14.29}, {"value": "[5.334, 2000.0]", "count": 1, "percentage": 14.29}, {"value": "[7.9248, 4000.0]", "count": 1, "percentage": 14.29}, {"value": "[3.9624, 1800.0]", "count": 1, "percentage": 14.29}, {"value": "[, 5700.0]", "count": 1, "percentage": 14.29}, {"value": "[91.44,]", "count": 1, "percentage": 14.29}, {"value": null, "count": 1, "percentage": 14.29}], "name": "attributes", "column_dtype": "array", "dtypes_stats": {"null": 1, "missing": 0, "array": 6}, "column_type": "array"}, "Date Type": {"stats": {"count_uniques": 6, "min": "2011-04-10", "max": "2016-09-10", "count_na": 1, "p_count_na": 14.29, "p_count_uniques": 85.71}, "frequency": [{"value": "2016-09-10", "count": 1, "percentage": 14.29}, {"value": "2015-08-10", "count": 1, "percentage": 14.29}, {"value": "2014-06-24", "count": 1, "percentage": 14.29}, {"value": "2013-06-24", "count": 1, "percentage": 14.29}, {"value": "2012-05-10", "count": 1, "percentage": 14.29}, {"value": "2011-04-10", "count": 1, "percentage": 14.29}, {"value": null, "count": 1, "percentage": 14.29}], "name": "Date Type", "column_dtype": "date", "dtypes_stats": {"null": 1, "missing": 0, "date": 6}, "column_type": "date"}, "timestamp": {"stats": {"count_uniques": 1, "min": "2014-06-24 00:00:00", "max": "2014-06-24 00:00:00", "count_na": 1, "p_count_na": 14.29, "p_count_uniques": 14.29}, "frequency": [{"value": "2014-06-24 00:00:00", "count": 6, "percentage": 85.71}, {"value": null, "count": 1, "percentage": 14.29}], "name": "timestamp", "column_dtype": "date", "dtypes_stats": {"null": 1, "missing": 0, "date": 6}, "column_type": "date"}, "Cybertronian": {"stats": {"count_uniques": 1, "min": 1, "max": 1, "count_na": 1, "p_count_na": 14.29, "p_count_uniques": 14.29}, "frequency": [{"value": true, "count": 6, "percentage": 85.71}, {"value": null, "count": 1, "percentage": 14.29}], "name": "Cybertronian", "column_dtype": "boolean", "dtypes_stats": {"null": 1, "missing": 0, "boolean": 6}, "column_type": "categorical"}, "function(binary)": {"stats": {"count_uniques": 6, "min": null, "max": null, "count_na": 1, "p_count_na": 14.29, "p_count_uniques": 85.71}, "frequency": [{"value": "Leader", "count": 1, "percentage": 14.29}, {"value": "Espionage", "count": 1, "percentage": 14.29}, {"value": "Security", "count": 1, "percentage": 14.29}, {"value": "First Lieutenant", "count": 1, "percentage": 14.29}, {"value": "None", "count": 1, "percentage": 14.29}, {"value": "Battle Station", "count": 1, "percentage": 14.29}, {"value": null, "count": 1, "percentage": 14.29}], "name": "function(binary)", "column_dtype": "binary", "dtypes_stats": {"null": 1, "missing": 0, "binary": 6}, "column_type": "binary"}, "NullType": {"stats": {"count_uniques": 0, "min": null, "max": null, "count_na": 7, "p_count_na": 100.0, "p_count_uniques": 0.0}, "frequency": [{"value": null, "count": 7, "percentage": 100.0}], "name": "NullType", "column_dtype": "null", "dtypes_stats": {"null": 7, "missing": 0}, "column_type": "null"}}, "name": null, "file_name": null, "summary": {"cols_count": 16, "rows_count": 7, "size": "52.5 MB", "sample_size": 10000, "missing_count": 26, "p_missing": 371.43}, "sample": {"columns": [{"title": "names"}, {"title": "height(ft)"}, {"title": "function"}, {"title": "rank"}, {"title": "age"}, {"title": "weight(t)"}, {"title": "japanese name"}, {"title": "last position seen"}, {"title": "date arrival"}, {"title": "last date seen"}, {"title": "attributes"}, {"title": "Date Type"}, {"title": "timestamp"}, {"title": "Cybertronian"}, {"title": "function(binary)"}, {"title": "NullType"}], "value": [["Optimus", -28, "Leader", 10, 5000000, 4.300000190734863, ["Inochi", "Convoy"], "19.442735,-99.201111", "1980/04/10", "2016/09/10", [8.53439998626709, 4300.0], "2016-09-10", "2014-06-24 00:00:00", true, null, null], ["bumbl#eb\u00e9\u00e9 ", 17, "Espionage", 7, 5000000, 2.0, ["Bumble", "Goldback"], "10.642707,-71.612534", "1980/04/10", "2015/08/10", [5.334000110626221, 2000.0], "2015-08-10", "2014-06-24 00:00:00", true, null, null], ["ironhide&", 26, "Security", 7, 5000000, 4.0, ["Roadbuster"], "37.789563,-122.400356", "1980/04/10", "2014/07/10", [7.924799919128418, 4000.0], "2014-06-24", "2014-06-24 00:00:00", true, null, null], ["Jazz", 13, "First Lieutenant", 8, 5000000, 1.7999999523162842, ["Meister"], "33.670666,-117.841553", "1980/04/10", "2013/06/10", [3.962399959564209, 1800.0], "2013-06-24", "2014-06-24 00:00:00", true, null, null], ["Megatron", null, "None", 10, 5000000, 5.699999809265137, ["Megatron"], null, "1980/04/10", "2012/05/10", [null, 5700.0], "2012-05-10", "2014-06-24 00:00:00", true, null, null], ["Metroplex_)^$", 300, "Battle Station", 8, 5000000, null, ["Metroflex"], null, "1980/04/10", "2011/04/10", [91.44000244140625, null], "2011-04-10", "2014-06-24 00:00:00", true, null, null], [null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null]]}}
In [45]:
t.run()
Creating file ../test_df_profiler.py
Done
In [46]:
mismatch = {"names":"dd/mm/yyyy","height(ft)":r'^([0-2][0-9]|(3)[0-1])(\/)(((0)[0-9])|((1)[0-2]))(\/)\d{4}$',"function":"yyyy-mm-dd"}
t.create(p, "dataset", "mismatch", 'json', None, source_df,"*", mismatch=mismatch)
Creating test_dataset_mismatch() test function...
INFO:optimus:test_dataset_mismatch()
INFO:optimus:Processing Stats For columns...
{'names': {'string': 6, 'null': 1}, 'height(ft)': {'smallint': 5, 'null': 2}, 'function': {'string': 6, 'null': 1}, 'rank': {'tinyint': 6, 'null': 1}, 'age': {'int': 6, 'null': 1}, 'weight(t)': {'float': 5, 'null': 2}, 'japanese name': {'array': 6, 'null': 1}, 'last position seen': {'string': 4, 'null': 3}, 'date arrival': {'string': 6, 'null': 1}, 'last date seen': {'string': 6, 'null': 1}, 'attributes': {'array': 6, 'null': 1}, 'Date Type': {'date': 6, 'null': 1}, 'timestamp': {'timestamp': 6, 'null': 1}, 'Cybertronian': {'boolean': 6, 'null': 1}, 'function(binary)': {'binary': 6, 'null': 1}, 'NullType': {'null': 7}}
INFO:optimus:Batch Stats 0. Processing columns['names', 'height(ft)', 'function', 'rank', 'age', 'weight(t)', 'japanese name', 'last position seen', 'date arrival', 'last date seen', 'attributes', 'Date Type', 'timestamp', 'Cybertronian', 'function(binary)', 'NullType']
INFO:optimus:'kurtosis' function in 'age' column is returning 'nan'. Is that what you expected?. Seems that 'age' has 'nan' values
INFO:optimus:'skewness' function in 'age' column is returning 'nan'. Is that what you expected?. Seems that 'age' has 'nan' values
INFO:optimus:Batch Histogram 0. Processing columns['names', 'height(ft)', 'function', 'rank', 'age', 'weight(t)', 'japanese name', 'last position seen', 'date arrival', 'last date seen', 'attributes', 'Date Type', 'timestamp', 'Cybertronian', 'function(binary)', 'NullType']
INFO:optimus:Processing Frequency ...
INFO:optimus:`names`,`function`,`japanese name`,`last position seen`,`date arrival`,`last date seen`,`attributes`,`Date Type`,`timestamp`,`Cybertronian`,`function(binary)`,`NullType` column(s) was not processed because is/are not byte,short,big,int,double,float
INFO:optimus:`names`,`function`,`last position seen`,`date arrival`,`last date seen`,`timestamp`,`Cybertronian`,`NullType` column(s) was not processed because is/are not array,vector,byte,date,binary
INFO:optimus:Using 'column_exp' to process column 'japanese name' with function _cast_to
INFO:optimus:Using 'column_exp' to process column 'attributes' with function _cast_to
INFO:optimus:Using 'column_exp' to process column 'Date Type' with function _cast_to
INFO:optimus:Using 'column_exp' to process column 'function(binary)' with function _cast_to
{"count_types": {"string": 5, "int": 3, "decimal": 1, "array": 2, "date": 2, "boolean": 1, "binary": 1, "null": 1, "numeric": 0, "categorical": 0}, "total_count_dtypes": 8, "dtypes_list": ["string", "int", "decimal", "array", "date", "boolean", "binary", "null"], "columns": {"names": {"stats": {"count_uniques": 5, "min": "Jazz", "max": "ironhide&", "stddev": null, "kurtosis": null, "mean": null, "skewness": null, "sum": null, "variance": null, "zeros": 0, "count_na": 1, "p_count_na": 14.29, "p_count_uniques": 71.43}, "frequency": [{"value": "Optimus", "count": 1, "percentage": 14.29}, {"value": "bumbl#eb\u00e9\u00e9 ", "count": 1, "percentage": 14.29}, {"value": "ironhide&", "count": 1, "percentage": 14.29}, {"value": "Jazz", "count": 1, "percentage": 14.29}, {"value": "Megatron", "count": 1, "percentage": 14.29}, {"value": "Metroplex_)^$", "count": 1, "percentage": 14.29}, {"value": null, "count": 1, "percentage": 14.29}], "name": "names", "column_dtype": "string", "dtypes_stats": {"null": 1, "missing": 0, "string": 6}, "column_type": "categorical"}, "height(ft)": {"stats": {"count_uniques": 5, "min": -28, "max": 300, "stddev": 132.66612, "kurtosis": 0.13863, "mean": 65.6, "skewness": 1.4049, "sum": 328, "variance": 17600.3, "zeros": 0, "percentile": {"0.75": 26, "0.95": 300, "0.05": -28, "0.25": 13, "0.5": 17}, "count_na": 2, "hist": [{"count": 4.0, "lower": -28.0, "upper": 54.0}, {"count": 0.0, "lower": 54.0, "upper": 136.0}, {"count": 0.0, "lower": 136.0, "upper": 218.0}, {"count": 0.0, "lower": 218.0, "upper": 300.0}], "range": 328, "median": 17, "interquartile_range": 13, "coef_variation": 2.02235, "mad": 9, "p_count_na": 28.57, "p_count_uniques": 71.43}, "name": "height(ft)", "column_dtype": "int", "dtypes_stats": {"null": 2, "missing": 0, "int": 5}, "column_type": "numeric"}, "function": {"stats": {"count_uniques": 6, "min": "Battle Station", "max": "Security", "stddev": null, "kurtosis": null, "mean": null, "skewness": null, "sum": null, "variance": null, "zeros": 0, "count_na": 1, "p_count_na": 14.29, "p_count_uniques": 85.71}, "frequency": [{"value": "Leader", "count": 1, "percentage": 14.29}, {"value": "Espionage", "count": 1, "percentage": 14.29}, {"value": "Security", "count": 1, "percentage": 14.29}, {"value": "First Lieutenant", "count": 1, "percentage": 14.29}, {"value": "None", "count": 1, "percentage": 14.29}, {"value": "Battle Station", "count": 1, "percentage": 14.29}, {"value": null, "count": 1, "percentage": 14.29}], "name": "function", "column_dtype": "string", "dtypes_stats": {"null": 1, "missing": 0, "string": 6}, "column_type": "categorical"}, "rank": {"stats": {"count_uniques": 3, "min": 7, "max": 10, "stddev": 1.36626, "kurtosis": -1.5, "mean": 8.33333, "skewness": 0.3818, "sum": 50, "variance": 1.86667, "zeros": 0, "percentile": {"0.75": 10, "0.95": 10, "0.05": 7, "0.25": 7, "0.5": 8}, "count_na": 1, "hist": [{"count": 4.0, "lower": 7.0, "upper": 8.5}, {"count": 0.0, "lower": 8.5, "upper": 10.0}], "range": 3, "median": 8, "interquartile_range": 3, "coef_variation": 0.16395, "mad": 1, "p_count_na": 14.29, "p_count_uniques": 42.86}, "name": "rank", "column_dtype": "int", "dtypes_stats": {"null": 1, "missing": 0, "int": 6}, "column_type": "numeric"}, "age": {"stats": {"count_uniques": 1, "min": 5000000, "max": 5000000, "stddev": 0.0, "kurtosis": null, "mean": 5000000.0, "skewness": null, "sum": 30000000, "variance": 0.0, "zeros": 0, "percentile": {"0.75": 5000000, "0.95": 5000000, "0.05": 5000000, "0.25": 5000000, "0.5": 5000000}, "count_na": 1, "hist": [{"count": 6, "lower": 5000000, "upper": 5000001}], "range": 0, "median": 5000000, "interquartile_range": 0, "coef_variation": 0.0, "mad": 0, "p_count_na": 14.29, "p_count_uniques": 14.29}, "name": "age", "column_dtype": "int", "dtypes_stats": {"null": 1, "missing": 0, "int": 6}, "column_type": "numeric"}, "weight(t)": {"stats": {"count_uniques": 5, "min": 1.8, "max": 5.7, "stddev": 1.64712, "kurtosis": -1.43641, "mean": 3.56, "skewness": 0.06521, "sum": 17.8, "variance": 2.713, "zeros": 0, "percentile": {"0.75": 4.300000190734863, "0.95": 5.699999809265137, "0.05": 1.7999999523162842, "0.25": 2.0, "0.5": 4.0}, "count_na": 2, "hist": [{"count": 1.0, "lower": 1.8, "upper": 2.78}, {"count": 0.0, "lower": 2.78, "upper": 3.75}, {"count": 2.0, "lower": 3.75, "upper": 4.73}, {"count": 1.0, "lower": 4.73, "upper": 5.7}], "range": 3.9000000000000004, "median": 4.0, "interquartile_range": 2.3000001907348633, "coef_variation": 0.46267, "mad": 1.7, "p_count_na": 28.57, "p_count_uniques": 71.43}, "name": "weight(t)", "column_dtype": "decimal", "dtypes_stats": {"null": 2, "missing": 0, "decimal": 5}, "column_type": "numeric"}, "japanese name": {"stats": {"count_uniques": 6, "min": ["Bumble", "Goldback"], "max": ["Roadbuster"], "count_na": 1, "p_count_na": 14.29, "p_count_uniques": 85.71}, "frequency": [{"value": "[Inochi, Convoy]", "count": 1, "percentage": 14.29}, {"value": "[Bumble, Goldback]", "count": 1, "percentage": 14.29}, {"value": "[Roadbuster]", "count": 1, "percentage": 14.29}, {"value": "[Meister]", "count": 1, "percentage": 14.29}, {"value": "[Megatron]", "count": 1, "percentage": 14.29}, {"value": "[Metroflex]", "count": 1, "percentage": 14.29}, {"value": null, "count": 1, "percentage": 14.29}], "name": "japanese name", "column_dtype": "array", "dtypes_stats": {"null": 1, "missing": 0, "array": 6}, "column_type": "array"}, "last position seen": {"stats": {"count_uniques": 4, "min": "10.642707,-71.612534", "max": "37.789563,-122.400356", "stddev": null, "kurtosis": null, "mean": null, "skewness": null, "sum": null, "variance": null, "zeros": 0, "count_na": 3, "p_count_na": 42.86, "p_count_uniques": 57.14}, "frequency": [{"value": null, "count": 3, "percentage": 42.86}, {"value": "19.442735,-99.201111", "count": 1, "percentage": 14.29}, {"value": "10.642707,-71.612534", "count": 1, "percentage": 14.29}, {"value": "37.789563,-122.400356", "count": 1, "percentage": 14.29}, {"value": "33.670666,-117.841553", "count": 1, "percentage": 14.29}], "name": "last position seen", "column_dtype": "string", "dtypes_stats": {"null": 3, "missing": 0, "string": 4}, "column_type": "categorical"}, "date arrival": {"stats": {"count_uniques": 1, "min": "1980/04/10", "max": "1980/04/10", "stddev": null, "kurtosis": null, "mean": null, "skewness": null, "sum": null, "variance": null, "zeros": 0, "count_na": 1, "p_count_na": 14.29, "p_count_uniques": 14.29}, "frequency": [{"value": "1980/04/10", "count": 6, "percentage": 85.71}, {"value": null, "count": 1, "percentage": 14.29}], "name": "date arrival", "column_dtype": "string", "dtypes_stats": {"null": 1, "missing": 0, "string": 6}, "column_type": "categorical"}, "last date seen": {"stats": {"count_uniques": 6, "min": "2011/04/10", "max": "2016/09/10", "stddev": null, "kurtosis": null, "mean": null, "skewness": null, "sum": null, "variance": null, "zeros": 0, "count_na": 1, "p_count_na": 14.29, "p_count_uniques": 85.71}, "frequency": [{"value": "2016/09/10", "count": 1, "percentage": 14.29}, {"value": "2015/08/10", "count": 1, "percentage": 14.29}, {"value": "2014/07/10", "count": 1, "percentage": 14.29}, {"value": "2013/06/10", "count": 1, "percentage": 14.29}, {"value": "2012/05/10", "count": 1, "percentage": 14.29}, {"value": "2011/04/10", "count": 1, "percentage": 14.29}, {"value": null, "count": 1, "percentage": 14.29}], "name": "last date seen", "column_dtype": "string", "dtypes_stats": {"null": 1, "missing": 0, "string": 6}, "column_type": "categorical"}, "attributes": {"stats": {"count_uniques": 6, "min": [null, 5700.0], "max": [91.44000244140625, null], "count_na": 1, "p_count_na": 14.29, "p_count_uniques": 85.71}, "frequency": [{"value": "[8.5344, 4300.0]", "count": 1, "percentage": 14.29}, {"value": "[5.334, 2000.0]", "count": 1, "percentage": 14.29}, {"value": "[7.9248, 4000.0]", "count": 1, "percentage": 14.29}, {"value": "[3.9624, 1800.0]", "count": 1, "percentage": 14.29}, {"value": "[, 5700.0]", "count": 1, "percentage": 14.29}, {"value": "[91.44,]", "count": 1, "percentage": 14.29}, {"value": null, "count": 1, "percentage": 14.29}], "name": "attributes", "column_dtype": "array", "dtypes_stats": {"null": 1, "missing": 0, "array": 6}, "column_type": "array"}, "Date Type": {"stats": {"count_uniques": 6, "min": "2011-04-10", "max": "2016-09-10", "count_na": 1, "p_count_na": 14.29, "p_count_uniques": 85.71}, "frequency": [{"value": "2016-09-10", "count": 1, "percentage": 14.29}, {"value": "2015-08-10", "count": 1, "percentage": 14.29}, {"value": "2014-06-24", "count": 1, "percentage": 14.29}, {"value": "2013-06-24", "count": 1, "percentage": 14.29}, {"value": "2012-05-10", "count": 1, "percentage": 14.29}, {"value": "2011-04-10", "count": 1, "percentage": 14.29}, {"value": null, "count": 1, "percentage": 14.29}], "name": "Date Type", "column_dtype": "date", "dtypes_stats": {"null": 1, "missing": 0, "date": 6}, "column_type": "date"}, "timestamp": {"stats": {"count_uniques": 1, "min": "2014-06-24 00:00:00", "max": "2014-06-24 00:00:00", "count_na": 1, "p_count_na": 14.29, "p_count_uniques": 14.29}, "frequency": [{"value": "2014-06-24 00:00:00", "count": 6, "percentage": 85.71}, {"value": null, "count": 1, "percentage": 14.29}], "name": "timestamp", "column_dtype": "date", "dtypes_stats": {"null": 1, "missing": 0, "date": 6}, "column_type": "date"}, "Cybertronian": {"stats": {"count_uniques": 1, "min": 1, "max": 1, "count_na": 1, "p_count_na": 14.29, "p_count_uniques": 14.29}, "frequency": [{"value": true, "count": 6, "percentage": 85.71}, {"value": null, "count": 1, "percentage": 14.29}], "name": "Cybertronian", "column_dtype": "boolean", "dtypes_stats": {"null": 1, "missing": 0, "boolean": 6}, "column_type": "categorical"}, "function(binary)": {"stats": {"count_uniques": 6, "min": null, "max": null, "count_na": 1, "p_count_na": 14.29, "p_count_uniques": 85.71}, "frequency": [{"value": "Leader", "count": 1, "percentage": 14.29}, {"value": "Espionage", "count": 1, "percentage": 14.29}, {"value": "Security", "count": 1, "percentage": 14.29}, {"value": "First Lieutenant", "count": 1, "percentage": 14.29}, {"value": "None", "count": 1, "percentage": 14.29}, {"value": "Battle Station", "count": 1, "percentage": 14.29}, {"value": null, "count": 1, "percentage": 14.29}], "name": "function(binary)", "column_dtype": "binary", "dtypes_stats": {"null": 1, "missing": 0, "binary": 6}, "column_type": "binary"}, "NullType": {"stats": {"count_uniques": 0, "min": null, "max": null, "count_na": 7, "p_count_na": 100.0, "p_count_uniques": 0.0}, "frequency": [{"value": null, "count": 7, "percentage": 100.0}], "name": "NullType", "column_dtype": "null", "dtypes_stats": {"null": 7, "missing": 0}, "column_type": "null"}}, "name": null, "file_name": null, "summary": {"cols_count": 16, "rows_count": 7, "size": "51.6 MB", "sample_size": 10000, "missing_count": 26, "p_missing": 371.43}, "sample": {"columns": [{"title": "names"}, {"title": "height(ft)"}, {"title": "function"}, {"title": "rank"}, {"title": "age"}, {"title": "weight(t)"}, {"title": "japanese name"}, {"title": "last position seen"}, {"title": "date arrival"}, {"title": "last date seen"}, {"title": "attributes"}, {"title": "Date Type"}, {"title": "timestamp"}, {"title": "Cybertronian"}, {"title": "function(binary)"}, {"title": "NullType"}], "value": [["Optimus", -28, "Leader", 10, 5000000, 4.300000190734863, ["Inochi", "Convoy"], "19.442735,-99.201111", "1980/04/10", "2016/09/10", [8.53439998626709, 4300.0], "2016-09-10", "2014-06-24 00:00:00", true, null, null], ["bumbl#eb\u00e9\u00e9 ", 17, "Espionage", 7, 5000000, 2.0, ["Bumble", "Goldback"], "10.642707,-71.612534", "1980/04/10", "2015/08/10", [5.334000110626221, 2000.0], "2015-08-10", "2014-06-24 00:00:00", true, null, null], ["ironhide&", 26, "Security", 7, 5000000, 4.0, ["Roadbuster"], "37.789563,-122.400356", "1980/04/10", "2014/07/10", [7.924799919128418, 4000.0], "2014-06-24", "2014-06-24 00:00:00", true, null, null], ["Jazz", 13, "First Lieutenant", 8, 5000000, 1.7999999523162842, ["Meister"], "33.670666,-117.841553", "1980/04/10", "2013/06/10", [3.962399959564209, 1800.0], "2013-06-24", "2014-06-24 00:00:00", true, null, null], ["Megatron", null, "None", 10, 5000000, 5.699999809265137, ["Megatron"], null, "1980/04/10", "2012/05/10", [null, 5700.0], "2012-05-10", "2014-06-24 00:00:00", true, null, null], ["Metroplex_)^$", 300, "Battle Station", 8, 5000000, null, ["Metroflex"], null, "1980/04/10", "2011/04/10", [91.44000244140625, null], "2011-04-10", "2014-06-24 00:00:00", true, null, null], [null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null]]}}
In [47]:
t.run()
Creating file ../test_df_profiler.py
Done
In [48]:
t.create(p, "columns_stats", None, 'json', None, source_df,"*")
Creating test_columns_stats() test function...
INFO:optimus:test_columns_stats()
INFO:optimus:Processing Stats For columns...
{'names': {'string': 6, 'null': 1}, 'height(ft)': {'smallint': 5, 'null': 2}, 'function': {'string': 6, 'null': 1}, 'rank': {'tinyint': 6, 'null': 1}, 'age': {'int': 6, 'null': 1}, 'weight(t)': {'float': 5, 'null': 2}, 'japanese name': {'array': 6, 'null': 1}, 'last position seen': {'string': 4, 'null': 3}, 'date arrival': {'string': 6, 'null': 1}, 'last date seen': {'string': 6, 'null': 1}, 'attributes': {'array': 6, 'null': 1}, 'Date Type': {'date': 6, 'null': 1}, 'timestamp': {'timestamp': 6, 'null': 1}, 'Cybertronian': {'boolean': 6, 'null': 1}, 'function(binary)': {'binary': 6, 'null': 1}, 'NullType': {'null': 7}}
INFO:optimus:Batch Stats 0. Processing columns['names', 'height(ft)', 'function', 'rank', 'age', 'weight(t)', 'japanese name', 'last position seen', 'date arrival', 'last date seen', 'attributes', 'Date Type', 'timestamp', 'Cybertronian', 'function(binary)', 'NullType']
INFO:optimus:'kurtosis' function in 'age' column is returning 'nan'. Is that what you expected?. Seems that 'age' has 'nan' values
INFO:optimus:'skewness' function in 'age' column is returning 'nan'. Is that what you expected?. Seems that 'age' has 'nan' values
INFO:optimus:Batch Histogram 0. Processing columns['names', 'height(ft)', 'function', 'rank', 'age', 'weight(t)', 'japanese name', 'last position seen', 'date arrival', 'last date seen', 'attributes', 'Date Type', 'timestamp', 'Cybertronian', 'function(binary)', 'NullType']
INFO:optimus:Processing Frequency ...
INFO:optimus:`names`,`function`,`japanese name`,`last position seen`,`date arrival`,`last date seen`,`attributes`,`Date Type`,`timestamp`,`Cybertronian`,`function(binary)`,`NullType` column(s) was not processed because is/are not byte,short,big,int,double,float
INFO:optimus:`names`,`function`,`last position seen`,`date arrival`,`last date seen`,`timestamp`,`Cybertronian`,`NullType` column(s) was not processed because is/are not array,vector,byte,date,binary
INFO:optimus:Using 'column_exp' to process column 'japanese name' with function _cast_to
INFO:optimus:Using 'column_exp' to process column 'attributes' with function _cast_to
INFO:optimus:Using 'column_exp' to process column 'Date Type' with function _cast_to
INFO:optimus:Using 'column_exp' to process column 'function(binary)' with function _cast_to
{'count_types': {'string': 5, 'int': 3, 'decimal': 1, 'array': 2, 'date': 2, 'boolean': 1, 'binary': 1, 'null': 1, 'numeric': 0, 'categorical': 0}, 'total_count_dtypes': 8, 'dtypes_list': ['string', 'int', 'decimal', 'array', 'date', 'boolean', 'binary', 'null'], 'columns': {'names': {'stats': {'count_uniques': 5, 'min': 'Jazz', 'max': 'ironhide&', 'stddev': None, 'kurtosis': None, 'mean': None, 'skewness': None, 'sum': None, 'variance': None, 'zeros': 0, 'count_na': 1, 'p_count_na': 14.29, 'p_count_uniques': 71.43}, 'frequency': [{'value': 'Optimus', 'count': 1, 'percentage': 14.29}, {'value': 'bumbl#ebéé ', 'count': 1, 'percentage': 14.29}, {'value': 'ironhide&', 'count': 1, 'percentage': 14.29}, {'value': 'Jazz', 'count': 1, 'percentage': 14.29}, {'value': 'Megatron', 'count': 1, 'percentage': 14.29}, {'value': 'Metroplex_)^$', 'count': 1, 'percentage': 14.29}, {'value': None, 'count': 1, 'percentage': 14.29}], 'name': 'names', 'column_dtype': 'string', 'dtypes_stats': {'null': 1, 'missing': 0, 'string': 6}, 'column_type': 'categorical'}, 'height(ft)': {'stats': {'count_uniques': 5, 'min': -28, 'max': 300, 'stddev': 132.66612, 'kurtosis': 0.13863, 'mean': 65.6, 'skewness': 1.4049, 'sum': 328, 'variance': 17600.3, 'zeros': 0, 'percentile': {'0.75': 26, '0.95': 300, '0.05': -28, '0.25': 13, '0.5': 17}, 'count_na': 2, 'hist': [{'count': 4.0, 'lower': -28.0, 'upper': 54.0}, {'count': 0.0, 'lower': 54.0, 'upper': 136.0}, {'count': 0.0, 'lower': 136.0, 'upper': 218.0}, {'count': 0.0, 'lower': 218.0, 'upper': 300.0}], 'range': 328, 'median': 17, 'interquartile_range': 13, 'coef_variation': 2.02235, 'mad': 9, 'p_count_na': 28.57, 'p_count_uniques': 71.43}, 'name': 'height(ft)', 'column_dtype': 'int', 'dtypes_stats': {'null': 2, 'missing': 0, 'int': 5}, 'column_type': 'numeric'}, 'function': {'stats': {'count_uniques': 6, 'min': 'Battle Station', 'max': 'Security', 'stddev': None, 'kurtosis': None, 'mean': None, 'skewness': None, 'sum': None, 'variance': None, 'zeros': 0, 'count_na': 1, 'p_count_na': 14.29, 'p_count_uniques': 85.71}, 'frequency': [{'value': 'Leader', 'count': 1, 'percentage': 14.29}, {'value': 'Espionage', 'count': 1, 'percentage': 14.29}, {'value': 'Security', 'count': 1, 'percentage': 14.29}, {'value': 'First Lieutenant', 'count': 1, 'percentage': 14.29}, {'value': 'None', 'count': 1, 'percentage': 14.29}, {'value': 'Battle Station', 'count': 1, 'percentage': 14.29}, {'value': None, 'count': 1, 'percentage': 14.29}], 'name': 'function', 'column_dtype': 'string', 'dtypes_stats': {'null': 1, 'missing': 0, 'string': 6}, 'column_type': 'categorical'}, 'rank': {'stats': {'count_uniques': 3, 'min': 7, 'max': 10, 'stddev': 1.36626, 'kurtosis': -1.5, 'mean': 8.33333, 'skewness': 0.3818, 'sum': 50, 'variance': 1.86667, 'zeros': 0, 'percentile': {'0.75': 10, '0.95': 10, '0.05': 7, '0.25': 7, '0.5': 8}, 'count_na': 1, 'hist': [{'count': 4.0, 'lower': 7.0, 'upper': 8.5}, {'count': 0.0, 'lower': 8.5, 'upper': 10.0}], 'range': 3, 'median': 8, 'interquartile_range': 3, 'coef_variation': 0.16395, 'mad': 1, 'p_count_na': 14.29, 'p_count_uniques': 42.86}, 'name': 'rank', 'column_dtype': 'int', 'dtypes_stats': {'null': 1, 'missing': 0, 'int': 6}, 'column_type': 'numeric'}, 'age': {'stats': {'count_uniques': 1, 'min': 5000000, 'max': 5000000, 'stddev': 0.0, 'kurtosis': nan, 'mean': 5000000.0, 'skewness': nan, 'sum': 30000000, 'variance': 0.0, 'zeros': 0, 'percentile': {'0.75': 5000000, '0.95': 5000000, '0.05': 5000000, '0.25': 5000000, '0.5': 5000000}, 'count_na': 1, 'hist': [{'count': 6, 'lower': 5000000, 'upper': 5000001}], 'range': 0, 'median': 5000000, 'interquartile_range': 0, 'coef_variation': 0.0, 'mad': 0, 'p_count_na': 14.29, 'p_count_uniques': 14.29}, 'name': 'age', 'column_dtype': 'int', 'dtypes_stats': {'null': 1, 'missing': 0, 'int': 6}, 'column_type': 'numeric'}, 'weight(t)': {'stats': {'count_uniques': 5, 'min': 1.8, 'max': 5.7, 'stddev': 1.64712, 'kurtosis': -1.43641, 'mean': 3.56, 'skewness': 0.06521, 'sum': 17.8, 'variance': 2.713, 'zeros': 0, 'percentile': {'0.75': 4.300000190734863, '0.95': 5.699999809265137, '0.05': 1.7999999523162842, '0.25': 2.0, '0.5': 4.0}, 'count_na': 2, 'hist': [{'count': 1.0, 'lower': 1.8, 'upper': 2.78}, {'count': 0.0, 'lower': 2.78, 'upper': 3.75}, {'count': 2.0, 'lower': 3.75, 'upper': 4.73}, {'count': 1.0, 'lower': 4.73, 'upper': 5.7}], 'range': 3.9000000000000004, 'median': 4.0, 'interquartile_range': 2.3000001907348633, 'coef_variation': 0.46267, 'mad': 1.7, 'p_count_na': 28.57, 'p_count_uniques': 71.43}, 'name': 'weight(t)', 'column_dtype': 'decimal', 'dtypes_stats': {'null': 2, 'missing': 0, 'decimal': 5}, 'column_type': 'numeric'}, 'japanese name': {'stats': {'count_uniques': 6, 'min': ['Bumble', 'Goldback'], 'max': ['Roadbuster'], 'count_na': 1, 'p_count_na': 14.29, 'p_count_uniques': 85.71}, 'frequency': [{'value': '[Inochi, Convoy]', 'count': 1, 'percentage': 14.29}, {'value': '[Bumble, Goldback]', 'count': 1, 'percentage': 14.29}, {'value': '[Roadbuster]', 'count': 1, 'percentage': 14.29}, {'value': '[Meister]', 'count': 1, 'percentage': 14.29}, {'value': '[Megatron]', 'count': 1, 'percentage': 14.29}, {'value': '[Metroflex]', 'count': 1, 'percentage': 14.29}, {'value': None, 'count': 1, 'percentage': 14.29}], 'name': 'japanese name', 'column_dtype': 'array', 'dtypes_stats': {'null': 1, 'missing': 0, 'array': 6}, 'column_type': 'array'}, 'last position seen': {'stats': {'count_uniques': 4, 'min': '10.642707,-71.612534', 'max': '37.789563,-122.400356', 'stddev': None, 'kurtosis': None, 'mean': None, 'skewness': None, 'sum': None, 'variance': None, 'zeros': 0, 'count_na': 3, 'p_count_na': 42.86, 'p_count_uniques': 57.14}, 'frequency': [{'value': None, 'count': 3, 'percentage': 42.86}, {'value': '19.442735,-99.201111', 'count': 1, 'percentage': 14.29}, {'value': '10.642707,-71.612534', 'count': 1, 'percentage': 14.29}, {'value': '37.789563,-122.400356', 'count': 1, 'percentage': 14.29}, {'value': '33.670666,-117.841553', 'count': 1, 'percentage': 14.29}], 'name': 'last position seen', 'column_dtype': 'string', 'dtypes_stats': {'null': 3, 'missing': 0, 'string': 4}, 'column_type': 'categorical'}, 'date arrival': {'stats': {'count_uniques': 1, 'min': '1980/04/10', 'max': '1980/04/10', 'stddev': None, 'kurtosis': None, 'mean': None, 'skewness': None, 'sum': None, 'variance': None, 'zeros': 0, 'count_na': 1, 'p_count_na': 14.29, 'p_count_uniques': 14.29}, 'frequency': [{'value': '1980/04/10', 'count': 6, 'percentage': 85.71}, {'value': None, 'count': 1, 'percentage': 14.29}], 'name': 'date arrival', 'column_dtype': 'string', 'dtypes_stats': {'null': 1, 'missing': 0, 'string': 6}, 'column_type': 'categorical'}, 'last date seen': {'stats': {'count_uniques': 6, 'min': '2011/04/10', 'max': '2016/09/10', 'stddev': None, 'kurtosis': None, 'mean': None, 'skewness': None, 'sum': None, 'variance': None, 'zeros': 0, 'count_na': 1, 'p_count_na': 14.29, 'p_count_uniques': 85.71}, 'frequency': [{'value': '2016/09/10', 'count': 1, 'percentage': 14.29}, {'value': '2015/08/10', 'count': 1, 'percentage': 14.29}, {'value': '2014/07/10', 'count': 1, 'percentage': 14.29}, {'value': '2013/06/10', 'count': 1, 'percentage': 14.29}, {'value': '2012/05/10', 'count': 1, 'percentage': 14.29}, {'value': '2011/04/10', 'count': 1, 'percentage': 14.29}, {'value': None, 'count': 1, 'percentage': 14.29}], 'name': 'last date seen', 'column_dtype': 'string', 'dtypes_stats': {'null': 1, 'missing': 0, 'string': 6}, 'column_type': 'categorical'}, 'attributes': {'stats': {'count_uniques': 6, 'min': [None, 5700.0], 'max': [91.44000244140625, None], 'count_na': 1, 'p_count_na': 14.29, 'p_count_uniques': 85.71}, 'frequency': [{'value': '[8.5344, 4300.0]', 'count': 1, 'percentage': 14.29}, {'value': '[5.334, 2000.0]', 'count': 1, 'percentage': 14.29}, {'value': '[7.9248, 4000.0]', 'count': 1, 'percentage': 14.29}, {'value': '[3.9624, 1800.0]', 'count': 1, 'percentage': 14.29}, {'value': '[, 5700.0]', 'count': 1, 'percentage': 14.29}, {'value': '[91.44,]', 'count': 1, 'percentage': 14.29}, {'value': None, 'count': 1, 'percentage': 14.29}], 'name': 'attributes', 'column_dtype': 'array', 'dtypes_stats': {'null': 1, 'missing': 0, 'array': 6}, 'column_type': 'array'}, 'Date Type': {'stats': {'count_uniques': 6, 'min': datetime.date(2011, 4, 10), 'max': datetime.date(2016, 9, 10), 'count_na': 1, 'p_count_na': 14.29, 'p_count_uniques': 85.71}, 'frequency': [{'value': '2016-09-10', 'count': 1, 'percentage': 14.29}, {'value': '2015-08-10', 'count': 1, 'percentage': 14.29}, {'value': '2014-06-24', 'count': 1, 'percentage': 14.29}, {'value': '2013-06-24', 'count': 1, 'percentage': 14.29}, {'value': '2012-05-10', 'count': 1, 'percentage': 14.29}, {'value': '2011-04-10', 'count': 1, 'percentage': 14.29}, {'value': None, 'count': 1, 'percentage': 14.29}], 'name': 'Date Type', 'column_dtype': 'date', 'dtypes_stats': {'null': 1, 'missing': 0, 'date': 6}, 'column_type': 'date'}, 'timestamp': {'stats': {'count_uniques': 1, 'min': datetime.datetime(2014, 6, 24, 0, 0), 'max': datetime.datetime(2014, 6, 24, 0, 0), 'count_na': 1, 'p_count_na': 14.29, 'p_count_uniques': 14.29}, 'frequency': [{'value': datetime.datetime(2014, 6, 24, 0, 0), 'count': 6, 'percentage': 85.71}, {'value': None, 'count': 1, 'percentage': 14.29}], 'name': 'timestamp', 'column_dtype': 'date', 'dtypes_stats': {'null': 1, 'missing': 0, 'date': 6}, 'column_type': 'date'}, 'Cybertronian': {'stats': {'count_uniques': 1, 'min': 1, 'max': 1, 'count_na': 1, 'p_count_na': 14.29, 'p_count_uniques': 14.29}, 'frequency': [{'value': True, 'count': 6, 'percentage': 85.71}, {'value': None, 'count': 1, 'percentage': 14.29}], 'name': 'Cybertronian', 'column_dtype': 'boolean', 'dtypes_stats': {'null': 1, 'missing': 0, 'boolean': 6}, 'column_type': 'categorical'}, 'function(binary)': {'stats': {'count_uniques': 6, 'min': bytearray(b'Battle Station'), 'max': bytearray(b'Security'), 'count_na': 1, 'p_count_na': 14.29, 'p_count_uniques': 85.71}, 'frequency': [{'value': 'Leader', 'count': 1, 'percentage': 14.29}, {'value': 'Espionage', 'count': 1, 'percentage': 14.29}, {'value': 'Security', 'count': 1, 'percentage': 14.29}, {'value': 'First Lieutenant', 'count': 1, 'percentage': 14.29}, {'value': 'None', 'count': 1, 'percentage': 14.29}, {'value': 'Battle Station', 'count': 1, 'percentage': 14.29}, {'value': None, 'count': 1, 'percentage': 14.29}], 'name': 'function(binary)', 'column_dtype': 'binary', 'dtypes_stats': {'null': 1, 'missing': 0, 'binary': 6}, 'column_type': 'binary'}, 'NullType': {'stats': {'count_uniques': 0, 'min': None, 'max': None, 'count_na': 7, 'p_count_na': 100.0, 'p_count_uniques': 0.0}, 'frequency': [{'value': None, 'count': 7, 'percentage': 100.0}], 'name': 'NullType', 'column_dtype': 'null', 'dtypes_stats': {'null': 7, 'missing': 0}, 'column_type': 'null'}}}
In [49]:
t.run()
Creating file ../test_df_profiler.py
Done
In [12]:
t.create(p, "columns_agg", None, 'json', None, source_df,"*")
Creating test_columns_agg() test function...
INFO:optimus:test_columns_agg()
INFO:optimus:Batch Stats 0. Processing columns['names', 'height(ft)', 'function', 'rank', 'age', 'weight(t)', 'japanese name', 'last position seen', 'date arrival', 'last date seen', 'attributes', 'Date Type', 'timestamp', 'Cybertronian', 'function(binary)', 'NullType']
INFO:optimus:'kurtosis' function in 'age' column is returning 'nan'. Is that what you expected?. Seems that 'age' has 'nan' values
INFO:optimus:'skewness' function in 'age' column is returning 'nan'. Is that what you expected?. Seems that 'age' has 'nan' values
INFO:optimus:Batch Histogram 0. Processing columns['names', 'height(ft)', 'function', 'rank', 'age', 'weight(t)', 'japanese name', 'last position seen', 'date arrival', 'last date seen', 'attributes', 'Date Type', 'timestamp', 'Cybertronian', 'function(binary)', 'NullType']
INFO:optimus:'kurtosis' function in 'age' column is returning 'nan'. Is that what you expected?. Seems that 'age' has 'nan' values
INFO:optimus:'skewness' function in 'age' column is returning 'nan'. Is that what you expected?. Seems that 'age' has 'nan' values
{'names': {'count_uniques': 5, 'min': 'Jazz', 'max': 'ironhide&', 'count_na': 1, 'stddev': None, 'kurtosis': None, 'mean': None, 'skewness': None, 'sum': None, 'variance': None, 'zeros': 0}, 'height(ft)': {'count_uniques': 5, 'min': -28, 'max': 300, 'count_na': 2, 'stddev': 132.66612, 'kurtosis': 0.13863, 'mean': 65.6, 'skewness': 1.4049, 'sum': 328, 'variance': 17600.3, 'zeros': 0, 'percentile': {'0.75': 26, '0.95': 300, '0.05': -28, '0.25': 13, '0.5': 17}, 'hist': [{'count': 4.0, 'lower': -28.0, 'upper': 54.0}, {'count': 0.0, 'lower': 54.0, 'upper': 136.0}, {'count': 0.0, 'lower': 136.0, 'upper': 218.0}, {'count': 0.0, 'lower': 218.0, 'upper': 300.0}]}, 'function': {'count_uniques': 6, 'min': 'Battle Station', 'max': 'Security', 'count_na': 1, 'stddev': None, 'kurtosis': None, 'mean': None, 'skewness': None, 'sum': None, 'variance': None, 'zeros': 0}, 'rank': {'count_uniques': 3, 'min': 7, 'max': 10, 'count_na': 1, 'stddev': 1.36626, 'kurtosis': -1.5, 'mean': 8.33333, 'skewness': 0.3818, 'sum': 50, 'variance': 1.86667, 'zeros': 0, 'percentile': {'0.75': 10, '0.95': 10, '0.05': 7, '0.25': 7, '0.5': 8}, 'hist': [{'count': 4.0, 'lower': 7.0, 'upper': 8.5}, {'count': 0.0, 'lower': 8.5, 'upper': 10.0}]}, 'age': {'count_uniques': 1, 'min': 5000000, 'max': 5000000, 'count_na': 1, 'stddev': 0.0, 'kurtosis': nan, 'mean': 5000000.0, 'skewness': nan, 'sum': 30000000, 'variance': 0.0, 'zeros': 0, 'percentile': {'0.75': 5000000, '0.95': 5000000, '0.05': 5000000, '0.25': 5000000, '0.5': 5000000}, 'hist': [{'count': 6, 'lower': 5000000, 'upper': 5000001}]}, 'weight(t)': {'count_uniques': 5, 'min': 1.8, 'max': 5.7, 'count_na': 2, 'stddev': 1.64712, 'kurtosis': -1.43641, 'mean': 3.56, 'skewness': 0.06521, 'sum': 17.8, 'variance': 2.713, 'zeros': 0, 'percentile': {'0.75': 4.300000190734863, '0.95': 5.699999809265137, '0.05': 1.7999999523162842, '0.25': 2.0, '0.5': 4.0}, 'hist': [{'count': 1.0, 'lower': 1.8, 'upper': 2.78}, {'count': 0.0, 'lower': 2.78, 'upper': 3.75}, {'count': 2.0, 'lower': 3.75, 'upper': 4.73}, {'count': 1.0, 'lower': 4.73, 'upper': 5.7}]}, 'japanese name': {'count_uniques': 6, 'min': ['Bumble', 'Goldback'], 'max': ['Roadbuster'], 'count_na': 1}, 'last position seen': {'count_uniques': 4, 'min': '10.642707,-71.612534', 'max': '37.789563,-122.400356', 'count_na': 3, 'stddev': None, 'kurtosis': None, 'mean': None, 'skewness': None, 'sum': None, 'variance': None, 'zeros': 0}, 'date arrival': {'count_uniques': 1, 'min': '1980/04/10', 'max': '1980/04/10', 'count_na': 1, 'stddev': None, 'kurtosis': None, 'mean': None, 'skewness': None, 'sum': None, 'variance': None, 'zeros': 0}, 'last date seen': {'count_uniques': 6, 'min': '2011/04/10', 'max': '2016/09/10', 'count_na': 1, 'stddev': None, 'kurtosis': None, 'mean': None, 'skewness': None, 'sum': None, 'variance': None, 'zeros': 0}, 'attributes': {'count_uniques': 6, 'min': [None, 5700.0], 'max': [91.44000244140625, None], 'count_na': 1}, 'Date Type': {'count_uniques': 6, 'min': datetime.date(2011, 4, 10), 'max': datetime.date(2016, 9, 10), 'count_na': 1}, 'timestamp': {'count_uniques': 1, 'min': datetime.datetime(2014, 6, 24, 0, 0), 'max': datetime.datetime(2014, 6, 24, 0, 0), 'count_na': 1}, 'Cybertronian': {'count_uniques': 1, 'min': 1, 'max': 1, 'count_na': 1}, 'function(binary)': {'count_uniques': 6, 'min': bytearray(b'Battle Station'), 'max': bytearray(b'Security'), 'count_na': 1}, 'NullType': {'count_uniques': 0, 'min': None, 'max': None, 'count_na': 7}, 'p_count_na': 100.0, 'p_count_uniques': 0.0, 'range': 3.9000000000000004, 'median': 4.0, 'interquartile_range': 2.3000001907348633, 'coef_variation': 0.46267, 'mad': 1.7}
In [14]:
t.run()
Creating file ../test_df_profiler.py
Done
In [39]:
a = "{'name'=a'a}"
In [40]:
print(a)
{'name'=a'a}
In [42]:
import json
json.dumps("{'name'=a'a}")
Out[42]:
'"{\'name\'=a\'a}"'
In [11]:
from optimus.profiler.profiler import Profiler
In [12]:
op.profiler.run(source_df, "*")
INFO:optimus:Processing Stats For columns...
INFO:optimus:Batch Stats 0. Processing columns['names', 'height(ft)', 'function', 'rank', 'age', 'weight(t)', 'japanese name', 'last position seen', 'date arrival', 'last date seen', 'attributes', 'Date Type', 'timestamp', 'Cybertronian', 'function(binary)', 'NullType']
INFO:optimus:'kurtosis' function in 'age' column is returning 'nan'. Is that what you expected?. Seems that 'age' has 'nan' values
INFO:optimus:'skewness' function in 'age' column is returning 'nan'. Is that what you expected?. Seems that 'age' has 'nan' values
INFO:optimus:Batch Histogram 0. Processing columns['names', 'height(ft)', 'function', 'rank', 'age', 'weight(t)', 'japanese name', 'last position seen', 'date arrival', 'last date seen', 'attributes', 'Date Type', 'timestamp', 'Cybertronian', 'function(binary)', 'NullType']
INFO:optimus:'kurtosis' function in 'age' column is returning 'nan'. Is that what you expected?. Seems that 'age' has 'nan' values
INFO:optimus:'skewness' function in 'age' column is returning 'nan'. Is that what you expected?. Seems that 'age' has 'nan' values
INFO:optimus:Processing Frequency ...
INFO:optimus:`names`,`function`,`japanese name`,`last position seen`,`date arrival`,`last date seen`,`attributes`,`Date Type`,`timestamp`,`Cybertronian`,`function(binary)`,`NullType` column(s) was not processed because is/are not byte,short,big,int,double,float
INFO:optimus:`names`,`function`,`last position seen`,`date arrival`,`last date seen`,`timestamp`,`Cybertronian`,`NullType` column(s) was not processed because is/are not array,vector,byte,date,binary
INFO:optimus:Using 'column_exp' to process column 'japanese name' with function _cast_to
INFO:optimus:Using 'column_exp' to process column 'attributes' with function _cast_to
INFO:optimus:Using 'column_exp' to process column 'Date Type' with function _cast_to
INFO:optimus:Using 'column_exp' to process column 'function(binary)' with function _cast_to
Overview
Dataset info
Number of columns
16
Number of rows
7
Total Missing (%)
26
Total size in memory
45.3 MB
Column types
Categorical
0
Numeric
0
Date
2
Array
2
Not available
1
names
categorical
Unique
5
Unique (%)
Missing
1
Missing (%)
Datatypes
String
6
Integer
Decimal
Bool
Date
Missing
0
Null
1
Frequency
Value
Count
Frequency (%)
Optimus
1
14.29%
bumbl#ebéé
1
14.29%
ironhide&
1
14.29%
Jazz
1
14.29%
Megatron
1
14.29%
Metroplex_)^$
1
14.29%
None
1
14.29%
"Missing"
1
%
height(ft)
numeric
Unique
5
Unique (%)
Missing
2
Missing (%)
Datatypes
String
Integer
5
Decimal
Bool
Date
Missing
0
Null
2
Basic Stats
Mean
65.6
Minimum
-28
Maximum
300
Zeros(%)
0
Quantile statistics
Minimum
-28
5-th percentile
-28
Q1
13
Median
17
Q3
26
95-th percentile
300
Maximum
300
Range
Interquartile range
Descriptive statistics
Standard deviation
132.66612
Coef of variation
Kurtosis
0.13863
Mean
65.6
MAD
Skewness
1.4049
Sum
328
Variance
17600.3
function
categorical
Unique
6
Unique (%)
Missing
1
Missing (%)
Datatypes
String
6
Integer
Decimal
Bool
Date
Missing
0
Null
1
Frequency
Value
Count
Frequency (%)
Leader
1
14.29%
Espionage
1
14.29%
Security
1
14.29%
First Lieutenant
1
14.29%
None
1
14.29%
Battle Station
1
14.29%
None
1
14.29%
"Missing"
1
%
rank
numeric
Unique
3
Unique (%)
Missing
1
Missing (%)
Datatypes
String
Integer
6
Decimal
Bool
Date
Missing
0
Null
1
Basic Stats
Mean
8.33333
Minimum
7
Maximum
10
Zeros(%)
0
Quantile statistics
Minimum
7
5-th percentile
7
Q1
7
Median
8
Q3
10
95-th percentile
10
Maximum
10
Range
Interquartile range
Descriptive statistics
Standard deviation
1.36626
Coef of variation
Kurtosis
-1.5
Mean
8.33333
MAD
Skewness
0.3818
Sum
50
Variance
1.86667
age
numeric
Unique
1
Unique (%)
Missing
1
Missing (%)
Datatypes
String
Integer
6
Decimal
Bool
Date
Missing
0
Null
1
Basic Stats
Mean
5000000.0
Minimum
5000000
Maximum
5000000
Zeros(%)
0
Quantile statistics
Minimum
5000000
5-th percentile
5000000
Q1
5000000
Median
5000000
Q3
5000000
95-th percentile
5000000
Maximum
5000000
Range
Interquartile range
Descriptive statistics
Standard deviation
0.0
Coef of variation
Kurtosis
nan
Mean
5000000.0
MAD
Skewness
nan
Sum
30000000
Variance
0.0
weight(t)
numeric
Unique
5
Unique (%)
Missing
2
Missing (%)
Datatypes
String
Integer
Decimal
5
Bool
Date
Missing
0
Null
2
Basic Stats
Mean
3.56
Minimum
1.8
Maximum
5.7
Zeros(%)
0
Quantile statistics
Minimum
1.8
5-th percentile
1.7999999523162842
Q1
2.0
Median
4.0
Q3
4.300000190734863
95-th percentile
5.699999809265137
Maximum
5.7
Range
Interquartile range
Descriptive statistics
Standard deviation
1.64712
Coef of variation
Kurtosis
-1.43641
Mean
3.56
MAD
Skewness
0.06521
Sum
17.8
Variance
2.713
japanese name
array
Unique
6
Unique (%)
Missing
1
Missing (%)
Datatypes
String
Integer
Decimal
Bool
Date
Missing
0
Null
1
Frequency
Value
Count
Frequency (%)
[Inochi, Convoy]
1
14.29%
[Bumble, Goldback]
1
14.29%
[Roadbuster]
1
14.29%
[Meister]
1
14.29%
[Megatron]
1
14.29%
[Metroflex]
1
14.29%
None
1
14.29%
"Missing"
1
%
last position seen
categorical
Unique
4
Unique (%)
Missing
3
Missing (%)
Datatypes
String
4
Integer
Decimal
Bool
Date
Missing
0
Null
3
Frequency
Value
Count
Frequency (%)
None
3
42.86%
19.442735,-99.201111
1
14.29%
10.642707,-71.612534
1
14.29%
37.789563,-122.400356
1
14.29%
33.670666,-117.841553
1
14.29%
"Missing"
3
%
date arrival
categorical
Unique
1
Unique (%)
Missing
1
Missing (%)
Datatypes
String
6
Integer
Decimal
Bool
Date
Missing
0
Null
1
Frequency
Value
Count
Frequency (%)
1980/04/10
6
85.71%
None
1
14.29%
"Missing"
1
%
last date seen
categorical
Unique
6
Unique (%)
Missing
1
Missing (%)
Datatypes
String
6
Integer
Decimal
Bool
Date
Missing
0
Null
1
Frequency
Value
Count
Frequency (%)
2016/09/10
1
14.29%
2015/08/10
1
14.29%
2014/07/10
1
14.29%
2013/06/10
1
14.29%
2012/05/10
1
14.29%
2011/04/10
1
14.29%
None
1
14.29%
"Missing"
1
%
attributes
array
Unique
6
Unique (%)
Missing
1
Missing (%)
Datatypes
String
Integer
Decimal
Bool
Date
Missing
0
Null
1
Frequency
Value
Count
Frequency (%)
[8.5344, 4300.0]
1
14.29%
[5.334, 2000.0]
1
14.29%
[7.9248, 4000.0]
1
14.29%
[3.9624, 1800.0]
1
14.29%
[, 5700.0]
1
14.29%
[91.44,]
1
14.29%
None
1
14.29%
"Missing"
1
%
Date Type
date
Unique
6
Unique (%)
Missing
1
Missing (%)
Datatypes
String
Integer
Decimal
Bool
Date
6
Missing
0
Null
1
Frequency
Value
Count
Frequency (%)
2016-09-10
1
14.29%
2015-08-10
1
14.29%
2014-06-24
1
14.29%
2013-06-24
1
14.29%
2012-05-10
1
14.29%
2011-04-10
1
14.29%
None
1
14.29%
"Missing"
1
%
timestamp
date
Unique
1
Unique (%)
Missing
1
Missing (%)
Datatypes
String
Integer
Decimal
Bool
Date
6
Missing
0
Null
1
Frequency
Value
Count
Frequency (%)
2014-06-24 00:00:00
6
85.71%
None
1
14.29%
"Missing"
1
%
Cybertronian
categorical
Unique
1
Unique (%)
Missing
1
Missing (%)
Datatypes
String
Integer
Decimal
Bool
6
Date
Missing
0
Null
1
Frequency
Value
Count
Frequency (%)
True
6
85.71%
None
1
14.29%
"Missing"
1
%
NullType
null
Unique
0
Unique (%)
Missing
7
Missing (%)
Datatypes
String
Integer
Decimal
Bool
Date
Missing
0
Null
7
Frequency
Value
Count
Frequency (%)
None
7
100.0%
"Missing"
7
%
INFO:optimus:run() executed in 68.73 sec
Out[12]:
<optimus.profiler.profiler.Profiler at 0x24242023b70>
In [15]:
source_df.cols.range("height(ft)")
Out[15]:
{'height(ft)': {'range': {'max': 300, 'min': -28}}}
In [ ]:
Content source: ironmussa/Optimus
Similar notebooks: