This notebook create the tests in python code. All this cells must be run to executed the tests


In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append("../..")

In [3]:
from optimus import Optimus
from optimus.helpers.test import Test


    You are using PySparkling of version 2.4.10, but your PySpark is of
    version 2.3.1. Please make sure Spark and PySparkling versions are compatible. 

In [4]:
op = Optimus(master='local', verbose=True)


INFO:optimus:Operative System:Windows
INFO:optimus:Just check that Spark and all necessary environments vars are present...
INFO:optimus:-----
INFO:optimus:SPARK_HOME=C:\opt\spark\spark-2.3.1-bin-hadoop2.7
INFO:optimus:HADOOP_HOME=C:\opt\hadoop-2.7.7
INFO:optimus:PYSPARK_PYTHON=C:\Users\argenisleon\Anaconda3\python.exe
INFO:optimus:PYSPARK_DRIVER_PYTHON=jupyter
INFO:optimus:PYSPARK_SUBMIT_ARGS=--jars "file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/spark-redis-2.4.1-SNAPSHOT-jar-with-dependencies.jar,file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/RedshiftJDBC42-1.2.16.1027.jar,file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/mysql-connector-java-8.0.16.jar,file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/ojdbc8.jar,file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/postgresql-42.2.5.jar,file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/presto-jdbc-0.224.jar,file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/spark-cassandra-connector_2.11-2.4.1.jar,file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/sqlite-jdbc-3.27.2.1.jar,file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/mssql-jdbc-7.4.1.jre8.jar" --driver-class-path "C:/Users/argenisleon/Documents/Optimus/optimus/jars/spark-redis-2.4.1-SNAPSHOT-jar-with-dependencies.jar;C:/Users/argenisleon/Documents/Optimus/optimus/jars/RedshiftJDBC42-1.2.16.1027.jar;C:/Users/argenisleon/Documents/Optimus/optimus/jars/mysql-connector-java-8.0.16.jar;C:/Users/argenisleon/Documents/Optimus/optimus/jars/ojdbc8.jar;C:/Users/argenisleon/Documents/Optimus/optimus/jars/postgresql-42.2.5.jar;C:/Users/argenisleon/Documents/Optimus/optimus/jars/presto-jdbc-0.224.jar;C:/Users/argenisleon/Documents/Optimus/optimus/jars/spark-cassandra-connector_2.11-2.4.1.jar;C:/Users/argenisleon/Documents/Optimus/optimus/jars/sqlite-jdbc-3.27.2.1.jar;C:/Users/argenisleon/Documents/Optimus/optimus/jars/mssql-jdbc-7.4.1.jre8.jar" --conf "spark.sql.catalogImplementation=hive" pyspark-shell
INFO:optimus:JAVA_HOME=C:\java
INFO:optimus:Pyarrow Installed
INFO:optimus:-----
INFO:optimus:Starting or getting SparkSession and SparkContext...
INFO:optimus:Spark Version:2.3.1
INFO:optimus:
                             ____        __  _                     
                            / __ \____  / /_(_)___ ___  __  _______
                           / / / / __ \/ __/ / __ `__ \/ / / / ___/
                          / /_/ / /_/ / /_/ / / / / / / /_/ (__  ) 
                          \____/ .___/\__/_/_/ /_/ /_/\__,_/____/  
                              /_/                                  
                              
INFO:optimus:Transform and Roll out...
C:/Users/argenisleon/Documents/Optimus/optimus/../parse/infer.py
INFO:optimus:Optimus successfully imported. Have fun :).
INFO:optimus:Config.ini not found

In [5]:
import pandas as pd
from pyspark.sql.types import *
from datetime import date, datetime


cols = [
        ("names", "str"),
        ("height(ft)", ShortType()),
        ("function", "str"),
        ("rank", ByteType()),
        ("age", "int"),
        ("weight(t)", "float"),
        "japanese name",
        "last position seen",
        "date arrival",
        "last date seen",
        ("attributes", ArrayType(FloatType())),
        ("Date Type", DateType()),
        ("timestamp", TimestampType()),
        ("Cybertronian", BooleanType()),
        ("function(binary)", BinaryType()),
        ("NullType", NullType())

    ]

rows = [
        ("Optimus", -28, "Leader", 10, 5000000, 4.30, ["Inochi", "Convoy"], "19.442735,-99.201111", "1980/04/10",
         "2016/09/10", [8.5344, 4300.0], date(2016, 9, 10), datetime(2014, 6, 24), True, bytearray("Leader", "utf-8"),
         None),
        ("bumbl#ebéé  ", 17, "Espionage", 7, 5000000, 2.0, ["Bumble", "Goldback"], "10.642707,-71.612534", "1980/04/10",
         "2015/08/10", [5.334, 2000.0], date(2015, 8, 10), datetime(2014, 6, 24), True, bytearray("Espionage", "utf-8"),
         None),
        ("ironhide&", 26, "Security", 7, 5000000, 4.0, ["Roadbuster"], "37.789563,-122.400356", "1980/04/10",
         "2014/07/10", [7.9248, 4000.0], date(2014, 6, 24), datetime(2014, 6, 24), True, bytearray("Security", "utf-8"),
         None),
        ("Jazz", 13, "First Lieutenant", 8, 5000000, 1.80, ["Meister"], "33.670666,-117.841553", "1980/04/10",
         "2013/06/10", [3.9624, 1800.0], date(2013, 6, 24), datetime(2014, 6, 24), True,
         bytearray("First Lieutenant", "utf-8"), None),
        ("Megatron", None, "None", 10, 5000000, 5.70, ["Megatron"], None, "1980/04/10", "2012/05/10", [None, 5700.0],
         date(2012, 5, 10), datetime(2014, 6, 24), True, bytearray("None", "utf-8"), None),
        ("Metroplex_)^$", 300, "Battle Station", 8, 5000000, None, ["Metroflex"], None, "1980/04/10", "2011/04/10",
         [91.44, None], date(2011, 4, 10), datetime(2014, 6, 24), True, bytearray("Battle Station", "utf-8"), None),
        (None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None),

    ]
source_df = op.create.df(cols ,rows)
source_df.table()


Viewing 7 of 7 rows / 16 columns
1 partition(s)
names
1 (string)
nullable
height(ft)
2 (smallint)
nullable
function
3 (string)
nullable
rank
4 (tinyint)
nullable
age
5 (int)
nullable
weight(t)
6 (float)
nullable
japanese name
7 (array<string>)
nullable
last position seen
8 (string)
nullable
date arrival
9 (string)
nullable
last date seen
10 (string)
nullable
attributes
11 (array<float>)
nullable
Date Type
12 (date)
nullable
timestamp
13 (timestamp)
nullable
Cybertronian
14 (boolean)
nullable
function(binary)
15 (binary)
nullable
NullType
16 (null)
nullable
Optimus
-28
Leader
10
5000000
4.300000190734863
['Inochi',⋅'Convoy']
19.442735,-99.201111
1980/04/10
2016/09/10
[8.53439998626709,⋅4300.0]
2016-09-10
2014-06-24⋅00:00:00
True
bytearray(b'Leader')
None
bumbl#ebéé⋅⋅
17
Espionage
7
5000000
2.0
['Bumble',⋅'Goldback']
10.642707,-71.612534
1980/04/10
2015/08/10
[5.334000110626221,⋅2000.0]
2015-08-10
2014-06-24⋅00:00:00
True
bytearray(b'Espionage')
None
ironhide&
26
Security
7
5000000
4.0
['Roadbuster']
37.789563,-122.400356
1980/04/10
2014/07/10
[7.924799919128418,⋅4000.0]
2014-06-24
2014-06-24⋅00:00:00
True
bytearray(b'Security')
None
Jazz
13
First⋅Lieutenant
8
5000000
1.7999999523162842
['Meister']
33.670666,-117.841553
1980/04/10
2013/06/10
[3.962399959564209,⋅1800.0]
2013-06-24
2014-06-24⋅00:00:00
True
bytearray(b'First⋅Lieutenant')
None
Megatron
None
None
10
5000000
5.699999809265137
['Megatron']
None
1980/04/10
2012/05/10
[None,⋅5700.0]
2012-05-10
2014-06-24⋅00:00:00
True
bytearray(b'None')
None
Metroplex_)^$
300
Battle⋅Station
8
5000000
None
['Metroflex']
None
1980/04/10
2011/04/10
[91.44000244140625,⋅None]
2011-04-10
2014-06-24⋅00:00:00
True
bytearray(b'Battle⋅Station')
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
Viewing 7 of 7 rows / 16 columns
1 partition(s)

End Init Section

Profiler


In [6]:
from pyspark.ml.linalg import Vectors

In [7]:
import re
a="a\'a"
re.escape(a)


Out[7]:
"a'a"

In [8]:
print(a)


a'a

In [9]:
t = Test(op, source_df, "df_profiler", imports=["from pyspark.ml.linalg import Vectors, VectorUDT, DenseVector",
                                        "import numpy as np",
                                        "nan = np.nan",
                                        "import datetime",
                                        "from pyspark.sql import functions as F",
                                        "from optimus.profiler.profiler import Profiler",
                                        "null = None",
                                        "true = True",
                                        "p= Profiler()"], path = "df_profiler", final_path="..")

In [10]:
from pyspark.sql import functions as F


def func(col_name, attrs):
    return F.col(col_name) * 2

numeric_col = "height(ft)"
numeric_col_B = "rank"
numeric_col_C = "rank"
string_col = "function"
date_col = "date arrival"
date_col_B = "last date seen"
new_col = "new col"
array_col = "attributes"

In [11]:
from optimus.profiler.profiler import Profiler
p= Profiler()


INFO:optimus:Config.ini not found

In [13]:
p.run(source_df, "*")


INFO:optimus:Processing Stats For columns...
INFO:optimus:Batch Stats 0. Processing columns['names', 'height(ft)', 'function', 'rank', 'age', 'weight(t)', 'japanese name', 'last position seen', 'date arrival', 'last date seen', 'attributes', 'Date Type', 'timestamp', 'Cybertronian', 'function(binary)', 'NullType']
'kurtosis' function in 'age' column is returning 'nan'. Is that what you expected?. Seems that 'age' has 'nan' values
'skewness' function in 'age' column is returning 'nan'. Is that what you expected?. Seems that 'age' has 'nan' values
INFO:optimus:Batch Histogram 0. Processing columns['names', 'height(ft)', 'function', 'rank', 'age', 'weight(t)', 'japanese name', 'last position seen', 'date arrival', 'last date seen', 'attributes', 'Date Type', 'timestamp', 'Cybertronian', 'function(binary)', 'NullType']
INFO:optimus:Processing Frequency ...
INFO:optimus:`names`,`function`,`japanese name`,`last position seen`,`date arrival`,`last date seen`,`attributes`,`Date Type`,`timestamp`,`Cybertronian`,`function(binary)`,`NullType` column(s) was not processed because is/are not byte,short,big,int,double,float
INFO:optimus:`names`,`function`,`last position seen`,`date arrival`,`last date seen`,`timestamp`,`Cybertronian`,`NullType` column(s) was not processed because is/are not array,vector,byte,date,binary
INFO:optimus:Using 'column_exp' to process column 'japanese name' with function _cast_to
INFO:optimus:Using 'column_exp' to process column 'attributes' with function _cast_to
INFO:optimus:Using 'column_exp' to process column 'Date Type' with function _cast_to
INFO:optimus:Using 'column_exp' to process column 'function(binary)' with function _cast_to

Overview

Dataset info

Number of columns 16
Number of rows 7
Total Missing (%) 26
Total size in memory 44.9 MB

Column types

Categorical 0
Numeric 0
Date 2
Array 0
Not available 5

names

categorical
Unique 5
Unique (%) 71.43
Missing 1
Missing (%) 14.29

Datatypes

String 6
Integer 0
Decimal 0
Bool 0
Date 0
Missing 0
Null 1

Frequency

Value Count Frequency (%)
Optim'us 1 14.29%
bumbl#ebéé 1 14.29%
ironhide& 1 14.29%
Jazz 1 14.29%
Megatron 1 14.29%
Metroplex_)^$ 1 14.29%
None 1 14.29%
"Missing" 1 14.29%

height(ft)

numeric
Unique 5
Unique (%) 71.43
Missing 2
Missing (%) 28.57

Datatypes

String 0
Integer 5
Decimal 0
Bool 0
Date 0
Missing 0
Null 2

Basic Stats

Mean 65.6
Minimum -28
Maximum 300
Zeros(%) 0

Quantile statistics

Minimum -28
5-th percentile -28
Q1 13
Median 17
Q3 26
95-th percentile 300
Maximum 300
Range 328
Interquartile range 13

Descriptive statistics

Standard deviation 132.66612
Coef of variation 2.02235
Kurtosis 0.13863
Mean 65.6
MAD 9
Skewness 1.4049
Sum 328
Variance 17600.3

function

categorical
Unique 6
Unique (%) 85.71
Missing 1
Missing (%) 14.29

Datatypes

String 6
Integer 0
Decimal 0
Bool 0
Date 0
Missing 0
Null 1

Frequency

Value Count Frequency (%)
Leader 1 14.29%
Espionage 1 14.29%
Security 1 14.29%
First Lieutenant 1 14.29%
None 1 14.29%
Battle Station 1 14.29%
None 1 14.29%
"Missing" 1 14.29%

rank

numeric
Unique 3
Unique (%) 42.86
Missing 1
Missing (%) 14.29

Datatypes

String 0
Integer 6
Decimal 0
Bool 0
Date 0
Missing 0
Null 1

Basic Stats

Mean 8.33333
Minimum 7
Maximum 10
Zeros(%) 0

Quantile statistics

Minimum 7
5-th percentile 7
Q1 7
Median 8
Q3 10
95-th percentile 10
Maximum 10
Range 3
Interquartile range 3

Descriptive statistics

Standard deviation 1.36626
Coef of variation 0.16395
Kurtosis -1.5
Mean 8.33333
MAD 1
Skewness 0.3818
Sum 50
Variance 1.86667

age

numeric
Unique 1
Unique (%) 14.29
Missing 1
Missing (%) 14.29

Datatypes

String 0
Integer 6
Decimal 0
Bool 0
Date 0
Missing 0
Null 1

Basic Stats

Mean 5000000.0
Minimum 5000000
Maximum 5000000
Zeros(%) 0

Quantile statistics

Minimum 5000000
5-th percentile 5000000
Q1 5000000
Median 5000000
Q3 5000000
95-th percentile 5000000
Maximum 5000000
Range 0
Interquartile range 0

Descriptive statistics

Standard deviation 0.0
Coef of variation 0.0
Kurtosis nan
Mean 5000000.0
MAD 0
Skewness nan
Sum 30000000
Variance 0.0

weight(t)

numeric
Unique 5
Unique (%) 71.43
Missing 2
Missing (%) 28.57

Datatypes

String 0
Integer 0
Decimal 5
Bool 0
Date 0
Missing 0
Null 2

Basic Stats

Mean 3.56
Minimum 1.8
Maximum 5.7
Zeros(%) 0

Quantile statistics

Minimum 1.8
5-th percentile 1.7999999523162842
Q1 2.0
Median 4.0
Q3 4.300000190734863
95-th percentile 5.699999809265137
Maximum 5.7
Range 3.9000000000000004
Interquartile range 2.3000001907348633

Descriptive statistics

Standard deviation 1.64712
Coef of variation 0.46267
Kurtosis -1.43641
Mean 3.56
MAD 1.7
Skewness 0.06521
Sum 17.8
Variance 2.713

japanese name

null
Unique 6
Unique (%) 85.71
Missing 1
Missing (%) 14.29

Datatypes

String 0
Integer 0
Decimal 0
Bool 0
Date 0
Missing 0
Null 1

Frequency

Value Count Frequency (%)
[Inochi, Convoy] 1 14.29%
[Bumble, Goldback] 1 14.29%
[Roadbuster] 1 14.29%
[Meister] 1 14.29%
[Megatron] 1 14.29%
[Metroflex] 1 14.29%
None 1 14.29%
"Missing" 1 14.29%

last position seen

categorical
Unique 4
Unique (%) 57.14
Missing 3
Missing (%) 42.86

Datatypes

String 4
Integer 0
Decimal 0
Bool 0
Date 0
Missing 0
Null 3

Frequency

Value Count Frequency (%)
None 3 42.86%
19.442735,-99.201111 1 14.29%
10.642707,-71.612534 1 14.29%
37.789563,-122.400356 1 14.29%
33.670666,-117.841553 1 14.29%
"Missing" 3 42.86%

date arrival

categorical
Unique 1
Unique (%) 14.29
Missing 1
Missing (%) 14.29

Datatypes

String 6
Integer 0
Decimal 0
Bool 0
Date 0
Missing 0
Null 1

Frequency

Value Count Frequency (%)
1980/04/10 6 85.71%
None 1 14.29%
"Missing" 1 14.29%

last date seen

categorical
Unique 6
Unique (%) 85.71
Missing 1
Missing (%) 14.29

Datatypes

String 6
Integer 0
Decimal 0
Bool 0
Date 0
Missing 0
Null 1

Frequency

Value Count Frequency (%)
2016/09/10 1 14.29%
2015/08/10 1 14.29%
2014/07/10 1 14.29%
2013/06/10 1 14.29%
2012/05/10 1 14.29%
2011/04/10 1 14.29%
None 1 14.29%
"Missing" 1 14.29%

attributes

null
Unique 6
Unique (%) 85.71
Missing 1
Missing (%) 14.29

Datatypes

String 0
Integer 0
Decimal 0
Bool 0
Date 0
Missing 0
Null 1

Frequency

Value Count Frequency (%)
[8.5344, 4300.0] 1 14.29%
[5.334, 2000.0] 1 14.29%
[7.9248, 4000.0] 1 14.29%
[3.9624, 1800.0] 1 14.29%
[, 5700.0] 1 14.29%
[91.44,] 1 14.29%
None 1 14.29%
"Missing" 1 14.29%

Date Type

date
Unique 6
Unique (%) 85.71
Missing 1
Missing (%) 14.29

Datatypes

String 0
Integer 0
Decimal 0
Bool 0
Date 6
Missing 0
Null 1

Frequency

Value Count Frequency (%)
2016-09-10 1 14.29%
2015-08-10 1 14.29%
2014-06-24 1 14.29%
2013-06-24 1 14.29%
2012-05-10 1 14.29%
2011-04-10 1 14.29%
None 1 14.29%
"Missing" 1 14.29%

timestamp

date
Unique 1
Unique (%) 14.29
Missing 1
Missing (%) 14.29

Datatypes

String 0
Integer 0
Decimal 0
Bool 0
Date 6
Missing 0
Null 1

Frequency

Value Count Frequency (%)
2014-06-24 00:00:00 6 85.71%
None 1 14.29%
"Missing" 1 14.29%

Cybertronian

null
Unique 1
Unique (%) 14.29
Missing 1
Missing (%) 14.29

Datatypes

String 0
Integer 0
Decimal 0
Bool 0
Date 0
Missing 0
Null 1

Frequency

Value Count Frequency (%)
True 6 85.71%
None 1 14.29%
"Missing" 1 14.29%

function(binary)

null
Unique 6
Unique (%) 85.71
Missing 1
Missing (%) 14.29

Datatypes

String 0
Integer 0
Decimal 0
Bool 0
Date 0
Missing 0
Null 1

Frequency

Value Count Frequency (%)
Leader 1 14.29%
Espionage 1 14.29%
Security 1 14.29%
First Lieutenant 1 14.29%
None 1 14.29%
Battle Station 1 14.29%
None 1 14.29%
"Missing" 1 14.29%

NullType

null
Unique 0
Unique (%) 0.0
Missing 7
Missing (%) 100.0

Datatypes

String 0
Integer 0
Decimal 0
Bool 0
Date 0
Missing 0
Null 7

Frequency

Value Count Frequency (%)
None 7 100.0%
"Missing" 7 100.0%
Viewing 7 of 7 rows / 16 columns
1 partition(s)
names
1 (string)
nullable
height(ft)
2 (smallint)
nullable
function
3 (string)
nullable
rank
4 (tinyint)
nullable
age
5 (int)
nullable
weight(t)
6 (float)
nullable
japanese name
7 (array<string>)
nullable
last position seen
8 (string)
nullable
date arrival
9 (string)
nullable
last date seen
10 (string)
nullable
attributes
11 (array<float>)
nullable
Date Type
12 (date)
nullable
timestamp
13 (timestamp)
nullable
Cybertronian
14 (boolean)
nullable
function(binary)
15 (binary)
nullable
NullType
16 (null)
nullable
Optim'us
-28
Leader
10
5000000
4.300000190734863
['Inochi',⋅'Convoy']
19.442735,-99.201111
1980/04/10
2016/09/10
[8.53439998626709,⋅4300.0]
2016-09-10
2014-06-24⋅00:00:00
True
bytearray(b'Leader')
None
bumbl#ebéé⋅⋅
17
Espionage
7
5000000
2.0
['Bumble',⋅'Goldback']
10.642707,-71.612534
1980/04/10
2015/08/10
[5.334000110626221,⋅2000.0]
2015-08-10
2014-06-24⋅00:00:00
True
bytearray(b'Espionage')
None
ironhide&
26
Security
7
5000000
4.0
['Roadbuster']
37.789563,-122.400356
1980/04/10
2014/07/10
[7.924799919128418,⋅4000.0]
2014-06-24
2014-06-24⋅00:00:00
True
bytearray(b'Security')
None
Jazz
13
First⋅Lieutenant
8
5000000
1.7999999523162842
['Meister']
33.670666,-117.841553
1980/04/10
2013/06/10
[3.962399959564209,⋅1800.0]
2013-06-24
2014-06-24⋅00:00:00
True
bytearray(b'First⋅Lieutenant')
None
Megatron
None
None
10
5000000
5.699999809265137
['Megatron']
None
1980/04/10
2012/05/10
[None,⋅5700.0]
2012-05-10
2014-06-24⋅00:00:00
True
bytearray(b'None')
None
Metroplex_)^$
300
Battle⋅Station
8
5000000
None
['Metroflex']
None
1980/04/10
2011/04/10
[91.44000244140625,⋅None]
2011-04-10
2014-06-24⋅00:00:00
True
bytearray(b'Battle⋅Station')
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
Viewing 7 of 7 rows / 16 columns
1 partition(s)
INFO:optimus:run() executed in 69.73 sec
Out[13]:
<optimus.profiler.profiler.Profiler at 0x1cc2093b7b8>

In [44]:
t.create(p, "dataset", None, 'json', None, source_df,"*")


Creating test_dataset() test function...
INFO:optimus:test_dataset()
INFO:optimus:Processing Stats For columns...
{'names': {'string': 6, 'null': 1}, 'height(ft)': {'smallint': 5, 'null': 2}, 'function': {'string': 6, 'null': 1}, 'rank': {'tinyint': 6, 'null': 1}, 'age': {'int': 6, 'null': 1}, 'weight(t)': {'float': 5, 'null': 2}, 'japanese name': {'array': 6, 'null': 1}, 'last position seen': {'string': 4, 'null': 3}, 'date arrival': {'string': 6, 'null': 1}, 'last date seen': {'string': 6, 'null': 1}, 'attributes': {'array': 6, 'null': 1}, 'Date Type': {'date': 6, 'null': 1}, 'timestamp': {'timestamp': 6, 'null': 1}, 'Cybertronian': {'boolean': 6, 'null': 1}, 'function(binary)': {'binary': 6, 'null': 1}, 'NullType': {'null': 7}}
INFO:optimus:Batch Stats 0. Processing columns['names', 'height(ft)', 'function', 'rank', 'age', 'weight(t)', 'japanese name', 'last position seen', 'date arrival', 'last date seen', 'attributes', 'Date Type', 'timestamp', 'Cybertronian', 'function(binary)', 'NullType']
INFO:optimus:'kurtosis' function in 'age' column is returning 'nan'. Is that what you expected?. Seems that 'age' has 'nan' values
INFO:optimus:'skewness' function in 'age' column is returning 'nan'. Is that what you expected?. Seems that 'age' has 'nan' values
INFO:optimus:Batch Histogram 0. Processing columns['names', 'height(ft)', 'function', 'rank', 'age', 'weight(t)', 'japanese name', 'last position seen', 'date arrival', 'last date seen', 'attributes', 'Date Type', 'timestamp', 'Cybertronian', 'function(binary)', 'NullType']
INFO:optimus:Processing Frequency ...
INFO:optimus:`names`,`function`,`japanese name`,`last position seen`,`date arrival`,`last date seen`,`attributes`,`Date Type`,`timestamp`,`Cybertronian`,`function(binary)`,`NullType` column(s) was not processed because is/are not byte,short,big,int,double,float
INFO:optimus:`names`,`function`,`last position seen`,`date arrival`,`last date seen`,`timestamp`,`Cybertronian`,`NullType` column(s) was not processed because is/are not array,vector,byte,date,binary
INFO:optimus:Using 'column_exp' to process column 'japanese name' with function _cast_to
INFO:optimus:Using 'column_exp' to process column 'attributes' with function _cast_to
INFO:optimus:Using 'column_exp' to process column 'Date Type' with function _cast_to
INFO:optimus:Using 'column_exp' to process column 'function(binary)' with function _cast_to
{"count_types": {"string": 5, "int": 3, "decimal": 1, "array": 2, "date": 2, "boolean": 1, "binary": 1, "null": 1, "numeric": 0, "categorical": 0}, "total_count_dtypes": 8, "dtypes_list": ["string", "int", "decimal", "array", "date", "boolean", "binary", "null"], "columns": {"names": {"stats": {"count_uniques": 5, "min": "Jazz", "max": "ironhide&", "stddev": null, "kurtosis": null, "mean": null, "skewness": null, "sum": null, "variance": null, "zeros": 0, "count_na": 1, "p_count_na": 14.29, "p_count_uniques": 71.43}, "frequency": [{"value": "Optimus", "count": 1, "percentage": 14.29}, {"value": "bumbl#eb\u00e9\u00e9  ", "count": 1, "percentage": 14.29}, {"value": "ironhide&", "count": 1, "percentage": 14.29}, {"value": "Jazz", "count": 1, "percentage": 14.29}, {"value": "Megatron", "count": 1, "percentage": 14.29}, {"value": "Metroplex_)^$", "count": 1, "percentage": 14.29}, {"value": null, "count": 1, "percentage": 14.29}], "name": "names", "column_dtype": "string", "dtypes_stats": {"null": 1, "missing": 0, "string": 6}, "column_type": "categorical"}, "height(ft)": {"stats": {"count_uniques": 5, "min": -28, "max": 300, "stddev": 132.66612, "kurtosis": 0.13863, "mean": 65.6, "skewness": 1.4049, "sum": 328, "variance": 17600.3, "zeros": 0, "percentile": {"0.75": 26, "0.95": 300, "0.05": -28, "0.25": 13, "0.5": 17}, "count_na": 2, "hist": [{"count": 4.0, "lower": -28.0, "upper": 54.0}, {"count": 0.0, "lower": 54.0, "upper": 136.0}, {"count": 0.0, "lower": 136.0, "upper": 218.0}, {"count": 0.0, "lower": 218.0, "upper": 300.0}], "range": 328, "median": 17, "interquartile_range": 13, "coef_variation": 2.02235, "mad": 9, "p_count_na": 28.57, "p_count_uniques": 71.43}, "name": "height(ft)", "column_dtype": "int", "dtypes_stats": {"null": 2, "missing": 0, "int": 5}, "column_type": "numeric"}, "function": {"stats": {"count_uniques": 6, "min": "Battle Station", "max": "Security", "stddev": null, "kurtosis": null, "mean": null, "skewness": null, "sum": null, "variance": null, "zeros": 0, "count_na": 1, "p_count_na": 14.29, "p_count_uniques": 85.71}, "frequency": [{"value": "Leader", "count": 1, "percentage": 14.29}, {"value": "Espionage", "count": 1, "percentage": 14.29}, {"value": "Security", "count": 1, "percentage": 14.29}, {"value": "First Lieutenant", "count": 1, "percentage": 14.29}, {"value": "None", "count": 1, "percentage": 14.29}, {"value": "Battle Station", "count": 1, "percentage": 14.29}, {"value": null, "count": 1, "percentage": 14.29}], "name": "function", "column_dtype": "string", "dtypes_stats": {"null": 1, "missing": 0, "string": 6}, "column_type": "categorical"}, "rank": {"stats": {"count_uniques": 3, "min": 7, "max": 10, "stddev": 1.36626, "kurtosis": -1.5, "mean": 8.33333, "skewness": 0.3818, "sum": 50, "variance": 1.86667, "zeros": 0, "percentile": {"0.75": 10, "0.95": 10, "0.05": 7, "0.25": 7, "0.5": 8}, "count_na": 1, "hist": [{"count": 4.0, "lower": 7.0, "upper": 8.5}, {"count": 0.0, "lower": 8.5, "upper": 10.0}], "range": 3, "median": 8, "interquartile_range": 3, "coef_variation": 0.16395, "mad": 1, "p_count_na": 14.29, "p_count_uniques": 42.86}, "name": "rank", "column_dtype": "int", "dtypes_stats": {"null": 1, "missing": 0, "int": 6}, "column_type": "numeric"}, "age": {"stats": {"count_uniques": 1, "min": 5000000, "max": 5000000, "stddev": 0.0, "kurtosis": null, "mean": 5000000.0, "skewness": null, "sum": 30000000, "variance": 0.0, "zeros": 0, "percentile": {"0.75": 5000000, "0.95": 5000000, "0.05": 5000000, "0.25": 5000000, "0.5": 5000000}, "count_na": 1, "hist": [{"count": 6, "lower": 5000000, "upper": 5000001}], "range": 0, "median": 5000000, "interquartile_range": 0, "coef_variation": 0.0, "mad": 0, "p_count_na": 14.29, "p_count_uniques": 14.29}, "name": "age", "column_dtype": "int", "dtypes_stats": {"null": 1, "missing": 0, "int": 6}, "column_type": "numeric"}, "weight(t)": {"stats": {"count_uniques": 5, "min": 1.8, "max": 5.7, "stddev": 1.64712, "kurtosis": -1.43641, "mean": 3.56, "skewness": 0.06521, "sum": 17.8, "variance": 2.713, "zeros": 0, "percentile": {"0.75": 4.300000190734863, "0.95": 5.699999809265137, "0.05": 1.7999999523162842, "0.25": 2.0, "0.5": 4.0}, "count_na": 2, "hist": [{"count": 1.0, "lower": 1.8, "upper": 2.78}, {"count": 0.0, "lower": 2.78, "upper": 3.75}, {"count": 2.0, "lower": 3.75, "upper": 4.73}, {"count": 1.0, "lower": 4.73, "upper": 5.7}], "range": 3.9000000000000004, "median": 4.0, "interquartile_range": 2.3000001907348633, "coef_variation": 0.46267, "mad": 1.7, "p_count_na": 28.57, "p_count_uniques": 71.43}, "name": "weight(t)", "column_dtype": "decimal", "dtypes_stats": {"null": 2, "missing": 0, "decimal": 5}, "column_type": "numeric"}, "japanese name": {"stats": {"count_uniques": 6, "min": ["Bumble", "Goldback"], "max": ["Roadbuster"], "count_na": 1, "p_count_na": 14.29, "p_count_uniques": 85.71}, "frequency": [{"value": "[Inochi, Convoy]", "count": 1, "percentage": 14.29}, {"value": "[Bumble, Goldback]", "count": 1, "percentage": 14.29}, {"value": "[Roadbuster]", "count": 1, "percentage": 14.29}, {"value": "[Meister]", "count": 1, "percentage": 14.29}, {"value": "[Megatron]", "count": 1, "percentage": 14.29}, {"value": "[Metroflex]", "count": 1, "percentage": 14.29}, {"value": null, "count": 1, "percentage": 14.29}], "name": "japanese name", "column_dtype": "array", "dtypes_stats": {"null": 1, "missing": 0, "array": 6}, "column_type": "array"}, "last position seen": {"stats": {"count_uniques": 4, "min": "10.642707,-71.612534", "max": "37.789563,-122.400356", "stddev": null, "kurtosis": null, "mean": null, "skewness": null, "sum": null, "variance": null, "zeros": 0, "count_na": 3, "p_count_na": 42.86, "p_count_uniques": 57.14}, "frequency": [{"value": null, "count": 3, "percentage": 42.86}, {"value": "19.442735,-99.201111", "count": 1, "percentage": 14.29}, {"value": "10.642707,-71.612534", "count": 1, "percentage": 14.29}, {"value": "37.789563,-122.400356", "count": 1, "percentage": 14.29}, {"value": "33.670666,-117.841553", "count": 1, "percentage": 14.29}], "name": "last position seen", "column_dtype": "string", "dtypes_stats": {"null": 3, "missing": 0, "string": 4}, "column_type": "categorical"}, "date arrival": {"stats": {"count_uniques": 1, "min": "1980/04/10", "max": "1980/04/10", "stddev": null, "kurtosis": null, "mean": null, "skewness": null, "sum": null, "variance": null, "zeros": 0, "count_na": 1, "p_count_na": 14.29, "p_count_uniques": 14.29}, "frequency": [{"value": "1980/04/10", "count": 6, "percentage": 85.71}, {"value": null, "count": 1, "percentage": 14.29}], "name": "date arrival", "column_dtype": "string", "dtypes_stats": {"null": 1, "missing": 0, "string": 6}, "column_type": "categorical"}, "last date seen": {"stats": {"count_uniques": 6, "min": "2011/04/10", "max": "2016/09/10", "stddev": null, "kurtosis": null, "mean": null, "skewness": null, "sum": null, "variance": null, "zeros": 0, "count_na": 1, "p_count_na": 14.29, "p_count_uniques": 85.71}, "frequency": [{"value": "2016/09/10", "count": 1, "percentage": 14.29}, {"value": "2015/08/10", "count": 1, "percentage": 14.29}, {"value": "2014/07/10", "count": 1, "percentage": 14.29}, {"value": "2013/06/10", "count": 1, "percentage": 14.29}, {"value": "2012/05/10", "count": 1, "percentage": 14.29}, {"value": "2011/04/10", "count": 1, "percentage": 14.29}, {"value": null, "count": 1, "percentage": 14.29}], "name": "last date seen", "column_dtype": "string", "dtypes_stats": {"null": 1, "missing": 0, "string": 6}, "column_type": "categorical"}, "attributes": {"stats": {"count_uniques": 6, "min": [null, 5700.0], "max": [91.44000244140625, null], "count_na": 1, "p_count_na": 14.29, "p_count_uniques": 85.71}, "frequency": [{"value": "[8.5344, 4300.0]", "count": 1, "percentage": 14.29}, {"value": "[5.334, 2000.0]", "count": 1, "percentage": 14.29}, {"value": "[7.9248, 4000.0]", "count": 1, "percentage": 14.29}, {"value": "[3.9624, 1800.0]", "count": 1, "percentage": 14.29}, {"value": "[, 5700.0]", "count": 1, "percentage": 14.29}, {"value": "[91.44,]", "count": 1, "percentage": 14.29}, {"value": null, "count": 1, "percentage": 14.29}], "name": "attributes", "column_dtype": "array", "dtypes_stats": {"null": 1, "missing": 0, "array": 6}, "column_type": "array"}, "Date Type": {"stats": {"count_uniques": 6, "min": "2011-04-10", "max": "2016-09-10", "count_na": 1, "p_count_na": 14.29, "p_count_uniques": 85.71}, "frequency": [{"value": "2016-09-10", "count": 1, "percentage": 14.29}, {"value": "2015-08-10", "count": 1, "percentage": 14.29}, {"value": "2014-06-24", "count": 1, "percentage": 14.29}, {"value": "2013-06-24", "count": 1, "percentage": 14.29}, {"value": "2012-05-10", "count": 1, "percentage": 14.29}, {"value": "2011-04-10", "count": 1, "percentage": 14.29}, {"value": null, "count": 1, "percentage": 14.29}], "name": "Date Type", "column_dtype": "date", "dtypes_stats": {"null": 1, "missing": 0, "date": 6}, "column_type": "date"}, "timestamp": {"stats": {"count_uniques": 1, "min": "2014-06-24 00:00:00", "max": "2014-06-24 00:00:00", "count_na": 1, "p_count_na": 14.29, "p_count_uniques": 14.29}, "frequency": [{"value": "2014-06-24 00:00:00", "count": 6, "percentage": 85.71}, {"value": null, "count": 1, "percentage": 14.29}], "name": "timestamp", "column_dtype": "date", "dtypes_stats": {"null": 1, "missing": 0, "date": 6}, "column_type": "date"}, "Cybertronian": {"stats": {"count_uniques": 1, "min": 1, "max": 1, "count_na": 1, "p_count_na": 14.29, "p_count_uniques": 14.29}, "frequency": [{"value": true, "count": 6, "percentage": 85.71}, {"value": null, "count": 1, "percentage": 14.29}], "name": "Cybertronian", "column_dtype": "boolean", "dtypes_stats": {"null": 1, "missing": 0, "boolean": 6}, "column_type": "categorical"}, "function(binary)": {"stats": {"count_uniques": 6, "min": null, "max": null, "count_na": 1, "p_count_na": 14.29, "p_count_uniques": 85.71}, "frequency": [{"value": "Leader", "count": 1, "percentage": 14.29}, {"value": "Espionage", "count": 1, "percentage": 14.29}, {"value": "Security", "count": 1, "percentage": 14.29}, {"value": "First Lieutenant", "count": 1, "percentage": 14.29}, {"value": "None", "count": 1, "percentage": 14.29}, {"value": "Battle Station", "count": 1, "percentage": 14.29}, {"value": null, "count": 1, "percentage": 14.29}], "name": "function(binary)", "column_dtype": "binary", "dtypes_stats": {"null": 1, "missing": 0, "binary": 6}, "column_type": "binary"}, "NullType": {"stats": {"count_uniques": 0, "min": null, "max": null, "count_na": 7, "p_count_na": 100.0, "p_count_uniques": 0.0}, "frequency": [{"value": null, "count": 7, "percentage": 100.0}], "name": "NullType", "column_dtype": "null", "dtypes_stats": {"null": 7, "missing": 0}, "column_type": "null"}}, "name": null, "file_name": null, "summary": {"cols_count": 16, "rows_count": 7, "size": "52.5 MB", "sample_size": 10000, "missing_count": 26, "p_missing": 371.43}, "sample": {"columns": [{"title": "names"}, {"title": "height(ft)"}, {"title": "function"}, {"title": "rank"}, {"title": "age"}, {"title": "weight(t)"}, {"title": "japanese name"}, {"title": "last position seen"}, {"title": "date arrival"}, {"title": "last date seen"}, {"title": "attributes"}, {"title": "Date Type"}, {"title": "timestamp"}, {"title": "Cybertronian"}, {"title": "function(binary)"}, {"title": "NullType"}], "value": [["Optimus", -28, "Leader", 10, 5000000, 4.300000190734863, ["Inochi", "Convoy"], "19.442735,-99.201111", "1980/04/10", "2016/09/10", [8.53439998626709, 4300.0], "2016-09-10", "2014-06-24 00:00:00", true, null, null], ["bumbl#eb\u00e9\u00e9  ", 17, "Espionage", 7, 5000000, 2.0, ["Bumble", "Goldback"], "10.642707,-71.612534", "1980/04/10", "2015/08/10", [5.334000110626221, 2000.0], "2015-08-10", "2014-06-24 00:00:00", true, null, null], ["ironhide&", 26, "Security", 7, 5000000, 4.0, ["Roadbuster"], "37.789563,-122.400356", "1980/04/10", "2014/07/10", [7.924799919128418, 4000.0], "2014-06-24", "2014-06-24 00:00:00", true, null, null], ["Jazz", 13, "First Lieutenant", 8, 5000000, 1.7999999523162842, ["Meister"], "33.670666,-117.841553", "1980/04/10", "2013/06/10", [3.962399959564209, 1800.0], "2013-06-24", "2014-06-24 00:00:00", true, null, null], ["Megatron", null, "None", 10, 5000000, 5.699999809265137, ["Megatron"], null, "1980/04/10", "2012/05/10", [null, 5700.0], "2012-05-10", "2014-06-24 00:00:00", true, null, null], ["Metroplex_)^$", 300, "Battle Station", 8, 5000000, null, ["Metroflex"], null, "1980/04/10", "2011/04/10", [91.44000244140625, null], "2011-04-10", "2014-06-24 00:00:00", true, null, null], [null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null]]}}

In [45]:
t.run()


Creating file ../test_df_profiler.py
Done

In [46]:
mismatch = {"names":"dd/mm/yyyy","height(ft)":r'^([0-2][0-9]|(3)[0-1])(\/)(((0)[0-9])|((1)[0-2]))(\/)\d{4}$',"function":"yyyy-mm-dd"}
t.create(p, "dataset", "mismatch", 'json', None, source_df,"*", mismatch=mismatch)


Creating test_dataset_mismatch() test function...
INFO:optimus:test_dataset_mismatch()
INFO:optimus:Processing Stats For columns...
{'names': {'string': 6, 'null': 1}, 'height(ft)': {'smallint': 5, 'null': 2}, 'function': {'string': 6, 'null': 1}, 'rank': {'tinyint': 6, 'null': 1}, 'age': {'int': 6, 'null': 1}, 'weight(t)': {'float': 5, 'null': 2}, 'japanese name': {'array': 6, 'null': 1}, 'last position seen': {'string': 4, 'null': 3}, 'date arrival': {'string': 6, 'null': 1}, 'last date seen': {'string': 6, 'null': 1}, 'attributes': {'array': 6, 'null': 1}, 'Date Type': {'date': 6, 'null': 1}, 'timestamp': {'timestamp': 6, 'null': 1}, 'Cybertronian': {'boolean': 6, 'null': 1}, 'function(binary)': {'binary': 6, 'null': 1}, 'NullType': {'null': 7}}
INFO:optimus:Batch Stats 0. Processing columns['names', 'height(ft)', 'function', 'rank', 'age', 'weight(t)', 'japanese name', 'last position seen', 'date arrival', 'last date seen', 'attributes', 'Date Type', 'timestamp', 'Cybertronian', 'function(binary)', 'NullType']
INFO:optimus:'kurtosis' function in 'age' column is returning 'nan'. Is that what you expected?. Seems that 'age' has 'nan' values
INFO:optimus:'skewness' function in 'age' column is returning 'nan'. Is that what you expected?. Seems that 'age' has 'nan' values
INFO:optimus:Batch Histogram 0. Processing columns['names', 'height(ft)', 'function', 'rank', 'age', 'weight(t)', 'japanese name', 'last position seen', 'date arrival', 'last date seen', 'attributes', 'Date Type', 'timestamp', 'Cybertronian', 'function(binary)', 'NullType']
INFO:optimus:Processing Frequency ...
INFO:optimus:`names`,`function`,`japanese name`,`last position seen`,`date arrival`,`last date seen`,`attributes`,`Date Type`,`timestamp`,`Cybertronian`,`function(binary)`,`NullType` column(s) was not processed because is/are not byte,short,big,int,double,float
INFO:optimus:`names`,`function`,`last position seen`,`date arrival`,`last date seen`,`timestamp`,`Cybertronian`,`NullType` column(s) was not processed because is/are not array,vector,byte,date,binary
INFO:optimus:Using 'column_exp' to process column 'japanese name' with function _cast_to
INFO:optimus:Using 'column_exp' to process column 'attributes' with function _cast_to
INFO:optimus:Using 'column_exp' to process column 'Date Type' with function _cast_to
INFO:optimus:Using 'column_exp' to process column 'function(binary)' with function _cast_to
{"count_types": {"string": 5, "int": 3, "decimal": 1, "array": 2, "date": 2, "boolean": 1, "binary": 1, "null": 1, "numeric": 0, "categorical": 0}, "total_count_dtypes": 8, "dtypes_list": ["string", "int", "decimal", "array", "date", "boolean", "binary", "null"], "columns": {"names": {"stats": {"count_uniques": 5, "min": "Jazz", "max": "ironhide&", "stddev": null, "kurtosis": null, "mean": null, "skewness": null, "sum": null, "variance": null, "zeros": 0, "count_na": 1, "p_count_na": 14.29, "p_count_uniques": 71.43}, "frequency": [{"value": "Optimus", "count": 1, "percentage": 14.29}, {"value": "bumbl#eb\u00e9\u00e9  ", "count": 1, "percentage": 14.29}, {"value": "ironhide&", "count": 1, "percentage": 14.29}, {"value": "Jazz", "count": 1, "percentage": 14.29}, {"value": "Megatron", "count": 1, "percentage": 14.29}, {"value": "Metroplex_)^$", "count": 1, "percentage": 14.29}, {"value": null, "count": 1, "percentage": 14.29}], "name": "names", "column_dtype": "string", "dtypes_stats": {"null": 1, "missing": 0, "string": 6}, "column_type": "categorical"}, "height(ft)": {"stats": {"count_uniques": 5, "min": -28, "max": 300, "stddev": 132.66612, "kurtosis": 0.13863, "mean": 65.6, "skewness": 1.4049, "sum": 328, "variance": 17600.3, "zeros": 0, "percentile": {"0.75": 26, "0.95": 300, "0.05": -28, "0.25": 13, "0.5": 17}, "count_na": 2, "hist": [{"count": 4.0, "lower": -28.0, "upper": 54.0}, {"count": 0.0, "lower": 54.0, "upper": 136.0}, {"count": 0.0, "lower": 136.0, "upper": 218.0}, {"count": 0.0, "lower": 218.0, "upper": 300.0}], "range": 328, "median": 17, "interquartile_range": 13, "coef_variation": 2.02235, "mad": 9, "p_count_na": 28.57, "p_count_uniques": 71.43}, "name": "height(ft)", "column_dtype": "int", "dtypes_stats": {"null": 2, "missing": 0, "int": 5}, "column_type": "numeric"}, "function": {"stats": {"count_uniques": 6, "min": "Battle Station", "max": "Security", "stddev": null, "kurtosis": null, "mean": null, "skewness": null, "sum": null, "variance": null, "zeros": 0, "count_na": 1, "p_count_na": 14.29, "p_count_uniques": 85.71}, "frequency": [{"value": "Leader", "count": 1, "percentage": 14.29}, {"value": "Espionage", "count": 1, "percentage": 14.29}, {"value": "Security", "count": 1, "percentage": 14.29}, {"value": "First Lieutenant", "count": 1, "percentage": 14.29}, {"value": "None", "count": 1, "percentage": 14.29}, {"value": "Battle Station", "count": 1, "percentage": 14.29}, {"value": null, "count": 1, "percentage": 14.29}], "name": "function", "column_dtype": "string", "dtypes_stats": {"null": 1, "missing": 0, "string": 6}, "column_type": "categorical"}, "rank": {"stats": {"count_uniques": 3, "min": 7, "max": 10, "stddev": 1.36626, "kurtosis": -1.5, "mean": 8.33333, "skewness": 0.3818, "sum": 50, "variance": 1.86667, "zeros": 0, "percentile": {"0.75": 10, "0.95": 10, "0.05": 7, "0.25": 7, "0.5": 8}, "count_na": 1, "hist": [{"count": 4.0, "lower": 7.0, "upper": 8.5}, {"count": 0.0, "lower": 8.5, "upper": 10.0}], "range": 3, "median": 8, "interquartile_range": 3, "coef_variation": 0.16395, "mad": 1, "p_count_na": 14.29, "p_count_uniques": 42.86}, "name": "rank", "column_dtype": "int", "dtypes_stats": {"null": 1, "missing": 0, "int": 6}, "column_type": "numeric"}, "age": {"stats": {"count_uniques": 1, "min": 5000000, "max": 5000000, "stddev": 0.0, "kurtosis": null, "mean": 5000000.0, "skewness": null, "sum": 30000000, "variance": 0.0, "zeros": 0, "percentile": {"0.75": 5000000, "0.95": 5000000, "0.05": 5000000, "0.25": 5000000, "0.5": 5000000}, "count_na": 1, "hist": [{"count": 6, "lower": 5000000, "upper": 5000001}], "range": 0, "median": 5000000, "interquartile_range": 0, "coef_variation": 0.0, "mad": 0, "p_count_na": 14.29, "p_count_uniques": 14.29}, "name": "age", "column_dtype": "int", "dtypes_stats": {"null": 1, "missing": 0, "int": 6}, "column_type": "numeric"}, "weight(t)": {"stats": {"count_uniques": 5, "min": 1.8, "max": 5.7, "stddev": 1.64712, "kurtosis": -1.43641, "mean": 3.56, "skewness": 0.06521, "sum": 17.8, "variance": 2.713, "zeros": 0, "percentile": {"0.75": 4.300000190734863, "0.95": 5.699999809265137, "0.05": 1.7999999523162842, "0.25": 2.0, "0.5": 4.0}, "count_na": 2, "hist": [{"count": 1.0, "lower": 1.8, "upper": 2.78}, {"count": 0.0, "lower": 2.78, "upper": 3.75}, {"count": 2.0, "lower": 3.75, "upper": 4.73}, {"count": 1.0, "lower": 4.73, "upper": 5.7}], "range": 3.9000000000000004, "median": 4.0, "interquartile_range": 2.3000001907348633, "coef_variation": 0.46267, "mad": 1.7, "p_count_na": 28.57, "p_count_uniques": 71.43}, "name": "weight(t)", "column_dtype": "decimal", "dtypes_stats": {"null": 2, "missing": 0, "decimal": 5}, "column_type": "numeric"}, "japanese name": {"stats": {"count_uniques": 6, "min": ["Bumble", "Goldback"], "max": ["Roadbuster"], "count_na": 1, "p_count_na": 14.29, "p_count_uniques": 85.71}, "frequency": [{"value": "[Inochi, Convoy]", "count": 1, "percentage": 14.29}, {"value": "[Bumble, Goldback]", "count": 1, "percentage": 14.29}, {"value": "[Roadbuster]", "count": 1, "percentage": 14.29}, {"value": "[Meister]", "count": 1, "percentage": 14.29}, {"value": "[Megatron]", "count": 1, "percentage": 14.29}, {"value": "[Metroflex]", "count": 1, "percentage": 14.29}, {"value": null, "count": 1, "percentage": 14.29}], "name": "japanese name", "column_dtype": "array", "dtypes_stats": {"null": 1, "missing": 0, "array": 6}, "column_type": "array"}, "last position seen": {"stats": {"count_uniques": 4, "min": "10.642707,-71.612534", "max": "37.789563,-122.400356", "stddev": null, "kurtosis": null, "mean": null, "skewness": null, "sum": null, "variance": null, "zeros": 0, "count_na": 3, "p_count_na": 42.86, "p_count_uniques": 57.14}, "frequency": [{"value": null, "count": 3, "percentage": 42.86}, {"value": "19.442735,-99.201111", "count": 1, "percentage": 14.29}, {"value": "10.642707,-71.612534", "count": 1, "percentage": 14.29}, {"value": "37.789563,-122.400356", "count": 1, "percentage": 14.29}, {"value": "33.670666,-117.841553", "count": 1, "percentage": 14.29}], "name": "last position seen", "column_dtype": "string", "dtypes_stats": {"null": 3, "missing": 0, "string": 4}, "column_type": "categorical"}, "date arrival": {"stats": {"count_uniques": 1, "min": "1980/04/10", "max": "1980/04/10", "stddev": null, "kurtosis": null, "mean": null, "skewness": null, "sum": null, "variance": null, "zeros": 0, "count_na": 1, "p_count_na": 14.29, "p_count_uniques": 14.29}, "frequency": [{"value": "1980/04/10", "count": 6, "percentage": 85.71}, {"value": null, "count": 1, "percentage": 14.29}], "name": "date arrival", "column_dtype": "string", "dtypes_stats": {"null": 1, "missing": 0, "string": 6}, "column_type": "categorical"}, "last date seen": {"stats": {"count_uniques": 6, "min": "2011/04/10", "max": "2016/09/10", "stddev": null, "kurtosis": null, "mean": null, "skewness": null, "sum": null, "variance": null, "zeros": 0, "count_na": 1, "p_count_na": 14.29, "p_count_uniques": 85.71}, "frequency": [{"value": "2016/09/10", "count": 1, "percentage": 14.29}, {"value": "2015/08/10", "count": 1, "percentage": 14.29}, {"value": "2014/07/10", "count": 1, "percentage": 14.29}, {"value": "2013/06/10", "count": 1, "percentage": 14.29}, {"value": "2012/05/10", "count": 1, "percentage": 14.29}, {"value": "2011/04/10", "count": 1, "percentage": 14.29}, {"value": null, "count": 1, "percentage": 14.29}], "name": "last date seen", "column_dtype": "string", "dtypes_stats": {"null": 1, "missing": 0, "string": 6}, "column_type": "categorical"}, "attributes": {"stats": {"count_uniques": 6, "min": [null, 5700.0], "max": [91.44000244140625, null], "count_na": 1, "p_count_na": 14.29, "p_count_uniques": 85.71}, "frequency": [{"value": "[8.5344, 4300.0]", "count": 1, "percentage": 14.29}, {"value": "[5.334, 2000.0]", "count": 1, "percentage": 14.29}, {"value": "[7.9248, 4000.0]", "count": 1, "percentage": 14.29}, {"value": "[3.9624, 1800.0]", "count": 1, "percentage": 14.29}, {"value": "[, 5700.0]", "count": 1, "percentage": 14.29}, {"value": "[91.44,]", "count": 1, "percentage": 14.29}, {"value": null, "count": 1, "percentage": 14.29}], "name": "attributes", "column_dtype": "array", "dtypes_stats": {"null": 1, "missing": 0, "array": 6}, "column_type": "array"}, "Date Type": {"stats": {"count_uniques": 6, "min": "2011-04-10", "max": "2016-09-10", "count_na": 1, "p_count_na": 14.29, "p_count_uniques": 85.71}, "frequency": [{"value": "2016-09-10", "count": 1, "percentage": 14.29}, {"value": "2015-08-10", "count": 1, "percentage": 14.29}, {"value": "2014-06-24", "count": 1, "percentage": 14.29}, {"value": "2013-06-24", "count": 1, "percentage": 14.29}, {"value": "2012-05-10", "count": 1, "percentage": 14.29}, {"value": "2011-04-10", "count": 1, "percentage": 14.29}, {"value": null, "count": 1, "percentage": 14.29}], "name": "Date Type", "column_dtype": "date", "dtypes_stats": {"null": 1, "missing": 0, "date": 6}, "column_type": "date"}, "timestamp": {"stats": {"count_uniques": 1, "min": "2014-06-24 00:00:00", "max": "2014-06-24 00:00:00", "count_na": 1, "p_count_na": 14.29, "p_count_uniques": 14.29}, "frequency": [{"value": "2014-06-24 00:00:00", "count": 6, "percentage": 85.71}, {"value": null, "count": 1, "percentage": 14.29}], "name": "timestamp", "column_dtype": "date", "dtypes_stats": {"null": 1, "missing": 0, "date": 6}, "column_type": "date"}, "Cybertronian": {"stats": {"count_uniques": 1, "min": 1, "max": 1, "count_na": 1, "p_count_na": 14.29, "p_count_uniques": 14.29}, "frequency": [{"value": true, "count": 6, "percentage": 85.71}, {"value": null, "count": 1, "percentage": 14.29}], "name": "Cybertronian", "column_dtype": "boolean", "dtypes_stats": {"null": 1, "missing": 0, "boolean": 6}, "column_type": "categorical"}, "function(binary)": {"stats": {"count_uniques": 6, "min": null, "max": null, "count_na": 1, "p_count_na": 14.29, "p_count_uniques": 85.71}, "frequency": [{"value": "Leader", "count": 1, "percentage": 14.29}, {"value": "Espionage", "count": 1, "percentage": 14.29}, {"value": "Security", "count": 1, "percentage": 14.29}, {"value": "First Lieutenant", "count": 1, "percentage": 14.29}, {"value": "None", "count": 1, "percentage": 14.29}, {"value": "Battle Station", "count": 1, "percentage": 14.29}, {"value": null, "count": 1, "percentage": 14.29}], "name": "function(binary)", "column_dtype": "binary", "dtypes_stats": {"null": 1, "missing": 0, "binary": 6}, "column_type": "binary"}, "NullType": {"stats": {"count_uniques": 0, "min": null, "max": null, "count_na": 7, "p_count_na": 100.0, "p_count_uniques": 0.0}, "frequency": [{"value": null, "count": 7, "percentage": 100.0}], "name": "NullType", "column_dtype": "null", "dtypes_stats": {"null": 7, "missing": 0}, "column_type": "null"}}, "name": null, "file_name": null, "summary": {"cols_count": 16, "rows_count": 7, "size": "51.6 MB", "sample_size": 10000, "missing_count": 26, "p_missing": 371.43}, "sample": {"columns": [{"title": "names"}, {"title": "height(ft)"}, {"title": "function"}, {"title": "rank"}, {"title": "age"}, {"title": "weight(t)"}, {"title": "japanese name"}, {"title": "last position seen"}, {"title": "date arrival"}, {"title": "last date seen"}, {"title": "attributes"}, {"title": "Date Type"}, {"title": "timestamp"}, {"title": "Cybertronian"}, {"title": "function(binary)"}, {"title": "NullType"}], "value": [["Optimus", -28, "Leader", 10, 5000000, 4.300000190734863, ["Inochi", "Convoy"], "19.442735,-99.201111", "1980/04/10", "2016/09/10", [8.53439998626709, 4300.0], "2016-09-10", "2014-06-24 00:00:00", true, null, null], ["bumbl#eb\u00e9\u00e9  ", 17, "Espionage", 7, 5000000, 2.0, ["Bumble", "Goldback"], "10.642707,-71.612534", "1980/04/10", "2015/08/10", [5.334000110626221, 2000.0], "2015-08-10", "2014-06-24 00:00:00", true, null, null], ["ironhide&", 26, "Security", 7, 5000000, 4.0, ["Roadbuster"], "37.789563,-122.400356", "1980/04/10", "2014/07/10", [7.924799919128418, 4000.0], "2014-06-24", "2014-06-24 00:00:00", true, null, null], ["Jazz", 13, "First Lieutenant", 8, 5000000, 1.7999999523162842, ["Meister"], "33.670666,-117.841553", "1980/04/10", "2013/06/10", [3.962399959564209, 1800.0], "2013-06-24", "2014-06-24 00:00:00", true, null, null], ["Megatron", null, "None", 10, 5000000, 5.699999809265137, ["Megatron"], null, "1980/04/10", "2012/05/10", [null, 5700.0], "2012-05-10", "2014-06-24 00:00:00", true, null, null], ["Metroplex_)^$", 300, "Battle Station", 8, 5000000, null, ["Metroflex"], null, "1980/04/10", "2011/04/10", [91.44000244140625, null], "2011-04-10", "2014-06-24 00:00:00", true, null, null], [null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null]]}}

In [47]:
t.run()


Creating file ../test_df_profiler.py
Done

In [48]:
t.create(p, "columns_stats", None, 'json', None, source_df,"*")


Creating test_columns_stats() test function...
INFO:optimus:test_columns_stats()
INFO:optimus:Processing Stats For columns...
{'names': {'string': 6, 'null': 1}, 'height(ft)': {'smallint': 5, 'null': 2}, 'function': {'string': 6, 'null': 1}, 'rank': {'tinyint': 6, 'null': 1}, 'age': {'int': 6, 'null': 1}, 'weight(t)': {'float': 5, 'null': 2}, 'japanese name': {'array': 6, 'null': 1}, 'last position seen': {'string': 4, 'null': 3}, 'date arrival': {'string': 6, 'null': 1}, 'last date seen': {'string': 6, 'null': 1}, 'attributes': {'array': 6, 'null': 1}, 'Date Type': {'date': 6, 'null': 1}, 'timestamp': {'timestamp': 6, 'null': 1}, 'Cybertronian': {'boolean': 6, 'null': 1}, 'function(binary)': {'binary': 6, 'null': 1}, 'NullType': {'null': 7}}
INFO:optimus:Batch Stats 0. Processing columns['names', 'height(ft)', 'function', 'rank', 'age', 'weight(t)', 'japanese name', 'last position seen', 'date arrival', 'last date seen', 'attributes', 'Date Type', 'timestamp', 'Cybertronian', 'function(binary)', 'NullType']
INFO:optimus:'kurtosis' function in 'age' column is returning 'nan'. Is that what you expected?. Seems that 'age' has 'nan' values
INFO:optimus:'skewness' function in 'age' column is returning 'nan'. Is that what you expected?. Seems that 'age' has 'nan' values
INFO:optimus:Batch Histogram 0. Processing columns['names', 'height(ft)', 'function', 'rank', 'age', 'weight(t)', 'japanese name', 'last position seen', 'date arrival', 'last date seen', 'attributes', 'Date Type', 'timestamp', 'Cybertronian', 'function(binary)', 'NullType']
INFO:optimus:Processing Frequency ...
INFO:optimus:`names`,`function`,`japanese name`,`last position seen`,`date arrival`,`last date seen`,`attributes`,`Date Type`,`timestamp`,`Cybertronian`,`function(binary)`,`NullType` column(s) was not processed because is/are not byte,short,big,int,double,float
INFO:optimus:`names`,`function`,`last position seen`,`date arrival`,`last date seen`,`timestamp`,`Cybertronian`,`NullType` column(s) was not processed because is/are not array,vector,byte,date,binary
INFO:optimus:Using 'column_exp' to process column 'japanese name' with function _cast_to
INFO:optimus:Using 'column_exp' to process column 'attributes' with function _cast_to
INFO:optimus:Using 'column_exp' to process column 'Date Type' with function _cast_to
INFO:optimus:Using 'column_exp' to process column 'function(binary)' with function _cast_to
{'count_types': {'string': 5, 'int': 3, 'decimal': 1, 'array': 2, 'date': 2, 'boolean': 1, 'binary': 1, 'null': 1, 'numeric': 0, 'categorical': 0}, 'total_count_dtypes': 8, 'dtypes_list': ['string', 'int', 'decimal', 'array', 'date', 'boolean', 'binary', 'null'], 'columns': {'names': {'stats': {'count_uniques': 5, 'min': 'Jazz', 'max': 'ironhide&', 'stddev': None, 'kurtosis': None, 'mean': None, 'skewness': None, 'sum': None, 'variance': None, 'zeros': 0, 'count_na': 1, 'p_count_na': 14.29, 'p_count_uniques': 71.43}, 'frequency': [{'value': 'Optimus', 'count': 1, 'percentage': 14.29}, {'value': 'bumbl#ebéé  ', 'count': 1, 'percentage': 14.29}, {'value': 'ironhide&', 'count': 1, 'percentage': 14.29}, {'value': 'Jazz', 'count': 1, 'percentage': 14.29}, {'value': 'Megatron', 'count': 1, 'percentage': 14.29}, {'value': 'Metroplex_)^$', 'count': 1, 'percentage': 14.29}, {'value': None, 'count': 1, 'percentage': 14.29}], 'name': 'names', 'column_dtype': 'string', 'dtypes_stats': {'null': 1, 'missing': 0, 'string': 6}, 'column_type': 'categorical'}, 'height(ft)': {'stats': {'count_uniques': 5, 'min': -28, 'max': 300, 'stddev': 132.66612, 'kurtosis': 0.13863, 'mean': 65.6, 'skewness': 1.4049, 'sum': 328, 'variance': 17600.3, 'zeros': 0, 'percentile': {'0.75': 26, '0.95': 300, '0.05': -28, '0.25': 13, '0.5': 17}, 'count_na': 2, 'hist': [{'count': 4.0, 'lower': -28.0, 'upper': 54.0}, {'count': 0.0, 'lower': 54.0, 'upper': 136.0}, {'count': 0.0, 'lower': 136.0, 'upper': 218.0}, {'count': 0.0, 'lower': 218.0, 'upper': 300.0}], 'range': 328, 'median': 17, 'interquartile_range': 13, 'coef_variation': 2.02235, 'mad': 9, 'p_count_na': 28.57, 'p_count_uniques': 71.43}, 'name': 'height(ft)', 'column_dtype': 'int', 'dtypes_stats': {'null': 2, 'missing': 0, 'int': 5}, 'column_type': 'numeric'}, 'function': {'stats': {'count_uniques': 6, 'min': 'Battle Station', 'max': 'Security', 'stddev': None, 'kurtosis': None, 'mean': None, 'skewness': None, 'sum': None, 'variance': None, 'zeros': 0, 'count_na': 1, 'p_count_na': 14.29, 'p_count_uniques': 85.71}, 'frequency': [{'value': 'Leader', 'count': 1, 'percentage': 14.29}, {'value': 'Espionage', 'count': 1, 'percentage': 14.29}, {'value': 'Security', 'count': 1, 'percentage': 14.29}, {'value': 'First Lieutenant', 'count': 1, 'percentage': 14.29}, {'value': 'None', 'count': 1, 'percentage': 14.29}, {'value': 'Battle Station', 'count': 1, 'percentage': 14.29}, {'value': None, 'count': 1, 'percentage': 14.29}], 'name': 'function', 'column_dtype': 'string', 'dtypes_stats': {'null': 1, 'missing': 0, 'string': 6}, 'column_type': 'categorical'}, 'rank': {'stats': {'count_uniques': 3, 'min': 7, 'max': 10, 'stddev': 1.36626, 'kurtosis': -1.5, 'mean': 8.33333, 'skewness': 0.3818, 'sum': 50, 'variance': 1.86667, 'zeros': 0, 'percentile': {'0.75': 10, '0.95': 10, '0.05': 7, '0.25': 7, '0.5': 8}, 'count_na': 1, 'hist': [{'count': 4.0, 'lower': 7.0, 'upper': 8.5}, {'count': 0.0, 'lower': 8.5, 'upper': 10.0}], 'range': 3, 'median': 8, 'interquartile_range': 3, 'coef_variation': 0.16395, 'mad': 1, 'p_count_na': 14.29, 'p_count_uniques': 42.86}, 'name': 'rank', 'column_dtype': 'int', 'dtypes_stats': {'null': 1, 'missing': 0, 'int': 6}, 'column_type': 'numeric'}, 'age': {'stats': {'count_uniques': 1, 'min': 5000000, 'max': 5000000, 'stddev': 0.0, 'kurtosis': nan, 'mean': 5000000.0, 'skewness': nan, 'sum': 30000000, 'variance': 0.0, 'zeros': 0, 'percentile': {'0.75': 5000000, '0.95': 5000000, '0.05': 5000000, '0.25': 5000000, '0.5': 5000000}, 'count_na': 1, 'hist': [{'count': 6, 'lower': 5000000, 'upper': 5000001}], 'range': 0, 'median': 5000000, 'interquartile_range': 0, 'coef_variation': 0.0, 'mad': 0, 'p_count_na': 14.29, 'p_count_uniques': 14.29}, 'name': 'age', 'column_dtype': 'int', 'dtypes_stats': {'null': 1, 'missing': 0, 'int': 6}, 'column_type': 'numeric'}, 'weight(t)': {'stats': {'count_uniques': 5, 'min': 1.8, 'max': 5.7, 'stddev': 1.64712, 'kurtosis': -1.43641, 'mean': 3.56, 'skewness': 0.06521, 'sum': 17.8, 'variance': 2.713, 'zeros': 0, 'percentile': {'0.75': 4.300000190734863, '0.95': 5.699999809265137, '0.05': 1.7999999523162842, '0.25': 2.0, '0.5': 4.0}, 'count_na': 2, 'hist': [{'count': 1.0, 'lower': 1.8, 'upper': 2.78}, {'count': 0.0, 'lower': 2.78, 'upper': 3.75}, {'count': 2.0, 'lower': 3.75, 'upper': 4.73}, {'count': 1.0, 'lower': 4.73, 'upper': 5.7}], 'range': 3.9000000000000004, 'median': 4.0, 'interquartile_range': 2.3000001907348633, 'coef_variation': 0.46267, 'mad': 1.7, 'p_count_na': 28.57, 'p_count_uniques': 71.43}, 'name': 'weight(t)', 'column_dtype': 'decimal', 'dtypes_stats': {'null': 2, 'missing': 0, 'decimal': 5}, 'column_type': 'numeric'}, 'japanese name': {'stats': {'count_uniques': 6, 'min': ['Bumble', 'Goldback'], 'max': ['Roadbuster'], 'count_na': 1, 'p_count_na': 14.29, 'p_count_uniques': 85.71}, 'frequency': [{'value': '[Inochi, Convoy]', 'count': 1, 'percentage': 14.29}, {'value': '[Bumble, Goldback]', 'count': 1, 'percentage': 14.29}, {'value': '[Roadbuster]', 'count': 1, 'percentage': 14.29}, {'value': '[Meister]', 'count': 1, 'percentage': 14.29}, {'value': '[Megatron]', 'count': 1, 'percentage': 14.29}, {'value': '[Metroflex]', 'count': 1, 'percentage': 14.29}, {'value': None, 'count': 1, 'percentage': 14.29}], 'name': 'japanese name', 'column_dtype': 'array', 'dtypes_stats': {'null': 1, 'missing': 0, 'array': 6}, 'column_type': 'array'}, 'last position seen': {'stats': {'count_uniques': 4, 'min': '10.642707,-71.612534', 'max': '37.789563,-122.400356', 'stddev': None, 'kurtosis': None, 'mean': None, 'skewness': None, 'sum': None, 'variance': None, 'zeros': 0, 'count_na': 3, 'p_count_na': 42.86, 'p_count_uniques': 57.14}, 'frequency': [{'value': None, 'count': 3, 'percentage': 42.86}, {'value': '19.442735,-99.201111', 'count': 1, 'percentage': 14.29}, {'value': '10.642707,-71.612534', 'count': 1, 'percentage': 14.29}, {'value': '37.789563,-122.400356', 'count': 1, 'percentage': 14.29}, {'value': '33.670666,-117.841553', 'count': 1, 'percentage': 14.29}], 'name': 'last position seen', 'column_dtype': 'string', 'dtypes_stats': {'null': 3, 'missing': 0, 'string': 4}, 'column_type': 'categorical'}, 'date arrival': {'stats': {'count_uniques': 1, 'min': '1980/04/10', 'max': '1980/04/10', 'stddev': None, 'kurtosis': None, 'mean': None, 'skewness': None, 'sum': None, 'variance': None, 'zeros': 0, 'count_na': 1, 'p_count_na': 14.29, 'p_count_uniques': 14.29}, 'frequency': [{'value': '1980/04/10', 'count': 6, 'percentage': 85.71}, {'value': None, 'count': 1, 'percentage': 14.29}], 'name': 'date arrival', 'column_dtype': 'string', 'dtypes_stats': {'null': 1, 'missing': 0, 'string': 6}, 'column_type': 'categorical'}, 'last date seen': {'stats': {'count_uniques': 6, 'min': '2011/04/10', 'max': '2016/09/10', 'stddev': None, 'kurtosis': None, 'mean': None, 'skewness': None, 'sum': None, 'variance': None, 'zeros': 0, 'count_na': 1, 'p_count_na': 14.29, 'p_count_uniques': 85.71}, 'frequency': [{'value': '2016/09/10', 'count': 1, 'percentage': 14.29}, {'value': '2015/08/10', 'count': 1, 'percentage': 14.29}, {'value': '2014/07/10', 'count': 1, 'percentage': 14.29}, {'value': '2013/06/10', 'count': 1, 'percentage': 14.29}, {'value': '2012/05/10', 'count': 1, 'percentage': 14.29}, {'value': '2011/04/10', 'count': 1, 'percentage': 14.29}, {'value': None, 'count': 1, 'percentage': 14.29}], 'name': 'last date seen', 'column_dtype': 'string', 'dtypes_stats': {'null': 1, 'missing': 0, 'string': 6}, 'column_type': 'categorical'}, 'attributes': {'stats': {'count_uniques': 6, 'min': [None, 5700.0], 'max': [91.44000244140625, None], 'count_na': 1, 'p_count_na': 14.29, 'p_count_uniques': 85.71}, 'frequency': [{'value': '[8.5344, 4300.0]', 'count': 1, 'percentage': 14.29}, {'value': '[5.334, 2000.0]', 'count': 1, 'percentage': 14.29}, {'value': '[7.9248, 4000.0]', 'count': 1, 'percentage': 14.29}, {'value': '[3.9624, 1800.0]', 'count': 1, 'percentage': 14.29}, {'value': '[, 5700.0]', 'count': 1, 'percentage': 14.29}, {'value': '[91.44,]', 'count': 1, 'percentage': 14.29}, {'value': None, 'count': 1, 'percentage': 14.29}], 'name': 'attributes', 'column_dtype': 'array', 'dtypes_stats': {'null': 1, 'missing': 0, 'array': 6}, 'column_type': 'array'}, 'Date Type': {'stats': {'count_uniques': 6, 'min': datetime.date(2011, 4, 10), 'max': datetime.date(2016, 9, 10), 'count_na': 1, 'p_count_na': 14.29, 'p_count_uniques': 85.71}, 'frequency': [{'value': '2016-09-10', 'count': 1, 'percentage': 14.29}, {'value': '2015-08-10', 'count': 1, 'percentage': 14.29}, {'value': '2014-06-24', 'count': 1, 'percentage': 14.29}, {'value': '2013-06-24', 'count': 1, 'percentage': 14.29}, {'value': '2012-05-10', 'count': 1, 'percentage': 14.29}, {'value': '2011-04-10', 'count': 1, 'percentage': 14.29}, {'value': None, 'count': 1, 'percentage': 14.29}], 'name': 'Date Type', 'column_dtype': 'date', 'dtypes_stats': {'null': 1, 'missing': 0, 'date': 6}, 'column_type': 'date'}, 'timestamp': {'stats': {'count_uniques': 1, 'min': datetime.datetime(2014, 6, 24, 0, 0), 'max': datetime.datetime(2014, 6, 24, 0, 0), 'count_na': 1, 'p_count_na': 14.29, 'p_count_uniques': 14.29}, 'frequency': [{'value': datetime.datetime(2014, 6, 24, 0, 0), 'count': 6, 'percentage': 85.71}, {'value': None, 'count': 1, 'percentage': 14.29}], 'name': 'timestamp', 'column_dtype': 'date', 'dtypes_stats': {'null': 1, 'missing': 0, 'date': 6}, 'column_type': 'date'}, 'Cybertronian': {'stats': {'count_uniques': 1, 'min': 1, 'max': 1, 'count_na': 1, 'p_count_na': 14.29, 'p_count_uniques': 14.29}, 'frequency': [{'value': True, 'count': 6, 'percentage': 85.71}, {'value': None, 'count': 1, 'percentage': 14.29}], 'name': 'Cybertronian', 'column_dtype': 'boolean', 'dtypes_stats': {'null': 1, 'missing': 0, 'boolean': 6}, 'column_type': 'categorical'}, 'function(binary)': {'stats': {'count_uniques': 6, 'min': bytearray(b'Battle Station'), 'max': bytearray(b'Security'), 'count_na': 1, 'p_count_na': 14.29, 'p_count_uniques': 85.71}, 'frequency': [{'value': 'Leader', 'count': 1, 'percentage': 14.29}, {'value': 'Espionage', 'count': 1, 'percentage': 14.29}, {'value': 'Security', 'count': 1, 'percentage': 14.29}, {'value': 'First Lieutenant', 'count': 1, 'percentage': 14.29}, {'value': 'None', 'count': 1, 'percentage': 14.29}, {'value': 'Battle Station', 'count': 1, 'percentage': 14.29}, {'value': None, 'count': 1, 'percentage': 14.29}], 'name': 'function(binary)', 'column_dtype': 'binary', 'dtypes_stats': {'null': 1, 'missing': 0, 'binary': 6}, 'column_type': 'binary'}, 'NullType': {'stats': {'count_uniques': 0, 'min': None, 'max': None, 'count_na': 7, 'p_count_na': 100.0, 'p_count_uniques': 0.0}, 'frequency': [{'value': None, 'count': 7, 'percentage': 100.0}], 'name': 'NullType', 'column_dtype': 'null', 'dtypes_stats': {'null': 7, 'missing': 0}, 'column_type': 'null'}}}

In [49]:
t.run()


Creating file ../test_df_profiler.py
Done

In [12]:
t.create(p, "columns_agg", None, 'json', None, source_df,"*")


Creating test_columns_agg() test function...
INFO:optimus:test_columns_agg()
INFO:optimus:Batch Stats 0. Processing columns['names', 'height(ft)', 'function', 'rank', 'age', 'weight(t)', 'japanese name', 'last position seen', 'date arrival', 'last date seen', 'attributes', 'Date Type', 'timestamp', 'Cybertronian', 'function(binary)', 'NullType']
INFO:optimus:'kurtosis' function in 'age' column is returning 'nan'. Is that what you expected?. Seems that 'age' has 'nan' values
INFO:optimus:'skewness' function in 'age' column is returning 'nan'. Is that what you expected?. Seems that 'age' has 'nan' values
INFO:optimus:Batch Histogram 0. Processing columns['names', 'height(ft)', 'function', 'rank', 'age', 'weight(t)', 'japanese name', 'last position seen', 'date arrival', 'last date seen', 'attributes', 'Date Type', 'timestamp', 'Cybertronian', 'function(binary)', 'NullType']
INFO:optimus:'kurtosis' function in 'age' column is returning 'nan'. Is that what you expected?. Seems that 'age' has 'nan' values
INFO:optimus:'skewness' function in 'age' column is returning 'nan'. Is that what you expected?. Seems that 'age' has 'nan' values
{'names': {'count_uniques': 5, 'min': 'Jazz', 'max': 'ironhide&', 'count_na': 1, 'stddev': None, 'kurtosis': None, 'mean': None, 'skewness': None, 'sum': None, 'variance': None, 'zeros': 0}, 'height(ft)': {'count_uniques': 5, 'min': -28, 'max': 300, 'count_na': 2, 'stddev': 132.66612, 'kurtosis': 0.13863, 'mean': 65.6, 'skewness': 1.4049, 'sum': 328, 'variance': 17600.3, 'zeros': 0, 'percentile': {'0.75': 26, '0.95': 300, '0.05': -28, '0.25': 13, '0.5': 17}, 'hist': [{'count': 4.0, 'lower': -28.0, 'upper': 54.0}, {'count': 0.0, 'lower': 54.0, 'upper': 136.0}, {'count': 0.0, 'lower': 136.0, 'upper': 218.0}, {'count': 0.0, 'lower': 218.0, 'upper': 300.0}]}, 'function': {'count_uniques': 6, 'min': 'Battle Station', 'max': 'Security', 'count_na': 1, 'stddev': None, 'kurtosis': None, 'mean': None, 'skewness': None, 'sum': None, 'variance': None, 'zeros': 0}, 'rank': {'count_uniques': 3, 'min': 7, 'max': 10, 'count_na': 1, 'stddev': 1.36626, 'kurtosis': -1.5, 'mean': 8.33333, 'skewness': 0.3818, 'sum': 50, 'variance': 1.86667, 'zeros': 0, 'percentile': {'0.75': 10, '0.95': 10, '0.05': 7, '0.25': 7, '0.5': 8}, 'hist': [{'count': 4.0, 'lower': 7.0, 'upper': 8.5}, {'count': 0.0, 'lower': 8.5, 'upper': 10.0}]}, 'age': {'count_uniques': 1, 'min': 5000000, 'max': 5000000, 'count_na': 1, 'stddev': 0.0, 'kurtosis': nan, 'mean': 5000000.0, 'skewness': nan, 'sum': 30000000, 'variance': 0.0, 'zeros': 0, 'percentile': {'0.75': 5000000, '0.95': 5000000, '0.05': 5000000, '0.25': 5000000, '0.5': 5000000}, 'hist': [{'count': 6, 'lower': 5000000, 'upper': 5000001}]}, 'weight(t)': {'count_uniques': 5, 'min': 1.8, 'max': 5.7, 'count_na': 2, 'stddev': 1.64712, 'kurtosis': -1.43641, 'mean': 3.56, 'skewness': 0.06521, 'sum': 17.8, 'variance': 2.713, 'zeros': 0, 'percentile': {'0.75': 4.300000190734863, '0.95': 5.699999809265137, '0.05': 1.7999999523162842, '0.25': 2.0, '0.5': 4.0}, 'hist': [{'count': 1.0, 'lower': 1.8, 'upper': 2.78}, {'count': 0.0, 'lower': 2.78, 'upper': 3.75}, {'count': 2.0, 'lower': 3.75, 'upper': 4.73}, {'count': 1.0, 'lower': 4.73, 'upper': 5.7}]}, 'japanese name': {'count_uniques': 6, 'min': ['Bumble', 'Goldback'], 'max': ['Roadbuster'], 'count_na': 1}, 'last position seen': {'count_uniques': 4, 'min': '10.642707,-71.612534', 'max': '37.789563,-122.400356', 'count_na': 3, 'stddev': None, 'kurtosis': None, 'mean': None, 'skewness': None, 'sum': None, 'variance': None, 'zeros': 0}, 'date arrival': {'count_uniques': 1, 'min': '1980/04/10', 'max': '1980/04/10', 'count_na': 1, 'stddev': None, 'kurtosis': None, 'mean': None, 'skewness': None, 'sum': None, 'variance': None, 'zeros': 0}, 'last date seen': {'count_uniques': 6, 'min': '2011/04/10', 'max': '2016/09/10', 'count_na': 1, 'stddev': None, 'kurtosis': None, 'mean': None, 'skewness': None, 'sum': None, 'variance': None, 'zeros': 0}, 'attributes': {'count_uniques': 6, 'min': [None, 5700.0], 'max': [91.44000244140625, None], 'count_na': 1}, 'Date Type': {'count_uniques': 6, 'min': datetime.date(2011, 4, 10), 'max': datetime.date(2016, 9, 10), 'count_na': 1}, 'timestamp': {'count_uniques': 1, 'min': datetime.datetime(2014, 6, 24, 0, 0), 'max': datetime.datetime(2014, 6, 24, 0, 0), 'count_na': 1}, 'Cybertronian': {'count_uniques': 1, 'min': 1, 'max': 1, 'count_na': 1}, 'function(binary)': {'count_uniques': 6, 'min': bytearray(b'Battle Station'), 'max': bytearray(b'Security'), 'count_na': 1}, 'NullType': {'count_uniques': 0, 'min': None, 'max': None, 'count_na': 7}, 'p_count_na': 100.0, 'p_count_uniques': 0.0, 'range': 3.9000000000000004, 'median': 4.0, 'interquartile_range': 2.3000001907348633, 'coef_variation': 0.46267, 'mad': 1.7}

In [14]:
t.run()


Creating file ../test_df_profiler.py
Done

In [39]:
a = "{'name'=a'a}"

In [40]:
print(a)


{'name'=a'a}

In [42]:
import json
json.dumps("{'name'=a'a}")


Out[42]:
'"{\'name\'=a\'a}"'

In [11]:
from optimus.profiler.profiler import Profiler

In [12]:
op.profiler.run(source_df, "*")


INFO:optimus:Processing Stats For columns...
INFO:optimus:Batch Stats 0. Processing columns['names', 'height(ft)', 'function', 'rank', 'age', 'weight(t)', 'japanese name', 'last position seen', 'date arrival', 'last date seen', 'attributes', 'Date Type', 'timestamp', 'Cybertronian', 'function(binary)', 'NullType']
INFO:optimus:'kurtosis' function in 'age' column is returning 'nan'. Is that what you expected?. Seems that 'age' has 'nan' values
INFO:optimus:'skewness' function in 'age' column is returning 'nan'. Is that what you expected?. Seems that 'age' has 'nan' values
INFO:optimus:Batch Histogram 0. Processing columns['names', 'height(ft)', 'function', 'rank', 'age', 'weight(t)', 'japanese name', 'last position seen', 'date arrival', 'last date seen', 'attributes', 'Date Type', 'timestamp', 'Cybertronian', 'function(binary)', 'NullType']
INFO:optimus:'kurtosis' function in 'age' column is returning 'nan'. Is that what you expected?. Seems that 'age' has 'nan' values
INFO:optimus:'skewness' function in 'age' column is returning 'nan'. Is that what you expected?. Seems that 'age' has 'nan' values
INFO:optimus:Processing Frequency ...
INFO:optimus:`names`,`function`,`japanese name`,`last position seen`,`date arrival`,`last date seen`,`attributes`,`Date Type`,`timestamp`,`Cybertronian`,`function(binary)`,`NullType` column(s) was not processed because is/are not byte,short,big,int,double,float
INFO:optimus:`names`,`function`,`last position seen`,`date arrival`,`last date seen`,`timestamp`,`Cybertronian`,`NullType` column(s) was not processed because is/are not array,vector,byte,date,binary
INFO:optimus:Using 'column_exp' to process column 'japanese name' with function _cast_to
INFO:optimus:Using 'column_exp' to process column 'attributes' with function _cast_to
INFO:optimus:Using 'column_exp' to process column 'Date Type' with function _cast_to
INFO:optimus:Using 'column_exp' to process column 'function(binary)' with function _cast_to

Overview

Dataset info

Number of columns 16
Number of rows 7
Total Missing (%) 26
Total size in memory 45.3 MB

Column types

Categorical 0
Numeric 0
Date 2
Array 2
Not available 1

names

categorical
Unique 5
Unique (%)
Missing 1
Missing (%)

Datatypes

String 6
Integer
Decimal
Bool
Date
Missing 0
Null 1

Frequency

Value Count Frequency (%)
Optimus 1 14.29%
bumbl#ebéé 1 14.29%
ironhide& 1 14.29%
Jazz 1 14.29%
Megatron 1 14.29%
Metroplex_)^$ 1 14.29%
None 1 14.29%
"Missing" 1 %

height(ft)

numeric
Unique 5
Unique (%)
Missing 2
Missing (%)

Datatypes

String
Integer 5
Decimal
Bool
Date
Missing 0
Null 2

Basic Stats

Mean 65.6
Minimum -28
Maximum 300
Zeros(%) 0

Quantile statistics

Minimum -28
5-th percentile -28
Q1 13
Median 17
Q3 26
95-th percentile 300
Maximum 300
Range
Interquartile range

Descriptive statistics

Standard deviation 132.66612
Coef of variation
Kurtosis 0.13863
Mean 65.6
MAD
Skewness 1.4049
Sum 328
Variance 17600.3

function

categorical
Unique 6
Unique (%)
Missing 1
Missing (%)

Datatypes

String 6
Integer
Decimal
Bool
Date
Missing 0
Null 1

Frequency

Value Count Frequency (%)
Leader 1 14.29%
Espionage 1 14.29%
Security 1 14.29%
First Lieutenant 1 14.29%
None 1 14.29%
Battle Station 1 14.29%
None 1 14.29%
"Missing" 1 %

rank

numeric
Unique 3
Unique (%)
Missing 1
Missing (%)

Datatypes

String
Integer 6
Decimal
Bool
Date
Missing 0
Null 1

Basic Stats

Mean 8.33333
Minimum 7
Maximum 10
Zeros(%) 0

Quantile statistics

Minimum 7
5-th percentile 7
Q1 7
Median 8
Q3 10
95-th percentile 10
Maximum 10
Range
Interquartile range

Descriptive statistics

Standard deviation 1.36626
Coef of variation
Kurtosis -1.5
Mean 8.33333
MAD
Skewness 0.3818
Sum 50
Variance 1.86667

age

numeric
Unique 1
Unique (%)
Missing 1
Missing (%)

Datatypes

String
Integer 6
Decimal
Bool
Date
Missing 0
Null 1

Basic Stats

Mean 5000000.0
Minimum 5000000
Maximum 5000000
Zeros(%) 0

Quantile statistics

Minimum 5000000
5-th percentile 5000000
Q1 5000000
Median 5000000
Q3 5000000
95-th percentile 5000000
Maximum 5000000
Range
Interquartile range

Descriptive statistics

Standard deviation 0.0
Coef of variation
Kurtosis nan
Mean 5000000.0
MAD
Skewness nan
Sum 30000000
Variance 0.0

weight(t)

numeric
Unique 5
Unique (%)
Missing 2
Missing (%)

Datatypes

String
Integer
Decimal 5
Bool
Date
Missing 0
Null 2

Basic Stats

Mean 3.56
Minimum 1.8
Maximum 5.7
Zeros(%) 0

Quantile statistics

Minimum 1.8
5-th percentile 1.7999999523162842
Q1 2.0
Median 4.0
Q3 4.300000190734863
95-th percentile 5.699999809265137
Maximum 5.7
Range
Interquartile range

Descriptive statistics

Standard deviation 1.64712
Coef of variation
Kurtosis -1.43641
Mean 3.56
MAD
Skewness 0.06521
Sum 17.8
Variance 2.713

japanese name

array
Unique 6
Unique (%)
Missing 1
Missing (%)

Datatypes

String
Integer
Decimal
Bool
Date
Missing 0
Null 1

Frequency

Value Count Frequency (%)
[Inochi, Convoy] 1 14.29%
[Bumble, Goldback] 1 14.29%
[Roadbuster] 1 14.29%
[Meister] 1 14.29%
[Megatron] 1 14.29%
[Metroflex] 1 14.29%
None 1 14.29%
"Missing" 1 %

last position seen

categorical
Unique 4
Unique (%)
Missing 3
Missing (%)

Datatypes

String 4
Integer
Decimal
Bool
Date
Missing 0
Null 3

Frequency

Value Count Frequency (%)
None 3 42.86%
19.442735,-99.201111 1 14.29%
10.642707,-71.612534 1 14.29%
37.789563,-122.400356 1 14.29%
33.670666,-117.841553 1 14.29%
"Missing" 3 %

date arrival

categorical
Unique 1
Unique (%)
Missing 1
Missing (%)

Datatypes

String 6
Integer
Decimal
Bool
Date
Missing 0
Null 1

Frequency

Value Count Frequency (%)
1980/04/10 6 85.71%
None 1 14.29%
"Missing" 1 %

last date seen

categorical
Unique 6
Unique (%)
Missing 1
Missing (%)

Datatypes

String 6
Integer
Decimal
Bool
Date
Missing 0
Null 1

Frequency

Value Count Frequency (%)
2016/09/10 1 14.29%
2015/08/10 1 14.29%
2014/07/10 1 14.29%
2013/06/10 1 14.29%
2012/05/10 1 14.29%
2011/04/10 1 14.29%
None 1 14.29%
"Missing" 1 %

attributes

array
Unique 6
Unique (%)
Missing 1
Missing (%)

Datatypes

String
Integer
Decimal
Bool
Date
Missing 0
Null 1

Frequency

Value Count Frequency (%)
[8.5344, 4300.0] 1 14.29%
[5.334, 2000.0] 1 14.29%
[7.9248, 4000.0] 1 14.29%
[3.9624, 1800.0] 1 14.29%
[, 5700.0] 1 14.29%
[91.44,] 1 14.29%
None 1 14.29%
"Missing" 1 %

Date Type

date
Unique 6
Unique (%)
Missing 1
Missing (%)

Datatypes

String
Integer
Decimal
Bool
Date 6
Missing 0
Null 1

Frequency

Value Count Frequency (%)
2016-09-10 1 14.29%
2015-08-10 1 14.29%
2014-06-24 1 14.29%
2013-06-24 1 14.29%
2012-05-10 1 14.29%
2011-04-10 1 14.29%
None 1 14.29%
"Missing" 1 %

timestamp

date
Unique 1
Unique (%)
Missing 1
Missing (%)

Datatypes

String
Integer
Decimal
Bool
Date 6
Missing 0
Null 1

Frequency

Value Count Frequency (%)
2014-06-24 00:00:00 6 85.71%
None 1 14.29%
"Missing" 1 %

Cybertronian

categorical
Unique 1
Unique (%)
Missing 1
Missing (%)

Datatypes

String
Integer
Decimal
Bool 6
Date
Missing 0
Null 1

Frequency

Value Count Frequency (%)
True 6 85.71%
None 1 14.29%
"Missing" 1 %

NullType

null
Unique 0
Unique (%)
Missing 7
Missing (%)

Datatypes

String
Integer
Decimal
Bool
Date
Missing 0
Null 7

Frequency

Value Count Frequency (%)
None 7 100.0%
"Missing" 7 %
INFO:optimus:run() executed in 68.73 sec
Out[12]:
<optimus.profiler.profiler.Profiler at 0x24242023b70>

In [15]:
source_df.cols.range("height(ft)")


Out[15]:
{'height(ft)': {'range': {'max': 300, 'min': -28}}}

In [ ]: