This notebook create the tests in python code. All this cells must be run to executed the tests



In [1]:

    
%load_ext autoreload
%autoreload 2



In [2]:

    
import sys
sys.path.append("../..")



In [3]:

    
from optimus import Optimus
from optimus.helpers.test import Test









    



C:\Users\argenisleon\Documents\Optimus\tests\creator






    



    You are using PySparkling of version 2.4.10, but your PySpark is of
    version 2.3.1. Please make sure Spark and PySparkling versions are compatible.



In [4]:

    
op = Optimus(master='local', verbose=True)









    



INFO:optimus:Operative System:Windows
INFO:optimus:Just check that Spark and all necessary environments vars are present...
INFO:optimus:-----
INFO:optimus:SPARK_HOME=C:\opt\spark\spark-2.3.1-bin-hadoop2.7
INFO:optimus:HADOOP_HOME=C:\opt\hadoop-2.7.7
INFO:optimus:PYSPARK_PYTHON=C:\Users\argenisleon\Anaconda3\python.exe
INFO:optimus:PYSPARK_DRIVER_PYTHON=jupyter
INFO:optimus:PYSPARK_SUBMIT_ARGS=--jars "file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/spark-redis-2.4.1-SNAPSHOT-jar-with-dependencies.jar,file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/RedshiftJDBC42-1.2.16.1027.jar,file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/mysql-connector-java-8.0.16.jar,file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/ojdbc8.jar,file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/postgresql-42.2.5.jar,file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/presto-jdbc-0.224.jar,file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/spark-cassandra-connector_2.11-2.4.1.jar,file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/sqlite-jdbc-3.27.2.1.jar,file:///C:/Users/argenisleon/Documents/Optimus/optimus/jars/mssql-jdbc-7.4.1.jre8.jar" --driver-class-path "C:/Users/argenisleon/Documents/Optimus/optimus/jars/spark-redis-2.4.1-SNAPSHOT-jar-with-dependencies.jar;C:/Users/argenisleon/Documents/Optimus/optimus/jars/RedshiftJDBC42-1.2.16.1027.jar;C:/Users/argenisleon/Documents/Optimus/optimus/jars/mysql-connector-java-8.0.16.jar;C:/Users/argenisleon/Documents/Optimus/optimus/jars/ojdbc8.jar;C:/Users/argenisleon/Documents/Optimus/optimus/jars/postgresql-42.2.5.jar;C:/Users/argenisleon/Documents/Optimus/optimus/jars/presto-jdbc-0.224.jar;C:/Users/argenisleon/Documents/Optimus/optimus/jars/spark-cassandra-connector_2.11-2.4.1.jar;C:/Users/argenisleon/Documents/Optimus/optimus/jars/sqlite-jdbc-3.27.2.1.jar;C:/Users/argenisleon/Documents/Optimus/optimus/jars/mssql-jdbc-7.4.1.jre8.jar" --conf "spark.sql.catalogImplementation=hive" pyspark-shell
INFO:optimus:JAVA_HOME=C:\java
INFO:optimus:Pyarrow Installed
INFO:optimus:-----
INFO:optimus:Starting or getting SparkSession and SparkContext...
INFO:optimus:Spark Version:2.3.1
INFO:optimus:
                             ____        __  _                     
                            / __ \____  / /_(_)___ ___  __  _______
                           / / / / __ \/ __/ / __ `__ \/ / / / ___/
                          / /_/ / /_/ / /_/ / / / / / / /_/ (__  ) 
                          \____/ .___/\__/_/_/ /_/ /_/\__,_/____/  
                              /_/                                  
                              
INFO:optimus:Transform and Roll out...
INFO:optimus:Optimus successfully imported. Have fun :).
INFO:optimus:Config.ini not found



In [5]:

    
import pandas as pd
from pyspark.sql.types import *
from datetime import date, datetime


cols = [
        ("names", "str"),
        ("height(ft)", ShortType()),
        ("function", "str"),
        ("rank", ByteType()),
        ("age", "int"),
        ("weight(t)", "float"),
        "japanese name",
        "last position seen",
        "date arrival",
        "last date seen",
        ("attributes", ArrayType(FloatType())),
        ("Date Type", DateType()),
        ("timestamp", TimestampType()),
        ("Cybertronian", BooleanType()),
        ("function(binary)", BinaryType()),
        ("NullType", NullType())

    ]

rows = [
        ("Optim'us", -28, "Leader", 10, 5000000, 4.30, ["Inochi", "Convoy"], "19.442735,-99.201111", "1980/04/10",
         "2016/09/10", [8.5344, 4300.0], date(2016, 9, 10), datetime(2014, 6, 24), True, bytearray("Leader", "utf-8"),
         None),
        ("bumbl#ebéé  ", 17, "Espionage", 7, 5000000, 2.0, ["Bumble", "Goldback"], "10.642707,-71.612534", "1980/04/10",
         "2015/08/10", [5.334, 2000.0], date(2015, 8, 10), datetime(2014, 6, 24), True, bytearray("Espionage", "utf-8"),
         None),
        ("ironhide&", 26, "Security", 7, 5000000, 4.0, ["Roadbuster"], "37.789563,-122.400356", "1980/04/10",
         "2014/07/10", [7.9248, 4000.0], date(2014, 6, 24), datetime(2014, 6, 24), True, bytearray("Security", "utf-8"),
         None),
        ("Jazz", 13, "First Lieutenant", 8, 5000000, 1.80, ["Meister"], "33.670666,-117.841553", "1980/04/10",
         "2013/06/10", [3.9624, 1800.0], date(2013, 6, 24), datetime(2014, 6, 24), True,
         bytearray("First Lieutenant", "utf-8"), None),
        ("Megatron", None, "None", 10, 5000000, 5.70, ["Megatron"], None, "1980/04/10", "2012/05/10", [None, 5700.0],
         date(2012, 5, 10), datetime(2014, 6, 24), True, bytearray("None", "utf-8"), None),
        ("Metroplex_)^$", 300, "Battle Station", 8, 5000000, None, ["Metroflex"], None, "1980/04/10", "2011/04/10",
         [91.44, None], date(2011, 4, 10), datetime(2014, 6, 24), True, bytearray("Battle Station", "utf-8"), None),
        (None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None),

    ]
source_df = op.create.df(cols ,rows)
source_df.table()









    









Viewing 7 of 7 rows / 16 columns
1 partition(s)


    
    
        
        
            names
            1 (string)
            
                
                nullable
                
            
        
        
        
            height(ft)
            2 (smallint)
            
                
                nullable
                
            
        
        
        
            function
            3 (string)
            
                
                nullable
                
            
        
        
        
            rank
            4 (tinyint)
            
                
                nullable
                
            
        
        
        
            age
            5 (int)
            
                
                nullable
                
            
        
        
        
            weight(t)
            6 (float)
            
                
                nullable
                
            
        
        
        
            japanese name
            7 (array<string>)
            
                
                nullable
                
            
        
        
        
            last position seen
            8 (string)
            
                
                nullable
                
            
        
        
        
            date arrival
            9 (string)
            
                
                nullable
                
            
        
        
        
            last date seen
            10 (string)
            
                
                nullable
                
            
        
        
        
            attributes
            11 (array<float>)
            
                
                nullable
                
            
        
        
        
            Date Type
            12 (date)
            
                
                nullable
                
            
        
        
        
            timestamp
            13 (timestamp)
            
                
                nullable
                
            
        
        
        
            Cybertronian
            14 (boolean)
            
                
                nullable
                
            
        
        
        
            function(binary)
            15 (binary)
            
                
                nullable
                
            
        
        
        
            NullType
            16 (null)
            
                
                nullable
                
            
        
        
    

    
    
    
    
        
        
            
                
                Optim'us
                 
            
        
        
        
            
                
                -28
                 
            
        
        
        
            
                
                Leader
                 
            
        
        
        
            
                
                10
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                4.300000190734863
                 
            
        
        
        
            
                
                ['Inochi',⋅'Convoy']
                 
            
        
        
        
            
                
                19.442735,-99.201111
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2016/09/10
                 
            
        
        
        
            
                
                [8.53439998626709,⋅4300.0]
                 
            
        
        
        
            
                
                2016-09-10
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'Leader')
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    
        
        
            
                
                bumbl#ebéé⋅⋅
                 
            
        
        
        
            
                
                17
                 
            
        
        
        
            
                
                Espionage
                 
            
        
        
        
            
                
                7
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                2.0
                 
            
        
        
        
            
                
                ['Bumble',⋅'Goldback']
                 
            
        
        
        
            
                
                10.642707,-71.612534
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2015/08/10
                 
            
        
        
        
            
                
                [5.334000110626221,⋅2000.0]
                 
            
        
        
        
            
                
                2015-08-10
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'Espionage')
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    
        
        
            
                
                ironhide&
                 
            
        
        
        
            
                
                26
                 
            
        
        
        
            
                
                Security
                 
            
        
        
        
            
                
                7
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                4.0
                 
            
        
        
        
            
                
                ['Roadbuster']
                 
            
        
        
        
            
                
                37.789563,-122.400356
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2014/07/10
                 
            
        
        
        
            
                
                [7.924799919128418,⋅4000.0]
                 
            
        
        
        
            
                
                2014-06-24
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'Security')
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    
        
        
            
                
                Jazz
                 
            
        
        
        
            
                
                13
                 
            
        
        
        
            
                
                First⋅Lieutenant
                 
            
        
        
        
            
                
                8
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                1.7999999523162842
                 
            
        
        
        
            
                
                ['Meister']
                 
            
        
        
        
            
                
                33.670666,-117.841553
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2013/06/10
                 
            
        
        
        
            
                
                [3.962399959564209,⋅1800.0]
                 
            
        
        
        
            
                
                2013-06-24
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'First⋅Lieutenant')
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    
        
        
            
                
                Megatron
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                10
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                5.699999809265137
                 
            
        
        
        
            
                
                ['Megatron']
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2012/05/10
                 
            
        
        
        
            
                
                [None,⋅5700.0]
                 
            
        
        
        
            
                
                2012-05-10
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'None')
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    
        
        
            
                
                Metroplex_)^$
                 
            
        
        
        
            
                
                300
                 
            
        
        
        
            
                
                Battle⋅Station
                 
            
        
        
        
            
                
                8
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                ['Metroflex']
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2011/04/10
                 
            
        
        
        
            
                
                [91.44000244140625,⋅None]
                 
            
        
        
        
            
                
                2011-04-10
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'Battle⋅Station')
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    



Viewing 7 of 7 rows / 16 columns
1 partition(s)



In [10]:

    
import pandas as pd
from pyspark.sql.types import *
from datetime import date, datetime


cols = [
        ("names", "str"),
        ("height(ft)", ShortType()),
        ("function", "str"),
        ("rank", ByteType()),
        ("age", "int"),
        ("weight(t)", "float"),
        "japanese name",
        "last position seen",
        "date arrival",
        "last date seen",
        ("attributes", ArrayType(FloatType())),
        ("Date Type", DateType()),
        ("timestamp", TimestampType()),
        ("Cybertronian", BooleanType())

    ]

rows = [
        ("Optim'us", -28, "Leader", 10, 5000000, 4.30, ["Inochi", "Convoy"], "19.442735,-99.201111", "1980/04/10",
         "2016/09/10", [8.5344, 4300.0], date(2016, 9, 10), datetime(2014, 6, 24), True),
        ("bumbl#ebéé  ", 17, "Espionage", 7, 5000000, 2.0, ["Bumble", "Goldback"], "10.642707,-71.612534", "1980/04/10",
         "2015/08/10", [5.334, 2000.0], date(2015, 8, 10), datetime(2014, 6, 24), True),
        ("ironhide&", 26, "Security", 7, 5000000, 4.0, ["Roadbuster"], "37.789563,-122.400356", "1980/04/10",
         "2014/07/10", [7.9248, 4000.0], date(2014, 6, 24), datetime(2014, 6, 24), True),
        ("Jazz", 13, "First Lieutenant", 8, 5000000, 1.80, ["Meister"], "33.670666,-117.841553", "1980/04/10",
         "2013/06/10", [3.9624, 1800.0], date(2013, 6, 24), datetime(2014, 6, 24), True),
        ("Megatron", None, "None", 10, 5000000, 5.70, ["Megatron"], None, "1980/04/10", "2012/05/10", [None, 5700.0],
         date(2012, 5, 10), datetime(2014, 6, 24), True),
        ("Metroplex_)^$", 300, "Battle Station", 8, 5000000, None, ["Metroflex"], None, "1980/04/10", "2011/04/10",
         [91.44, None], date(2011, 4, 10), datetime(2014, 6, 24), True),

    ]
source_df_string_to_index = op.create.df(cols ,rows)
source_df_string_to_index.table()









    









Viewing 6 of 6 rows / 14 columns
1 partition(s)


    
    
        
        
            names
            1 (string)
            
                
                nullable
                
            
        
        
        
            height(ft)
            2 (smallint)
            
                
                nullable
                
            
        
        
        
            function
            3 (string)
            
                
                nullable
                
            
        
        
        
            rank
            4 (tinyint)
            
                
                nullable
                
            
        
        
        
            age
            5 (int)
            
                
                nullable
                
            
        
        
        
            weight(t)
            6 (float)
            
                
                nullable
                
            
        
        
        
            japanese name
            7 (array<string>)
            
                
                nullable
                
            
        
        
        
            last position seen
            8 (string)
            
                
                nullable
                
            
        
        
        
            date arrival
            9 (string)
            
                
                nullable
                
            
        
        
        
            last date seen
            10 (string)
            
                
                nullable
                
            
        
        
        
            attributes
            11 (array<float>)
            
                
                nullable
                
            
        
        
        
            Date Type
            12 (date)
            
                
                nullable
                
            
        
        
        
            timestamp
            13 (timestamp)
            
                
                nullable
                
            
        
        
        
            Cybertronian
            14 (boolean)
            
                
                nullable
                
            
        
        
    

    
    
    
    
        
        
            
                
                Optim'us
                 
            
        
        
        
            
                
                -28
                 
            
        
        
        
            
                
                Leader
                 
            
        
        
        
            
                
                10
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                4.300000190734863
                 
            
        
        
        
            
                
                ['Inochi',⋅'Convoy']
                 
            
        
        
        
            
                
                19.442735,-99.201111
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2016/09/10
                 
            
        
        
        
            
                
                [8.53439998626709,⋅4300.0]
                 
            
        
        
        
            
                
                2016-09-10
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
    
    
    
        
        
            
                
                bumbl#ebéé⋅⋅
                 
            
        
        
        
            
                
                17
                 
            
        
        
        
            
                
                Espionage
                 
            
        
        
        
            
                
                7
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                2.0
                 
            
        
        
        
            
                
                ['Bumble',⋅'Goldback']
                 
            
        
        
        
            
                
                10.642707,-71.612534
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2015/08/10
                 
            
        
        
        
            
                
                [5.334000110626221,⋅2000.0]
                 
            
        
        
        
            
                
                2015-08-10
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
    
    
    
        
        
            
                
                ironhide&
                 
            
        
        
        
            
                
                26
                 
            
        
        
        
            
                
                Security
                 
            
        
        
        
            
                
                7
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                4.0
                 
            
        
        
        
            
                
                ['Roadbuster']
                 
            
        
        
        
            
                
                37.789563,-122.400356
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2014/07/10
                 
            
        
        
        
            
                
                [7.924799919128418,⋅4000.0]
                 
            
        
        
        
            
                
                2014-06-24
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
    
    
    
        
        
            
                
                Jazz
                 
            
        
        
        
            
                
                13
                 
            
        
        
        
            
                
                First⋅Lieutenant
                 
            
        
        
        
            
                
                8
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                1.7999999523162842
                 
            
        
        
        
            
                
                ['Meister']
                 
            
        
        
        
            
                
                33.670666,-117.841553
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2013/06/10
                 
            
        
        
        
            
                
                [3.962399959564209,⋅1800.0]
                 
            
        
        
        
            
                
                2013-06-24
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
    
    
    
        
        
            
                
                Megatron
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                10
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                5.699999809265137
                 
            
        
        
        
            
                
                ['Megatron']
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2012/05/10
                 
            
        
        
        
            
                
                [None,⋅5700.0]
                 
            
        
        
        
            
                
                2012-05-10
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
    
    
    
        
        
            
                
                Metroplex_)^$
                 
            
        
        
        
            
                
                300
                 
            
        
        
        
            
                
                Battle⋅Station
                 
            
        
        
        
            
                
                8
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                ['Metroflex']
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2011/04/10
                 
            
        
        
        
            
                
                [91.44000244140625,⋅None]
                 
            
        
        
        
            
                
                2011-04-10
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
    
    
    



Viewing 6 of 6 rows / 14 columns
1 partition(s)

End Init Section

Test

Optimus Test



In [9]:

    
from pyspark.ml.linalg import Vectors



In [10]:

    
t = Test(op, None, "create_df", imports=["import datetime",
                                "from pyspark.sql import functions as F"], path = "..", final_path="..")



In [11]:

    
one_column = {"rows":["Argenis", "Favio", "Matthew"], "cols":["name"]}
plain = {"rows":[("BOB", 1),("JoSe", 2)],"cols":["name","age"]}
plain_infer_false = {"rows":[("BOB", 1),("JoSe", 2)],"cols":["name","age"],"infer_schema":False}
with_data_types = {"rows":[("BOB", 1),("JoSe", 2)],"cols":[("name", StringType(), True),("age", IntegerType(), False)]}
nullable = {"rows":[("BOB", 1),("JoSe", 2)],"cols":[("name", StringType()),("age", IntegerType())]}

df1 = op.create.df(**one_column)
df2 = op.create.df(**plain)
df3 = op.create.df(**plain_infer_false)
df4 = op.create.df(**with_data_types)
df5 = op.create.df(**nullable)



In [ ]:

    
t.create(df1, None, "one_column", "df", **one_column)



In [ ]:

    
t.create(df2, None, "plain", "df", **plain)



In [ ]:

    
t.create(df3, None, "plain_infer_false", "df", **plain_infer_false)



In [ ]:

    
t.create(df4, None, "with_data_types", "df", **with_data_types)



In [ ]:

    
t.create(df5, None, "nullable", "df", **nullable)



In [ ]:

    
t.run()

Columns Test



In [28]:

    
from pyspark.ml.linalg import Vectors



In [29]:

    
t = Test(op, source_df, "df_cols", imports=["from pyspark.ml.linalg import Vectors, VectorUDT, DenseVector",
                                        "import numpy as np",
                                        "nan = np.nan",
                                        "import datetime",
                                        "from pyspark.sql import functions as F"], path = "df_cols", final_path="..")



In [30]:

    
from pyspark.sql import functions as F


def func(col_name, attrs):
    return F.col(col_name) * 2

numeric_col = "height(ft)"
numeric_col_B = "rank"
numeric_col_C = "rank"
string_col = "function"
date_col = "date arrival"
date_col_B = "last date seen"
new_col = "new col"
array_col = "attributes"



In [31]:

    
t.create(source_df_string_to_index, "cols.string_to_index", None, "df", None, "rank")









    



Creating test_cols_string_to_index() test function...






    



INFO:optimus:test_cols_string_to_index()






    



["'rank'"]






    









Viewing 6 of 6 rows / 15 columns
1 partition(s)


    
    
        
        
            names
            1 (string)
            
                
                nullable
                
            
        
        
        
            height(ft)
            2 (smallint)
            
                
                nullable
                
            
        
        
        
            function
            3 (string)
            
                
                nullable
                
            
        
        
        
            rank
            4 (tinyint)
            
                
                nullable
                
            
        
        
        
            age
            5 (int)
            
                
                nullable
                
            
        
        
        
            weight(t)
            6 (float)
            
                
                nullable
                
            
        
        
        
            japanese name
            7 (array<string>)
            
                
                nullable
                
            
        
        
        
            last position seen
            8 (string)
            
                
                nullable
                
            
        
        
        
            date arrival
            9 (string)
            
                
                nullable
                
            
        
        
        
            last date seen
            10 (string)
            
                
                nullable
                
            
        
        
        
            attributes
            11 (array<float>)
            
                
                nullable
                
            
        
        
        
            Date Type
            12 (date)
            
                
                nullable
                
            
        
        
        
            timestamp
            13 (timestamp)
            
                
                nullable
                
            
        
        
        
            Cybertronian
            14 (boolean)
            
                
                nullable
                
            
        
        
        
            rank***STRING_TO_INDEX
            15 (double)
            
                
                not nullable
                
            
        
        
    

    
    
    
    
        
        
            
                
                Optim'us
                 
            
        
        
        
            
                
                -28
                 
            
        
        
        
            
                
                Leader
                 
            
        
        
        
            
                
                10
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                4.300000190734863
                 
            
        
        
        
            
                
                ['Inochi',⋅'Convoy']
                 
            
        
        
        
            
                
                19.442735,-99.201111
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2016/09/10
                 
            
        
        
        
            
                
                [8.53439998626709,⋅4300.0]
                 
            
        
        
        
            
                
                2016-09-10
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                2.0
                 
            
        
        
    
    
    
        
        
            
                
                bumbl#ebéé⋅⋅
                 
            
        
        
        
            
                
                17
                 
            
        
        
        
            
                
                Espionage
                 
            
        
        
        
            
                
                7
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                2.0
                 
            
        
        
        
            
                
                ['Bumble',⋅'Goldback']
                 
            
        
        
        
            
                
                10.642707,-71.612534
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2015/08/10
                 
            
        
        
        
            
                
                [5.334000110626221,⋅2000.0]
                 
            
        
        
        
            
                
                2015-08-10
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                1.0
                 
            
        
        
    
    
    
        
        
            
                
                ironhide&
                 
            
        
        
        
            
                
                26
                 
            
        
        
        
            
                
                Security
                 
            
        
        
        
            
                
                7
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                4.0
                 
            
        
        
        
            
                
                ['Roadbuster']
                 
            
        
        
        
            
                
                37.789563,-122.400356
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2014/07/10
                 
            
        
        
        
            
                
                [7.924799919128418,⋅4000.0]
                 
            
        
        
        
            
                
                2014-06-24
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                1.0
                 
            
        
        
    
    
    
        
        
            
                
                Jazz
                 
            
        
        
        
            
                
                13
                 
            
        
        
        
            
                
                First⋅Lieutenant
                 
            
        
        
        
            
                
                8
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                1.7999999523162842
                 
            
        
        
        
            
                
                ['Meister']
                 
            
        
        
        
            
                
                33.670666,-117.841553
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2013/06/10
                 
            
        
        
        
            
                
                [3.962399959564209,⋅1800.0]
                 
            
        
        
        
            
                
                2013-06-24
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                0.0
                 
            
        
        
    
    
    
        
        
            
                
                Megatron
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                10
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                5.699999809265137
                 
            
        
        
        
            
                
                ['Megatron']
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2012/05/10
                 
            
        
        
        
            
                
                [None,⋅5700.0]
                 
            
        
        
        
            
                
                2012-05-10
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                2.0
                 
            
        
        
    
    
    
        
        
            
                
                Metroplex_)^$
                 
            
        
        
        
            
                
                300
                 
            
        
        
        
            
                
                Battle⋅Station
                 
            
        
        
        
            
                
                8
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                ['Metroflex']
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2011/04/10
                 
            
        
        
        
            
                
                [91.44000244140625,⋅None]
                 
            
        
        
        
            
                
                2011-04-10
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                0.0
                 
            
        
        
    
    
    



Viewing 6 of 6 rows / 15 columns
1 partition(s)



In [16]:

    
source_df_index_to_string = source_df_string_to_index.cols.string_to_index("rank")



In [36]:

    
# FIX at creation time we los the metadata. Need to find a way to put it on the dataframe creation
t.delete(source_df_index_to_string, "cols.index_to_string", None, "df", None, "rank***STRING_TO_INDEX")









    



Deleting file df_cols//cols_index_to_string.test...






    



INFO:optimus:test_cols_index_to_string()



In [37]:

    
t.run()









    



Creating file ../test_df_cols.py
Done



In [ ]:

    
t.create(source_df_string_to_index, "cols.values_to_cols", None, "df", None, "rank")



In [ ]:

    
t.run()



In [ ]:

    
t.create(source_df_string_to_index, "cols.values_to_cols", "all_columns", "df", None, ["names","height(ft)"])



In [ ]:

    
t.run()



In [ ]:

    
t.create(None, "cols.remove", None, "df", None, string_col, "i")



In [ ]:

    
t.run()



In [ ]:

    
t.create(None, "cols.remove", "list", "df", string_col, ["a","i","Es"])



In [ ]:

    
t.create(None, "cols.remove", "list_output", "df", string_col, ["a","i","Es"], output_cols=string_col+"_new")



In [ ]:

    
t.run()



In [ ]:

    
t.create(None, "cols.min", None, "json", numeric_col)



In [ ]:

    
t.create(None, "cols.min", "all_columns", "json", None, "*")
t.run()



In [ ]:

    
t.create(None, "cols.max", None, "json", numeric_col)



In [ ]:

    
t.create(None, "cols.max", "all_columns", "json", None, "*")
t.run()



In [ ]:

    
t.create(None, "cols.range", None, "json",None, numeric_col)



In [ ]:

    
t.create(None, "cols.range", "all_columns", "json",None, "*")



In [ ]:

    
t.run()



In [ ]:

    
source_df.table()



In [ ]:

    
t.create(None, "cols.median", None, "json", None,numeric_col)



In [ ]:

    
t.create(None, "cols.median", "all_columns", "json", None, "*")



In [ ]:

    
t.run()



In [ ]:

    
t.create(None, "cols.percentile", None, "json", None, numeric_col, [0.05, 0.25], 1)



In [ ]:

    
t.create(None, "cols.percentile", "all_columns", "json", None, "*", [0.05, 0.25], 1)

MAD



In [ ]:

    
t.create(None, "cols.mad", None, "json", None, numeric_col)



In [ ]:

    
t.create(None, "cols.mad", "all_columns", "json", None, "*")



In [ ]:

    
t.run()



In [ ]:

    
t.create(None, "cols.std", None, "json", numeric_col)



In [ ]:

    
t.create(None, "cols.std", "all_columns", "json", None, "*")
t.run()



In [ ]:

    
t.create(None, "cols.kurt", None, "json", None, numeric_col)
t.run()



In [ ]:

    
t.create(None, "cols.kurt", "all_columns", "json", None, "*")
t.run()



In [ ]:

    
t.create(None, "cols.mean", None, "json", numeric_col)



In [ ]:

    
t.create(None, "cols.mean", "all_columns", "json", None, "*")
t.run()



In [ ]:

    
t.create(None, "cols.skewness", None, "json", numeric_col)



In [ ]:

    
t.create(None, "cols.skewness", "all_columns", "json", None, "*")
t.run()



In [ ]:

    
t.create(None, "cols.sum", None, "json", numeric_col)



In [ ]:

    
t.create(None, "cols.sum", "all_columns", "json", None,"*")
t.run()



In [ ]:

    
t.create(None, "cols.variance", None, "json", numeric_col)



In [ ]:

    
t.create(None, "cols.variance", "all_columns", "json", None, "*")
t.run()



In [ ]:

    
source_df.table()



In [ ]:

    
from pyspark.sql import functions as F
source_df.select(F.abs(F.col("age")))



In [ ]:

    
t.create(None, "cols.abs", None, "df", None,"weight(t)")



In [ ]:

    
t.create(None, "cols.abs", "all_columns", "json", None, "*")



In [ ]:

    
source_df.table()



In [ ]:

    
from pyspark.sql import functions as F

source_df.select(F.abs("weight(t)"))



In [30]:

    
t.create(None, "cols.mode", None, "json", None, numeric_col)









    



Creating test_cols_mode() test function...






    



INFO:optimus:test_cols_mode()






    



None



In [34]:

    
%%time
t.create(None, "cols.mode", "all_columns", "json", None,"*")









    



Creating test_cols_mode_all_columns() test function...






    



INFO:optimus:test_cols_mode_all_columns()






    



[{'names': None}, {'height(ft)': None}, {'function': None}, {'rank': [8, 7, 10]}, {'age': 5000000}, {'weight(t)': None}, {'japanese name': None}, {'last position seen': None}, {'date arrival': '1980/04/10'}, {'last date seen': None}, {'attributes': None}, {'Date Type': None}, {'timestamp': datetime.datetime(2014, 6, 24, 0, 0)}, {'Cybertronian': True}, {'function(binary)': None}, {'NullType': None}]
Wall time: 40.1 s



In [32]:

    
t.run()









    



Creating file ../test_df_cols.py
Done



In [ ]:

    
t.create(None, "cols.count", None, "json")

Count na



In [ ]:

    
t.create(None, "cols.count_na", None, "json", None, numeric_col)



In [ ]:

    
t.create(None, "cols.count_na", "all_columns", "json",None, "*")



In [ ]:

    
t.run()



In [ ]:

    
source_df.cols.names("rank",["str","int","float"],True)



In [ ]:

    
t.create(None, "cols.count_zeros", None, "json", numeric_col)



In [ ]:

    
t.create(None, "cols.count_zeros", "all_columns", "json", None, "*")
t.run()



In [ ]:

    
t.run()



In [ ]:

    
source_df.cols.names()

Value counts



In [ ]:

    
t.create(None, "cols.value_counts", None, "json", None, numeric_col)
t.run()



In [ ]:

    
t.create(None, "cols.value_counts", "all_columns", "json", None,  "*")
t.run()



In [ ]:

    
t.create(None, "cols.count_uniques", None, "json", None, numeric_col)
t.run()



In [ ]:

    
t.create(None, "cols.count_uniques", "all_columns", "json",None, "*")
t.run()



In [ ]:

    
t.create(None, "cols.unique", None, "json", None,numeric_col)
t.run()



In [ ]:

    
t.create(None, "cols.unique", "all_columns", "json", None,"*")
t.run()



In [ ]:

    
t.create(None, "cols.add", None, "df", [numeric_col, numeric_col_B])



In [ ]:

    
t.create(None, "cols.add", "all_columns", "df", "*"),



In [ ]:

    
t.create(None, "cols.sub", None, "df", [numeric_col, numeric_col_B])



In [ ]:

    
t.create(None, "cols.sub", "all_columns", "df", "*")



In [ ]:

    
t.create(None, "cols.mul", None, "df", [numeric_col, numeric_col_B])



In [ ]:

    
t.create(None, "cols.mul", "all_columns", "df", "*")



In [ ]:

    
t.create(None, "cols.div", None, "df", [numeric_col, numeric_col_B])



In [ ]:

    
t.create(None, "cols.div", "all_columns", "df", "*")



In [ ]:

    
t.create(None, "cols.z_score", None, "df", numeric_col)



In [ ]:

    
t.create(None, "cols.z_score", "all_columns", "df", "*")



In [ ]:

    
t.create(None, "cols.iqr", None, "json", None, numeric_col)



In [ ]:

    
t.create(None, "cols.iqr", "all_columns", "json",None, "*")



In [ ]:

    
t.run()



In [ ]:

    
t.create(None, "cols.lower", None, "df", string_col)



In [ ]:

    
t.create(None, "cols.lower", "all_columns", "df", "*")



In [ ]:

    
t.create(None, "cols.upper", None, "df", string_col)



In [ ]:

    
t.create(None, "cols.upper", "all_columns", "df", "*")



In [ ]:



In [ ]:

    
t.create(None, "cols.trim", None, "df", numeric_col)



In [ ]:

    
t.create(None, "cols.trim", "all_columns", "df", "*")



In [ ]:

    
t.create(None, "cols.reverse", None, "df", string_col)



In [ ]:

    
t.create(None, "cols.reverse", "all_columns", "df", "*")



In [ ]:

    
t.create(None, "cols.remove_accents", None, "df", string_col)



In [ ]:

    
t.create(None, "cols.remove_accents", "all_columns", "df", string_col)



In [ ]:

    
source_df.table()



In [ ]:

    
t.create(None, "cols.remove_special_chars", None, "df", string_col)



In [ ]:

    
t.create(None, "cols.remove_special_chars", "all_columns","df", None, "*")
t.run()
# t.create(None, "cols.value_counts", None, "json", None, numeric_col)



In [ ]:

    
source_df.cols.remove_special_chars("*").table()



In [ ]:

    
t.create(None, "cols.remove_white_spaces", None, "df", string_col)



In [ ]:

    
t.create(None, "cols.remove_white_spaces", "all_columns", "df", "*")



In [ ]:

    
t.create(None, "cols.date_transform", None, "df", date_col, "yyyy/MM/dd", "dd-MM-YYYY")



In [ ]:

    
t.run()



In [ ]:

    
t.create(None, "cols.date_transform", "all_columns", "df", [date_col, date_col_B], "yyyy/MM/dd", "dd-MM-YYYY")



In [ ]:

    
# t.create(None, "cols.years_between", None, "df", date_col, "yyyy/MM/dd")
t.delete(None, "cols.years_between", None, "df", date_col, "yyyy/MM/dd")



In [ ]:

    
# t.create(None, "cols.years_between", "multiple_columns", "df", [date_col, date_col_B], "yyyy/MM/dd")
t.delete(None, "cols.years_between", "multiple_columns", "df", [date_col, date_col_B], "yyyy/MM/dd")



In [ ]:

    
t.run()



In [ ]:

    
t.create(None, "cols.impute", None, "df", numeric_col_B)



In [25]:

    
%%time
t.create(None, "cols.impute", "all_columns","df", None ,"names","categorical")









    



Creating test_cols_impute_all_columns() test function...






    



INFO:optimus:test_cols_impute_all_columns()
INFO:optimus:Using 'column_exp' to process column 'names' with function func_col_exp






    









Viewing 7 of 7 rows / 16 columns
1 partition(s)


    
    
        
        
            names
            1 (string)
            
                
                nullable
                
            
        
        
        
            height(ft)
            2 (smallint)
            
                
                nullable
                
            
        
        
        
            function
            3 (string)
            
                
                nullable
                
            
        
        
        
            rank
            4 (tinyint)
            
                
                nullable
                
            
        
        
        
            age
            5 (int)
            
                
                nullable
                
            
        
        
        
            weight(t)
            6 (float)
            
                
                nullable
                
            
        
        
        
            japanese name
            7 (array<string>)
            
                
                nullable
                
            
        
        
        
            last position seen
            8 (string)
            
                
                nullable
                
            
        
        
        
            date arrival
            9 (string)
            
                
                nullable
                
            
        
        
        
            last date seen
            10 (string)
            
                
                nullable
                
            
        
        
        
            attributes
            11 (array<float>)
            
                
                nullable
                
            
        
        
        
            Date Type
            12 (date)
            
                
                nullable
                
            
        
        
        
            timestamp
            13 (timestamp)
            
                
                nullable
                
            
        
        
        
            Cybertronian
            14 (boolean)
            
                
                nullable
                
            
        
        
        
            function(binary)
            15 (binary)
            
                
                nullable
                
            
        
        
        
            NullType
            16 (null)
            
                
                nullable
                
            
        
        
    

    
    
    
    
        
        
            
                
                Optim'us
                 
            
        
        
        
            
                
                -28
                 
            
        
        
        
            
                
                Leader
                 
            
        
        
        
            
                
                10
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                4.300000190734863
                 
            
        
        
        
            
                
                ['Inochi',⋅'Convoy']
                 
            
        
        
        
            
                
                19.442735,-99.201111
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2016/09/10
                 
            
        
        
        
            
                
                [8.53439998626709,⋅4300.0]
                 
            
        
        
        
            
                
                2016-09-10
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'Leader')
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    
        
        
            
                
                bumbl#ebéé⋅⋅
                 
            
        
        
        
            
                
                17
                 
            
        
        
        
            
                
                Espionage
                 
            
        
        
        
            
                
                7
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                2.0
                 
            
        
        
        
            
                
                ['Bumble',⋅'Goldback']
                 
            
        
        
        
            
                
                10.642707,-71.612534
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2015/08/10
                 
            
        
        
        
            
                
                [5.334000110626221,⋅2000.0]
                 
            
        
        
        
            
                
                2015-08-10
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'Espionage')
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    
        
        
            
                
                ironhide&
                 
            
        
        
        
            
                
                26
                 
            
        
        
        
            
                
                Security
                 
            
        
        
        
            
                
                7
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                4.0
                 
            
        
        
        
            
                
                ['Roadbuster']
                 
            
        
        
        
            
                
                37.789563,-122.400356
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2014/07/10
                 
            
        
        
        
            
                
                [7.924799919128418,⋅4000.0]
                 
            
        
        
        
            
                
                2014-06-24
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'Security')
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    
        
        
            
                
                Jazz
                 
            
        
        
        
            
                
                13
                 
            
        
        
        
            
                
                First⋅Lieutenant
                 
            
        
        
        
            
                
                8
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                1.7999999523162842
                 
            
        
        
        
            
                
                ['Meister']
                 
            
        
        
        
            
                
                33.670666,-117.841553
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2013/06/10
                 
            
        
        
        
            
                
                [3.962399959564209,⋅1800.0]
                 
            
        
        
        
            
                
                2013-06-24
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'First⋅Lieutenant')
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    
        
        
            
                
                Megatron
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                10
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                5.699999809265137
                 
            
        
        
        
            
                
                ['Megatron']
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2012/05/10
                 
            
        
        
        
            
                
                [None,⋅5700.0]
                 
            
        
        
        
            
                
                2012-05-10
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'None')
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    
        
        
            
                
                Metroplex_)^$
                 
            
        
        
        
            
                
                300
                 
            
        
        
        
            
                
                Battle⋅Station
                 
            
        
        
        
            
                
                8
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                ['Metroflex']
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2011/04/10
                 
            
        
        
        
            
                
                [91.44000244140625,⋅None]
                 
            
        
        
        
            
                
                2011-04-10
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'Battle⋅Station')
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    



Viewing 7 of 7 rows / 16 columns
1 partition(s)







    



Wall time: 8.41 s



In [26]:

    
t.run()









    



Creating file ../test_df_cols.py
Done

Hist



In [ ]:

    
t.create(None, "cols.hist", None, "json", None, ["height(ft)",numeric_col_B], 4)
t.run()



In [ ]:

    
t.create(None,"cols.hist","all_columns","json",None, "Date Type",4)



In [ ]:

    
t.run()



In [ ]:

    
t.create(None, "cols.frequency", None, "dict", None, numeric_col_B, 4)
t.run()



In [ ]:

    
t.create(None, "cols.frequency", "all_columns", "dict", None, "*", 4)
t.run()



In [ ]:

    
t.create(None, "cols.schema_dtype", None, "json", numeric_col_B)



In [ ]:

    
# Problems with casting
# t.delete(None, "cols.schema_dtype", "all_columns", "json", "*")
t.run()



In [ ]:

    
t.create(None, "cols.dtypes", None, "json", None, numeric_col_B)



In [ ]:

    
t.run()



In [ ]:

    
t.create(None, "cols.dtypes", "all_columns", "json", "*")



In [36]:

    
t.create(None, "cols.select_by_dtypes", "str", "df", None, "str")









    



Creating test_cols_select_by_dtypes_str() test function...






    



INFO:optimus:test_cols_select_by_dtypes_str()
INFO:optimus:`height(ft)`,`rank`,`age`,`weight(t)`,`japanese name`,`attributes`,`Date Type`,`timestamp`,`Cybertronian`,`function(binary)`,`NullType` column(s) was not processed because is/are not str






    









Viewing 7 of 7 rows / 5 columns
1 partition(s)


    
    
        
        
            names
            1 (string)
            
                
                nullable
                
            
        
        
        
            function
            2 (string)
            
                
                nullable
                
            
        
        
        
            last position seen
            3 (string)
            
                
                nullable
                
            
        
        
        
            date arrival
            4 (string)
            
                
                nullable
                
            
        
        
        
            last date seen
            5 (string)
            
                
                nullable
                
            
        
        
    

    
    
    
    
        
        
            
                
                Optim'us
                 
            
        
        
        
            
                
                Leader
                 
            
        
        
        
            
                
                19.442735,-99.201111
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2016/09/10
                 
            
        
        
    
    
    
        
        
            
                
                bumbl#ebéé⋅⋅
                 
            
        
        
        
            
                
                Espionage
                 
            
        
        
        
            
                
                10.642707,-71.612534
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2015/08/10
                 
            
        
        
    
    
    
        
        
            
                
                ironhide&
                 
            
        
        
        
            
                
                Security
                 
            
        
        
        
            
                
                37.789563,-122.400356
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2014/07/10
                 
            
        
        
    
    
    
        
        
            
                
                Jazz
                 
            
        
        
        
            
                
                First⋅Lieutenant
                 
            
        
        
        
            
                
                33.670666,-117.841553
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2013/06/10
                 
            
        
        
    
    
    
        
        
            
                
                Megatron
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2012/05/10
                 
            
        
        
    
    
    
        
        
            
                
                Metroplex_)^$
                 
            
        
        
        
            
                
                Battle⋅Station
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2011/04/10
                 
            
        
        
    
    
    
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    



Viewing 7 of 7 rows / 5 columns
1 partition(s)



In [ ]:

    
t.create(None, "cols.select_by_dtypes", "int", "df", "int")



In [ ]:

    
t.create(None, "cols.select_by_dtypes", "float", "df", "float")



In [ ]:

    
t.create(None, "cols.select_by_dtypes", "array", "df", "array")



In [ ]:

    
t.create(None, "cols.names", None, "json")



In [ ]:

    
t.create(None, "cols.qcut", None, "df", numeric_col_B, 4)



In [ ]:

    
t.create(None, "cols.qcut", "all_columns", "df", "*", 4)



In [ ]:

    
t.create(None, "cols.clip", None, "df", numeric_col_B, 3, 5)



In [ ]:

    
t.create(None, "cols.clip", "all_columns", "df", "*", 3, 5)



In [ ]:

    
t.create(None, "cols.replace", "full", "df", None,string_col,["First Lieutenant","Battle"], "Match", search_by="full")



In [ ]:

    
t.create(None, "cols.replace", "words", "df", None,string_col,["Security", "Leader"], "Match", search_by="words")
t.run()



In [ ]:

    
t.create(None, "cols.replace", "chars", "df", None,string_col,["F", "E"], "Match", search_by="chars")



In [ ]:

    
t.create(None, "cols.replace", "numeric", "df", None,"age",5000000, 5, search_by="numeric")



In [ ]:

    
t.run()



In [ ]:

    
# Assert is failing I can see why
t.create(None, "cols.replace", "all_columns", "df", None,"*", ["Jazz", "Leader"], "Match")
t.run()



In [ ]:

    
# Its necesary to save the function 
t.delete(None, "cols.apply_expr", None, "df", numeric_col_B, func)



In [ ]:

    
# Its necesary to save the function 
t.delete(None, "cols.apply_expr", "all_columns", "df", [numeric_col_B,numeric_col_C], func)



In [ ]:

    
t.create(None, "cols.append", "number", "df", new_col, 1)



In [ ]:

    
df_col = op.create.df(
    [
        ("new_col", "str", True),
       

    ],[
        ("q"),("w"), ("e"), ("r"),

    ])



In [ ]:

    
t.create(None, "cols.append", "dataframes", "df", None, df_col)



In [ ]:

    
#t.create(None, "cols.append", "advance", "df", [("new_col_4", "test"),
    #                                                ("new_col_5", df[numeric_col_B] * 2),
    #                                                ("new_col_6", [1, 2, 3])
    #                                                ]),



In [ ]:

    
t.create(None, "cols.rename", None, "df", numeric_col_B, numeric_col_B + "(old)")



In [ ]:

    
t.create(None, "cols.rename", "list", "df", [numeric_col, numeric_col + "(tons)", numeric_col_B, numeric_col_B + "(old)"])



In [ ]:

    
t.create(None, "cols.rename", "function", "df", str.upper)



In [ ]:

    
t.create(None, "cols.drop", None, "df", numeric_col_B)



In [ ]:

    
t.create(None, "cols.cast", None, "df", string_col, "string")



In [ ]:

    
t.create(None, "cols.cast", "all_columns", "df", "*", "string")



In [ ]:

    
t.run()



In [ ]:

    
# Problems with precision
t.delete(None, "cols.cast", "vector", "df", array_col, Vectors)



In [ ]:

    
t.create(None, "cols.keep", None, "df", numeric_col_B)



In [ ]:

    
t.create(None, "cols.move", "after", "df", numeric_col_B, "after", array_col)



In [ ]:

    
t.create(None, "cols.move", "before", "df", numeric_col_B, "before", array_col)



In [ ]:

    
t.create(None, "cols.move", "beginning", "df", numeric_col_B, "beginning")



In [ ]:

    
t.create(None, "cols.move", "end", "df", numeric_col_B, "end")



In [ ]:

    
t.create(None, "cols.select", None, "df", 0, numeric_col)



In [ ]:

    
t.create(None, "cols.select", "regex", "df", "n.*", regex=True),



In [ ]:

    
t.create(None, "cols.sort", None, "df")
t.run()



In [ ]:

    
t.create(None, "cols.sort", "desc", "df", None,"desc")



In [ ]:

    
t.create(None, "cols.sort", "asc", "df", None, "asc")



In [ ]:

    
t.run()



In [ ]:

    
t.create(None, "cols.fill_na", None, "df", numeric_col, "1")



In [ ]:

    
t.create(None, "cols.fill_na", "array", "df", None, "japanese name", ["1","2"])



In [ ]:

    
t.run()



In [ ]:

    
t.create(None, "cols.fill_na", "bool", "df", None, "Cybertronian", False)



In [ ]:

    
t.run()



In [ ]:

    
t.create(None, "cols.fill_na", "all_columns", "df", ["names","height(ft)", "function", "rank", "age"], "2")

Nest



In [ ]:

    
t.create(None, "cols.nest", None, "df", None, [numeric_col, numeric_col_B], separator=" ",output_col=new_col)



In [ ]:

    
# t.create(None, "cols.nest", "mix", "df", [F.col(numeric_col_C), F.col(numeric_col_B)], "E", separator="--")



In [ ]:

    
df_na = source_df.cols.drop("NullType").rows.drop_na("*")

t.create(df_na, "cols.nest", "vector_all_columns", "df", None,[numeric_col_C, numeric_col_B], shape="vector", output_col=new_col)



In [ ]:

    
t.create(df_na, "cols.nest", "vector", "df", None, [numeric_col_C, numeric_col_B], shape="vector",output_col=new_col)



In [ ]:

    
t.create(None, "cols.nest", "array", "df", None, [numeric_col, numeric_col_B,numeric_col_C], shape="array", output_col=new_col)



In [14]:

    
t.create(None, "cols.count_by_dtypes", None, "dict", None, "*", infer=False)









    



Creating test_cols_count_by_dtypes() test function...






    



INFO:optimus:test_cols_count_by_dtypes()






    



{'names': {'null': 1, 'missing': 0, 'string': 6}, 'height(ft)': {'null': 2, 'missing': 0, 'int': 5}, 'function': {'null': 1, 'missing': 0, 'string': 6}, 'rank': {'null': 1, 'missing': 0, 'int': 6}, 'age': {'null': 1, 'missing': 0, 'int': 6}, 'weight(t)': {'null': 2, 'missing': 0, 'decimal': 5}, 'japanese name': {'null': 1, 'missing': 0, 'array': 6}, 'last position seen': {'null': 3, 'missing': 0, 'string': 4}, 'date arrival': {'null': 1, 'missing': 0, 'string': 6}, 'last date seen': {'null': 1, 'missing': 0, 'string': 6}, 'attributes': {'null': 1, 'missing': 0, 'array': 6}, 'Date Type': {'null': 1, 'missing': 0, 'date': 6}, 'timestamp': {'null': 1, 'missing': 0, 'date': 6}, 'Cybertronian': {'null': 1, 'missing': 0, 'boolean': 6}, 'function(binary)': {'null': 1, 'missing': 0, 'binary': 6}, 'NullType': {'null': 7, 'missing': 0}}



In [15]:

    
t.create(None, "cols.count_by_dtypes", "infer", "dict", None, "*", infer=True)









    



Creating test_cols_count_by_dtypes_infer() test function...






    



INFO:optimus:test_cols_count_by_dtypes_infer()






    



{'names': {'string': 6, 'null': 1, 'int': 0, 'decimal': 0, 'boolean': 0, 'date': 0, 'array': 0, 'object': 0, 'gender': 0, 'ip': 0, 'url': 0, 'email': 0, 'credit_card_number': 0, 'zip_code': 0, 'missing': 0}, 'height(ft)': {'smallint': 5, 'null': 2}, 'function': {'string': 6, 'null': 1, 'int': 0, 'decimal': 0, 'boolean': 0, 'date': 0, 'array': 0, 'object': 0, 'gender': 0, 'ip': 0, 'url': 0, 'email': 0, 'credit_card_number': 0, 'zip_code': 0, 'missing': 0}, 'rank': {'tinyint': 6, 'null': 1}, 'age': {'int': 6, 'null': 1}, 'weight(t)': {'float': 5, 'null': 2}, 'japanese name': {'array': 6, 'null': 1}, 'last position seen': {'array': 3, 'date': 1, 'null': 3, 'int': 0, 'decimal': 0, 'string': 0, 'boolean': 0, 'object': 0, 'gender': 0, 'ip': 0, 'url': 0, 'email': 0, 'credit_card_number': 0, 'zip_code': 0, 'missing': 0}, 'date arrival': {'date': 6, 'null': 1, 'int': 0, 'decimal': 0, 'string': 0, 'boolean': 0, 'array': 0, 'object': 0, 'gender': 0, 'ip': 0, 'url': 0, 'email': 0, 'credit_card_number': 0, 'zip_code': 0, 'missing': 0}, 'last date seen': {'date': 6, 'null': 1, 'int': 0, 'decimal': 0, 'string': 0, 'boolean': 0, 'array': 0, 'object': 0, 'gender': 0, 'ip': 0, 'url': 0, 'email': 0, 'credit_card_number': 0, 'zip_code': 0, 'missing': 0}, 'attributes': {'array': 6, 'null': 1}, 'Date Type': {'date': 6, 'null': 1}, 'timestamp': {'timestamp': 6, 'null': 1}, 'Cybertronian': {'boolean': 6, 'null': 1}, 'function(binary)': {'binary': 6, 'null': 1}, 'NullType': {'null': 7}}



In [16]:

    
t.run()









    



Creating file ../test_df_cols.py
Done



In [ ]:

    
dtypes_df = op.create.df(
    [
        ("col 1", "str", True),
        ("col 2", "str", True),
        ("col 3", "int", True),
    
    ],
    [
        ("male","male",1),
        ("optimus","bumblebee",1),
        ("3","4.1",1),
        ("true","False",1),
        ("[1,2,3,4]","(1,2,3,4)",1),
        ("{1,2,3,4}","{'key1' :1 , 'key2':2}",1),
        ("1.1.1.1","123.123.123.123",1),
        ("http://hi-optimuse.com","https://hi-bumblebee.com",1),
        ("optimus@cybertron.com","bumblebee@cybertron.com",1),
        ("5123456789123456","373655783158306",1),
        ("11529","30345",1),
        ("04/10/1980","04/10/1980",1),
        ("null","Null",1),
        ("","",1),
        (None,None,1) 
       
    ], infer_schema=True)



In [ ]:

    
t.create(dtypes_df, "cols.count_by_dtypes", "infer", "dict", None, "*", infer=True)



In [ ]:

    
t.run()



In [ ]:

    
t.create(source_df, "cols.count_by_dtypes", None, "dict", None, "*", infer=False)



In [ ]:

    
t.run()



In [ ]:

    
source_df.table()



In [17]:

    
import logging
import sys
from datetime import date, datetime

from pyspark.sql.types import *

from optimus import Optimus

mismatch_df = op.create.df(
    [
        ("names", "str", True),
        ("height(ft)", "int", True),
        ("function", "str", True),
        ("rank", "int", True),
        ("age", "int", True),
        ("weight(t)", "float", True),
        ("japanese name", ArrayType(StringType()), True),
        ("last position seen", "str", True),
        ("date arrival", "str", True),
        ("last date seen", "str", True),
        ("attributes", ArrayType(FloatType()), True),
        ("DateType", DateType()),
        ("Timestamp", TimestampType()),
        ("Cybertronian", "bool", True),
        ("function(binary)", "binary", False),
        ("NullType", "null", True),

    ],
    [
        ("31/12/2019", 28, "1978-12-20", 10, 5000000, 4.30, ["Inochi", "Convoy"], "19.442735,-99.201111", "1980/04/10",
         "2016/09/10", [8.5344, 4300.0], date(2016, 9, 10), datetime(2014, 6, 24), True, bytearray("Leader", "utf-8"),
         None),
        ("bumbl#ebéé  ", 17, "Espionage", 7, 5000000, 2.0, ["Bumble", "Goldback"], "10.642707,-71.612534", "1980/04/10",
         "2015/08/10", [5.334, 2000.0], date(2015, 8, 10), datetime(2014, 6, 24), True, bytearray("Espionage", "utf-8"),
         None),
        ("ironhide&", 26, "Security", 7, 5000000, 4.0, ["Roadbuster"], "37.789563,-122.400356", "1980/04/10",
         "2014/07/10", [7.9248, 4000.0], date(2014, 6, 24), datetime(2014, 6, 24), True, bytearray("Security", "utf-8"),
         None),
        ("Jazz", 13, "First Lieutenant", 8, 5000000, 1.80, ["Meister"], "33.670666,-117.841553", "1980/04/10",
         "2013/06/10", [3.9624, 1800.0], date(2013, 6, 24), datetime(2014, 6, 24), True,
         bytearray("First Lieutenant", "utf-8"), None),
        ("Megatron", None, "None", 10, 5000000, 5.70, ["Megatron"], None, "1980/04/10", "2012/05/10", [None, 5700.0],
         date(2012, 5, 10), datetime(2014, 6, 24), True, bytearray("None", "utf-8"), None),
        ("Metroplex_)^$", 300, "Battle Station", 8, 5000000, None, ["Metroflex"], None, "1980/04/10", "2011/04/10",
         [91.44, None], date(2011, 4, 10), datetime(2014, 6, 24), True, bytearray("Battle Station", "utf-8"), None),
        ("1", 2, "3", 4, 5, 6.0, ["7"], 8, "1980/04/10", "2011/04/10",
         [11.0], date(2011, 4, 10), datetime(2014, 6, 24), True, bytearray("15", "utf-8"), None)
    ], infer_schema=True)



In [18]:

    
mismatch = {"names":"dd/mm/yyyy","height(ft)":r'^([0-2][0-9]|(3)[0-1])(\/)(((0)[0-9])|((1)[0-2]))(\/)\d{4}$',"function":"yyyy-mm-dd"}



In [68]:

    
m = {"names":"int"}



In [69]:

    
mismatch_df.table()









    









Viewing 7 of 7 rows / 16 columns
1 partition(s)


    
    
        
        
            names
            1 (string)
            
                
                nullable
                
            
        
        
        
            height(ft)
            2 (int)
            
                
                nullable
                
            
        
        
        
            function
            3 (string)
            
                
                nullable
                
            
        
        
        
            rank
            4 (int)
            
                
                nullable
                
            
        
        
        
            age
            5 (int)
            
                
                nullable
                
            
        
        
        
            weight(t)
            6 (float)
            
                
                nullable
                
            
        
        
        
            japanese name
            7 (array<string>)
            
                
                nullable
                
            
        
        
        
            last position seen
            8 (string)
            
                
                nullable
                
            
        
        
        
            date arrival
            9 (string)
            
                
                nullable
                
            
        
        
        
            last date seen
            10 (string)
            
                
                nullable
                
            
        
        
        
            attributes
            11 (array<float>)
            
                
                nullable
                
            
        
        
        
            DateType
            12 (date)
            
                
                nullable
                
            
        
        
        
            Timestamp
            13 (timestamp)
            
                
                nullable
                
            
        
        
        
            Cybertronian
            14 (boolean)
            
                
                nullable
                
            
        
        
        
            function(binary)
            15 (binary)
            
                
                not nullable
                
            
        
        
        
            NullType
            16 (null)
            
                
                nullable
                
            
        
        
    

    
    
    
    
        
        
            
                
                31/12/2019
                 
            
        
        
        
            
                
                28
                 
            
        
        
        
            
                
                1978-12-20
                 
            
        
        
        
            
                
                10
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                4.300000190734863
                 
            
        
        
        
            
                
                ['Inochi',⋅'Convoy']
                 
            
        
        
        
            
                
                19.442735,-99.201111
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2016/09/10
                 
            
        
        
        
            
                
                [8.53439998626709,⋅4300.0]
                 
            
        
        
        
            
                
                2016-09-10
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'Leader')
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    
        
        
            
                
                bumbl#ebéé⋅⋅
                 
            
        
        
        
            
                
                17
                 
            
        
        
        
            
                
                Espionage
                 
            
        
        
        
            
                
                7
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                2.0
                 
            
        
        
        
            
                
                ['Bumble',⋅'Goldback']
                 
            
        
        
        
            
                
                10.642707,-71.612534
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2015/08/10
                 
            
        
        
        
            
                
                [5.334000110626221,⋅2000.0]
                 
            
        
        
        
            
                
                2015-08-10
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'Espionage')
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    
        
        
            
                
                ironhide&
                 
            
        
        
        
            
                
                26
                 
            
        
        
        
            
                
                Security
                 
            
        
        
        
            
                
                7
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                4.0
                 
            
        
        
        
            
                
                ['Roadbuster']
                 
            
        
        
        
            
                
                37.789563,-122.400356
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2014/07/10
                 
            
        
        
        
            
                
                [7.924799919128418,⋅4000.0]
                 
            
        
        
        
            
                
                2014-06-24
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'Security')
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    
        
        
            
                
                Jazz
                 
            
        
        
        
            
                
                13
                 
            
        
        
        
            
                
                First⋅Lieutenant
                 
            
        
        
        
            
                
                8
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                1.7999999523162842
                 
            
        
        
        
            
                
                ['Meister']
                 
            
        
        
        
            
                
                33.670666,-117.841553
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2013/06/10
                 
            
        
        
        
            
                
                [3.962399959564209,⋅1800.0]
                 
            
        
        
        
            
                
                2013-06-24
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'First⋅Lieutenant')
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    
        
        
            
                
                Megatron
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                10
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                5.699999809265137
                 
            
        
        
        
            
                
                ['Megatron']
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2012/05/10
                 
            
        
        
        
            
                
                [None,⋅5700.0]
                 
            
        
        
        
            
                
                2012-05-10
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'None')
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    
        
        
            
                
                Metroplex_)^$
                 
            
        
        
        
            
                
                300
                 
            
        
        
        
            
                
                Battle⋅Station
                 
            
        
        
        
            
                
                8
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                ['Metroflex']
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2011/04/10
                 
            
        
        
        
            
                
                [91.44000244140625,⋅None]
                 
            
        
        
        
            
                
                2011-04-10
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'Battle⋅Station')
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    
        
        
            
                
                1
                 
            
        
        
        
            
                
                2
                 
            
        
        
        
            
                
                3
                 
            
        
        
        
            
                
                4
                 
            
        
        
        
            
                
                5
                 
            
        
        
        
            
                
                6.0
                 
            
        
        
        
            
                
                ['7']
                 
            
        
        
        
            
                
                8
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2011/04/10
                 
            
        
        
        
            
                
                [11.0]
                 
            
        
        
        
            
                
                2011-04-10
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'15')
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    



Viewing 7 of 7 rows / 16 columns
1 partition(s)



In [19]:

    
t.create(mismatch_df, "cols.count_mismatch", None, "dict", None, {"names":"int"})









    



Creating test_cols_count_mismatch() test function...






    



INFO:optimus:test_cols_count_mismatch()






    



['{"names": "int"}']
{'names': {'mismatch': 6, 'int': 1, 'null': 0, 'missing': 0}}



In [20]:

    
t.run()









    



Creating file ../test_df_cols.py
Done

Unnest String



In [83]:

    
t.create(None, "cols.unnest", "string_multi_index", "df", None, date_col, "/", splits=3, index=2)









    



Creating test_cols_unnest_string_multi_index() test function...






    



INFO:optimus:test_cols_unnest_string_multi_index()






    



["'date arrival'", "'/'"]






    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-83-422a78ee4e78> in <module>
----> 1 t.create(None, "cols.unnest", "string_multi_index", "df", None, date_col, "/", splits=3, index=2)

~\Documents\Optimus\optimus\helpers\test.py in create(self, obj, method, suffix, output, additional_method, *args, **kwargs)
    217                 df_func = getattr(df_func, f)
    218 
--> 219             df_result = df_func(*args, **kwargs)
    220 
    221         # Additional Methods

~\Documents\Optimus\optimus\helpers\decorators.py in wrapper(*args, **kwargs)
     47         def wrapper(*args, **kwargs):
     48             start_time = timeit.default_timer()
---> 49             f = func(*args, **kwargs)
     50             _time = round(timeit.default_timer() - start_time, 2)
     51             if log_time:

~\Documents\Optimus\optimus\dataframe\columns.py in unnest(input_cols, separator, splits, index, output_cols)
   1571         df = self
   1572 
-> 1573         input_cols = parse_columns(self, input_cols)
   1574         output_cols = get_output_cols(input_cols, output_cols)
   1575         final_columns = None

~\Documents\Optimus\optimus\helpers\columns.py in parse_columns(df, cols_args, get_args, is_regex, filter_by_column_dtypes, accepts_missing_cols, invert)
    140     # Check for missing columns
    141     if accepts_missing_cols is False:
--> 142         check_for_missing_columns(df, cols)
    143 
    144     # Filter by column data type

~\Documents\Optimus\optimus\helpers\columns.py in check_for_missing_columns(df, col_names)
    242 
    243     if len(missing_columns) > 0:
--> 244         RaiseIt.value_error(missing_columns, df.columns)
    245     return False
    246 

~\Documents\Optimus\optimus\helpers\raiseit.py in value_error(var, data_values)
     76                                  type=divisor.join(map(
     77                                      lambda x: "'" + x + "'",
---> 78                                      data_values)), var_type=one_list_to_val(var)))
     79 
     80     @staticmethod

ValueError: 'missing_columns' must be 'words', 'num', 'animals', 'thing', 'second', 'filter', received 'date arrival'



In [ ]:

    
t.create(None, "cols.unnest", "string_multi_index", "df", None, date_col, "/", splits=3, index=[1,2])



In [ ]:

    
t.create(None, "cols.unnest", "string_infer_split", "df", None, date_col, "/")



In [ ]:

    
t.create(None, "cols.unnest", "string_no_index", "df", None, date_col, "/", splits=3)



In [ ]:

    
t.create(None, "cols.unnest", "string_output_columns", "df", None, date_col, "/", splits=3, output_cols= [("year", "month","day")])



In [ ]:

    
t.create(None, "cols.unnest", "array_index", "df", None, array_col, index=2)



In [ ]:

    
t.create(None, "cols.unnest", "array_multi_index", "df", None, array_col, index=[1,2])



In [ ]:

    
t.run()



In [ ]:

    
t.create(None, "cols.unnest", "string_multi_colum_multi_index_multi_output", "df", None, ["date arrival","last date seen"], "/", index=[(1,2),(1,2)], output_cols=[("year1","month1"),("year2","month2")])



In [ ]:

    
t.create(None, "cols.unnest", "string_multi_colum_multi_output", "df", None, ["date arrival","last date seen"], "/", output_cols=[("year1","month1"),("year2","month2")])



In [ ]:

    
t.create(None, "cols.unnest", "array", "df", array_col)



In [ ]:

    
t.create(None, "cols.unnest", "array_all_columns", "df", array_col)



In [ ]:

    
t.create(None, "cols.is_na", "all_columns", "df", "*")



In [ ]:

    
t.create(None, "cols.is_na", None, "df", numeric_col)



In [ ]:

    
t.run()



In [ ]:

    
from pyspark.sql.types import *
from optimus import Optimus
from optimus.helpers.json import json_enconding
from pyspark.ml.linalg import Vectors, VectorUDT, DenseVector
import numpy as np
nan = np.nan
import datetime



In [ ]:

    
actual_df =op.load.json('https://raw.githubusercontent.com/ironmussa/Optimus/master/examples/data/foo.json')
expected_df = op.create.df([('billingId', LongType(), True),('birth', StringType(), True),('dummyCol', StringType(), True),('firstName', StringType(), True),('id', LongType(), True),('lastName', StringType(), True),('price', LongType(), True),('product', StringType(), True)], [(123, '1980/07/07', 'never', 'Luis', 1, 'Alvarez$$%!', 10, 'Cake')])

# assert (expected_df.collect() == actual_df.collect())

from deepdiff import DeepDiff  # For Deep Difference of 2 objects

actual_df.table()
expected_df.table()

# source_df.table()
# print(actual_df.to_json())
# print(expected_df.to_json())
a1 = actual_df.to_json()
e1 = expected_df.to_json()



In [ ]:

    
ddiff = DeepDiff(a1, e1, ignore_order=False)
print(ddiff)



In [ ]:

Rows Test



In [ ]:

    
t = Test(op,df, "df_rows", imports=["from pyspark.ml.linalg import Vectors, VectorUDT, DenseVector",
                                         "import numpy as np",
                                        "nan = np.nan",
                                        "import datetime",
                                        "from pyspark.sql import functions as F",
                                        "from optimus.functions import abstract_udf as audf"])



In [ ]:

    
rows = [
        ("Optim'us", 28, "Leader", 10, 5000000, 4.30, ["Inochi", "Convoy"], "19.442735,-99.201111", "1980/04/10",
         "2016/09/10", [8.5344, 4300.0], date(2016, 9, 10), datetime(2014, 6, 24), True, bytearray("Leader", "utf-8"),
         None)
]



In [ ]:

    
df = op.load.url("https://raw.githubusercontent.com/ironmussa/Optimus/master/examples/data/foo.csv")



In [ ]:

    
t = Test(op, source_df, "op_io", imports=["from pyspark.ml.linalg import Vectors, VectorUDT, DenseVector",
                                        "import numpy as np",
                                        "nan = np.nan",
                                        "import datetime",
                                        "from pyspark.sql import functions as F"],path = "op_io", final_path="..")



In [ ]:

    
t.create(op, "load.csv", "local_csv", "df", "../../examples/data/foo.csv")
t.create(op, "load.json", "local_json", "df", "../../examples/data/foo.json")
t.create(op, "load.parquet", "local_parquet", "df", "../../examples/data/foo.parquet")
t.create(op, "load.csv", "remote_csv", "df", "https://raw.githubusercontent.com/ironmussa/Optimus/master/examples/data/foo.csv")



In [ ]:

    
t.create(op, "load.json", "remote_json", "df", "https://raw.githubusercontent.com/ironmussa/Optimus/master/examples/data/foo.json")



In [ ]:

    
t.create(op, "load.parquet", "remote_parquet", "df", "https://raw.githubusercontent.com/ironmussa/Optimus/master/examples/data/foo.parquet")



In [ ]:



In [ ]:

    
from optimus.profiler.profiler import Profiler
p = Profiler()

print(p.run(source_df1, "japanese name"))



In [ ]:

    
# df_string = source_df.cols.cast("*","str")
t.create(source_df, "save.csv", None, None, "test.csv")



In [ ]:

    
t.create(None, "save.json", None, None, "test.json")



In [ ]:

    
t.create(None, "save.parquet", None, None, "test.parquet")



In [ ]:

    
t.run()



In [ ]:

    
source_df.table()

Ouliers



In [86]:

    
import pandas as pd
from pyspark.sql.types import *
from datetime import date, datetime


cols = [
        ("names", "str"),
        ("height(ft)", ShortType()),
        ("function", "str"),
        ("rank", ByteType()),
        ("age", "int"),
        ("weight(t)", "float"),
        "japanese name",
        "last position seen",
        "date arrival",
        "last date seen",
        ("attributes", ArrayType(FloatType())),
        ("Date Type", DateType()),
        ("timestamp", TimestampType()),
        ("Cybertronian", BooleanType()),
        ("function(binary)", BinaryType()),
        ("NullType", NullType())

    ]

rows = [
        ("Optim'us", -28, "Leader", 10, 5000000, 4.30, ["Inochi", "Convoy"], "19.442735,-99.201111", "1980/04/10",
         "2016/09/10", [8.5344, 4300.0], date(2016, 9, 10), datetime(2014, 6, 24), True, bytearray("Leader", "utf-8"),
         None),
        ("bumbl#ebéé  ", 17, "Espionage", 7, 5000000, 2.0, ["Bumble", "Goldback"], "10.642707,-71.612534", "1980/04/10",
         "2015/08/10", [5.334, 2000.0], date(2015, 8, 10), datetime(2014, 6, 24), True, bytearray("Espionage", "utf-8"),
         None),
        ("ironhide&", 26, "Security", 7, 5000000, 4.0, ["Roadbuster"], "37.789563,-122.400356", "1980/04/10",
         "2014/07/10", [7.9248, 4000.0], date(2014, 6, 24), datetime(2014, 6, 24), True, bytearray("Security", "utf-8"),
         None),
        ("Jazz", 13, "First Lieutenant", 8, 5000000, 1.80, ["Meister"], "33.670666,-117.841553", "1980/04/10",
         "2013/06/10", [3.9624, 1800.0], date(2013, 6, 24), datetime(2014, 6, 24), True,
         bytearray("First Lieutenant", "utf-8"), None),
        ("Megatron", None, "None", 10, 5000000, 5.70, ["Megatron"], None, "1980/04/10", "2012/05/10", [None, 5700.0],
         date(2012, 5, 10), datetime(2014, 6, 24), True, bytearray("None", "utf-8"), None),
        ("Metroplex_)^$", 300, "Battle Station", 8, 5000000, None, ["Metroflex"], None, "1980/04/10", "2011/04/10",
         [91.44, None], date(2011, 4, 10), datetime(2014, 6, 24), True, bytearray("Battle Station", "utf-8"), None),
        (None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None),

    ]
source_df = op.create.df(cols ,rows)
source_df.table()









    









Viewing 7 of 7 rows / 16 columns
1 partition(s)


    
    
        
        
            names
            1 (string)
            
                
                nullable
                
            
        
        
        
            height(ft)
            2 (smallint)
            
                
                nullable
                
            
        
        
        
            function
            3 (string)
            
                
                nullable
                
            
        
        
        
            rank
            4 (tinyint)
            
                
                nullable
                
            
        
        
        
            age
            5 (int)
            
                
                nullable
                
            
        
        
        
            weight(t)
            6 (float)
            
                
                nullable
                
            
        
        
        
            japanese name
            7 (array<string>)
            
                
                nullable
                
            
        
        
        
            last position seen
            8 (string)
            
                
                nullable
                
            
        
        
        
            date arrival
            9 (string)
            
                
                nullable
                
            
        
        
        
            last date seen
            10 (string)
            
                
                nullable
                
            
        
        
        
            attributes
            11 (array<float>)
            
                
                nullable
                
            
        
        
        
            Date Type
            12 (date)
            
                
                nullable
                
            
        
        
        
            timestamp
            13 (timestamp)
            
                
                nullable
                
            
        
        
        
            Cybertronian
            14 (boolean)
            
                
                nullable
                
            
        
        
        
            function(binary)
            15 (binary)
            
                
                nullable
                
            
        
        
        
            NullType
            16 (null)
            
                
                nullable
                
            
        
        
    

    
    
    
    
        
        
            
                
                Optim'us
                 
            
        
        
        
            
                
                -28
                 
            
        
        
        
            
                
                Leader
                 
            
        
        
        
            
                
                10
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                4.300000190734863
                 
            
        
        
        
            
                
                ['Inochi',⋅'Convoy']
                 
            
        
        
        
            
                
                19.442735,-99.201111
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2016/09/10
                 
            
        
        
        
            
                
                [8.53439998626709,⋅4300.0]
                 
            
        
        
        
            
                
                2016-09-10
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'Leader')
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    
        
        
            
                
                bumbl#ebéé⋅⋅
                 
            
        
        
        
            
                
                17
                 
            
        
        
        
            
                
                Espionage
                 
            
        
        
        
            
                
                7
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                2.0
                 
            
        
        
        
            
                
                ['Bumble',⋅'Goldback']
                 
            
        
        
        
            
                
                10.642707,-71.612534
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2015/08/10
                 
            
        
        
        
            
                
                [5.334000110626221,⋅2000.0]
                 
            
        
        
        
            
                
                2015-08-10
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'Espionage')
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    
        
        
            
                
                ironhide&
                 
            
        
        
        
            
                
                26
                 
            
        
        
        
            
                
                Security
                 
            
        
        
        
            
                
                7
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                4.0
                 
            
        
        
        
            
                
                ['Roadbuster']
                 
            
        
        
        
            
                
                37.789563,-122.400356
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2014/07/10
                 
            
        
        
        
            
                
                [7.924799919128418,⋅4000.0]
                 
            
        
        
        
            
                
                2014-06-24
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'Security')
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    
        
        
            
                
                Jazz
                 
            
        
        
        
            
                
                13
                 
            
        
        
        
            
                
                First⋅Lieutenant
                 
            
        
        
        
            
                
                8
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                1.7999999523162842
                 
            
        
        
        
            
                
                ['Meister']
                 
            
        
        
        
            
                
                33.670666,-117.841553
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2013/06/10
                 
            
        
        
        
            
                
                [3.962399959564209,⋅1800.0]
                 
            
        
        
        
            
                
                2013-06-24
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'First⋅Lieutenant')
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    
        
        
            
                
                Megatron
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                10
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                5.699999809265137
                 
            
        
        
        
            
                
                ['Megatron']
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2012/05/10
                 
            
        
        
        
            
                
                [None,⋅5700.0]
                 
            
        
        
        
            
                
                2012-05-10
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'None')
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    
        
        
            
                
                Metroplex_)^$
                 
            
        
        
        
            
                
                300
                 
            
        
        
        
            
                
                Battle⋅Station
                 
            
        
        
        
            
                
                8
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                ['Metroflex']
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2011/04/10
                 
            
        
        
        
            
                
                [91.44000244140625,⋅None]
                 
            
        
        
        
            
                
                2011-04-10
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'Battle⋅Station')
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    



Viewing 7 of 7 rows / 16 columns
1 partition(s)



In [21]:

    
t = Test(op, source_df, "df_outliers", imports=["from pyspark.ml.linalg import Vectors, VectorUDT, DenseVector",
                                        "import numpy as np",
                                        "nan = np.nan",
                                        "import datetime",
                                        "from pyspark.sql import functions as F"], path = "df_outliers", final_path="..")



In [22]:

    
from pyspark.sql import functions as F


def func(col_name, attrs):
    return F.col(col_name) * 2

numeric_col = "height(ft)"
numeric_col_B = "rank"
numeric_col_C = "rank"
string_col = "function"
date_col = "date arrival"
date_col_B = "last date seen"
new_col = "new col"
array_col = "attributes"



In [23]:

    
source_df.table()









    









Viewing 7 of 7 rows / 16 columns
1 partition(s)


    
    
        
        
            names
            1 (string)
            
                
                nullable
                
            
        
        
        
            height(ft)
            2 (smallint)
            
                
                nullable
                
            
        
        
        
            function
            3 (string)
            
                
                nullable
                
            
        
        
        
            rank
            4 (tinyint)
            
                
                nullable
                
            
        
        
        
            age
            5 (int)
            
                
                nullable
                
            
        
        
        
            weight(t)
            6 (float)
            
                
                nullable
                
            
        
        
        
            japanese name
            7 (array<string>)
            
                
                nullable
                
            
        
        
        
            last position seen
            8 (string)
            
                
                nullable
                
            
        
        
        
            date arrival
            9 (string)
            
                
                nullable
                
            
        
        
        
            last date seen
            10 (string)
            
                
                nullable
                
            
        
        
        
            attributes
            11 (array<float>)
            
                
                nullable
                
            
        
        
        
            Date Type
            12 (date)
            
                
                nullable
                
            
        
        
        
            timestamp
            13 (timestamp)
            
                
                nullable
                
            
        
        
        
            Cybertronian
            14 (boolean)
            
                
                nullable
                
            
        
        
        
            function(binary)
            15 (binary)
            
                
                nullable
                
            
        
        
        
            NullType
            16 (null)
            
                
                nullable
                
            
        
        
    

    
    
    
    
        
        
            
                
                Optim'us
                 
            
        
        
        
            
                
                -28
                 
            
        
        
        
            
                
                Leader
                 
            
        
        
        
            
                
                10
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                4.300000190734863
                 
            
        
        
        
            
                
                ['Inochi',⋅'Convoy']
                 
            
        
        
        
            
                
                19.442735,-99.201111
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2016/09/10
                 
            
        
        
        
            
                
                [8.53439998626709,⋅4300.0]
                 
            
        
        
        
            
                
                2016-09-10
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'Leader')
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    
        
        
            
                
                bumbl#ebéé⋅⋅
                 
            
        
        
        
            
                
                17
                 
            
        
        
        
            
                
                Espionage
                 
            
        
        
        
            
                
                7
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                2.0
                 
            
        
        
        
            
                
                ['Bumble',⋅'Goldback']
                 
            
        
        
        
            
                
                10.642707,-71.612534
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2015/08/10
                 
            
        
        
        
            
                
                [5.334000110626221,⋅2000.0]
                 
            
        
        
        
            
                
                2015-08-10
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'Espionage')
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    
        
        
            
                
                ironhide&
                 
            
        
        
        
            
                
                26
                 
            
        
        
        
            
                
                Security
                 
            
        
        
        
            
                
                7
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                4.0
                 
            
        
        
        
            
                
                ['Roadbuster']
                 
            
        
        
        
            
                
                37.789563,-122.400356
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2014/07/10
                 
            
        
        
        
            
                
                [7.924799919128418,⋅4000.0]
                 
            
        
        
        
            
                
                2014-06-24
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'Security')
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    
        
        
            
                
                Jazz
                 
            
        
        
        
            
                
                13
                 
            
        
        
        
            
                
                First⋅Lieutenant
                 
            
        
        
        
            
                
                8
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                1.7999999523162842
                 
            
        
        
        
            
                
                ['Meister']
                 
            
        
        
        
            
                
                33.670666,-117.841553
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2013/06/10
                 
            
        
        
        
            
                
                [3.962399959564209,⋅1800.0]
                 
            
        
        
        
            
                
                2013-06-24
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'First⋅Lieutenant')
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    
        
        
            
                
                Megatron
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                10
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                5.699999809265137
                 
            
        
        
        
            
                
                ['Megatron']
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2012/05/10
                 
            
        
        
        
            
                
                [None,⋅5700.0]
                 
            
        
        
        
            
                
                2012-05-10
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'None')
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    
        
        
            
                
                Metroplex_)^$
                 
            
        
        
        
            
                
                300
                 
            
        
        
        
            
                
                Battle⋅Station
                 
            
        
        
        
            
                
                8
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                ['Metroflex']
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2011/04/10
                 
            
        
        
        
            
                
                [91.44000244140625,⋅None]
                 
            
        
        
        
            
                
                2011-04-10
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'Battle⋅Station')
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    



Viewing 7 of 7 rows / 16 columns
1 partition(s)

Tukey



In [24]:

    
t.create(None, "outliers.tukey", None, "df","select", numeric_col)









    



Creating test_outliers_tukey_select() test function...






    



INFO:optimus:test_outliers_tukey_select()






    



["'height(ft)'"]






    



INFO:optimus:percentile() executed in 3.34 sec
INFO:optimus:percentile() executed in 2.68 sec






    









Viewing 2 of 2 rows / 16 columns
1 partition(s)


    
    
        
        
            names
            1 (string)
            
                
                nullable
                
            
        
        
        
            height(ft)
            2 (smallint)
            
                
                nullable
                
            
        
        
        
            function
            3 (string)
            
                
                nullable
                
            
        
        
        
            rank
            4 (tinyint)
            
                
                nullable
                
            
        
        
        
            age
            5 (int)
            
                
                nullable
                
            
        
        
        
            weight(t)
            6 (float)
            
                
                nullable
                
            
        
        
        
            japanese name
            7 (array<string>)
            
                
                nullable
                
            
        
        
        
            last position seen
            8 (string)
            
                
                nullable
                
            
        
        
        
            date arrival
            9 (string)
            
                
                nullable
                
            
        
        
        
            last date seen
            10 (string)
            
                
                nullable
                
            
        
        
        
            attributes
            11 (array<float>)
            
                
                nullable
                
            
        
        
        
            Date Type
            12 (date)
            
                
                nullable
                
            
        
        
        
            timestamp
            13 (timestamp)
            
                
                nullable
                
            
        
        
        
            Cybertronian
            14 (boolean)
            
                
                nullable
                
            
        
        
        
            function(binary)
            15 (binary)
            
                
                nullable
                
            
        
        
        
            NullType
            16 (null)
            
                
                nullable
                
            
        
        
    

    
    
    
    
        
        
            
                
                Optim'us
                 
            
        
        
        
            
                
                -28
                 
            
        
        
        
            
                
                Leader
                 
            
        
        
        
            
                
                10
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                4.300000190734863
                 
            
        
        
        
            
                
                ['Inochi',⋅'Convoy']
                 
            
        
        
        
            
                
                19.442735,-99.201111
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2016/09/10
                 
            
        
        
        
            
                
                [8.53439998626709,⋅4300.0]
                 
            
        
        
        
            
                
                2016-09-10
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'Leader')
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    
        
        
            
                
                Metroplex_)^$
                 
            
        
        
        
            
                
                300
                 
            
        
        
        
            
                
                Battle⋅Station
                 
            
        
        
        
            
                
                8
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                ['Metroflex']
                 
            
        
        
        
            
                
                None
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2011/04/10
                 
            
        
        
        
            
                
                [91.44000244140625,⋅None]
                 
            
        
        
        
            
                
                2011-04-10
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'Battle⋅Station')
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    



Viewing 2 of 2 rows / 16 columns
1 partition(s)



In [32]:

    
source_df.outliers.tukey(numeric_col).drop().table()









    



INFO:optimus:percentile() executed in 2.62 sec






    









Viewing 3 of 3 rows / 16 columns
1 partition(s)


    
    
        
        
            names
            1 (string)
            
                
                nullable
                
            
        
        
        
            height(ft)
            2 (smallint)
            
                
                nullable
                
            
        
        
        
            function
            3 (string)
            
                
                nullable
                
            
        
        
        
            rank
            4 (tinyint)
            
                
                nullable
                
            
        
        
        
            age
            5 (int)
            
                
                nullable
                
            
        
        
        
            weight(t)
            6 (float)
            
                
                nullable
                
            
        
        
        
            japanese name
            7 (array<string>)
            
                
                nullable
                
            
        
        
        
            last position seen
            8 (string)
            
                
                nullable
                
            
        
        
        
            date arrival
            9 (string)
            
                
                nullable
                
            
        
        
        
            last date seen
            10 (string)
            
                
                nullable
                
            
        
        
        
            attributes
            11 (array<float>)
            
                
                nullable
                
            
        
        
        
            Date Type
            12 (date)
            
                
                nullable
                
            
        
        
        
            timestamp
            13 (timestamp)
            
                
                nullable
                
            
        
        
        
            Cybertronian
            14 (boolean)
            
                
                nullable
                
            
        
        
        
            function(binary)
            15 (binary)
            
                
                nullable
                
            
        
        
        
            NullType
            16 (null)
            
                
                nullable
                
            
        
        
    

    
    
    
    
        
        
            
                
                bumbl#ebéé⋅⋅
                 
            
        
        
        
            
                
                17
                 
            
        
        
        
            
                
                Espionage
                 
            
        
        
        
            
                
                7
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                2.0
                 
            
        
        
        
            
                
                ['Bumble',⋅'Goldback']
                 
            
        
        
        
            
                
                10.642707,-71.612534
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2015/08/10
                 
            
        
        
        
            
                
                [5.334000110626221,⋅2000.0]
                 
            
        
        
        
            
                
                2015-08-10
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'Espionage')
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    
        
        
            
                
                ironhide&
                 
            
        
        
        
            
                
                26
                 
            
        
        
        
            
                
                Security
                 
            
        
        
        
            
                
                7
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                4.0
                 
            
        
        
        
            
                
                ['Roadbuster']
                 
            
        
        
        
            
                
                37.789563,-122.400356
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2014/07/10
                 
            
        
        
        
            
                
                [7.924799919128418,⋅4000.0]
                 
            
        
        
        
            
                
                2014-06-24
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'Security')
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    
        
        
            
                
                Jazz
                 
            
        
        
        
            
                
                13
                 
            
        
        
        
            
                
                First⋅Lieutenant
                 
            
        
        
        
            
                
                8
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                1.7999999523162842
                 
            
        
        
        
            
                
                ['Meister']
                 
            
        
        
        
            
                
                33.670666,-117.841553
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2013/06/10
                 
            
        
        
        
            
                
                [3.962399959564209,⋅1800.0]
                 
            
        
        
        
            
                
                2013-06-24
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'First⋅Lieutenant')
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    



Viewing 3 of 3 rows / 16 columns
1 partition(s)



In [26]:

    
t.create(None, "outliers.tukey", None, "df","drop", numeric_col)









    



Creating test_outliers_tukey_drop() test function...






    



INFO:optimus:test_outliers_tukey_drop()






    



["'height(ft)'"]






    



INFO:optimus:percentile() executed in 2.64 sec






    









Viewing 3 of 3 rows / 16 columns
1 partition(s)


    
    
        
        
            names
            1 (string)
            
                
                nullable
                
            
        
        
        
            height(ft)
            2 (smallint)
            
                
                nullable
                
            
        
        
        
            function
            3 (string)
            
                
                nullable
                
            
        
        
        
            rank
            4 (tinyint)
            
                
                nullable
                
            
        
        
        
            age
            5 (int)
            
                
                nullable
                
            
        
        
        
            weight(t)
            6 (float)
            
                
                nullable
                
            
        
        
        
            japanese name
            7 (array<string>)
            
                
                nullable
                
            
        
        
        
            last position seen
            8 (string)
            
                
                nullable
                
            
        
        
        
            date arrival
            9 (string)
            
                
                nullable
                
            
        
        
        
            last date seen
            10 (string)
            
                
                nullable
                
            
        
        
        
            attributes
            11 (array<float>)
            
                
                nullable
                
            
        
        
        
            Date Type
            12 (date)
            
                
                nullable
                
            
        
        
        
            timestamp
            13 (timestamp)
            
                
                nullable
                
            
        
        
        
            Cybertronian
            14 (boolean)
            
                
                nullable
                
            
        
        
        
            function(binary)
            15 (binary)
            
                
                nullable
                
            
        
        
        
            NullType
            16 (null)
            
                
                nullable
                
            
        
        
    

    
    
    
    
        
        
            
                
                bumbl#ebéé⋅⋅
                 
            
        
        
        
            
                
                17
                 
            
        
        
        
            
                
                Espionage
                 
            
        
        
        
            
                
                7
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                2.0
                 
            
        
        
        
            
                
                ['Bumble',⋅'Goldback']
                 
            
        
        
        
            
                
                10.642707,-71.612534
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2015/08/10
                 
            
        
        
        
            
                
                [5.334000110626221,⋅2000.0]
                 
            
        
        
        
            
                
                2015-08-10
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'Espionage')
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    
        
        
            
                
                ironhide&
                 
            
        
        
        
            
                
                26
                 
            
        
        
        
            
                
                Security
                 
            
        
        
        
            
                
                7
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                4.0
                 
            
        
        
        
            
                
                ['Roadbuster']
                 
            
        
        
        
            
                
                37.789563,-122.400356
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2014/07/10
                 
            
        
        
        
            
                
                [7.924799919128418,⋅4000.0]
                 
            
        
        
        
            
                
                2014-06-24
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'Security')
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    
        
        
            
                
                Jazz
                 
            
        
        
        
            
                
                13
                 
            
        
        
        
            
                
                First⋅Lieutenant
                 
            
        
        
        
            
                
                8
                 
            
        
        
        
            
                
                5000000
                 
            
        
        
        
            
                
                1.7999999523162842
                 
            
        
        
        
            
                
                ['Meister']
                 
            
        
        
        
            
                
                33.670666,-117.841553
                 
            
        
        
        
            
                
                1980/04/10
                 
            
        
        
        
            
                
                2013/06/10
                 
            
        
        
        
            
                
                [3.962399959564209,⋅1800.0]
                 
            
        
        
        
            
                
                2013-06-24
                 
            
        
        
        
            
                
                2014-06-24⋅00:00:00
                 
            
        
        
        
            
                
                True
                 
            
        
        
        
            
                
                bytearray(b'First⋅Lieutenant')
                 
            
        
        
        
            
                
                None
                 
            
        
        
    
    
    



Viewing 3 of 3 rows / 16 columns
1 partition(s)



In [27]:

    
t.create(None, "outliers.tukey", None, "json", "whiskers", numeric_col)









    



Creating test_outliers_tukey_whiskers() test function...






    



INFO:optimus:test_outliers_tukey_whiskers()






    



["'height(ft)'"]






    



INFO:optimus:percentile() executed in 2.65 sec
INFO:optimus:percentile() executed in 2.64 sec






    



{'lower_bound': -6.5, 'upper_bound': 45.5, 'q1': 13, 'median': 17, 'q3': 26, 'iqr': 13}



In [28]:

    
t.create(None, "outliers.tukey", None, "json", "count", numeric_col)









    



Creating test_outliers_tukey_count() test function...






    



INFO:optimus:test_outliers_tukey_count()






    



["'height(ft)'"]






    



INFO:optimus:percentile() executed in 2.61 sec






    



2



In [29]:

    
t.create(None, "outliers.tukey", None, "json", "non_outliers_count", numeric_col)









    



Creating test_outliers_tukey_non_outliers_count() test function...






    



INFO:optimus:test_outliers_tukey_non_outliers_count()






    



["'height(ft)'"]






    



INFO:optimus:percentile() executed in 2.59 sec






    



3



In [30]:

    
t.create(None, "outliers.tukey", None, "json", "info", numeric_col)









    



Creating test_outliers_tukey_info() test function...






    



INFO:optimus:test_outliers_tukey_info()






    



["'height(ft)'"]






    



INFO:optimus:percentile() executed in 2.83 sec






    



{'count_outliers': 2, 'count_non_outliers': 3, 'lower_bound': -6.5, 'lower_bound_count': 1, 'upper_bound': 45.5, 'upper_bound_count': 1, 'q1': 13, 'median': 17, 'q3': 26, 'iqr': 13}



In [31]:

    
t.run()









    



Creating file ../test_df_outliers.py
Done

Zscore



In [ ]:

    
threshold = 0.5



In [ ]:

    
t.create(None, "outliers.z_score", None, "df","select", numeric_col, threshold)



In [ ]:

    
source_df.outliers.z_score('height(ft)',0.5).select()



In [ ]:

    
t.create(None, "outliers.z_score", None, "df","drop", numeric_col, threshold)



In [ ]:

    
t.create(None, "outliers.z_score", None, "json", "count", numeric_col, threshold)



In [ ]:

    
t.create(None, "outliers.z_score", None, "json", "non_outliers_count", numeric_col, threshold)



In [ ]:

    
t.create(None, "outliers.z_score", None, "json", "info", numeric_col, threshold)



In [ ]:

    
t.run()

Modified Zscore



In [ ]:

    
threshold = 0.5
relative_error = 10000



In [ ]:

    
t.create(None, "outliers.modified_z_score", None, "df","select", numeric_col, threshold, relative_error)



In [ ]:

    
t.create(None, "outliers.modified_z_score", None, "df","drop", numeric_col, threshold, relative_error)



In [ ]:

    
t.create(None, "outliers.modified_z_score", None, "json","count", numeric_col, threshold, relative_error)



In [ ]:

    
t.create(None, "outliers.modified_z_score", None, "json","non_outliers_count", numeric_col, threshold, relative_error)



In [ ]:

    
t.create(None, "outliers.modified_z_score", None, "json","info", numeric_col, threshold, relative_error)



In [ ]:

    
t.run()

Mad



In [84]:

    
threshold = 0.5
relative_error = 10000



In [ ]:

    
t.create(None, "outliers.mad", None, "df","select", numeric_col, threshold, relative_error)



In [ ]:

    
t.create(None, "outliers.mad", None, "df","drop", numeric_col, threshold, relative_error)



In [ ]:

    
t.create(None, "outliers.mad", None, "json","count", numeric_col, threshold, relative_error)



In [93]:

    
t.create(None, "outliers.mad", None, "json","non_outliers_count", numeric_col, threshold, relative_error)









    



Creating test_outliers_mad_non_outliers_count() test function...






    



INFO:optimus:test_outliers_mad_non_outliers_count()






    



["'height(ft)'", '0.5', '10000']
2



In [90]:

    
t.create(None, "outliers.mad", None, "json","info", numeric_col, threshold, relative_error)









    



Creating test_outliers_mad_info() test function...






    



INFO:optimus:test_outliers_mad_info()






    



["'height(ft)'", '0.5', '10000']
{'count_outliers': 3, 'count_non_outliers': 2, 'lower_bound': 12.5, 'lower_bound_count': 1, 'upper_bound': 21.5, 'upper_bound_count': 2}



In [94]:

    
t.run()









    



Creating file ../test_df_outliers.py
Done

Keycolision



In [ ]:

    
source_df = op.read.csv("../../examples/data/random.csv",header=True, sep=";").limit(10)



In [ ]:

    
source_df.table()



In [ ]:

    
t = Test(op, source_df, "df_keycollision", imports=["from pyspark.ml.linalg import Vectors, VectorUDT, DenseVector",
                                        "import numpy as np",
                                        "nan = np.nan",
                                        "import datetime",
                                        "from pyspark.sql import functions as F",
                                        "from optimus.ml import keycollision as keyCol"], 
         path = "df_keycollision", final_path="..")

from optimus.ml import keycollision as keyCol



In [ ]:

    
t.create(keyCol, "fingerprint",  None, "df",None, source_df, "STATE")
t.run()



In [ ]:

    
t.create(keyCol, "fingerprint_cluster", None, "json", None, source_df, "STATE")



In [ ]:

    
t.run()



In [ ]:

    
t.create(keyCol, "n_gram_fingerprint", None, "df", None, source_df, "STATE")



In [ ]:

    
t.create(keyCol, "n_gram_fingerprint_cluster", None, "json", None, source_df, "STATE", 2)



In [ ]:

    
t.run()

Distance cluster



In [ ]:

    
source_df = op.read.csv("../../examples/data/random.csv",header=True, sep=";").limit(1000)



In [ ]:

    
t = Test(op, source_df, "df_distance_cluster", imports=["from pyspark.ml.linalg import Vectors, VectorUDT, DenseVector",
                                        "import numpy as np",
                                        "nan = np.nan",
                                        "import datetime",
                                        "from pyspark.sql import functions as F",
                                        "from optimus.ml import distancecluster as dc"], path = "df_distance_cluster", final_path="..")

from optimus.ml import distancecluster as dc



In [ ]:

    
df.table()



In [ ]:

    
t.create(dc, "levenshtein_cluster", None, 'dict', None, source_df, "STATE")



In [ ]:

    
t.run()



In [ ]:

    
df_cancer = op.spark.read.csv('../data_cancer.csv', sep=',', header=True, inferSchema=True)



In [ ]:

    
columns = ['diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean',
           'compactness_mean', 'concavity_mean', 'concave points_mean', 'symmetry_mean',
           'fractal_dimension_mean']



In [ ]:

    
df_model, rf_model = op.ml.gbt(df_cancer, columns, "diagnosis")



In [ ]:

    
df_model.table()

Row



In [24]:

    
source_df = op.create.df([
    ("words", "str", True),
    ("num", "int", True),
    ("animals", "str", True),
    ("thing", StringType(), True),
    ("second", "int", True),
    ("filter", StringType(), True)
],
    [
        ("  I like     fish  ", 1, "dog dog", "housé", 5, "a"),
        ("    zombies", 2, "cat", "tv", 6, "b"),
        ("simpsons   cat lady", 2, "frog", "table", 7, "1"),
        (None, 3, "eagle", "glass", 8, "c"),
    ])



In [25]:

    
from optimus.audf import abstract_udf as audf
t = Test(op, source_df, "df_rows", imports=["from pyspark.ml.linalg import Vectors, VectorUDT, DenseVector",
                                        "import numpy as np",
                                        "nan = np.nan",
                                        "from optimus.audf import abstract_udf as audf",
                                        "import datetime",
                                        "from pyspark.sql import functions as F"], path = "df_rows", final_path="..")



In [4]:

    
row =[("this is a word", 2, "this is an animal",
                                           "this is a thing", 64, "this is a filter",)]



In [14]:

    
t.create(None, "rows.append", None, "df", None, row)









    



Creating test_rows_append() test function...






    



INFO:optimus:test_rows_append()






    



["[('this is a word', 2, 'this is an animal', 'this is a thing', 64, 'this is a filter')]"]






    









Viewing 5 of 5 rows / 6 columns
2 partition(s)


    
    
        
        
            words
            1 (string)
            
                
                nullable
                
            
        
        
        
            num
            2 (int)
            
                
                nullable
                
            
        
        
        
            animals
            3 (string)
            
                
                nullable
                
            
        
        
        
            thing
            4 (string)
            
                
                nullable
                
            
        
        
        
            second
            5 (int)
            
                
                nullable
                
            
        
        
        
            filter
            6 (string)
            
                
                nullable
                
            
        
        
    

    
    
    
    
        
        
            
                
                ⋅⋅I⋅like⋅⋅⋅⋅⋅fish⋅⋅
                 
            
        
        
        
            
                
                1
                 
            
        
        
        
            
                
                dog⋅dog
                 
            
        
        
        
            
                
                housé
                 
            
        
        
        
            
                
                5
                 
            
        
        
        
            
                
                a
                 
            
        
        
    
    
    
        
        
            
                
                ⋅⋅⋅⋅zombies
                 
            
        
        
        
            
                
                2
                 
            
        
        
        
            
                
                cat
                 
            
        
        
        
            
                
                tv
                 
            
        
        
        
            
                
                6
                 
            
        
        
        
            
                
                b
                 
            
        
        
    
    
    
        
        
            
                
                simpsons⋅⋅⋅cat⋅lady
                 
            
        
        
        
            
                
                2
                 
            
        
        
        
            
                
                frog
                 
            
        
        
        
            
                
                table
                 
            
        
        
        
            
                
                7
                 
            
        
        
        
            
                
                1
                 
            
        
        
    
    
    
        
        
            
                
                None
                 
            
        
        
        
            
                
                3
                 
            
        
        
        
            
                
                eagle
                 
            
        
        
        
            
                
                glass
                 
            
        
        
        
            
                
                8
                 
            
        
        
        
            
                
                c
                 
            
        
        
    
    
    
        
        
            
                
                this⋅is⋅a⋅word
                 
            
        
        
        
            
                
                2
                 
            
        
        
        
            
                
                this⋅is⋅an⋅animal
                 
            
        
        
        
            
                
                this⋅is⋅a⋅thing
                 
            
        
        
        
            
                
                64
                 
            
        
        
        
            
                
                this⋅is⋅a⋅filter
                 
            
        
        
    
    
    



Viewing 5 of 5 rows / 6 columns
2 partition(s)



In [116]:

    
fil = (source_df["num"] == 1)



In [120]:

    
t.create(None, "rows.select", None, "df", None, fil)









    



---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-120-499e6583145b> in <module>
----> 1 t.delete(None, "rows.select", None, "df", None, fil)

TypeError: delete() takes from 2 to 3 positional arguments but 7 were given



In [40]:

    
t.create(None, "rows.select_by_dtypes", None, "df", None, "filter", "integer")









    



Creating test_rows_select_by_dtypes() test function...






    



INFO:optimus:test_rows_select_by_dtypes()






    









Viewing 1 of 1 rows / 6 columns
1 partition(s)


    
    
        
        
            words
            1 (string)
            
                
                nullable
                
            
        
        
        
            num
            2 (int)
            
                
                nullable
                
            
        
        
        
            animals
            3 (string)
            
                
                nullable
                
            
        
        
        
            thing
            4 (string)
            
                
                nullable
                
            
        
        
        
            second
            5 (int)
            
                
                nullable
                
            
        
        
        
            filter
            6 (string)
            
                
                nullable
                
            
        
        
    

    
    
    
    
        
        
            
                
                simpsons⋅⋅⋅cat⋅lady
                 
            
        
        
        
            
                
                2
                 
            
        
        
        
            
                
                frog
                 
            
        
        
        
            
                
                table
                 
            
        
        
        
            
                
                7
                 
            
        
        
        
            
                
                1
                 
            
        
        
    
    
    



Viewing 1 of 1 rows / 6 columns
1 partition(s)



In [ ]:



In [113]:

    
fil = (source_df["num"] == 2) | (source_df["second"] == 5)
print(str(fil))
# type(fil)









    



Column<b'((num = 2) OR (second = 5))'>



In [114]:

    
t.create(None, "rows.drop", None, "df", None, fil)









    



Creating test_rows_drop() test function...






    



INFO:optimus:test_rows_drop()






    



["Column<b'((num = 2) OR (second = 5))'>"]






    









Viewing 1 of 1 rows / 6 columns
1 partition(s)


    
    
        
        
            words
            1 (string)
            
                
                nullable
                
            
        
        
        
            num
            2 (int)
            
                
                nullable
                
            
        
        
        
            animals
            3 (string)
            
                
                nullable
                
            
        
        
        
            thing
            4 (string)
            
                
                nullable
                
            
        
        
        
            second
            5 (int)
            
                
                nullable
                
            
        
        
        
            filter
            6 (string)
            
                
                nullable
                
            
        
        
    

    
    
    
    
        
        
            
                
                None
                 
            
        
        
        
            
                
                3
                 
            
        
        
        
            
                
                eagle
                 
            
        
        
        
            
                
                glass
                 
            
        
        
        
            
                
                8
                 
            
        
        
        
            
                
                c
                 
            
        
        
    
    
    



Viewing 1 of 1 rows / 6 columns
1 partition(s)



In [43]:

    
t.create(None, "rows.drop_by_dtypes", None, "df", None, "filter", "integer")









    



Creating test_rows_drop_by_dtypes() test function...






    



INFO:optimus:test_rows_drop_by_dtypes()






    









Viewing 3 of 3 rows / 6 columns
1 partition(s)


    
    
        
        
            words
            1 (string)
            
                
                nullable
                
            
        
        
        
            num
            2 (int)
            
                
                nullable
                
            
        
        
        
            animals
            3 (string)
            
                
                nullable
                
            
        
        
        
            thing
            4 (string)
            
                
                nullable
                
            
        
        
        
            second
            5 (int)
            
                
                nullable
                
            
        
        
        
            filter
            6 (string)
            
                
                nullable
                
            
        
        
    

    
    
    
    
        
        
            
                
                ⋅⋅I⋅like⋅⋅⋅⋅⋅fish⋅⋅
                 
            
        
        
        
            
                
                1
                 
            
        
        
        
            
                
                dog⋅dog
                 
            
        
        
        
            
                
                housé
                 
            
        
        
        
            
                
                5
                 
            
        
        
        
            
                
                a
                 
            
        
        
    
    
    
        
        
            
                
                ⋅⋅⋅⋅zombies
                 
            
        
        
        
            
                
                2
                 
            
        
        
        
            
                
                cat
                 
            
        
        
        
            
                
                tv
                 
            
        
        
        
            
                
                6
                 
            
        
        
        
            
                
                b
                 
            
        
        
    
    
    
        
        
            
                
                None
                 
            
        
        
        
            
                
                3
                 
            
        
        
        
            
                
                eagle
                 
            
        
        
        
            
                
                glass
                 
            
        
        
        
            
                
                8
                 
            
        
        
        
            
                
                c
                 
            
        
        
    
    
    



Viewing 3 of 3 rows / 6 columns
1 partition(s)



In [52]:

    
def func_data_type(value, attr):
    return value > 1
a = audf("num", func_data_type, "boolean")









    



INFO:optimus:Using 'pandas_udf' to process column 'num' with function func_data_type



In [53]:

    
t.create(None, "rows.drop", "audf", "df", None, a)









    



Creating test_rows_drop_audf() test function...






    



INFO:optimus:test_rows_drop_audf()






    









Viewing 1 of 1 rows / 6 columns
1 partition(s)


    
    
        
        
            words
            1 (string)
            
                
                nullable
                
            
        
        
        
            num
            2 (int)
            
                
                nullable
                
            
        
        
        
            animals
            3 (string)
            
                
                nullable
                
            
        
        
        
            thing
            4 (string)
            
                
                nullable
                
            
        
        
        
            second
            5 (int)
            
                
                nullable
                
            
        
        
        
            filter
            6 (string)
            
                
                nullable
                
            
        
        
    

    
    
    
    
        
        
            
                
                ⋅⋅I⋅like⋅⋅⋅⋅⋅fish⋅⋅
                 
            
        
        
        
            
                
                1
                 
            
        
        
        
            
                
                dog⋅dog
                 
            
        
        
        
            
                
                housé
                 
            
        
        
        
            
                
                5
                 
            
        
        
        
            
                
                a
                 
            
        
        
    
    
    



Viewing 1 of 1 rows / 6 columns
1 partition(s)



In [51]:

    
t.create(None, "rows.sort", None, "df", None, "num", "desc")









    



Creating test_rows_sort() test function...






    



INFO:optimus:test_rows_sort()






    









Viewing 4 of 4 rows / 6 columns
4 partition(s)


    
    
        
        
            words
            1 (string)
            
                
                nullable
                
            
        
        
        
            num
            2 (int)
            
                
                nullable
                
            
        
        
        
            animals
            3 (string)
            
                
                nullable
                
            
        
        
        
            thing
            4 (string)
            
                
                nullable
                
            
        
        
        
            second
            5 (int)
            
                
                nullable
                
            
        
        
        
            filter
            6 (string)
            
                
                nullable
                
            
        
        
    

    
    
    
    
        
        
            
                
                None
                 
            
        
        
        
            
                
                3
                 
            
        
        
        
            
                
                eagle
                 
            
        
        
        
            
                
                glass
                 
            
        
        
        
            
                
                8
                 
            
        
        
        
            
                
                c
                 
            
        
        
    
    
    
        
        
            
                
                ⋅⋅⋅⋅zombies
                 
            
        
        
        
            
                
                2
                 
            
        
        
        
            
                
                cat
                 
            
        
        
        
            
                
                tv
                 
            
        
        
        
            
                
                6
                 
            
        
        
        
            
                
                b
                 
            
        
        
    
    
    
        
        
            
                
                simpsons⋅⋅⋅cat⋅lady
                 
            
        
        
        
            
                
                2
                 
            
        
        
        
            
                
                frog
                 
            
        
        
        
            
                
                table
                 
            
        
        
        
            
                
                7
                 
            
        
        
        
            
                
                1
                 
            
        
        
    
    
    
        
        
            
                
                ⋅⋅I⋅like⋅⋅⋅⋅⋅fish⋅⋅
                 
            
        
        
        
            
                
                1
                 
            
        
        
        
            
                
                dog⋅dog
                 
            
        
        
        
            
                
                housé
                 
            
        
        
        
            
                
                5
                 
            
        
        
        
            
                
                a
                 
            
        
        
    
    
    



Viewing 4 of 4 rows / 6 columns
4 partition(s)



In [ ]:

    
t.create(None, "rows.is_in", None, "df", None, "num", 2)



In [26]:

    
t.create(None, "rows.between", None, "df", None, "second", 6, 8)









    



Creating test_rows_between() test function...






    



INFO:optimus:test_rows_between()






    



["'second'", '6', '8']






    



---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-26-ff96240885b0> in <module>
----> 1 t.create(None, "rows.between", None, "df", None, "second", 6, 8)

~\Documents\Optimus\optimus\helpers\test.py in create(self, obj, method, suffix, output, additional_method, *args, **kwargs)
    217                 df_func = getattr(df_func, f)
    218 
--> 219             df_result = df_func(*args, **kwargs)
    220 
    221         # Additional Methods

~\Documents\Optimus\optimus\helpers\decorators.py in wrapper(*args, **kwargs)
     47         def wrapper(*args, **kwargs):
     48             start_time = timeit.default_timer()
---> 49             f = func(*args, **kwargs)
     50             _time = round(timeit.default_timer() - start_time, 2)
     51             if log_time:

~\Documents\Optimus\optimus\dataframe\rows.py in between(columns, lower_bound, upper_bound, invert, equal, bounds)
    221         df = self
    222         for col_name in columns:
--> 223             df = df.where(_between(col_name))
    224         df = df.preserve_meta(self, Actions.DROP_ROW.value, df.cols.names())
    225         return df

~\Documents\Optimus\optimus\dataframe\rows.py in _between(_col_name)
    213             sub_query = []
    214             for bound in bounds:
--> 215                 _lower_bound, _upper_bound = bound
    216                 sub_query.append(opb(op1(F.col(_col_name), _lower_bound), op2(F.col(_col_name), _upper_bound)))
    217             query = functools.reduce(operator.__or__, sub_query)

TypeError: cannot unpack non-iterable int object



In [27]:

    
t.create(None, "rows.between", "equal", "df", None, "second", 6, 8, equal=True)









    



Creating test_rows_between_equal() test function...






    



INFO:optimus:test_rows_between_equal()






    



["'second'", '6', '8']






    



---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-27-e7f76945619e> in <module>
----> 1 t.create(None, "rows.between", "equal", "df", None, "second", 6, 8, equal=True)

~\Documents\Optimus\optimus\helpers\test.py in create(self, obj, method, suffix, output, additional_method, *args, **kwargs)
    217                 df_func = getattr(df_func, f)
    218 
--> 219             df_result = df_func(*args, **kwargs)
    220 
    221         # Additional Methods

~\Documents\Optimus\optimus\helpers\decorators.py in wrapper(*args, **kwargs)
     47         def wrapper(*args, **kwargs):
     48             start_time = timeit.default_timer()
---> 49             f = func(*args, **kwargs)
     50             _time = round(timeit.default_timer() - start_time, 2)
     51             if log_time:

~\Documents\Optimus\optimus\dataframe\rows.py in between(columns, lower_bound, upper_bound, invert, equal, bounds)
    221         df = self
    222         for col_name in columns:
--> 223             df = df.where(_between(col_name))
    224         df = df.preserve_meta(self, Actions.DROP_ROW.value, df.cols.names())
    225         return df

~\Documents\Optimus\optimus\dataframe\rows.py in _between(_col_name)
    213             sub_query = []
    214             for bound in bounds:
--> 215                 _lower_bound, _upper_bound = bound
    216                 sub_query.append(opb(op1(F.col(_col_name), _lower_bound), op2(F.col(_col_name), _upper_bound)))
    217             query = functools.reduce(operator.__or__, sub_query)

TypeError: cannot unpack non-iterable int object



In [63]:

    
t.create(None, "rows.between", "invert_equal", "df", None, "second", 6, 8, invert=True, equal=True)









    



Creating test_rows_between_invert_equal() test function...






    



INFO:optimus:test_rows_between_invert_equal()






    



Column<b'((second <= 6) OR (second >= 8))'>






    









Viewing 3 of 3 rows / 6 columns
1 partition(s)


    
    
        
        
            words
            1 (string)
            
                
                nullable
                
            
        
        
        
            num
            2 (int)
            
                
                nullable
                
            
        
        
        
            animals
            3 (string)
            
                
                nullable
                
            
        
        
        
            thing
            4 (string)
            
                
                nullable
                
            
        
        
        
            second
            5 (int)
            
                
                nullable
                
            
        
        
        
            filter
            6 (string)
            
                
                nullable
                
            
        
        
    

    
    
    
    
        
        
            
                
                ⋅⋅I⋅like⋅⋅⋅⋅⋅fish⋅⋅
                 
            
        
        
        
            
                
                1
                 
            
        
        
        
            
                
                dog⋅dog
                 
            
        
        
        
            
                
                housé
                 
            
        
        
        
            
                
                5
                 
            
        
        
        
            
                
                a
                 
            
        
        
    
    
    
        
        
            
                
                ⋅⋅⋅⋅zombies
                 
            
        
        
        
            
                
                2
                 
            
        
        
        
            
                
                cat
                 
            
        
        
        
            
                
                tv
                 
            
        
        
        
            
                
                6
                 
            
        
        
        
            
                
                b
                 
            
        
        
    
    
    
        
        
            
                
                None
                 
            
        
        
        
            
                
                3
                 
            
        
        
        
            
                
                eagle
                 
            
        
        
        
            
                
                glass
                 
            
        
        
        
            
                
                8
                 
            
        
        
        
            
                
                c
                 
            
        
        
    
    
    



Viewing 3 of 3 rows / 6 columns
1 partition(s)



In [ ]:

    
t.create(None, "rows.between", "bounds", "df", None, "second", bounds=[(6,7),(7,8)], invert=True, equal=True)



In [ ]:

    
t.run()



In [ ]:

names 1 (string) nullable	height(ft) 2 (smallint) nullable	function 3 (string) nullable	rank 4 (tinyint) nullable	age 5 (int) nullable	weight(t) 6 (float) nullable	japanese name 7 (array<string>) nullable	last position seen 8 (string) nullable	date arrival 9 (string) nullable	last date seen 10 (string) nullable	attributes 11 (array<float>) nullable	Date Type 12 (date) nullable	timestamp 13 (timestamp) nullable	Cybertronian 14 (boolean) nullable	function(binary) 15 (binary) nullable	NullType 16 (null) nullable
Optim'us	-28	Leader	10	5000000	4.300000190734863	['Inochi',⋅'Convoy']	19.442735,-99.201111	1980/04/10	2016/09/10	[8.53439998626709,⋅4300.0]	2016-09-10	2014-06-24⋅00:00:00	True	bytearray(b'Leader')	None
bumbl#ebéé⋅⋅	17	Espionage	7	5000000	2.0	['Bumble',⋅'Goldback']	10.642707,-71.612534	1980/04/10	2015/08/10	[5.334000110626221,⋅2000.0]	2015-08-10	2014-06-24⋅00:00:00	True	bytearray(b'Espionage')	None
ironhide&	26	Security	7	5000000	4.0	['Roadbuster']	37.789563,-122.400356	1980/04/10	2014/07/10	[7.924799919128418,⋅4000.0]	2014-06-24	2014-06-24⋅00:00:00	True	bytearray(b'Security')	None
Jazz	13	First⋅Lieutenant	8	5000000	1.7999999523162842	['Meister']	33.670666,-117.841553	1980/04/10	2013/06/10	[3.962399959564209,⋅1800.0]	2013-06-24	2014-06-24⋅00:00:00	True	bytearray(b'First⋅Lieutenant')	None
Megatron	None	None	10	5000000	5.699999809265137	['Megatron']	None	1980/04/10	2012/05/10	[None,⋅5700.0]	2012-05-10	2014-06-24⋅00:00:00	True	bytearray(b'None')	None
Metroplex_)^$	300	Battle⋅Station	8	5000000	None	['Metroflex']	None	1980/04/10	2011/04/10	[91.44000244140625,⋅None]	2011-04-10	2014-06-24⋅00:00:00	True	bytearray(b'Battle⋅Station')	None
None	None	None	None	None	None	None	None	None	None	None	None	None	None	None	None

names 1 (string) nullable	height(ft) 2 (int) nullable	function 3 (string) nullable	rank 4 (int) nullable	age 5 (int) nullable	weight(t) 6 (float) nullable	japanese name 7 (array<string>) nullable	last position seen 8 (string) nullable	date arrival 9 (string) nullable	last date seen 10 (string) nullable	attributes 11 (array<float>) nullable	DateType 12 (date) nullable	Timestamp 13 (timestamp) nullable	Cybertronian 14 (boolean) nullable	function(binary) 15 (binary) not nullable	NullType 16 (null) nullable
31/12/2019	28	1978-12-20	10	5000000	4.300000190734863	['Inochi',⋅'Convoy']	19.442735,-99.201111	1980/04/10	2016/09/10	[8.53439998626709,⋅4300.0]	2016-09-10	2014-06-24⋅00:00:00	True	bytearray(b'Leader')	None
bumbl#ebéé⋅⋅	17	Espionage	7	5000000	2.0	['Bumble',⋅'Goldback']	10.642707,-71.612534	1980/04/10	2015/08/10	[5.334000110626221,⋅2000.0]	2015-08-10	2014-06-24⋅00:00:00	True	bytearray(b'Espionage')	None
ironhide&	26	Security	7	5000000	4.0	['Roadbuster']	37.789563,-122.400356	1980/04/10	2014/07/10	[7.924799919128418,⋅4000.0]	2014-06-24	2014-06-24⋅00:00:00	True	bytearray(b'Security')	None
Jazz	13	First⋅Lieutenant	8	5000000	1.7999999523162842	['Meister']	33.670666,-117.841553	1980/04/10	2013/06/10	[3.962399959564209,⋅1800.0]	2013-06-24	2014-06-24⋅00:00:00	True	bytearray(b'First⋅Lieutenant')	None
Megatron	None	None	10	5000000	5.699999809265137	['Megatron']	None	1980/04/10	2012/05/10	[None,⋅5700.0]	2012-05-10	2014-06-24⋅00:00:00	True	bytearray(b'None')	None
Metroplex_)^$	300	Battle⋅Station	8	5000000	None	['Metroflex']	None	1980/04/10	2011/04/10	[91.44000244140625,⋅None]	2011-04-10	2014-06-24⋅00:00:00	True	bytearray(b'Battle⋅Station')	None
1	2	3	4	5	6.0	['7']	8	1980/04/10	2011/04/10	[11.0]	2011-04-10	2014-06-24⋅00:00:00	True	bytearray(b'15')	None

words 1 (string) nullable	num 2 (int) nullable	animals 3 (string) nullable	thing 4 (string) nullable	second 5 (int) nullable	filter 6 (string) nullable
⋅⋅I⋅like⋅⋅⋅⋅⋅fish⋅⋅	1	dog⋅dog	housé	5	a
⋅⋅⋅⋅zombies	2	cat	tv	6	b
simpsons⋅⋅⋅cat⋅lady	2	frog	table	7	1
None	3	eagle	glass	8	c
this⋅is⋅a⋅word	2	this⋅is⋅an⋅animal	this⋅is⋅a⋅thing	64	this⋅is⋅a⋅filter