notebook.community

Edit and run



In [1]:

    
%load_ext autoreload
%autoreload 2



In [2]:

    
import sys
sys.path.append("..")



In [3]:

    
from optimus import Optimus

from pyspark.sql.session import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, BooleanType, IntegerType, ArrayType



In [4]:

    
# Create optimus
op = Optimus()



In [5]:

    
df = op.create.df([
                ("words", "str", True),
                ("num", "int", True),
                ("animals", "str", True),
                ("thing", StringType(), True),
                ("second", "int", True),
                ("filter", StringType(), True)
            ],
            [
                ("  I like     fish  ", 1, "dog dog", "housé", 5 , "a"),
                ("    zombies", 2, "cat", "tv", 6, "b"),
                ("simpsons   cat lady", 2, "frog", "table", 7, "1"),
                (None, 3, "eagle", "glass", 8, "c")
                
            ])

df.show()









    



+-------------------+---+-------+-----+------+------+
|              words|num|animals|thing|second|filter|
+-------------------+---+-------+-----+------+------+
|  I like     fish  |  1|dog dog|housé|     5|     a|
|            zombies|  2|    cat|   tv|     6|     b|
|simpsons   cat lady|  2|   frog|table|     7|     1|
|               null|  3|  eagle|glass|     8|     c|
+-------------------+---+-------+-----+------+------+



In [6]:

    
df.dtypes









    Out[6]:





[('words', 'string'),
 ('num', 'int'),
 ('animals', 'string'),
 ('thing', 'string'),
 ('second', 'int'),
 ('filter', 'string')]

Append row

Spark

Not available in Spark. You need to create a dataframe and the union to append a row



In [7]:

    
df.rows.append(["this is a word",2, "this is an animal", "this is a thing", 64, "this is a filter"]).table()









    









Viewing 5 of 5 rows / 6 columns
8 partition(s)


    
    
        
        
            words
            1 (string)
            
                
                nullable
                
            
        
        
        
            num
            2 (int)
            
                
                nullable
                
            
        
        
        
            animals
            3 (string)
            
                
                nullable
                
            
        
        
        
            thing
            4 (string)
            
                
                nullable
                
            
        
        
        
            second
            5 (int)
            
                
                nullable
                
            
        
        
        
            filter
            6 (string)
            
                
                nullable
                
            
        
        
    

    
    
    
    
        
        
            ⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱
        
        
        
            1
        
        
        
            dog⸱dog
        
        
        
            housé
        
        
        
            5
        
        
        
            a
        
        
    
    
    
        
        
            ⸱⸱⸱⸱zombies
        
        
        
            2
        
        
        
            cat
        
        
        
            tv
        
        
        
            6
        
        
        
            b
        
        
    
    
    
        
        
            simpsons⸱⸱⸱cat⸱lady
        
        
        
            2
        
        
        
            frog
        
        
        
            table
        
        
        
            7
        
        
        
            1
        
        
    
    
    
        
        
            None
        
        
        
            3
        
        
        
            eagle
        
        
        
            glass
        
        
        
            8
        
        
        
            c
        
        
    
    
    
        
        
            this⸱is⸱a⸱word
        
        
        
            2
        
        
        
            this⸱is⸱an⸱animal
        
        
        
            this⸱is⸱a⸱thing
        
        
        
            64
        
        
        
            this⸱is⸱a⸱filter
        
        
    
    
    


Viewing 5 of 5 rows / 6 columns
8 partition(s)

Append a dataframe



In [ ]:

    
df_bat = op.create.df(
    [
        ("words", "str", True),
        ("num", "int", True),
        ("animals", "str", True),
        ("thing", StringType(), True),
        ("two strings", StringType(), True),
        ("filter", StringType(), True),
        ("num 2", "string", True),
        ("col_array", ArrayType(StringType()), True),
        ("col_int", ArrayType(IntegerType()), True)

    ],[
        ("I am batman", 1, "bat", "housé", "cat-car", "z", "10", ["screen", "sorry"], [11, 21, 31]),

    ])



In [ ]:

    
df.rows.append(df_bat).table()

Sort

Sort columns desc (This is the default value)



In [8]:

    
df.rows.sort("animals").table()









    









Viewing 4 of 4 rows / 6 columns
4 partition(s)


    
    
        
        
            words
            1 (string)
            
                
                nullable
                
            
        
        
        
            num
            2 (int)
            
                
                nullable
                
            
        
        
        
            animals
            3 (string)
            
                
                nullable
                
            
        
        
        
            thing
            4 (string)
            
                
                nullable
                
            
        
        
        
            second
            5 (int)
            
                
                nullable
                
            
        
        
        
            filter
            6 (string)
            
                
                nullable
                
            
        
        
    

    
    
    
    
        
        
            simpsons⸱⸱⸱cat⸱lady
        
        
        
            2
        
        
        
            frog
        
        
        
            table
        
        
        
            7
        
        
        
            1
        
        
    
    
    
        
        
            None
        
        
        
            3
        
        
        
            eagle
        
        
        
            glass
        
        
        
            8
        
        
        
            c
        
        
    
    
    
        
        
            ⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱
        
        
        
            1
        
        
        
            dog⸱dog
        
        
        
            housé
        
        
        
            5
        
        
        
            a
        
        
    
    
    
        
        
            ⸱⸱⸱⸱zombies
        
        
        
            2
        
        
        
            cat
        
        
        
            tv
        
        
        
            6
        
        
        
            b
        
        
    
    
    


Viewing 4 of 4 rows / 6 columns
4 partition(s)



In [9]:

    
df.rows.sort("animals", "desc").table()









    









Viewing 4 of 4 rows / 6 columns
4 partition(s)


    
    
        
        
            words
            1 (string)
            
                
                nullable
                
            
        
        
        
            num
            2 (int)
            
                
                nullable
                
            
        
        
        
            animals
            3 (string)
            
                
                nullable
                
            
        
        
        
            thing
            4 (string)
            
                
                nullable
                
            
        
        
        
            second
            5 (int)
            
                
                nullable
                
            
        
        
        
            filter
            6 (string)
            
                
                nullable
                
            
        
        
    

    
    
    
    
        
        
            simpsons⸱⸱⸱cat⸱lady
        
        
        
            2
        
        
        
            frog
        
        
        
            table
        
        
        
            7
        
        
        
            1
        
        
    
    
    
        
        
            None
        
        
        
            3
        
        
        
            eagle
        
        
        
            glass
        
        
        
            8
        
        
        
            c
        
        
    
    
    
        
        
            ⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱
        
        
        
            1
        
        
        
            dog⸱dog
        
        
        
            housé
        
        
        
            5
        
        
        
            a
        
        
    
    
    
        
        
            ⸱⸱⸱⸱zombies
        
        
        
            2
        
        
        
            cat
        
        
        
            tv
        
        
        
            6
        
        
        
            b
        
        
    
    
    


Viewing 4 of 4 rows / 6 columns
4 partition(s)

Sort by multiples columns



In [10]:

    
df.rows.sort([("animals","desc"),("thing","asc")]).table()









    









Viewing 4 of 4 rows / 6 columns
4 partition(s)


    
    
        
        
            words
            1 (string)
            
                
                nullable
                
            
        
        
        
            num
            2 (int)
            
                
                nullable
                
            
        
        
        
            animals
            3 (string)
            
                
                nullable
                
            
        
        
        
            thing
            4 (string)
            
                
                nullable
                
            
        
        
        
            second
            5 (int)
            
                
                nullable
                
            
        
        
        
            filter
            6 (string)
            
                
                nullable
                
            
        
        
    

    
    
    
    
        
        
            simpsons⸱⸱⸱cat⸱lady
        
        
        
            2
        
        
        
            frog
        
        
        
            table
        
        
        
            7
        
        
        
            1
        
        
    
    
    
        
        
            None
        
        
        
            3
        
        
        
            eagle
        
        
        
            glass
        
        
        
            8
        
        
        
            c
        
        
    
    
    
        
        
            ⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱
        
        
        
            1
        
        
        
            dog⸱dog
        
        
        
            housé
        
        
        
            5
        
        
        
            a
        
        
    
    
    
        
        
            ⸱⸱⸱⸱zombies
        
        
        
            2
        
        
        
            cat
        
        
        
            tv
        
        
        
            6
        
        
        
            b
        
        
    
    
    


Viewing 4 of 4 rows / 6 columns
4 partition(s)



In [11]:

    
a = [("animals","desc"),("thing","asc")]
for c in a:
    print(c[0])









    



animals
thing

Select



In [12]:

    
df.rows.select(df["num"]==1).table()









    









Viewing 1 of 1 rows / 6 columns
4 partition(s)


    
    
        
        
            words
            1 (string)
            
                
                nullable
                
            
        
        
        
            num
            2 (int)
            
                
                nullable
                
            
        
        
        
            animals
            3 (string)
            
                
                nullable
                
            
        
        
        
            thing
            4 (string)
            
                
                nullable
                
            
        
        
        
            second
            5 (int)
            
                
                nullable
                
            
        
        
        
            filter
            6 (string)
            
                
                nullable
                
            
        
        
    

    
    
    
    
        
        
            ⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱
        
        
        
            1
        
        
        
            dog⸱dog
        
        
        
            housé
        
        
        
            5
        
        
        
            a
        
        
    
    
    


Viewing 1 of 1 rows / 6 columns
4 partition(s)

Select by type

Spark

Not available in Spark Vanilla.



In [13]:

    
df.rows.select_by_dtypes("filter", "integer").table()









    









Viewing 1 of 1 rows / 6 columns
4 partition(s)


    
    
        
        
            words
            1 (string)
            
                
                nullable
                
            
        
        
        
            num
            2 (int)
            
                
                nullable
                
            
        
        
        
            animals
            3 (string)
            
                
                nullable
                
            
        
        
        
            thing
            4 (string)
            
                
                nullable
                
            
        
        
        
            second
            5 (int)
            
                
                nullable
                
            
        
        
        
            filter
            6 (string)
            
                
                nullable
                
            
        
        
    

    
    
    
    
        
        
            simpsons⸱⸱⸱cat⸱lady
        
        
        
            2
        
        
        
            frog
        
        
        
            table
        
        
        
            7
        
        
        
            1
        
        
    
    
    


Viewing 1 of 1 rows / 6 columns
4 partition(s)

Drop

Spark

Drop by row not available in Spark Vanilla



In [14]:

    
df.rows.drop((df["num"]==2) | (df["second"]==5)).table()









    









Viewing 1 of 1 rows / 6 columns
4 partition(s)


    
    
        
        
            words
            1 (string)
            
                
                nullable
                
            
        
        
        
            num
            2 (int)
            
                
                nullable
                
            
        
        
        
            animals
            3 (string)
            
                
                nullable
                
            
        
        
        
            thing
            4 (string)
            
                
                nullable
                
            
        
        
        
            second
            5 (int)
            
                
                nullable
                
            
        
        
        
            filter
            6 (string)
            
                
                nullable
                
            
        
        
    

    
    
    
    
        
        
            None
        
        
        
            3
        
        
        
            eagle
        
        
        
            glass
        
        
        
            8
        
        
        
            c
        
        
    
    
    


Viewing 1 of 1 rows / 6 columns
4 partition(s)



In [15]:

    
df.rows.drop_by_dtypes("filter", "int").table()









    









Viewing 3 of 3 rows / 6 columns
4 partition(s)


    
    
        
        
            words
            1 (string)
            
                
                nullable
                
            
        
        
        
            num
            2 (int)
            
                
                nullable
                
            
        
        
        
            animals
            3 (string)
            
                
                nullable
                
            
        
        
        
            thing
            4 (string)
            
                
                nullable
                
            
        
        
        
            second
            5 (int)
            
                
                nullable
                
            
        
        
        
            filter
            6 (string)
            
                
                nullable
                
            
        
        
    

    
    
    
    
        
        
            ⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱
        
        
        
            1
        
        
        
            dog⸱dog
        
        
        
            housé
        
        
        
            5
        
        
        
            a
        
        
    
    
    
        
        
            ⸱⸱⸱⸱zombies
        
        
        
            2
        
        
        
            cat
        
        
        
            tv
        
        
        
            6
        
        
        
            b
        
        
    
    
    
        
        
            None
        
        
        
            3
        
        
        
            eagle
        
        
        
            glass
        
        
        
            8
        
        
        
            c
        
        
    
    
    


Viewing 3 of 3 rows / 6 columns
4 partition(s)

Drop by type



In [16]:

    
df.rows.drop_by_dtypes("filter", "integer").table()









    









Viewing 3 of 3 rows / 6 columns
4 partition(s)


    
    
        
        
            words
            1 (string)
            
                
                nullable
                
            
        
        
        
            num
            2 (int)
            
                
                nullable
                
            
        
        
        
            animals
            3 (string)
            
                
                nullable
                
            
        
        
        
            thing
            4 (string)
            
                
                nullable
                
            
        
        
        
            second
            5 (int)
            
                
                nullable
                
            
        
        
        
            filter
            6 (string)
            
                
                nullable
                
            
        
        
    

    
    
    
    
        
        
            ⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱
        
        
        
            1
        
        
        
            dog⸱dog
        
        
        
            housé
        
        
        
            5
        
        
        
            a
        
        
    
    
    
        
        
            ⸱⸱⸱⸱zombies
        
        
        
            2
        
        
        
            cat
        
        
        
            tv
        
        
        
            6
        
        
        
            b
        
        
    
    
    
        
        
            None
        
        
        
            3
        
        
        
            eagle
        
        
        
            glass
        
        
        
            8
        
        
        
            c
        
        
    
    
    


Viewing 3 of 3 rows / 6 columns
4 partition(s)

Drop using an abstract UDF



In [17]:

    
from optimus.audf import abstract_udf as audf

def func_data_type(value, attr):
    return value >1


df.rows.drop(audf("num", func_data_type, "boolean")).table()









    









Viewing 1 of 1 rows / 6 columns
4 partition(s)


    
    
        
        
            words
            1 (string)
            
                
                nullable
                
            
        
        
        
            num
            2 (int)
            
                
                nullable
                
            
        
        
        
            animals
            3 (string)
            
                
                nullable
                
            
        
        
        
            thing
            4 (string)
            
                
                nullable
                
            
        
        
        
            second
            5 (int)
            
                
                nullable
                
            
        
        
        
            filter
            6 (string)
            
                
                nullable
                
            
        
        
    

    
    
    
    
        
        
            ⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱
        
        
        
            1
        
        
        
            dog⸱dog
        
        
        
            housé
        
        
        
            5
        
        
        
            a
        
        
    
    
    


Viewing 1 of 1 rows / 6 columns
4 partition(s)

words 1 (string) nullable	num 2 (int) nullable	animals 3 (string) nullable	thing 4 (string) nullable	second 5 (int) nullable	filter 6 (string) nullable
⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱	1	dog⸱dog	housé	5	a
⸱⸱⸱⸱zombies	2	cat	tv	6	b
simpsons⸱⸱⸱cat⸱lady	2	frog	table	7	1
None	3	eagle	glass	8	c
this⸱is⸱a⸱word	2	this⸱is⸱an⸱animal	this⸱is⸱a⸱thing	64	this⸱is⸱a⸱filter