In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append("..")

In [3]:
from optimus import Optimus

from pyspark.sql.session import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, BooleanType, IntegerType, ArrayType

In [4]:
# Create optimus
op = Optimus()

In [5]:
df = op.create.df([
                ("words", "str", True),
                ("num", "int", True),
                ("animals", "str", True),
                ("thing", StringType(), True),
                ("second", "int", True),
                ("filter", StringType(), True)
            ],
            [
                ("  I like     fish  ", 1, "dog dog", "housé", 5 , "a"),
                ("    zombies", 2, "cat", "tv", 6, "b"),
                ("simpsons   cat lady", 2, "frog", "table", 7, "1"),
                (None, 3, "eagle", "glass", 8, "c")
                
            ])

df.show()


+-------------------+---+-------+-----+------+------+
|              words|num|animals|thing|second|filter|
+-------------------+---+-------+-----+------+------+
|  I like     fish  |  1|dog dog|housé|     5|     a|
|            zombies|  2|    cat|   tv|     6|     b|
|simpsons   cat lady|  2|   frog|table|     7|     1|
|               null|  3|  eagle|glass|     8|     c|
+-------------------+---+-------+-----+------+------+


In [6]:
df.dtypes


Out[6]:
[('words', 'string'),
 ('num', 'int'),
 ('animals', 'string'),
 ('thing', 'string'),
 ('second', 'int'),
 ('filter', 'string')]

Append row

Spark

Not available in Spark. You need to create a dataframe and the union to append a row


In [7]:
df.rows.append(["this is a word",2, "this is an animal", "this is a thing", 64, "this is a filter"]).table()


Viewing 5 of 5 rows / 6 columns
8 partition(s)
words
1 (string)
nullable
num
2 (int)
nullable
animals
3 (string)
nullable
thing
4 (string)
nullable
second
5 (int)
nullable
filter
6 (string)
nullable
⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱ 1 dog⸱dog housé 5 a
⸱⸱⸱⸱zombies 2 cat tv 6 b
simpsons⸱⸱⸱cat⸱lady 2 frog table 7 1
None 3 eagle glass 8 c
this⸱is⸱a⸱word 2 this⸱is⸱an⸱animal this⸱is⸱a⸱thing 64 this⸱is⸱a⸱filter
Viewing 5 of 5 rows / 6 columns
8 partition(s)

Append a dataframe


In [ ]:
df_bat = op.create.df(
    [
        ("words", "str", True),
        ("num", "int", True),
        ("animals", "str", True),
        ("thing", StringType(), True),
        ("two strings", StringType(), True),
        ("filter", StringType(), True),
        ("num 2", "string", True),
        ("col_array", ArrayType(StringType()), True),
        ("col_int", ArrayType(IntegerType()), True)

    ],[
        ("I am batman", 1, "bat", "housé", "cat-car", "z", "10", ["screen", "sorry"], [11, 21, 31]),

    ])

In [ ]:
df.rows.append(df_bat).table()

Sort

Sort columns desc (This is the default value)


In [8]:
df.rows.sort("animals").table()


Viewing 4 of 4 rows / 6 columns
4 partition(s)
words
1 (string)
nullable
num
2 (int)
nullable
animals
3 (string)
nullable
thing
4 (string)
nullable
second
5 (int)
nullable
filter
6 (string)
nullable
simpsons⸱⸱⸱cat⸱lady 2 frog table 7 1
None 3 eagle glass 8 c
⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱ 1 dog⸱dog housé 5 a
⸱⸱⸱⸱zombies 2 cat tv 6 b
Viewing 4 of 4 rows / 6 columns
4 partition(s)

In [9]:
df.rows.sort("animals", "desc").table()


Viewing 4 of 4 rows / 6 columns
4 partition(s)
words
1 (string)
nullable
num
2 (int)
nullable
animals
3 (string)
nullable
thing
4 (string)
nullable
second
5 (int)
nullable
filter
6 (string)
nullable
simpsons⸱⸱⸱cat⸱lady 2 frog table 7 1
None 3 eagle glass 8 c
⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱ 1 dog⸱dog housé 5 a
⸱⸱⸱⸱zombies 2 cat tv 6 b
Viewing 4 of 4 rows / 6 columns
4 partition(s)

Sort by multiples columns


In [10]:
df.rows.sort([("animals","desc"),("thing","asc")]).table()


Viewing 4 of 4 rows / 6 columns
4 partition(s)
words
1 (string)
nullable
num
2 (int)
nullable
animals
3 (string)
nullable
thing
4 (string)
nullable
second
5 (int)
nullable
filter
6 (string)
nullable
simpsons⸱⸱⸱cat⸱lady 2 frog table 7 1
None 3 eagle glass 8 c
⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱ 1 dog⸱dog housé 5 a
⸱⸱⸱⸱zombies 2 cat tv 6 b
Viewing 4 of 4 rows / 6 columns
4 partition(s)

In [11]:
a = [("animals","desc"),("thing","asc")]
for c in a:
    print(c[0])


animals
thing

Select


In [12]:
df.rows.select(df["num"]==1).table()


Viewing 1 of 1 rows / 6 columns
4 partition(s)
words
1 (string)
nullable
num
2 (int)
nullable
animals
3 (string)
nullable
thing
4 (string)
nullable
second
5 (int)
nullable
filter
6 (string)
nullable
⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱ 1 dog⸱dog housé 5 a
Viewing 1 of 1 rows / 6 columns
4 partition(s)

Select by type

Spark

Not available in Spark Vanilla.


In [13]:
df.rows.select_by_dtypes("filter", "integer").table()


Viewing 1 of 1 rows / 6 columns
4 partition(s)
words
1 (string)
nullable
num
2 (int)
nullable
animals
3 (string)
nullable
thing
4 (string)
nullable
second
5 (int)
nullable
filter
6 (string)
nullable
simpsons⸱⸱⸱cat⸱lady 2 frog table 7 1
Viewing 1 of 1 rows / 6 columns
4 partition(s)

Drop

Spark

Drop by row not available in Spark Vanilla


In [14]:
df.rows.drop((df["num"]==2) | (df["second"]==5)).table()


Viewing 1 of 1 rows / 6 columns
4 partition(s)
words
1 (string)
nullable
num
2 (int)
nullable
animals
3 (string)
nullable
thing
4 (string)
nullable
second
5 (int)
nullable
filter
6 (string)
nullable
None 3 eagle glass 8 c
Viewing 1 of 1 rows / 6 columns
4 partition(s)

In [15]:
df.rows.drop_by_dtypes("filter", "int").table()


Viewing 3 of 3 rows / 6 columns
4 partition(s)
words
1 (string)
nullable
num
2 (int)
nullable
animals
3 (string)
nullable
thing
4 (string)
nullable
second
5 (int)
nullable
filter
6 (string)
nullable
⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱ 1 dog⸱dog housé 5 a
⸱⸱⸱⸱zombies 2 cat tv 6 b
None 3 eagle glass 8 c
Viewing 3 of 3 rows / 6 columns
4 partition(s)

Drop by type


In [16]:
df.rows.drop_by_dtypes("filter", "integer").table()


Viewing 3 of 3 rows / 6 columns
4 partition(s)
words
1 (string)
nullable
num
2 (int)
nullable
animals
3 (string)
nullable
thing
4 (string)
nullable
second
5 (int)
nullable
filter
6 (string)
nullable
⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱ 1 dog⸱dog housé 5 a
⸱⸱⸱⸱zombies 2 cat tv 6 b
None 3 eagle glass 8 c
Viewing 3 of 3 rows / 6 columns
4 partition(s)

Drop using an abstract UDF


In [17]:
from optimus.audf import abstract_udf as audf

def func_data_type(value, attr):
    return value >1


df.rows.drop(audf("num", func_data_type, "boolean")).table()


Viewing 1 of 1 rows / 6 columns
4 partition(s)
words
1 (string)
nullable
num
2 (int)
nullable
animals
3 (string)
nullable
thing
4 (string)
nullable
second
5 (int)
nullable
filter
6 (string)
nullable
⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱ 1 dog⸱dog housé 5 a
Viewing 1 of 1 rows / 6 columns
4 partition(s)