In [1]:
%load_ext autoreload
%autoreload 2
In [2]:
import sys
sys.path.append("..")
In [3]:
from optimus import Optimus
from pyspark.sql.session import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, BooleanType, IntegerType, ArrayType
In [4]:
# Create optimus
op = Optimus()
In [5]:
df = op.create.df([
("words", "str", True),
("num", "int", True),
("animals", "str", True),
("thing", StringType(), True),
("second", "int", True),
("filter", StringType(), True)
],
[
(" I like fish ", 1, "dog dog", "housé", 5 , "a"),
(" zombies", 2, "cat", "tv", 6, "b"),
("simpsons cat lady", 2, "frog", "table", 7, "1"),
(None, 3, "eagle", "glass", 8, "c")
])
df.show()
In [6]:
df.dtypes
Out[6]:
In [7]:
df.rows.append(["this is a word",2, "this is an animal", "this is a thing", 64, "this is a filter"]).table()
In [ ]:
df_bat = op.create.df(
[
("words", "str", True),
("num", "int", True),
("animals", "str", True),
("thing", StringType(), True),
("two strings", StringType(), True),
("filter", StringType(), True),
("num 2", "string", True),
("col_array", ArrayType(StringType()), True),
("col_int", ArrayType(IntegerType()), True)
],[
("I am batman", 1, "bat", "housé", "cat-car", "z", "10", ["screen", "sorry"], [11, 21, 31]),
])
In [ ]:
df.rows.append(df_bat).table()
In [8]:
df.rows.sort("animals").table()
In [9]:
df.rows.sort("animals", "desc").table()
In [10]:
df.rows.sort([("animals","desc"),("thing","asc")]).table()
In [11]:
a = [("animals","desc"),("thing","asc")]
for c in a:
print(c[0])
In [12]:
df.rows.select(df["num"]==1).table()
In [13]:
df.rows.select_by_dtypes("filter", "integer").table()
In [14]:
df.rows.drop((df["num"]==2) | (df["second"]==5)).table()
In [15]:
df.rows.drop_by_dtypes("filter", "int").table()
In [16]:
df.rows.drop_by_dtypes("filter", "integer").table()
In [17]:
from optimus.audf import abstract_udf as audf
def func_data_type(value, attr):
return value >1
df.rows.drop(audf("num", func_data_type, "boolean")).table()