In [1]:
# UNCOMMENT THIS IF YOU'RE USING GOOGLE COLAB!
#!apt-get install openjdk-8-jdk-headless -qq > /dev/null
#!wget -q http://apache.osuosl.org/spark/spark-2.4.1/spark-2.4.1-bin-hadoop2.7.tgz
#!tar xf spark-2.4.1-bin-hadoop2.7.tgz
#!pip install optimuspyspark
# AFTER RUNNING THIS CELL, YOU MUST RESTART THE RUNTIME TO USE UPDATED VERSIONS OF PACKAGES!
In [2]:
# UNCOMMENT THIS IF YOU'RE USING GOOGLE COLAB!
#import os
#os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
#os.environ["SPARK_HOME"] = "/content/spark-2.3.3-bin-hadoop2.7"
In [3]:
# Run if you clone Optimus and want to hack it from the notebook else you must install
%load_ext autoreload
%autoreload 2
import sys
sys.path.append("..")
In [4]:
from optimus import Optimus
C:\Users\argenisleon\Anaconda3\lib\site-packages\socks.py:58: DeprecationWarning: Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated, and in 3.8 it will stop working
from collections import Callable
You are using PySparkling of version 2.4.10, but your PySpark is of
version 2.3.1. Please make sure Spark and PySparkling versions are compatible.
In [5]:
# Create optimus
op = Optimus(master="local", app_name= "optimus", verbose = True)
INFO:optimus:Just check that Spark and all necessary environments vars are present...
INFO:optimus:-----
INFO:optimus:SPARK_HOME=C:\opt\spark\spark-2.3.1-bin-hadoop2.7
INFO:optimus:HADOOP_HOME=C:\opt\hadoop-2.7.7
INFO:optimus:PYSPARK_PYTHON=C:\Users\argenisleon\Anaconda3\python.exe
INFO:optimus:PYSPARK_DRIVER_PYTHON=jupyter
INFO:optimus:PYSPARK_SUBMIT_ARGS=--packages com.databricks:spark-avro_2.11:4.0.0 --conf "spark.sql.catalogImplementation=hive" pyspark-shell
INFO:optimus:JAVA_HOME=C:\java
INFO:optimus:Pyarrow Installed
INFO:optimus:-----
INFO:optimus:Starting or getting SparkSession and SparkContext...
INFO:optimus:Spark Version:2.3.1
INFO:optimus:
____ __ _
/ __ \____ / /_(_)___ ___ __ _______
/ / / / __ \/ __/ / __ `__ \/ / / / ___/
/ /_/ / /_/ / /_/ / / / / / / /_/ (__ )
\____/ .___/\__/_/_/ /_/ /_/\__,_/____/
/_/
INFO:optimus:Transform and Roll out...
INFO:optimus:Optimus successfully imported. Have fun :).
In [ ]:
op = Optimus(packages="datastax:spark-cassandra-connector:1.6.1-s_2.10")
In [7]:
op.spark
Out[7]:
SparkSession - hive
In [8]:
op.sc
Out[8]:
In [7]:
op.packages
Out[7]:
['com.databricks:spark-avro_2.11:4.0.0']
In [6]:
from pyspark.sql.types import StructType, StructField, StringType, BooleanType, IntegerType, ArrayType
from datetime import date, datetime
df = op.create.df(
[
"names",
"height(ft)",
"function",
"rank",
"age",
"weight(t)",
"japanese name",
"last position seen",
"date arrival",
"last date seen",
"attributes",
"DateType",
"Tiemstamp",
"Cybertronian",
"function(binary)",
"NullType",
],
[
("Optim'us", 28, "Leader", 10, 5000000, 4.30, ["Inochi", "Convoy"], "19.442735,-99.201111", "1980/04/10",
"2016/09/10", [8.5344, 4300.0], date(2016, 9, 10), datetime(2014, 6, 24), True, bytearray("Leader", "utf-8"),
None),
("bumbl#ebéé ", 17, "Espionage", 7, 5000000, 2.0, ["Bumble", "Goldback"], "10.642707,-71.612534", "1980/04/10",
"2015/08/10", [5.334, 2000.0], date(2015, 8, 10), datetime(2014, 6, 24), True, bytearray("Espionage", "utf-8"),
None),
("ironhide&", 26, "Security", 7, 5000000, 4.0, ["Roadbuster"], "37.789563,-122.400356", "1980/04/10",
"2014/07/10", [7.9248, 4000.0], date(2014, 6, 24), datetime(2014, 6, 24), True, bytearray("Security", "utf-8"),
None),
("Jazz", 13, "First Lieutenant", 8, 5000000, 1.80, ["Meister"], "33.670666,-117.841553", "1980/04/10",
"2013/06/10", [3.9624, 1800.0], date(2013, 6, 24), datetime(2014, 6, 24), True,
bytearray("First Lieutenant", "utf-8"), None),
("Megatron", None, "None", 10, 5000000, 5.70, ["Megatron"], None, "1980/04/10", "2012/05/10", [None, 5700.0],
date(2012, 5, 10), datetime(2014, 6, 24), True, bytearray("None", "utf-8"), None),
("Metroplex_)^$", 300, "Battle Station", 8, 5000000, None, ["Metroflex"], None, "1980/04/10", "2011/04/10",
[91.44, None], date(2011, 4, 10), datetime(2014, 6, 24), True, bytearray("Battle Station", "utf-8"), None),
]).h_repartition(1)
df.table()
Viewing 6 of 6 rows / 16 columns
1 partition(s)
names
1 (string)
nullable
height(ft)
2 (int)
nullable
function
3 (string)
nullable
rank
4 (int)
nullable
age
5 (int)
nullable
weight(t)
6 (float)
nullable
japanese name
7 (array<string>)
nullable
last position seen
8 (string)
nullable
date arrival
9 (string)
nullable
last date seen
10 (string)
nullable
attributes
11 (array<float>)
nullable
DateType
12 (date)
nullable
Tiemstamp
13 (timestamp)
nullable
Cybertronian
14 (boolean)
nullable
function(binary)
15 (binary)
nullable
NullType
16 (null)
nullable
Optim'us
28
Leader
10
5000000
4.300000190734863
['Inochi',⋅'Convoy']
19.442735,-99.201111
1980/04/10
2016/09/10
[8.53439998626709,⋅4300.0]
2016-09-10
2014-06-24⋅00:00:00
True
None
None
bumbl#ebéé⋅⋅
17
Espionage
7
5000000
2.0
['Bumble',⋅'Goldback']
10.642707,-71.612534
1980/04/10
2015/08/10
[5.334000110626221,⋅2000.0]
2015-08-10
2014-06-24⋅00:00:00
True
None
None
ironhide&
26
Security
7
5000000
4.0
['Roadbuster']
37.789563,-122.400356
1980/04/10
2014/07/10
[7.924799919128418,⋅4000.0]
2014-06-24
2014-06-24⋅00:00:00
True
None
None
Jazz
13
First⋅Lieutenant
8
5000000
1.7999999523162842
['Meister']
33.670666,-117.841553
1980/04/10
2013/06/10
[3.962399959564209,⋅1800.0]
2013-06-24
2014-06-24⋅00:00:00
True
None
None
Megatron
None
None
10
5000000
5.699999809265137
['Megatron']
None
1980/04/10
2012/05/10
[None,⋅5700.0]
2012-05-10
2014-06-24⋅00:00:00
True
None
None
Metroplex_)^$
300
Battle⋅Station
8
5000000
None
['Metroflex']
None
1980/04/10
2011/04/10
[91.44000244140625,⋅None]
2011-04-10
2014-06-24⋅00:00:00
True
None
None
Viewing 6 of 6 rows / 16 columns
1 partition(s)
In [ ]:
import pandas as pd
pdf = pd.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c'},
'B': {0: 1, 1: 3, 2: 5},
'C': {0: 2, 1: 4, 2: 6}})
sdf = op.create.df(pdf=pdf)
In [ ]:
op.append([df,df], like="rows").table()
In [20]:
df_csv =op.load.csv("data/foo.csv").table(5)
Viewing 5 of 19 rows / 8 columns
1 partition(s)
id
1 (int)
nullable
firstName
2 (string)
nullable
lastName
3 (string)
nullable
billingId
4 (int)
nullable
product
5 (string)
nullable
price
6 (int)
nullable
birth
7 (string)
nullable
dummyCol
8 (string)
nullable
1
Luis
Alvarez$$%!
123
Cake
10
1980/07/07
never
2
André
Ampère
423
piza
8
1950/07/08
gonna
3
NiELS
Böhr//((%%
551
pizza
8
1990/07/09
give
4
PAUL
dirac$
521
pizza
8
1954/07/10
you
5
Albert
Einstein
634
pizza
8
1990/07/11
up
Viewing 5 of 19 rows / 8 columns
1 partition(s)
In [14]:
df_csv =op.load.tsv("data/foo.tsv").table(5)
Viewing 4 of 4 rows / 5 columns
1 partition(s)
Sepal length
1 (string)
nullable
Sepal width
2 (double)
nullable
Petal length
3 (double)
nullable
Petal width
4 (string)
nullable
Species
5 (string)
nullable
5.0
3.6
1.4
0.2
I.⋅setosa
5.0
3.6
1.4
0.2
I.⋅setosa
5.0
3.6
1.4
0.2
I.⋅setosa
5.0
3.6
1.4
0.2
I.⋅setosa
Viewing 4 of 4 rows / 5 columns
1 partition(s)
In [21]:
df_json =op.load.json("data/foo.json").table(5)
Viewing 5 of 19 rows / 8 columns
1 partition(s)
billingId
1 (bigint)
nullable
birth
2 (string)
nullable
dummyCol
3 (string)
nullable
firstName
4 (string)
nullable
id
5 (bigint)
nullable
lastName
6 (string)
nullable
price
7 (bigint)
nullable
product
8 (string)
nullable
123
1980/07/07
never
Luis
1
Alvarez$$%!
10
Cake
423
1950/07/08
gonna
André
2
Ampère
8
piza
551
1990/07/09
give
NiELS
3
Böhr//((%%
8
pizza
521
1954/07/10
you
PAUL
4
dirac$
8
pizza
634
1990/07/11
up
Albert
5
Einstein
8
pizza
Viewing 5 of 19 rows / 8 columns
1 partition(s)
In [22]:
df_parquet =op.load.parquet("data/foo.parquet").table(5)
Viewing 5 of 19 rows / 8 columns
1 partition(s)
id
1 (int)
nullable
firstName
2 (string)
nullable
lastName
3 (string)
nullable
billingId
4 (int)
nullable
product
5 (string)
nullable
price
6 (int)
nullable
birth
7 (string)
nullable
dummyCol
8 (string)
nullable
1
Luis
Alvarez$$%!
123
Cake
10
1980/07/07
never
2
André
Ampère
423
piza
8
1950/07/08
gonna
3
NiELS
Böhr//((%%
551
pizza
8
1990/07/09
give
4
PAUL
dirac$
521
pizza
8
1954/07/10
you
5
Albert
Einstein
634
pizza
8
1990/07/11
up
Viewing 5 of 19 rows / 8 columns
1 partition(s)
df_avro =op.load.avro("data/foo.avro").table(5)
In [15]:
df_avro =op.load.excel("data/titanic3.xls").table(5)
Viewing 5 of 1309 rows / 14 columns
8 partition(s)
pclass
1 (bigint)
nullable
survived
2 (bigint)
nullable
name
3 (string)
nullable
sex
4 (string)
nullable
age
5 (double)
nullable
sibsp
6 (bigint)
nullable
parch
7 (bigint)
nullable
ticket
8 (string)
nullable
fare
9 (double)
nullable
cabin
10 (string)
nullable
embarked
11 (string)
nullable
boat
12 (string)
nullable
body
13 (double)
nullable
home_dest
14 (string)
nullable
1
1
Allen,⸱Miss.⸱Elisabeth⸱Walton
female
29.0
0
0
24160
211.3375
B5
S
2
nan
St⸱Louis,⸱MO
1
1
Allison,⸱Master.⸱Hudson⸱Trevor
male
0.9167
1
2
113781
151.55
C22⸱C26
S
11
nan
Montreal,⸱PQ⸱/⸱Chesterville,⸱ON
1
0
Allison,⸱Miss.⸱Helen⸱Loraine
female
2.0
1
2
113781
151.55
C22⸱C26
S
nan
nan
Montreal,⸱PQ⸱/⸱Chesterville,⸱ON
1
0
Allison,⸱Mr.⸱Hudson⸱Joshua⸱Creighton
male
30.0
1
2
113781
151.55
C22⸱C26
S
nan
135.0
Montreal,⸱PQ⸱/⸱Chesterville,⸱ON
1
0
Allison,⸱Mrs.⸱Hudson⸱J⸱C⸱(Bessie⸱Waldo⸱Daniels)
female
25.0
1
2
113781
151.55
C22⸱C26
S
nan
nan
Montreal,⸱PQ⸱/⸱Chesterville,⸱ON
Viewing 5 of 1309 rows / 14 columns
8 partition(s)
In [16]:
df_csv =op.load.csv("https://raw.githubusercontent.com/ironmussa/Optimus/master/examples/data/foo.csv")
df_csv.table()
Viewing 19 of 19 rows / 8 columns
1 partition(s)
id
1 (int)
nullable
firstName
2 (string)
nullable
lastName
3 (string)
nullable
billingId
4 (int)
nullable
product
5 (string)
nullable
price
6 (int)
nullable
birth
7 (string)
nullable
dummyCol
8 (string)
nullable
1
Luis
Alvarez$$%!
123
Cake
10
1980/07/07
never
2
André
Ampère
423
piza
8
1950/07/08
gonna
3
NiELS
Böhr//((%%
551
pizza
8
1990/07/09
give
4
PAUL
dirac$
521
pizza
8
1954/07/10
you
5
Albert
Einstein
634
pizza
8
1990/07/11
up
6
Galileo
⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱GALiLEI
672
arepa
5
1930/08/12
never
7
CaRL
Ga%%%uss
323
taco
3
1970/07/13
gonna
8
David
H$$$ilbert
624
taaaccoo
3
1950/07/14
let
9
Johannes
KEPLER
735
taco
3
1920/04/22
you
10
JaMES
M$$ax%%well
875
taco
3
1923/03/12
down
11
Isaac
Newton
992
pasta
9
1999/02/15
never⸱
12
Emmy%%
Nöether$
234
pasta
9
1993/12/08
gonna
13
Max!!!
Planck!!!
111
hamburguer
4
1994/01/04
run⸱
14
Fred
Hoy&&&le
553
pizzza
8
1997/06/27
around
15
(((⸱⸱⸱Heinrich⸱)))))
Hertz
116
pizza
8
1956/11/30
and
16
William
Gilbert###
886
BEER
2
1958/03/26
desert
17
Marie
CURIE
912
Rice
1
2000/03/22
you
18
Arthur
COM%%%pton
812
110790
5
1899/01/01
#
19
JAMES
Chadwick
467
null
10
1921/05/03
#
Viewing 19 of 19 rows / 8 columns
1 partition(s)
In [17]:
df_json =op.load.json("https://raw.githubusercontent.com/ironmussa/Optimus/master/examples/data/foo.json")
df_json.table()
Viewing 19 of 19 rows / 8 columns
1 partition(s)
billingId
1 (bigint)
nullable
birth
2 (string)
nullable
dummyCol
3 (string)
nullable
firstName
4 (string)
nullable
id
5 (bigint)
nullable
lastName
6 (string)
nullable
price
7 (bigint)
nullable
product
8 (string)
nullable
123
1980/07/07
never
Luis
1
Alvarez$$%!
10
Cake
423
1950/07/08
gonna
André
2
Ampère
8
piza
551
1990/07/09
give
NiELS
3
Böhr//((%%
8
pizza
521
1954/07/10
you
PAUL
4
dirac$
8
pizza
634
1990/07/11
up
Albert
5
Einstein
8
pizza
672
1930/08/12
never
Galileo
6
⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱GALiLEI
5
arepa
323
1970/07/13
gonna
CaRL
7
Ga%%%uss
3
taco
624
1950/07/14
let
David
8
H$$$ilbert
3
taaaccoo
735
1920/04/22
you
Johannes
9
KEPLER
3
taco
875
1923/03/12
down
JaMES
10
M$$ax%%well
3
taco
992
1999/02/15
never⸱
Isaac
11
Newton
9
pasta
234
1993/12/08
gonna
Emmy%%
12
Nöether$
9
pasta
111
1994/01/04
run⸱
Max!!!
13
Planck!!!
4
hamburguer
553
1997/06/27
around
Fred
14
Hoy&&&le
8
pizzza
116
1956/11/30
and
(((⸱⸱⸱Heinrich⸱)))))
15
Hertz
8
pizza
886
1958/03/26
desert
William
16
Gilbert###
2
BEER
912
2000/03/22
you
Marie
17
CURIE
1
Rice
812
1899/01/01
#
Arthur
18
COM%%%pton
5
110790
467
1921/05/03
#
JAMES
19
Chadwick
10
null
Viewing 19 of 19 rows / 8 columns
1 partition(s)
In [25]:
df_parquet =op.load.parquet("https://raw.githubusercontent.com/ironmussa/Optimus/master/examples/data/foo.parquet")
df_parquet.table()
C:\Users\ARGENI~1\AppData\Local\Temp\tmpvhxr0yyy.parquet
Viewing 19 of 19 rows / 8 columns
1 partition(s)
id
1 (int)
nullable
firstName
2 (string)
nullable
lastName
3 (string)
nullable
billingId
4 (int)
nullable
product
5 (string)
nullable
price
6 (int)
nullable
birth
7 (string)
nullable
dummyCol
8 (string)
nullable
1
Luis
Alvarez$$%!
123
Cake
10
1980/07/07
never
2
André
Ampère
423
piza
8
1950/07/08
gonna
3
NiELS
Böhr//((%%
551
pizza
8
1990/07/09
give
4
PAUL
dirac$
521
pizza
8
1954/07/10
you
5
Albert
Einstein
634
pizza
8
1990/07/11
up
6
Galileo
⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱GALiLEI
672
arepa
5
1930/08/12
never
7
CaRL
Ga%%%uss
323
taco
3
1970/07/13
gonna
8
David
H$$$ilbert
624
taaaccoo
3
1950/07/14
let
9
Johannes
KEPLER
735
taco
3
1920/04/22
you
10
JaMES
M$$ax%%well
875
taco
3
1923/03/12
down
11
Isaac
Newton
992
pasta
9
1999/02/15
never⸱
12
Emmy%%
Nöether$
234
pasta
9
1993/12/08
gonna
13
Max!!!
Planck!!!
111
hamburguer
4
1994/01/04
run⸱
14
Fred
Hoy&&&le
553
pizzza
8
1997/06/27
around
15
(((⸱⸱⸱Heinrich⸱)))))
Hertz
116
pizza
8
1956/11/30
and
16
William
Gilbert###
886
BEER
2
1958/03/26
desert
17
Marie
CURIE
912
Rice
1
2000/03/22
you
18
Arthur
COM%%%pton
812
110790
5
1899/01/01
#
19
JAMES
Chadwick
467
null
10
1921/05/03
#
Viewing 19 of 19 rows / 8 columns
1 partition(s)
df_avro =op.load.avro("https://raw.githubusercontent.com/ironmussa/Optimus/master/examples/data/foo.avro", "avro") df_avro.table()
In [ ]:
df_csv.save.csv("test.csv")
In [ ]:
df_csv.save.json("test.json")
In [ ]:
df_csv.save.parquet("test.parquet")
df_csv.save.avro("test.avro")
Content source: ironmussa/Optimus
Similar notebooks: