In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append("..")

In [3]:
from optimus import Optimus

Load optimus local or in a cluster


In [4]:
# Create optimus
op = Optimus(master="local", app_name= "optimus", verbose = True)


INFO:optimus:Just check that Spark and all necessary environments vars are present...
INFO:optimus:-----
INFO:optimus:SPARK_HOME=C:\opt\spark\spark-2.3.1-bin-hadoop2.7
INFO:optimus:HADOOP_HOME=C:\opt\spark\spark-2.3.1-bin-hadoop2.7
INFO:optimus:PYSPARK_PYTHON=C:\Users\argenisleon\Anaconda3\python.exe
INFO:optimus:PYSPARK_DRIVER_PYTHON=jupyter
INFO:optimus:PYSPARK_SUBMIT_ARGS=--conf "spark.sql.catalogImplementation=hive" pyspark-shell
INFO:optimus:JAVA_HOME=C:\java
INFO:optimus:Pyarrow Installed
INFO:optimus:-----
INFO:optimus:Starting or getting SparkSession and SparkContext...
INFO:optimus:Spark Version:2.3.1
INFO:optimus:
                             ____        __  _                     
                            / __ \____  / /_(_)___ ___  __  _______
                           / / / / __ \/ __/ / __ `__ \/ / / / ___/
                          / /_/ / /_/ / /_/ / / / / / / /_/ (__  ) 
                          \____/ .___/\__/_/_/ /_/ /_/\__,_/____/  
                              /_/                                  
                              
INFO:optimus:Transform and Roll out...
INFO:optimus:Optimus successfully imported. Have fun :).

Create Spark dataframe using a panda dataframe


In [28]:
df = op.load.csv("data/Meteorite_Landings.csv").h_repartition()

In [18]:
df.table()


Viewing 100 of 45716 rows / 10 columns
32 partition(s)
name
1 (string)
nullable
id
2 (int)
nullable
nametype
3 (string)
nullable
recclass
4 (string)
nullable
mass (g)
5 (double)
nullable
fall
6 (string)
nullable
year
7 (string)
nullable
reclat
8 (double)
nullable
reclong
9 (double)
nullable
GeoLocation
10 (string)
nullable
Acfer⸱232
240
Valid
H5
725.0
Found
01/01/1991⸱12:00:00⸱AM
27.73944
4.32833
(27.739440,⸱4.328330)
Elephant⸱Moraine⸱90232
8641
Valid
L6
16.9
Found
01/01/1990⸱12:00:00⸱AM
-76.28795
156.46841
(-76.287950,⸱156.468410)
Grove⸱Mountains⸱020090
30681
Valid
Martian⸱(shergottite)
7.5
Found
01/01/2003⸱12:00:00⸱AM
-72.99944
75.26111
(-72.999440,⸱75.261110)
Northwest⸱Africa⸱891
31912
Valid
H4
70.8
Found
01/01/2001⸱12:00:00⸱AM
None
None
None
Queen⸱Alexandra⸱Range⸱93098
19187
Valid
H6
1.2
Found
01/01/1993⸱12:00:00⸱AM
-84.5757
162.56524
(-84.575700,⸱162.565240)
Queen⸱Alexandra⸱Range⸱94691
20322
Valid
H6
9.6
Found
01/01/1994⸱12:00:00⸱AM
-84.0
168.0
(-84.000000,⸱168.000000)
Meteorite⸱Hills⸱00977
16211
Valid
H5
13.2
Found
01/01/2000⸱12:00:00⸱AM
-79.68333
159.75
(-79.683330,⸱159.750000)
Grove⸱Mountains⸱020114
46531
Valid
L3
1.0
Found
01/01/2003⸱12:00:00⸱AM
-72.98194
75.25167
(-72.981940,⸱75.251670)
Pecora⸱Escarpment⸱91483
18774
Valid
H5
5.5
Found
01/01/1991⸱12:00:00⸱AM
-85.55819
-68.31586
(-85.558190,⸱-68.315860)
Ramlat⸱as⸱Sahmah⸱390
55656
Valid
H3.8-6
0.69
Found
01/01/2010⸱12:00:00⸱AM
20.0949
55.69318
(20.094900,⸱55.693180)
Northwest⸱Africa⸱2847
33321
Valid
H4
283.0
Found
01/01/2004⸱12:00:00⸱AM
None
None
None
Jiddat⸱al⸱Harasis⸱235
35554
Valid
H4
13.08
Found
01/01/2005⸱12:00:00⸱AM
19.97115
56.39807
(19.971150,⸱56.398070)
Lewis⸱Cliff⸱86496
13426
Valid
H5
4.9
Found
01/01/1986⸱12:00:00⸱AM
-84.24519
161.39823
(-84.245190,⸱161.398230)
Jemlapur
12079
Valid
L6
450.0
Fell
01/01/1901⸱12:00:00⸱AM
None
None
None
Dominion⸱Range⸱03289
7687
Valid
H5
85.79
Found
01/01/2003⸱12:00:00⸱AM
None
None
None
Miller⸱Range⸱07439
53215
Valid
CO3
4.4
Found
01/01/2007⸱12:00:00⸱AM
0.0
0.0
(0.000000,⸱0.000000)
LaPaz⸱Icefield⸱031346
35928
Valid
L3
5.4
Found
01/01/2003⸱12:00:00⸱AM
None
None
None
Lewis⸱Cliff⸱88617
14355
Valid
L3.5
3.2
Found
01/01/1988⸱12:00:00⸱AM
-84.26961
161.37881
(-84.269610,⸱161.378810)
Ksar⸱Ghilane⸱009
54560
Valid
H6
173.0
Found
01/01/2011⸱12:00:00⸱AM
32.76277
9.86542
(32.762770,⸱9.865420)
Queen⸱Alexandra⸱Range⸱94459
20091
Valid
L6
4.0
Found
01/01/1994⸱12:00:00⸱AM
-84.0
168.0
(-84.000000,⸱168.000000)
Northwest⸱Africa⸱2563
33084
Valid
L5
1175.0
Found
01/01/2004⸱12:00:00⸱AM
0.0
0.0
(0.000000,⸱0.000000)
Sayh⸱al⸱Uhaymir⸱450
45941
Valid
L4
1713.81
Found
01/01/2006⸱12:00:00⸱AM
20.8825
57.35723
(20.882500,⸱57.357230)
Sayh⸱al⸱Uhaymir⸱447
45938
Valid
LL~3
32.8
Found
01/01/2007⸱12:00:00⸱AM
20.55037
56.6854
(20.550370,⸱56.685400)
Queen⸱Alexandra⸱Range⸱93608
19695
Valid
L5
34.4
Found
01/01/1993⸱12:00:00⸱AM
-84.0
168.0
(-84.000000,⸱168.000000)
Grove⸱Mountains⸱054625
48528
Valid
L5
2.59
Found
01/01/2006⸱12:00:00⸱AM
-72.998889
75.187222
(-72.998889,⸱75.187222)
Elephant⸱Moraine⸱90205
8614
Valid
L6
46.0
Found
01/01/1990⸱12:00:00⸱AM
-76.28209
156.49089
(-76.282090,⸱156.490890)
Queen⸱Alexandra⸱Range⸱99337
21787
Valid
LL5
1.78
Found
01/01/1999⸱12:00:00⸱AM
-84.0
168.0
(-84.000000,⸱168.000000)
Queen⸱Alexandra⸱Range⸱94260
19897
Valid
L6
2.9
Found
01/01/1994⸱12:00:00⸱AM
-84.0
168.0
(-84.000000,⸱168.000000)
Frontier⸱Mountain⸱97040
10769
Valid
H4/5
0.5
Found
01/01/1997⸱12:00:00⸱AM
-72.98833
160.40722
(-72.988330,⸱160.407220)
MacAlpine⸱Hills⸱02544
14850
Valid
LL6
189.09
Found
01/01/2002⸱12:00:00⸱AM
None
None
None
Dar⸱al⸱Gani⸱842
6389
Valid
L6
347.0
Found
01/01/1998⸱12:00:00⸱AM
26.8965
16.57833
(26.896500,⸱16.578330)
Allan⸱Hills⸱84142
744
Valid
L6
78.5
Found
01/01/1984⸱12:00:00⸱AM
-77.01141
156.94615
(-77.011410,⸱156.946150)
Pine⸱Bluffs
18824
Valid
H
2700.0
Found
01/01/1935⸱12:00:00⸱AM
41.18333
-104.06667
(41.183330,⸱-104.066670)
Roberts⸱Massif⸱04254
44658
Valid
L5
54.9
Found
01/01/2004⸱12:00:00⸱AM
None
None
None
Acfer⸱223
231
Valid
L/LL6
1398.0
Found
01/01/1991⸱12:00:00⸱AM
27.54194
3.82778
(27.541940,⸱3.827780)
Queen⸱Alexandra⸱Range⸱99692
22138
Valid
LL5
3.5
Found
01/01/1999⸱12:00:00⸱AM
-84.0
168.0
(-84.000000,⸱168.000000)
Sayh⸱al⸱Uhaymir⸱096
23288
Valid
H4/5
158.0
Found
01/01/2000⸱12:00:00⸱AM
21.10912
56.9335
(21.109120,⸱56.933500)
Meteorite⸱Hills⸱01029
16262
Valid
LL5
218.3
Found
01/01/2001⸱12:00:00⸱AM
-79.68333
159.75
(-79.683330,⸱159.750000)
Grove⸱Mountains⸱021785
46565
Valid
L5
71.74
Found
01/01/2003⸱12:00:00⸱AM
-72.775
75.33889
(-72.775000,⸱75.338890)
Dar⸱al⸱Gani⸱945
6485
Valid
Eucrite
300.0
Found
01/01/2000⸱12:00:00⸱AM
27.18867
16.37583
(27.188670,⸱16.375830)
Northwest⸱Africa⸱3047
31178
Valid
H6
77.6
Found
01/01/2003⸱12:00:00⸱AM
None
None
None
Grove⸱Mountains⸱021827
49890
Valid
H5
0.63
Found
01/01/2003⸱12:00:00⸱AM
-72.77722
75.33917
(-72.777220,⸱75.339170)
Elephant⸱Moraine⸱92156
9557
Valid
CR2
0.4
Found
01/01/1992⸱12:00:00⸱AM
-76.01295
155.83949
(-76.012950,⸱155.839490)
Queen⸱Alexandra⸱Range⸱97660
21116
Valid
LL6
13.9
Found
01/01/1997⸱12:00:00⸱AM
-84.0
168.0
(-84.000000,⸱168.000000)
MacAlpine⸱Hills⸱04930
35045
Valid
H3
0.88
Found
01/01/2004⸱12:00:00⸱AM
None
None
None
Miller⸱Range⸱090982
55245
Valid
CK6
1.1
Found
01/01/2009⸱12:00:00⸱AM
0.0
0.0
(0.000000,⸱0.000000)
Roosevelt⸱County⸱054
22709
Valid
H5
361.5
Found
01/01/1971⸱12:00:00⸱AM
34.3
-103.43333
(34.300000,⸱-103.433330)
Queen⸱Alexandra⸱Range⸱94684
20315
Valid
L5
4.4
Found
01/01/1994⸱12:00:00⸱AM
-84.0
168.0
(-84.000000,⸱168.000000)
Northwest⸱Africa⸱4809
45733
Valid
LL4
1780.0
Found
01/01/2006⸱12:00:00⸱AM
None
None
None
Grove⸱Mountains⸱021871
49897
Valid
H5
1.5
Found
01/01/2003⸱12:00:00⸱AM
-72.7753
75.3381
(-72.775300,⸱75.338100)
Queen⸱Alexandra⸱Range⸱99109
21561
Valid
LL5
1.9
Found
01/01/1999⸱12:00:00⸱AM
-84.0
168.0
(-84.000000,⸱168.000000)
Grove⸱Mountains⸱021679
46787
Valid
L5
27.55
Found
01/01/2003⸱12:00:00⸱AM
-72.77722
75.33889
(-72.777220,⸱75.338890)
Queen⸱Alexandra⸱Range⸱97141
20599
Valid
LL5
20.9
Found
01/01/1997⸱12:00:00⸱AM
-84.0
168.0
(-84.000000,⸱168.000000)
Northwest⸱Africa⸱4608
45554
Valid
L6
833.5
Found
01/01/2006⸱12:00:00⸱AM
0.0
0.0
(0.000000,⸱0.000000)
Grove⸱Mountains⸱021561
49834
Valid
H4
3.2
Found
01/01/2003⸱12:00:00⸱AM
-72.93611
75.30556
(-72.936110,⸱75.305560)
Wiluna
24281
Valid
H5
150000.0
Fell
01/01/1967⸱12:00:00⸱AM
-26.59278
120.32833
(-26.592780,⸱120.328330)
Grove⸱Mountains⸱020162
46717
Valid
H3
2.31
Found
01/01/2003⸱12:00:00⸱AM
-72.97611
75.26694
(-72.976110,⸱75.266940)
Asuka⸱880900
3609
Valid
H4
22.0
Found
01/01/1988⸱12:00:00⸱AM
-72.0
26.0
(-72.000000,⸱26.000000)
Reckling⸱Peak⸱86705
22408
Valid
H5
68.5
Found
01/01/1986⸱12:00:00⸱AM
-76.23773
158.67436
(-76.237730,⸱158.674360)
Dar⸱al⸱Gani⸱885
6432
Valid
H6
102.0
Found
01/01/2000⸱12:00:00⸱AM
26.1
16.06667
(26.100000,⸱16.066670)
Lewis⸱Cliff⸱88698
14435
Valid
Iron
0.8
Found
01/01/1988⸱12:00:00⸱AM
-84.27807
161.39748
(-84.278070,⸱161.397480)
Miller⸱Range⸱090729
54339
Valid
L6
0.6
Found
01/01/2009⸱12:00:00⸱AM
0.0
0.0
(0.000000,⸱0.000000)
Grosvenor⸱Mountains⸱95589
11317
Valid
L6
127.8
Found
01/01/1995⸱12:00:00⸱AM
-85.66667
175.0
(-85.666670,⸱175.000000)
Reckling⸱Peak⸱92411
22420
Valid
H5
21.3
Found
01/01/1992⸱12:00:00⸱AM
-76.23116
158.32857
(-76.231160,⸱158.328570)
Northwest⸱Africa⸱847
17873
Valid
H3
1851.0
Found
01/01/2001⸱12:00:00⸱AM
None
None
None
Daraj⸱115
6574
Valid
H6
417.0
Found
01/01/1986⸱12:00:00⸱AM
29.37806
11.88667
(29.378060,⸱11.886670)
Queen⸱Alexandra⸱Range⸱99201
21652
Valid
LL5
1.1
Found
01/01/1999⸱12:00:00⸱AM
-84.0
168.0
(-84.000000,⸱168.000000)
Dar⸱al⸱Gani⸱830
6377
Valid
Ureilite
53.1
Found
01/01/2000⸱12:00:00⸱AM
27.04833
16.38433
(27.048330,⸱16.384330)
Katagum
35465
Valid
L6
1500.0
Fell
01/01/1999⸱12:00:00⸱AM
11.33333
10.08333
(11.333330,⸱10.083330)
Jiddat⸱al⸱Harasis⸱334
45858
Valid
L6
261.65
Found
01/01/2006⸱12:00:00⸱AM
19.72743
55.72533
(19.727430,⸱55.725330)
Jepara
53840
Valid
Pallasite,⸱PMG
499500.0
Found
01/01/2008⸱12:00:00⸱AM
-6.6
110.73333
(-6.600000,⸱110.733330)
Lewis⸱Cliff⸱88149
13902
Valid
H5
1.5
Found
01/01/1988⸱12:00:00⸱AM
-84.27543
161.40962
(-84.275430,⸱161.409620)
Northwest⸱Africa⸱1484
17261
Valid
H5
16.4
Found
01/01/2001⸱12:00:00⸱AM
None
None
None
Asuka⸱881402
4111
Valid
L3.8
74.11
Found
01/01/1988⸱12:00:00⸱AM
-72.0
26.0
(-72.000000,⸱26.000000)
Lewis⸱Cliff⸱86120
13059
Valid
H6
32.9
Found
01/01/1986⸱12:00:00⸱AM
-84.25934
161.35931
(-84.259340,⸱161.359310)
Meteorite⸱Hills⸱00624
15858
Valid
L5
103.91
Found
01/01/2000⸱12:00:00⸱AM
-79.68333
155.75
(-79.683330,⸱155.750000)
Reid⸱002
22557
Valid
L6
66.5
Found
01/01/1974⸱12:00:00⸱AM
-30.06667
128.96667
(-30.066670,⸱128.966670)
Elephant⸱Moraine⸱83263
7903
Valid
H6
10.2
Found
01/01/1983⸱12:00:00⸱AM
-76.33722
157.11417
(-76.337220,⸱157.114170)
Jiddat⸱al⸱Harasis⸱226
35545
Valid
Mesosiderite
308.67
Found
01/01/2005⸱12:00:00⸱AM
19.97493
56.4269
(19.974930,⸱56.426900)
Miller⸱Range⸱090732
54342
Valid
L5
7.1
Found
01/01/2009⸱12:00:00⸱AM
0.0
0.0
(0.000000,⸱0.000000)
Shangdu
23523
Valid
Iron,⸱IIIAB
247000.0
Found
01/01/1957⸱12:00:00⸱AM
42.5
114.0
(42.500000,⸱114.000000)
Northwest⸱Africa⸱2599
33120
Valid
LL5
234.5
Found
None
0.0
0.0
(0.000000,⸱0.000000)
Hammadah⸱al⸱Hamra⸱089
11572
Valid
H5
1379.0
Found
01/01/1995⸱12:00:00⸱AM
28.46717
13.317
(28.467170,⸱13.317000)
Graves⸱Nunataks⸱98153
11143
Valid
H4
35.1
Found
01/01/1998⸱12:00:00⸱AM
-86.71667
-141.5
(-86.716670,⸱-141.500000)
Queen⸱Alexandra⸱Range⸱99659
22105
Valid
LL5
7.3
Found
01/01/1999⸱12:00:00⸱AM
-84.0
168.0
(-84.000000,⸱168.000000)
Cumulus⸱Hills⸱04069
32525
Valid
Pallasite
44700.0
Found
01/01/2003⸱12:00:00⸱AM
None
None
None
Grove⸱Mountains⸱054613
50645
Valid
L5
2.14
Found
01/01/2006⸱12:00:00⸱AM
-72.99889
75.18722
(-72.998890,⸱75.187220)
Elephant⸱Moraine⸱92074
9476
Valid
L6
24.0
Found
01/01/1992⸱12:00:00⸱AM
-76.05375
156.2343
(-76.053750,⸱156.234300)
Queen⸱Alexandra⸱Range⸱97312
20770
Valid
LL5
4.2
Found
01/01/1997⸱12:00:00⸱AM
-84.0
168.0
(-84.000000,⸱168.000000)
Miller⸱Range⸱05093
44489
Valid
LL6
6.9
Found
01/01/2005⸱12:00:00⸱AM
None
None
None
Roosevelt⸱County⸱071
22726
Valid
L4
5.3
Found
01/01/1989⸱12:00:00⸱AM
34.1
-103.48333
(34.100000,⸱-103.483330)
Hedeskoga
11869
Valid
H5
3500.0
Fell
01/01/1922⸱12:00:00⸱AM
55.46667
13.78333
(55.466670,⸱13.783330)
Meteorite⸱Hills⸱01229
16461
Valid
H6
8.5
Found
01/01/2001⸱12:00:00⸱AM
-79.68333
159.75
(-79.683330,⸱159.750000)
Asuka⸱881313
4022
Valid
LL3.8
19.26
Found
01/01/1988⸱12:00:00⸱AM
-72.0
26.0
(-72.000000,⸱26.000000)
Northwest⸱Africa⸱4033
34305
Valid
L/LL5
1042.0
Found
01/01/2004⸱12:00:00⸱AM
None
None
None
Northwest⸱Africa⸱7299
55713
Valid
Brachinite
20.0
Found
01/01/2011⸱12:00:00⸱AM
0.0
0.0
(0.000000,⸱0.000000)
Jiddat⸱al⸱Harasis⸱685
56238
Valid
H4
1.51
Found
01/01/2011⸱12:00:00⸱AM
19.4996
55.5247
(19.499600,⸱55.524700)
Patuxent⸱Range⸱10277
57079
Valid
H6
1.1
Found
01/01/2010⸱12:00:00⸱AM
0.0
0.0
(0.000000,⸱0.000000)
Dhofar⸱1441
48543
Valid
Achondrite-ung
267.8
Found
01/01/2003⸱12:00:00⸱AM
18.436
54.48383
(18.436000,⸱54.483830)
Northwest⸱Africa⸱2860
33334
Valid
H4
878.0
Found
01/01/2004⸱12:00:00⸱AM
None
None
None
Viewing 100 of 45716 rows / 10 columns
32 partition(s)

In [41]:
df.plot.hist("mass (g)")


INFO:optimus:bucketizer() executed in 0.09 sec
INFO:optimus:hist() executed in 1.58 sec
INFO:optimus:hist() executed in 4.1 sec

In [42]:
df.plot.frequency("mass (g)")



In [29]:
df.plot.box("mass (g)")


INFO:optimus:percentile() executed in 0.62 sec
<Figure size 864x360 with 0 Axes>

In [30]:
df1= op.load.csv("https://raw.githubusercontent.com/dvgodoy/handyspark/master/tests/rawdata/train.csv")


INFO:optimus:Downloading train.csv from https://raw.githubusercontent.com/dvgodoy/handyspark/master/tests/rawdata/train.csv
INFO:optimus:Downloaded 61194 bytes
INFO:optimus:Creating DataFrame for train.csv. Please wait...
INFO:optimus:Successfully created DataFrame for 'train.csv'

In [153]:
df1.plot.scatter(["Age", "Fare"], buckets=30)


INFO:optimus:bucketizer() executed in 0.16 sec
INFO:optimus:bucketizer() executed in 0.15 sec

In [155]:
df1.plot.box("Age")


INFO:optimus:percentile() executed in 0.08 sec
<Figure size 864x360 with 0 Axes>

In [156]:
df1.plot.box("Fare")


INFO:optimus:percentile() executed in 0.06 sec
<Figure size 864x360 with 0 Axes>

In [ ]: