notebook.community

Edit and run



In [1]:

    
# import packages
import h2o, os.path
import sparkling_water as sw
from pyspark import SparkConf, SparkContext



In [2]:

    
# create spark context 
conf = (SparkConf()
        .setMaster("local")
        .setAppName("My app")
        .set("spark.executor.memory", "512mb"))
sc = SparkContext(conf=conf)



In [3]:

    
# connect to existing h2o cluster
ip="192.168.0.14"
port=54323
h2o.init(ip=ip,port=port)









    



Warning: Version mismatch. H2O is version 3.1.0.3118, but the python package is version 3.0.1.4.






    




H2O cluster uptime: 
3 hours 18 minutes 42 seconds 8 milliseconds 
H2O cluster version: 
3.1.0.3118
H2O cluster name: 
sparkling-water-kuba
H2O cluster total nodes: 
1
H2O cluster total memory: 
466.5 MB
H2O cluster total cores: 
4
H2O cluster allowed cores: 
4
H2O cluster healthy: 
True
H2O Connection ip: 
192.168.0.14
H2O Connection port: 
54323



In [4]:

    
# create rdd
data = [1, 2, 3, 4, 5]
distData = sc.parallelize(data)



In [5]:

    
#Upload frame from RDD
h2o_frame = sw.Utils.upload_frame_from_rdd(distData)









    



Parse Progress: [##################################################] 100%
Uploaded pyf2fba1d8-ccd5-4d31-9981-67d355a4b10d into cluster with 5 rows and 1 cols



In [6]:

    
h2o_frame.describe()









    



Rows: 5 Cols: 1

Chunk compression summary:







    




chunk_type
chunk_name
count
count_percentage
size
size_percentage
C1N
1-Byte Integers (w/o NAs)
1
100.0
     73  B
100.0






    



Frame distribution summary:







    





size
number_of_rows
number_of_chunks_per_column
number_of_chunks
192.168.0.14:54321
     73  B
5.0
1.0
1.0
mean
     73  B
5.0
1.0
1.0
min
     73  B
5.0
1.0
1.0
max
     73  B
5.0
1.0
1.0
stddev
      0  B
0.0
0.0
0.0
total
     73  B
5.0
1.0
1.0






    



Column-by-Column Summary:







    





C1
type
int
mins
1.0
maxs
5.0
sigma
1.58113883008
zero_count
0
missing_count
0



In [7]:

    
# Download the dataset
home = os.path.expanduser("~")
filename =home + os.path.sep + "downloaded_dataset"
h2o.download_csv(h2o_frame,filename)



In [ ]:

H2O cluster uptime:	3 hours 18 minutes 42 seconds 8 milliseconds
H2O cluster version:	3.1.0.3118
H2O cluster name:	sparkling-water-kuba
H2O cluster total nodes:	1
H2O cluster total memory:	466.5 MB
H2O cluster total cores:	4
H2O cluster allowed cores:	4
H2O cluster healthy:	True
H2O Connection ip:	192.168.0.14
H2O Connection port:	54323

chunk_type	chunk_name	count	count_percentage	size	size_percentage
C1N	1-Byte Integers (w/o NAs)	1	100.0	73 B	100.0

	size	number_of_rows	number_of_chunks_per_column	number_of_chunks
192.168.0.14:54321	73 B	5.0	1.0	1.0
mean	73 B	5.0	1.0	1.0
min	73 B	5.0	1.0	1.0
max	73 B	5.0	1.0	1.0
stddev	0 B	0.0	0.0	0.0
total	73 B	5.0	1.0	1.0

	C1
type	int
mins	1.0
maxs	5.0
sigma	1.58113883008
zero_count	0
missing_count	0