In [1]:
# import packages
import h2o, os.path
import sparkling_water as sw
from pyspark import SparkConf, SparkContext

In [2]:
# create spark context 
conf = (SparkConf()
        .setMaster("local")
        .setAppName("My app")
        .set("spark.executor.memory", "512mb"))
sc = SparkContext(conf=conf)

In [3]:
# connect to existing h2o cluster
ip="192.168.0.14"
port=54323
h2o.init(ip=ip,port=port)


Warning: Version mismatch. H2O is version 3.1.0.3118, but the python package is version 3.0.1.4.
H2O cluster uptime: 3 hours 18 minutes 42 seconds 8 milliseconds
H2O cluster version: 3.1.0.3118
H2O cluster name: sparkling-water-kuba
H2O cluster total nodes: 1
H2O cluster total memory: 466.5 MB
H2O cluster total cores: 4
H2O cluster allowed cores: 4
H2O cluster healthy: True
H2O Connection ip: 192.168.0.14
H2O Connection port: 54323

In [4]:
# create rdd
data = [1, 2, 3, 4, 5]
distData = sc.parallelize(data)

In [5]:
#Upload frame from RDD
h2o_frame = sw.Utils.upload_frame_from_rdd(distData)


Parse Progress: [##################################################] 100%
Uploaded pyf2fba1d8-ccd5-4d31-9981-67d355a4b10d into cluster with 5 rows and 1 cols

In [6]:
h2o_frame.describe()


Rows: 5 Cols: 1

Chunk compression summary:

chunk_type chunk_name count count_percentage size size_percentage
C1N 1-Byte Integers (w/o NAs) 1 100.0 73 B 100.0
Frame distribution summary:

size number_of_rows number_of_chunks_per_column number_of_chunks
192.168.0.14:54321 73 B 5.0 1.0 1.0
mean 73 B 5.0 1.0 1.0
min 73 B 5.0 1.0 1.0
max 73 B 5.0 1.0 1.0
stddev 0 B 0.0 0.0 0.0
total 73 B 5.0 1.0 1.0
Column-by-Column Summary:

C1
type int
mins 1.0
maxs 5.0
sigma 1.58113883008
zero_count 0
missing_count 0

In [7]:
# Download the dataset
home = os.path.expanduser("~")
filename =home + os.path.sep + "downloaded_dataset"
h2o.download_csv(h2o_frame,filename)

In [ ]: