NOTE: This notebook downloads files from WebHDFS to the spark local file system. It does not clean up these files. You will need to do that.
Credentials - keep this secret!
In [ ]:
# Cluster number, e.g. 100000
cluster = ''
# Cluster username
username = ''
# Cluster password
password = ''
# file path in HDFS
webhdfs_filepath = 'yourpath/yourfile.txt'
# where to save the file in the spark service file system
local_filepath = 'yourfile.txt'
Add your custom code to read_csv_lines for processing your datafile
In [ ]:
host = 'ehaasp-{0}-mastermanager.bi.services.bluemix.net'.format(cluster)
Code to connect to BigInsights on Cloud via WebHDFS - don't change this
In [ ]:
import requests
import sys
import datetime
print('READ FILE START: {0}'.format(datetime.datetime.now()))
chunk_size = 200000000 # Read in 200 Mb chunks
url = "https://{0}:8443/gateway/default/webhdfs/v1/{1}?op=OPEN".format(host, webhdfs_filepath)
# note SSL verification is been disabled
r = requests.get(url,
auth=(username, password),
verify=False,
allow_redirects=True,
stream=True)
chunk_num = 1
with open(local_filepath, 'wb') as f:
for chunk in r.iter_content(chunk_size):
if chunk: # filter out keep-alive new chunks
print('{0} writing chunk {1}'.format(datetime.datetime.now(), chunk_num))
f.write(chunk)
chunk_num = chunk_num + 1
print('READ FILE END: {0}'.format(datetime.datetime.now()))
In [ ]:
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
df = sqlContext.read.format('com.databricks.spark.csv') \
.options(header='false', inferschema='true', delimiter='|') \
.load(local_filepath)
df.cache()
df.show()
In [ ]:
df.count()
In [ ]: