You can load data from cloud storage such as Object Storage.
Collect your Object Storage connection information:
auth_url
), e.g. https://identity.open.softlayer.com
projectId
)region
), e.g. dallas
userId
)password
)Collect your data set information
my_sample_data
my_data_set.csv
Import PixieDust and enable the Spark Job monitor
In [3]:
import pixiedust
pixiedust.enableJobMonitor()
In [4]:
# @hidden_cell
# Enter your ...
OS_AUTH_URL = 'https://identity.open.softlayer.com'
OS_USERID = '...'
OS_PASSWORD = '...'
OS_PROJECTID = '...'
OS_REGION = '...'
OS_SOURCE_CONTAINER = '...'
OS_FILENAME = '....csv'
In [5]:
# no changes are required to this cell
from ingest import Connectors
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
objectstoreloadOptions = {
Connectors.BluemixObjectStorage.AUTH_URL : OS_AUTH_URL,
Connectors.BluemixObjectStorage.USERID : OS_USERID,
Connectors.BluemixObjectStorage.PASSWORD : OS_PASSWORD,
Connectors.BluemixObjectStorage.PROJECTID : OS_PROJECTID,
Connectors.BluemixObjectStorage.REGION : OS_REGION,
Connectors.BluemixObjectStorage.SOURCE_CONTAINER : OS_SOURCE_CONTAINER,
Connectors.BluemixObjectStorage.SOURCE_FILE_NAME : OS_FILENAME,
Connectors.BluemixObjectStorage.SOURCE_INFER_SCHEMA : '1'}
os_data = sqlContext.read.format("com.ibm.spark.discover").options(**objectstoreloadOptions).load()
In [6]:
display(os_data)