""" This is done at a time where Spark did not support csv parsing 'in single line' I m sure it will come soon... """
""" === e.g. the csv file looks like this:=== field1, field2, time 5768, 49.4,'2014-12-19 04:15:00+01', 1039, 26.1, 2014-12-18 14:45:00+01' ... """
IPYTHON_OPTS="notebook --profile=pysparknb" /Users/charil/.../bin/pyspark --jars /Users/chari.../spark-examples-1.3.1-hadoop2.4.0.jar
In [5]:
from pyspark.sql import SQLContext, Row
sqlContext = SQLContext(sc)
In [ ]:
# read the csv data
data = sc.textFile('tableau_grid_tile_refactored.csv')
# remove the header
header = data.take(1)[0]
rows = data.filter(lambda line: line != header)
# parse the lines, split with delimiter and provide Row names (and types if you wish, if not inferred)
row_parts = rows.map(lambda l: l.split("|"))
data_rdd = row_parts.map(lambda p: Row(area_id=p[0], cnt=p[1], start_time=str(p[2])))
# convert to dataframe
df = sqlContext.createDataFrame(data_rdd)
# lets see now 2 rows of it
df.show(2)
field1 field2 start_time
5768 49.64 2014-12-19 04:15:...
1039 266.1 2014-12-18 14:45:...