In [1]:
from pyspark.sql import SQLContext
sqlc=SQLContext(sc)
In [2]:
tsv=sc.textFile('samples/1mil.tsv')
header=tsv.first()
fields=header.split('\t')
In [3]:
rdd=tsv.filter(
lambda line: line != header
).map(
lambda row: dict(zip(fields,row.split('\t')))
)
In [4]:
srdd=sqlc.inferSchema(rdd)
srdd.registerAsTable('flows')
In [5]:
sqlc.sql('select count(*) from flows').collect()
Out[5]: