In [1]:
# import classes
from pyspark import SparkConf, SparkContext
# configuration with all defaults
conf = SparkConf()
# spark context based on our configuration
sc = SparkContext(conf=conf)
In [26]:
rdd = sc.textFile('files/*')
In [27]:
rdd.count()
Out[27]:
In [28]:
def split_line(line):
pieces = line.strip().split('\t')
annotations = pieces[7].split(';')
return pieces[0], pieces[1], pieces[2], pieces[3], pieces[4], annotations[0].split('=')
In [30]:
cached = rdd \
.filter(lambda x: not x.startswith('#')) \
.map(split_line) \
.cache()
In [31]:
cached
Out[31]:
In [32]:
cached \
.filter(lambda obj: int(obj[5][1]) > 2) \
.take(5)
Out[32]: