In [1]:
# import classes
from pyspark import SparkConf, SparkContext
# configuration with all defaults
conf = SparkConf()
# spark context based on our configuration
sc = SparkContext(conf=conf)

In [26]:
rdd = sc.textFile('files/*')

In [27]:
rdd.count()


Out[27]:
137372

In [28]:
def split_line(line):
    pieces = line.strip().split('\t')
    annotations = pieces[7].split(';')
    return pieces[0], pieces[1], pieces[2], pieces[3], pieces[4], annotations[0].split('=')

In [30]:
cached = rdd \
    .filter(lambda x: not x.startswith('#')) \
    .map(split_line) \
    .cache()

In [31]:
cached


Out[31]:
PythonRDD[26] at RDD at PythonRDD.scala:48

In [32]:
cached \
    .filter(lambda obj: int(obj[5][1]) > 2) \
    .take(5)


Out[32]:
[('chr1', '13656', '.', 'CAG', 'C', ['AC', '3']),
 ('chr1', '14653', 'rs62635297', 'C', 'T', ['AC', '3']),
 ('chr1', '14930', 'rs6682385', 'A', 'G', ['AC', '3']),
 ('chr1', '15903', 'rs557514207', 'G', 'GC', ['AC', '6']),
 ('chr1', '16571', 'rs12405900', 'G', 'A', ['AC', '3'])]