notebook.community

Edit and run



In [1]:

    
# import classes
from pyspark import SparkConf, SparkContext
# configuration with all defaults
conf = SparkConf()
# spark context based on our configuration
sc = SparkContext(conf=conf)



In [26]:

    
rdd = sc.textFile('files/*')



In [27]:

    
rdd.count()









    Out[27]:





137372



In [28]:

    
def split_line(line):
    pieces = line.strip().split('\t')
    annotations = pieces[7].split(';')
    return pieces[0], pieces[1], pieces[2], pieces[3], pieces[4], annotations[0].split('=')



In [30]:

    
cached = rdd \
    .filter(lambda x: not x.startswith('#')) \
    .map(split_line) \
    .cache()



In [31]:

    
cached









    Out[31]:





PythonRDD[26] at RDD at PythonRDD.scala:48



In [32]:

    
cached \
    .filter(lambda obj: int(obj[5][1]) > 2) \
    .take(5)









    Out[32]:





[('chr1', '13656', '.', 'CAG', 'C', ['AC', '3']),
 ('chr1', '14653', 'rs62635297', 'C', 'T', ['AC', '3']),
 ('chr1', '14930', 'rs6682385', 'A', 'G', ['AC', '3']),
 ('chr1', '15903', 'rs557514207', 'G', 'GC', ['AC', '6']),
 ('chr1', '16571', 'rs12405900', 'G', 'A', ['AC', '3'])]