In [1]:
import pyspark
import shutil
import time

subset_count = 1000

def inline_subset(file, count=10):
    content = file.take(count+1)
    content[count] = content[count][:-1]+']'
    return sc.parallelize(content).reduce(lambda a, b: a+b)

print("start extracting")
start = time.time()

sc = pyspark.SparkContext('local[*]')

wikiFile = sc.textFile('wikidata-20170306-all.json.gz')
subset = inline_subset(wikiFile, subset_count)

with open('wikidata.json', 'w') as file:
    file.write(subset)
    
#shutil.rmtree('wikidata.json')
#sc.parallelize(subset).saveAsTextFile('wikidata.json')

end = time.time()

print('subset ready!')
print(end - start, 'seconds')

sc.stop()


start extracting
subset ready!
8.364059686660767 seconds

In [ ]: