In [1]:
import pyspark
import shutil
import time
subset_count = 1000
def inline_subset(file, count=10):
content = file.take(count+1)
content[count] = content[count][:-1]+']'
return sc.parallelize(content).reduce(lambda a, b: a+b)
print("start extracting")
start = time.time()
sc = pyspark.SparkContext('local[*]')
wikiFile = sc.textFile('wikidata-20170306-all.json.gz')
subset = inline_subset(wikiFile, subset_count)
with open('wikidata.json', 'w') as file:
file.write(subset)
#shutil.rmtree('wikidata.json')
#sc.parallelize(subset).saveAsTextFile('wikidata.json')
end = time.time()
print('subset ready!')
print(end - start, 'seconds')
sc.stop()
In [ ]: