In [1]:
import pyspark
import time
start = time.time()
sc = pyspark.SparkContext('local[*]')
wikiFile = sc.textFile('wikidata-20170306-all.json.gz')
print('counting ...')
print('count:', wikiFile.count())
end = time.time()
print(end - start, 'seconds')
sc.stop()
In [ ]: