In [1]:
import pyspark
import time

start = time.time()

sc = pyspark.SparkContext('local[*]')

wikiFile = sc.textFile('wikidata-20170306-all.json.gz')

print('counting ...')
print('count:', wikiFile.count())

end = time.time()
print(end - start, 'seconds')

sc.stop()


counting ...
count: 25283998
670.2925405502319 seconds

In [ ]: