In [7]:
from pyspark.sql import Row
In [8]:
points = sc.textFile("/opt/GISData/Geolife_Trajectories_1.3/beijing.csv")\
.map(lambda line: line.split(","))\
.map(lambda p: Row(x=float(p[0]), y=float(p[1])))
In [9]:
points.take(5)
Out[9]:
In [10]:
points = points.toDF()
points.show()
In [11]:
points.registerTempTable("points")
sql = "SELECT * FROM points WHERE POINT(x, y) IN CIRCLERANGE(POINT(39.9042, 116.4074), 0.5)"
buffer = sqlContext.sql(sql).cache()
In [13]:
print(buffer.count())
buffer_sample = buffer.sample(False, 0.00005, 42)
print(buffer_sample.count())
buffer_sample.toPandas().to_csv('sample_small.csv')
In [ ]: