In [1]:
from pyspark.sql import Row
In [4]:
points = sc.textFile("/opt/GISData/Geolife_Trajectories_1.3/beijing.csv")\
.map(lambda line: line.split(","))\
.map(lambda p: Row(x=float(p[0]), y=float(p[1])))
In [5]:
points.take(5)
Out[5]:
In [6]:
points = points.toDF()
points.show()
In [10]:
points.registerTempTable("points")
sql = "SELECT * FROM points WHERE POINT(x, y) IN CIRCLERANGE(POINT(39.9042, 116.4074), 0.005)"
buffer = sqlContext.sql(sql)
buffer.count()
Out[10]:
In [15]:
buffer.toPandas().to_csv('sample.csv')
In [ ]: