In [65]:
from pyspark.sql import Row

In [66]:
points = sc.textFile("/home/and/Documents/PhD/Code/Y2Q1/SDB/Project/Code/trajs.csv")\
.map(lambda line: line.split("\t"))\
.map(lambda p: Row(tag="{0}_{1}".format(p[0],p[1]), x=float(p[2].strip()), y=float(p[3].strip())))

In [67]:
points = points.toDF()
points.show()


+----+----------------+----------------+
| tag|               x|               y|
+----+----------------+----------------+
| 0_0|         14992.0|         13485.0|
| 0_1|14988.5546033368|13223.4685437995|
| 0_2|14687.4853092579| 12891.715878107|
| 0_3|14439.2848569753|12984.8452218598|
| 0_4|14267.2523555687|13144.5044093446|
| 0_5| 14106.380428277|13320.9723559176|
| 0_6|13885.3015873829|13582.8642967952|
| 0_7|13736.2690822009|13759.6918533465|
| 0_8|13586.4730568468|13925.4215545226|
| 0_9|13374.2170171242| 14067.417626726|
|0_10|13147.9531024768| 14224.228410651|
|0_11|12952.1665218279|14347.8667667371|
|0_12|12624.7521573678|14415.0971778723|
|0_13|12565.1769964001|14655.4791543081|
|0_14|12480.1838472365|14855.4646540428|
|0_15|12405.8350613404|  15018.58586222|
|0_16|12309.0847892043|15234.5438461127|
|0_17|12391.6712295932|15417.1355435914|
|0_18|12547.7518309049|15452.0497516498|
|0_19|         12552.0|         15453.0|
+----+----------------+----------------+
only showing top 20 rows


In [68]:
points.registerTempTable("points")
sql = "SELECT * FROM points WHERE POINT(x, y) IN CIRCLERANGE(POINT(12552, 15453), 10)"
print(sqlContext.sql(sql).count())
sql = "SELECT * FROM points WHERE POINT(x, y) IN CIRCLERANGE(POINT(12552, 15453), 100)"
print(sqlContext.sql(sql).count())
sql = "SELECT * FROM points WHERE POINT(x, y) IN CIRCLERANGE(POINT(12552, 15453), 1000)"
print(sqlContext.sql(sql).count())
sql = "SELECT * FROM points WHERE POINT(x, y) IN CIRCLERANGE(POINT(12552, 15453), 10000)"
print(sqlContext.sql(sql).count())
sql = "SELECT * FROM points WHERE POINT(x, y) IN CIRCLERANGE(POINT(12552, 15453), 100000)"
print(sqlContext.sql(sql).count())


3
9
870
49668
57011

In [ ]: