In [ ]:
In [ ]:
In [ ]:
In [136]:
exact_values = [Tweet(json.loads(line)["_id"]["$numberLong"].encode("utf-8"),
filter(None,re.split('[^a-z]',
json.loads(line)["text"].lower().encode("utf-8"))),
coordinates = json.loads(line)["coordinates"]["coordinates"])
for line
in open("actual_data/exact.json")]
exact_values = RemoveStopWords(stopWords, exact_values)
exact = [value.text for value in exact_values + bbox_values]
#flatten the list of lists to 1d array
exact_flatten = [item for sublist in exact for item in sublist]
#remove duplicates
exact_dict = {w:'' for w in exact_flatten}
#enumerate without duplicates
exact_enum = {w: idx for idx, w in enumerate(exact_dict)}
exact_matrix = np.zeros((len(exact_values+bbox_values), len(exact_enum)), dtype=int)
d = dict()
for idx, tweet in enumerate(exact_values + bbox_values):
d[tweet.id] = idx
for w in tweet.text:
exact_matrix[idx,exact_enum[w]] += 1
from shapely.geometry import Point
from shapely.geometry.polygon import Polygon
from lshash import LSHash
i=0
for tweet in bbox_values:
inside_tweets = [value for value in exact_values
if Polygon(tweet.bounding_box).contains(
Point(value.coordinates))]
lsh = LSHash(12, len(exact_enum))
if len(inside_tweets)<3: continue
for insider in inside_tweets:
lsh.index(exact_matrix[d[insider.id]])
cs = lsh.query(exact_matrix[d[tweet.id]], num_results=3)
points = []
for insider in inside_tweets:
if exact_matrix[d[insider.id]] in cs:
points.append(insider.coordinates)
if len(cs)!=0:
i+=1
print i
print len(bbox_values)
In [ ]: