In [ ]:


In [ ]:


In [ ]:


In [136]:
exact_values = [Tweet(json.loads(line)["_id"]["$numberLong"].encode("utf-8"),
                filter(None,re.split('[^a-z]', 
                    json.loads(line)["text"].lower().encode("utf-8"))),
                coordinates = json.loads(line)["coordinates"]["coordinates"])
                for line 
                in open("actual_data/exact.json")]
exact_values = RemoveStopWords(stopWords, exact_values)

exact = [value.text for value in exact_values + bbox_values]
#flatten the list of lists to 1d array
exact_flatten = [item for sublist in exact for item in sublist]
#remove duplicates
exact_dict = {w:'' for w in exact_flatten}
#enumerate without duplicates
exact_enum = {w: idx for idx, w in enumerate(exact_dict)}

exact_matrix = np.zeros((len(exact_values+bbox_values), len(exact_enum)), dtype=int) 
d = dict()
for idx, tweet in enumerate(exact_values + bbox_values):
    d[tweet.id] = idx
    for w in tweet.text:
        exact_matrix[idx,exact_enum[w]] += 1

from shapely.geometry import Point
from shapely.geometry.polygon import Polygon

from lshash import LSHash
i=0
for tweet in bbox_values:
    inside_tweets = [value for value in exact_values 
                     if Polygon(tweet.bounding_box).contains(
                     Point(value.coordinates))]
    lsh = LSHash(12, len(exact_enum))
    if len(inside_tweets)<3: continue
    for insider in inside_tweets: 
        lsh.index(exact_matrix[d[insider.id]])
    
    cs = lsh.query(exact_matrix[d[tweet.id]], num_results=3)
    points = []
    for insider in inside_tweets:
        if exact_matrix[d[insider.id]] in cs:
            points.append(insider.coordinates)
    if len(cs)!=0:
        i+=1
print i
print len(bbox_values)


/Users/alexander/anaconda2/lib/python2.7/site-packages/ipykernel_launcher.py:67: DeprecationWarning: elementwise == comparison failed; this will raise an error in the future.
13
753

In [ ]: