In [1]:
from __future__ import unicode_literals
import requests
import json
import requests
from bson.objectid import ObjectId
import re
from pymongo import MongoClient
import tangelo
import glob
# import pandas as pd
from pyelasticsearch import ElasticSearch
import sys, os
import utilities
# read in config file
from ConfigParser import ConfigParser
#__location__ = os.path.realpath(
# os.path.join(os.getcwd(), os.path.dirname(__file__)))
config_file = glob.glob('config.ini')
parser = ConfigParser()
parser.read(config_file)
mitie_directory = parser.get('Locations', 'mitie_directory')
country_endpoint = parser.get('Endpoints', 'country_endpoint')
sys.path.append(mitie_directory)
from mitie import *
es = ElasticSearch(urls='http://localhost:9200', timeout=60, max_retries=2)
country_names = ["Afghanistan","Åland Islands","Albania","Algeria","American Samoa",
"Andorra","Angola","Anguilla","Antarctica","Antigua and Barbuda",
"Argentina","Armenia","Aruba","Ascension Island","Australia","Austria",
"Azerbaijan","Bahamas","Bahrain","Bangladesh","Barbados","Belarus",
"Belgium","Belize","Benin","Bermuda","Bhutan","Bolivia",
"Bonaire, Sint Eustatius, and Saba","Bosnia and Herzegovina","Botswana",
"Bouvet Island","Brazil","Britain","Great Britain", "British Indian Ocean Territory",
"British Virgin Islands","Brunei","Bulgaria","Burkina Faso","Burundi","Cambodia",
"Cameroon","Canada","Canary Islands","Cape Verde","Cayman Islands","Central African Republic",
"Ceuta and Melilla","Chad","Chile","China","Christmas Island","Clipperton Island",
"Cocos [Keeling] Islands","Colombia","Comoros","Congo - Brazzaville","Congo - Kinshasa","Congo",
"Democratic Republic of Congo", "Cook Islands","Costa Rica","Côte d’Ivoire","Croatia","Cuba",
"Curaçao","Cyprus","Czech Republic","Denmark","Diego Garcia","Djibouti","Dominica",
"Dominican Republic","Ecuador","Egypt","El Salvador","Equatorial Guinea","Eritrea",
"Estonia","Ethiopia","European Union","Falkland Islands","Faroe Islands","Fiji","Finland",
"France","French Guiana","French Polynesia","French Southern Territories","Gabon","Gambia",
"Gaza","Georgia","Germany","Ghana","Gibraltar","Greece","Greenland","Grenada","Guadeloupe",
"Guam","Guatemala","Guernsey","Guinea","Guinea-Bissau","Guyana","Haiti",
"Heard Island and McDonald Islands","Honduras","Hong Kong SAR China","Hungary","Iceland",
"India","Indonesia","Iran","Iraq","Ireland","Isle of Man","Israel","Italy","Jamaica","Japan",
"Jersey","Jordan","Kazakhstan","Kenya","Kiribati","Kuwait","Kyrgyzstan","Laos","Latvia","Lebanon",
"Lesotho","Liberia","Libya","Liechtenstein","Lithuania","Luxembourg","Macau SAR China","Macedonia",
"Madagascar","Malawi","Malaysia","Maldives","Mali","Malta","Marshall Islands","Martinique","Mauritania",
"Mauritius","Mayotte","Mexico","Micronesia","Moldova","Monaco","Mongolia","Montenegro","Montserrat",
"Morocco","Mozambique","Myanmar [Burma]","Namibia","Nauru","Nepal","Netherlands","Netherlands Antilles",
"New Caledonia","New Zealand","Nicaragua","Niger","Nigeria","Niue","Norfolk Island","North Korea",
"Northern Ireland", "Northern Mariana Islands","Norway","Oman","Outlying Oceania","Pakistan","Palau",
"Palestinian Territories","Panama","Papua New Guinea","Paraguay","Peru","Philippines","Pitcairn Islands",
"Poland","Portugal","Puerto Rico","Qatar","Réunion","Romania","Russia","Rwanda","Saint Barthélemy",
"Saint Helena","Saint Kitts and Nevis","Saint Lucia","Saint Martin","Saint Pierre and Miquelon",
"Saint Vincent and the Grenadines","Samoa","San Marino","São Tomé and Príncipe","Saudi Arabia",
"Senegal","Serbia","Serbia and Montenegro","Seychelles","Sierra Leone","Singapore","Sint Maarten",
"Slovakia","Slovenia","Solomon Islands","Somalia","South Africa",
"South Georgia and the South Sandwich Islands","South Korea","South Sudan","Spain","Sri Lanka",
"Sudan","Suriname","Svalbard and Jan Mayen","Swaziland","Sweden","Switzerland","Syria","Taiwan",
"Tajikistan","Tanzania","Thailand","Timor-Leste","Togo","Tokelau","Tonga","Trinidad and Tobago",
"Tristan da Cunha","Tunisia","Turkey","Turkmenistan","Turks and Caicos Islands","Tuvalu",
"U.S. Minor Outlying Islands","U.S. Virgin Islands","Uganda","Ukraine","United Arab Emirates",
"United Kingdom","UK","United States","USA", "United States of America", "Uruguay","Uzbekistan",
"Vanuatu","Vatican City","Venezuela","Vietnam","Wallis and Futuna","Western Sahara","Yemen",
"Zambia","Zimbabwe", "Europe", "America", "Africa", "Asia", "North America", "South America",
"United Nations","UN"]
@tangelo.restful
def get():
return """
This service expects a POST in the form '{"text":"On 12 August, the BBC reported that..."}'
It will return the places mentioned in the text along with their latitudes and longitudes in the form:
{"lat":34.567, "lon":12.345, "seachterm":"Baghdad", "placename":"Baghdad", "countrycode":"IRQ"}
"""
In [2]:
def pick_best_result(results, term):
# Given a search term and the elasticsearch/geonames result from that search, return the best lat, lon, searchterm, place name
loc = []
try:
results = results['hits']['hits']
except:
return []
if len(results) < 1:
# end if there are no results
return []
# This is a big chunk of conditional logic to favor different results depending on what terms are in the
# original term. This is all obviously Syria and Iraq specific.
# Governorate/Province Search
elif re.search("Governorate|Province|Wilayah", term):
# look for top-level ADM1 code
for r in results:
if r['_source']['feature_code'] == 'ADM1':
coords = r['_source']['coordinates'].split(",")
loc = [float(coords[0]), float(coords[1]), term, r['_source']['asciiname'], r['_source']['feature_class'], r['_source']['country_code3']]
if loc:
return loc
# Failing that, take an area
if loc == []:
for r in results:
if r['_source']['feature_class'] == 'A':
coords = r['_source']['coordinates'].split(",")
loc = [float(coords[0]), float(coords[1]), term, r['_source']['asciiname'], r['_source']['feature_class'], r['_source']['country_code3']]
if loc:
return loc
# Failing that, take an inhabited place
if loc == []:
for r in results:
if r['_source']['feature_class'] == 'P':
coords = r['_source']['coordinates'].split(",")
loc = [float(coords[0]), float(coords[1]), term, r['_source']['asciiname'], r['_source']['feature_class'], r['_source']['country_code3']]
if loc:
return loc
# last resort, just take the first result.
if loc == []:
coords = results[0]['_source']['coordinates'].split(",")
loc = [float(coords[0]), float(coords[1]), term, results[0]['_source']['asciiname'], results[0]['_source']['feature_class'], results[0]['_source']['country_code3']]
return loc
# District search
elif re.search("District", term):
# take places that are areas
## define the default up here at the top?
for r in results:
if r['_source']['feature_class'] == 'A':
coords = r['_source']['coordinates'].split(",")
loc = [float(coords[0]), float(coords[1]), term, r['_source']['asciiname'], r['_source']['feature_class'], r['_source']['country_code3']]
if loc:
return loc
# Failing that, take an inhabited place
if loc == []:
for r in results:
if r['_source']['feature_class'] == 'P':
coords = r['_source']['coordinates'].split(",")
loc = [float(coords[0]), float(coords[1]), term, r['_source']['asciiname'], r['_source']['feature_class'], r['_source']['country_code3']]
if loc:
return loc
# last resort, just take the first place result.
if loc == []:
coords = results[0]['_source']['coordinates'].split(",")
loc = [float(coords[0]), float(coords[1]), term, results[0]['_source']['asciiname'], results[0]['_source']['feature_class'], results[0]['_source']['country_code3']]
return loc
# Subdistrict search
elif re.search("Subdistrict", term):
# take places that are areas
## define the default up here at the top?
for r in results:
if r['_source']['feature_class'] == 'P':
coords = r['_source']['coordinates'].split(",")
loc = [float(coords[0]), float(coords[1]), term, r['_source']['asciiname'], r['_source']['feature_class'], r['_source']['country_code3']]
if loc:
return loc
# Failing that, take an inhabited place
if loc == []:
for r in results:
if r['_source']['feature_class'] == 'P':
coords = r['_source']['coordinates'].split(",")
loc = [float(coords[0]), float(coords[1]), term, r['_source']['asciiname'], r['_source']['feature_class'], r['_source']['country_code3']]
if loc:
return loc
# last resort, just take the first result.
if loc == []:
coords = results[0]['_source']['coordinates'].split(",")
loc = [float(coords[0]), float(coords[1]), term, results[0]['_source']['asciiname'], results[0]['_source']['feature_class'], results[0]['_source']['country_code3']]
return loc
# Airport search
elif re.search("Airport", term):
for r in results:
if r['_source']['feature_class'] == 'S':
coords = r['_source']['coordinates'].split(",")
loc = [float(coords[0]), float(coords[1]), term, r['_source']['asciiname'], r['_source']['feature_class'], r['_source']['country_code3']]
if loc:
return loc
# Failing that, take an inhabited place
if loc == []:
for r in results:
if r['_source']['feature_class'] == 'P':
coords = r['_source']['coordinates'].split(",")
loc = [float(coords[0]), float(coords[1]), term, r['_source']['asciiname'], r['_source']['feature_class'], r['_source']['country_code3']]
if loc:
return loc
if loc == []:
coords = results[0]['_source']['coordinates'].split(",")
loc = [float(coords[0]), float(coords[1]), term, results[0]['_source']['asciiname'], results[0]['_source']['feature_class'], results[0]['_source']['country_code3']]
return loc
# final condition: if it doesn't have any special terms, just take the first result.
# Not sure whether this should pick a city instead. Example: "Aleppo" should go to Aleppo the city.
# But switching makes Damascus resolve to the wrong place, since the city of Damascus doesn't make it into the top 10 for some reason.
# But definitely don't take bodies of water
else:
for r in results:
if r['_source']['feature_code'] == 'PPLA':
coords = r['_source']['coordinates'].split(",")
loc = [float(coords[0]), float(coords[1]), term, r['_source']['asciiname'], r['_source']['feature_class'], r['_source']['country_code3']]
if loc:
return loc
if loc == []:
coords = results[0]['_source']['coordinates'].split(",")
loc = [float(coords[0]), float(coords[1]), term, results[0]['_source']['asciiname'], results[0]['_source']['feature_class'], results[0]['_source']['country_code3']]
return loc
In [53]:
P_list = ("city", "town", "village", "settlement", "capital", "cities", "villages", "towns", "neighborhood", "neighborhoods")
A_list = ("governorate", "province", "muhafazat")
#
def subset_results(results, feature_class):
for r in results:
if r['_source']['feature_class'] == feature_class:
return r
# Is there an exact match?
def check_names(results, term):
new_results = []
for r in results:
if r['_source']['name'].lower() == term.lower():
return r
## Filter based on the context.
## Then take edit distance.
def pick_best_result2(results, term, context):
results = results['hits']['hits']
context = set([x.lower() for x in context])
#if context.intersection(P_list):
# print "city"
# place = subset_results(results, 'P')
place = check_names(results, term)
if not place:
print "No nothing"
try:
place = results[0]
except IndexError:
return []
#place = results[0]
coords = place['_source']['coordinates'].split(",")
print "I'm at the end"
loc = [float(coords[0]), float(coords[1]), term, place['_source']['asciiname'], place['_source']['feature_class'], place['_source']['country_code3']]
return loc
def extract_feature_class(results, term, context):
context = set([x.lower() for x in context])
if context.intersection(P_list):
return ['P']
if context.intersection(A_list):
return ['A']
else:
return ['A', 'P', 'S']
In [54]:
place_cache = {}
def places(text):
#params = json.loads(tangelo.request_body().read())
#text = params['text']
locations = []
try:
country = requests.post(country_endpoint, data=json.dumps({"text":text}))
country_filter = [country.text]
print country_filter
except ValueError:
return json.dumps(locations)
out = utilities.mitie_context(text)
for i in out['entities']:
if i['text'] in country_names:
print " (Country/blacklist. Skipping...)"
elif i['tag'] == "LOCATION" or i['tag'] == "Location":
print i
try:
searchterm = re.sub(r"Governorate|District|Subdistrict|Airport", "", i['text']).strip() #put this in query_geonames?
searchterm = re.sub("Dar 'a", "Dar'a", searchterm)
feature_class = extract_feature_class(searchterm, i['text'], i['context'])
cache_term = '___'.join([searchterm, ''.join(feature_class)])
print cache_term
try:
t = place_cache[cache_term]
except KeyError:
t = utilities.query_geonames_featureclass(searchterm, country_filter, feature_class)
place_cache[cache_term] = t
for n in t['hits']['hits']:
print n['_source'][u'name']
print extract_feature_class(t, i['text'], i['context'])
loc = pick_best_result2(t, i['text'], i['context'])
# loc is a nice format for debugging and looks like [35.13179, 36.75783, 'searchterm', u'matchname', u'feature_class', u'country_code3']:
if loc:
formatted_loc = {"lat":loc[0], "lon":loc[1], "searchterm":loc[2], "placename":loc[3], "countrycode":loc[5]}
locations.append(formatted_loc)
except Exception as e:
print e
print "Place cache is ",
print len(place_cache)
return json.dumps(locations)
#print place_cache
In [55]:
places("When you travel alone, you are completely on your own schedule, which means you are free to do what you want to do—like follow a series of clues and piece together your father’s murder, which transpired 15 years ago. When you finally track the killer down in Zagreb, it will be your decision whether or not you want to give him mercy—not your travel buddy’s!")
Out[55]:
In [56]:
places("Security sources told Reuters that Egyptian intelligence services had provided Libyan authorities with information that helped them free the Ethiopians who had been held by armed groups in the cities of Derna and Misrata.")
Out[56]:
In [57]:
places("Another day and another government airstrike on insurgent-held districts in the benighted historic city of Aleppo. What was once the proud commercial capital of Syria is now suffering an intensifying blitz from the Syrian regime, and according to a report released today by Amnesty International, the attacks amount to war crimes and crimes against humanity.")
Out[57]:
In [58]:
places("(Reuters) - A senior Syrian army officer was wounded in a suicide bombing in a central Damascus district on Monday, a monitoring group said, though the military denied the report.")
Out[58]:
In [47]:
t = places("(Reuters) - Nepalese soldiers and villagers dug through snow mounds in a remote hamlet on Wednesday in search of scores of bodies of villagers and trekkers believed to be buried in an avalanche set off by last month's devastating earthquake, officials said. The death toll from the April 25 quake in the Himalayan mountain nation has reached 7,675, with more than 16,300 people injured, the government said. As rescuers hunted for more than 180 bodies in the village of Langtang, 60 km (37 miles) north of the capital Kathmandu, seven bodies including of that of a German trekker were recovered at Manaslu, another climbing site.")
t
Out[47]:
In [59]:
places("Burundi protesters burned a man to death in the capital on Thursday, accusing him of being a member of the ruling party's Imbonerakure youth wing and saying the group had launched attacks on them, a witness and local media reported. They put tires around his neck and then burned him,' a witness told Reuters after seeing the incident in the Nyakabiga district of Bujumbura, one of the flashpoint areas during protests against the president's bid for a third term.")
Out[59]:
In [50]:
places("U.S. airstrikes helped Kurdish and Iraqi forces take control of Mosul Dam on Monday, fighting back ISIS militants who had seized the dam, President Obama told reporters. The stakes were huge for the millions of Iraqis who live downstream from the dam, the largest in the country. 'If that dam was breached it could have proven catastrophic, with floods that would have threatened the lives of thousands of civilians and endangered our embassy compound in Baghdad,' the President said.")
Out[50]:
In [104]:
import utilities
utilities.query_geonames_featureclass("Aleppo", ["IRQ", "SYR"], ["S", "A", "P"])
Out[104]:
In [52]:
payload = {
"query": {
"filtered": {
"query": {
"query_string": {
"query": "Mosul Dam",
"fields": ["asciiname^5", "alternativenames"]
}
},
"filter": {
"and" : [
{
"terms" : {
"country_code3": ["IRQ"]
}
},{
"terms" : {
"feature_class": ["P", "A", "S"]
}
}
]
}
}
}
}
out = requests.post("http://localhost:9200/geonames/_search?pretty", data=json.dumps(payload))
out.json()
Out[52]:
In [8]:
utilities.query_geonames_featureclass("Kathmandu", ["NPL"], ["P"])
Out[8]:
In [ ]:
In [ ]: