In [1]:
import csv, re, requests, json
In [2]:
len(Place.objects.all())
Out[2]:
In [3]:
for x in Place.objects.all():
x.delete()
for x in Institution.objects.all():
x.delete()
In [4]:
# fetch data directly from github
# see http://stackoverflow.com/questions/35371043/use-python-requests-to-download-csv
url = 'https://raw.githubusercontent.com/gfranzini/digEds_cat/master/digEds_cat_spatial.csv'
with requests.Session() as s:
download = s.get(url)
decoded_content = download.content.decode('utf-8')
cr = csv.reader(decoded_content.splitlines(), delimiter=',')
datalist = list(cr)
In [5]:
#load local copy of data
#file = "data/digEds_cat_spatial.csv"
#with open(file, 'r', encoding='utf-8') as data:
#reader = csv.reader(data)
#datalist = list(reader)
In [5]:
# iterats over rows in csv. sends for each lng-lat a request to geonames
# api to fetch matching places (and the countries they are located in)
# creates Place objects and Institution objects, relats and stores them in db.
root = "http://api.geonames.org/findNearbyPlaceNameJSON?lat="
geoname_jsons = []
for row in datalist:
if row[1] != "" and row[1] != "Latitude":
lat, lng = format(float(row[1]), '.3f'), format(float(row[2]), '.3f')
params = "{}&lng={}&username=digitalarchiv".format(lat, lng)
url = root + params
r = requests.get(url)
x = r.json()
if len(x["geonames"]) >= 1:
place_dict = x["geonames"][0]
temp_country, _ = Place.objects.get_or_create(geonames_id=place_dict["countryId"],
name=place_dict["countryName"])
temp_country.save()
temp_country.place_type = "country"
temp_country.save()
temp_place, _ = Place.objects.get_or_create(geonames_id=place_dict["geonameId"],
name=place_dict["name"],
place_type="city",
lat=place_dict["lat"],
lng=place_dict["lng"])
temp_place.part_of = temp_country
temp_place.save()
temp_inst, _ = Institution.objects.get_or_create(name=row[0])
temp_inst.lat = row[1]
temp_inst.lng = row[2]
temp_inst.save()
temp_inst.place = temp_place
temp_inst.save()
In [7]:
print("Countries created: ", len(Place.objects.filter(place_type="country")))
print("Places created: ", len(Place.objects.filter(place_type="city")))
print("Institutions created: ", len(Institution.objects.all()) )
In [8]:
#iterates over all instituions, sends a post request to stanbol endpoint and saves the result(if there is one)
good = []
bad = []
for x in Institution.objects.all():
data = {'limit': 20,
'constraints': [
{'type': 'text',
'field': 'http://www.w3.org/2000/01/rdf-schema#label',
'text': x.name
},]}
data = json.dumps(data)
try:
r = requests.post('http://stanbol.herkules.arz.oeaw.ac.at/entityhub/site/gndCorporateBody/query',
data = data, headers = {'content-type': 'application/json'})
hit = r.json()
result = {'gnd-id':hit['results'][0]['id'], 'inst':x}
good.append(result)
except:
bad.append(x.name)
In [9]:
#iterates over the previously created list of results and updates Instant object with gnd-id
for x in good:
x['inst'].gnd_id=x['gnd-id']
x['inst'].save()
In [ ]: