In [1]:
import csv, re, requests, json

In [2]:
len(Place.objects.all())


Out[2]:
171

In [3]:
for x in Place.objects.all():
    x.delete()
for x in Institution.objects.all():
    x.delete()

In [4]:
# fetch data directly from github
# see http://stackoverflow.com/questions/35371043/use-python-requests-to-download-csv
url = 'https://raw.githubusercontent.com/gfranzini/digEds_cat/master/digEds_cat_spatial.csv'
with requests.Session() as s:
    download = s.get(url)
    decoded_content = download.content.decode('utf-8')
    cr = csv.reader(decoded_content.splitlines(), delimiter=',')
    datalist = list(cr)

In [5]:
#load local copy of data
#file = "data/digEds_cat_spatial.csv"
#with open(file, 'r', encoding='utf-8') as data:
    #reader = csv.reader(data)
    #datalist = list(reader)

In [5]:
# iterats over rows in csv. sends for each lng-lat a request to geonames 
# api to fetch matching places (and the countries they are located in) 
# creates Place objects and Institution objects, relats and stores them in db.
root = "http://api.geonames.org/findNearbyPlaceNameJSON?lat="
geoname_jsons = []
for row in datalist:
    if row[1] != "" and row[1] != "Latitude":
        lat, lng = format(float(row[1]), '.3f'), format(float(row[2]), '.3f')
        params = "{}&lng={}&username=digitalarchiv".format(lat, lng)
        url = root + params
        r = requests.get(url)
        x =  r.json()
        if len(x["geonames"]) >= 1:
            place_dict = x["geonames"][0]
            temp_country, _ = Place.objects.get_or_create(geonames_id=place_dict["countryId"],
                                                         name=place_dict["countryName"])
            temp_country.save()
            temp_country.place_type = "country"
            temp_country.save()
            temp_place, _ = Place.objects.get_or_create(geonames_id=place_dict["geonameId"],
                                                        name=place_dict["name"],
                                                        place_type="city",
                                                        lat=place_dict["lat"],
                                                        lng=place_dict["lng"])
            temp_place.part_of = temp_country
            temp_place.save()
            temp_inst, _ = Institution.objects.get_or_create(name=row[0])
            temp_inst.lat = row[1]
            temp_inst.lng = row[2]
            temp_inst.save()
            temp_inst.place = temp_place
            temp_inst.save()

In [7]:
print("Countries created: ", len(Place.objects.filter(place_type="country")))
print("Places created: ", len(Place.objects.filter(place_type="city")))
print("Institutions created: ", len(Institution.objects.all()) )


Countries created:  25
Places created:  146
Institutions created:  161

In [8]:
#iterates over all instituions, sends a post request to stanbol endpoint and saves the result(if there is one)
good = []
bad = []
for x in Institution.objects.all():
    data = {'limit': 20,
        'constraints': [
            {'type': 'text',
            'field': 'http://www.w3.org/2000/01/rdf-schema#label',
            'text': x.name
            },]}
    data = json.dumps(data)
    try:
        r = requests.post('http://stanbol.herkules.arz.oeaw.ac.at/entityhub/site/gndCorporateBody/query',
                          data = data, headers = {'content-type': 'application/json'})
        hit = r.json()
        result = {'gnd-id':hit['results'][0]['id'], 'inst':x}
        good.append(result)
    except:
        bad.append(x.name)

In [9]:
#iterates over the previously created list of results and updates Instant object with gnd-id
for x in good:
    x['inst'].gnd_id=x['gnd-id']
    x['inst'].save()

In [ ]: