In [13]:
import csv
import geopandas as gpd
import json
import matplotlib.pyplot as plt
import pandas as pd
from shapely.geometry import Point
%matplotlib inline
plt.style.use('ggplot')
In [2]:
with open("roman-amphitheaters.geojson") as f:
j = json.load(f)
In [3]:
# If there is one, I'd welcome a more pythonic approach. One that
# accomodates the variable data model supported by JSON.
d = []
for feature in j['features']:
# Check for optional properties
if 'latintoponym' in feature['properties'].keys():
latintoponym = feature['properties']['latintoponym']
else:
latintoponym = ''
if 'welchid' in feature['properties'].keys():
welchid = feature['properties']['welchid']
else:
welchid = ''
if 'golvinid' in feature['properties'].keys():
golvinid = feature['properties']['golvinid']
else:
golvinid = ''
if 'buildingtype' in feature['properties'].keys():
buildingtype = feature['properties']['buildingtype']
else:
buildingtype = ''
if 'buildingtype' in feature['properties'].keys():
buildingtype = feature['properties']['buildingtype']
else:
buildingtype = ''
if 'chronogroup' in feature['properties'].keys():
chronogroup = feature['properties']['chronogroup']
else:
chronogroup = ''
secondcentury = True
if 'exclude' in feature['properties'].keys():
secondcentury = False
if 'capacity' in feature['properties'].keys():
capacity = feature['properties']['capacity']['quantity']
else:
capacity = ''
if 'province' in feature['properties'].keys():
romanregion = feature['properties']['province']
elif 'region' in feature['properties'].keys():
romanregion = feature['properties']['region']
else:
romanregion = ''
arenamajor = ''
arenaminor = ''
extmajor = ''
extminor = ''
exteriorheight = ''
if 'dimensions' in feature['properties'].keys():
dimensions = feature['properties']['dimensions']
if 'arenamajor' in dimensions.keys():
arenamajor = dimensions['arenamajor']
if 'arenaminor' in dimensions.keys():
arenaminor = dimensions['arenaminor']
if 'exteriormajor' in dimensions.keys():
extmajor = dimensions['exteriormajor']
if 'exteriorminor' in dimensions.keys():
extminor = dimensions['exteriorminor']
if 'exteriorheight' in dimensions.keys():
exteriorheight = dimensions['exteriorheight']
d.append((feature['id'],
feature['properties']['title'],
feature['properties']['label'],
latintoponym,
feature['properties']['pleiades'],
welchid,
golvinid,
buildingtype,
chronogroup,
secondcentury,
capacity,
feature['properties']['moderncountry'],
romanregion,
arenamajor,
arenaminor,
extmajor,
extminor,
exteriorheight,
feature['geometry']['coordinates'][0],
feature['geometry']['coordinates'][1],
feature['geometry']['coordinates'][2]))
ramphs_df = pd.DataFrame(d, columns=(
'id', # short id
'title', # longer title
'label', # short label
'latintoponym', # latin toponym
'pleiades', # pleiades https uri
'welchid', # id in Welch
'golvinid', # id in Golvin
'buildingtype', # usually 'amphitheater'
'chronogroup', # label for the chronological group
'secondcentury', # is this an amphitheater that was in use in 2nd century
'capacity', # capacity as integer
'modcountry', # modern country
'romanregion', # province or augustan region of italy
'arenamajor', # long axis of arena in meters
'arenaminor', # short axis of arena in meters
'extmajor', # long axis of exterior
'extminor', # short axis of exterior
'exteriorheight', # height of exterior wall if known
'longitude', # latitude
'latitude', # longitude
'elevation' # elevation in meters.
))
ramphs_df[['capacity','elevation','arenamajor','arenaminor',
'extmajor','extminor','exteriorheight']] = ramphs_df[['capacity','elevation','arenamajor',
'arenaminor','extmajor','extminor','exteriorheight']].apply(pd.to_numeric)
In [4]:
ramphs_df.to_csv("roman-amphitheaters.csv", index = False, quoting = csv.QUOTE_NONNUMERIC)
In [5]:
ramphs_df[['id','title','chronogroup','latintoponym','romanregion','modcountry','capacity',
'extmajor','extminor','arenamajor','arenaminor','latitude','longitude']].to_csv('tmp.csv', index = False, quoting = csv.QUOTE_NONNUMERIC)
In [6]:
ramphs_df.head(2)
Out[6]:
In [7]:
ramphs_df.describe()
Out[7]:
In [8]:
ramphs_df[ramphs_df.secondcentury].describe()
Out[8]:
In [9]:
# Confirm that CSV is readable
# It would be nice if the "numeric pattern" string survived as strings.
pd.read_csv("roman-amphitheaters.csv", quoting = 2).describe()
Out[9]:
In [10]:
# which have heights
ramphs_df[ramphs_df.exteriorheight > 0]
Out[10]:
In [11]:
# which don't have exteriormajor
ramphs_df[pd.isnull(ramphs_df.extmajor)].sort_values(by = 'longitude')\
[['id','modcountry','latintoponym','golvinid','extmajor','arenamajor','latitude','longitude']]
Out[11]:
In [ ]:
ramphs_df[ramphs_df.golvinid == '' ][['id','latintoponym','golvinid','extmajor','arenamajor','latitude','longitude']]
In [ ]:
ramphs_df[ramphs_df.latintoponym == '' ][['id','latintoponym','golvinid','extmajor','arenamajor','latitude','longitude']]
In [ ]:
dups = ramphs_df[ramphs_df.label.duplicated(keep = False)]\
[['id','pleiades','latintoponym','latitude','longitude']].sort_values('pleiades')
len(dups) == 0
In [ ]:
dups = ramphs_df[ramphs_df.id.duplicated(keep = False)]\
[['id','pleiades','latintoponym','latitude','longitude']].sort_values('pleiades')
len(dups) == 0
In [ ]:
dups = ramphs_df[ramphs_df.pleiades.duplicated(keep = False)]\
[['id','pleiades','latintoponym','latitude','longitude']].sort_values('pleiades')
len(dups) == 15
In [ ]:
dups = ramphs_df[ramphs_df.latintoponym.duplicated(keep = False)]\
[['id','pleiades','latintoponym',
'latitude','longitude']].sort_values('pleiades')
len(dups.query("latintoponym != ''")) == 11
In [14]:
rgdf = gpd.read_file("roman-amphitheaters.geojson")
In [15]:
rgdf.crs
Out[15]:
In [16]:
rgdf.plot(color = 'black')
Out[16]:
In [ ]:
# this is simple enough that all steps are in one cell
c = []
for cgrp in j['romanamphitheaterschronogroups']:
c.append((cgrp['id'],
cgrp['startdate'],
cgrp['enddate']))
chrono_df = pd.DataFrame(c, columns=('chronogroup','startdate','enddate'))
chrono_df.to_csv("chronogrps.csv", index = False, quoting = csv.QUOTE_NONNUMERIC)
chrono_df.head(2)
In [ ]:
ramphs_df.merge(chrono_df)