Python for Creating CSV and Preliminary Reporting


In [13]:
import csv
import geopandas as gpd
import json
import matplotlib.pyplot as plt
import pandas as pd
from shapely.geometry import Point

%matplotlib inline
plt.style.use('ggplot')

Load Data and Write to CSV


In [2]:
with open("roman-amphitheaters.geojson") as f:
    j = json.load(f)

In [3]:
# If there is one, I'd welcome a more pythonic approach. One that 
# accomodates the variable data model supported by JSON.

d = []
for feature in j['features']:
    
    # Check for optional properties

    if 'latintoponym' in feature['properties'].keys():
        latintoponym = feature['properties']['latintoponym']
    else:
        latintoponym = ''    

    if 'welchid' in feature['properties'].keys():
        welchid = feature['properties']['welchid']
    else:
        welchid = ''

    if 'golvinid' in feature['properties'].keys():
        golvinid = feature['properties']['golvinid']
    else:
        golvinid = ''

    if 'buildingtype' in feature['properties'].keys():
        buildingtype = feature['properties']['buildingtype']
    else:
        buildingtype = ''
        
    if 'buildingtype' in feature['properties'].keys():
        buildingtype = feature['properties']['buildingtype']
    else:
        buildingtype = ''
 
    if 'chronogroup' in feature['properties'].keys():
        chronogroup = feature['properties']['chronogroup']
    else:
        chronogroup = ''

    secondcentury = True
    if 'exclude' in feature['properties'].keys():
        secondcentury = False

    if 'capacity' in feature['properties'].keys():
        capacity = feature['properties']['capacity']['quantity']
    else:
        capacity = ''

    if 'province' in feature['properties'].keys():
        romanregion = feature['properties']['province']
    elif 'region' in feature['properties'].keys():
        romanregion = feature['properties']['region']
    else:
        romanregion = ''
        
    arenamajor = ''
    arenaminor = ''
    extmajor = ''
    extminor = ''
    exteriorheight = ''
    if 'dimensions' in feature['properties'].keys():
        dimensions = feature['properties']['dimensions']
        
        if 'arenamajor' in dimensions.keys():
            arenamajor = dimensions['arenamajor']

        if 'arenaminor' in dimensions.keys():
            arenaminor = dimensions['arenaminor']
            
        if 'exteriormajor' in dimensions.keys():
            extmajor = dimensions['exteriormajor']

        if 'exteriorminor' in dimensions.keys():
            extminor = dimensions['exteriorminor']
            
        if 'exteriorheight' in dimensions.keys():
            exteriorheight = dimensions['exteriorheight']
            
    d.append((feature['id'],
              feature['properties']['title'],
              feature['properties']['label'],
              latintoponym,
              feature['properties']['pleiades'],
              welchid,
              golvinid,
              buildingtype,
              chronogroup,
              secondcentury,
              capacity,
              feature['properties']['moderncountry'],
              romanregion,
              arenamajor,
              arenaminor,
              extmajor,
              extminor,
              exteriorheight,
              feature['geometry']['coordinates'][0],
              feature['geometry']['coordinates'][1],
              feature['geometry']['coordinates'][2]))

ramphs_df = pd.DataFrame(d, columns=(
 'id',    # short id
 'title', # longer title
 'label', # short label
 'latintoponym', # latin toponym
 'pleiades', # pleiades https uri
 'welchid',  # id in Welch
 'golvinid', # id in Golvin
 'buildingtype',  # usually 'amphitheater'
 'chronogroup',   # label for the chronological group
 'secondcentury', # is this an amphitheater that was in use in 2nd century
 'capacity',    # capacity as integer
 'modcountry',  # modern country
 'romanregion', # province or augustan region of italy
 'arenamajor', # long axis of arena in meters
 'arenaminor', # short axis of arena in meters
 'extmajor',   # long axis of exterior
 'extminor', # short axis of exterior
 'exteriorheight',   # height of exterior wall if known
 'longitude', # latitude
 'latitude', # longitude
 'elevation'  # elevation in meters.
 )) 

ramphs_df[['capacity','elevation','arenamajor','arenaminor',
        'extmajor','extminor','exteriorheight']] = ramphs_df[['capacity','elevation','arenamajor',
        'arenaminor','extmajor','extminor','exteriorheight']].apply(pd.to_numeric)

In [4]:
ramphs_df.to_csv("roman-amphitheaters.csv", index = False, quoting = csv.QUOTE_NONNUMERIC)

In [5]:
ramphs_df[['id','title','chronogroup','latintoponym','romanregion','modcountry','capacity',
           'extmajor','extminor','arenamajor','arenaminor','latitude','longitude']].to_csv('tmp.csv', index = False, quoting = csv.QUOTE_NONNUMERIC)

Basic Reporting


In [6]:
ramphs_df.head(2)


Out[6]:
id title label latintoponym pleiades welchid golvinid buildingtype chronogroup secondcentury ... modcountry romanregion arenamajor arenaminor extmajor extminor exteriorheight longitude latitude elevation
0 duraEuroposAmphitheater Amphitheater at Dura Europos Dura Dura Europus https://pleiades.stoa.org/places/893989 129 amphitheater severan False ... Syria syria 31.0 25.0 50.0 44.0 NaN 40.728926 34.749855 223
1 arlesAmphitheater Amphitheater at Arles Arles Arelate https://pleiades.stoa.org/places/148217 154 amphitheater flavian True ... France narbonensis 47.0 32.0 136.0 107.0 NaN 4.631111 43.677778 21

2 rows × 21 columns


In [7]:
ramphs_df.describe()


Out[7]:
capacity arenamajor arenaminor extmajor extminor exteriorheight longitude latitude elevation
count 124.000000 150.000000 149.000000 181.000000 167.000000 3.000000 261.000000 261.000000 261.000000
mean 12095.806452 57.176667 38.089933 97.437182 77.107844 41.483333 10.607393 42.206039 192.628352
std 9200.198900 14.263028 8.499440 29.751924 24.859736 9.859048 9.039066 4.966858 210.522363
min 1000.000000 25.000000 19.000000 39.600000 34.000000 32.450000 -8.493330 31.608189 -121.000000
25% 5112.500000 47.125000 33.000000 76.000000 59.200000 36.225000 5.490960 37.983696 32.000000
50% 9200.000000 58.000000 39.000000 95.000000 75.000000 40.000000 10.913907 42.077200 119.000000
75% 15662.500000 67.000000 43.000000 117.720000 94.250000 46.000000 14.250089 45.467767 283.000000
max 50000.000000 101.000000 62.000000 189.000000 156.000000 52.000000 40.728926 55.602600 1170.000000

In [8]:
ramphs_df[ramphs_df.secondcentury].describe()


Out[8]:
capacity arenamajor arenaminor extmajor extminor exteriorheight longitude latitude elevation
count 115.000000 141.000000 140.000000 172.000000 160.000000 2.000000 243.000000 243.000000 243.000000
mean 11859.826087 57.281915 38.249286 96.765291 76.610688 42.225000 10.119319 42.329709 195.041152
std 9071.861611 14.358440 8.561603 28.713826 24.773195 13.823938 8.855243 5.005590 211.581565
min 1200.000000 25.000000 19.000000 39.600000 34.000000 32.450000 -8.493330 31.608189 1.000000
25% 5075.000000 47.000000 33.000000 75.750000 58.950000 37.337500 4.164815 38.314765 34.000000
50% 9000.000000 58.000000 39.000000 93.400000 74.000000 42.225000 10.583180 42.239312 120.000000
75% 15095.000000 67.000000 43.000000 115.000000 93.400000 47.112500 14.095206 45.644955 283.000000
max 50000.000000 101.000000 62.000000 189.000000 156.000000 52.000000 38.273763 55.602600 1170.000000

In [9]:
# Confirm that CSV is readable
# It would be nice if the "numeric pattern" string survived as strings.
pd.read_csv("roman-amphitheaters.csv", quoting = 2).describe()


Out[9]:
welchid golvinid capacity arenamajor arenaminor extmajor extminor exteriorheight longitude latitude elevation
count 18.000000 82.000000 124.000000 150.000000 149.000000 181.000000 167.000000 3.000000 261.000000 261.000000 261.000000
mean 9.777778 109.524390 12095.806452 57.176667 38.089933 97.437182 77.107844 41.483333 10.607393 42.206039 192.628352
std 5.704029 63.329089 9200.198900 14.263028 8.499440 29.751924 24.859736 9.859048 9.039066 4.966858 210.522363
min 1.000000 12.000000 1000.000000 25.000000 19.000000 39.600000 34.000000 32.450000 -8.493330 31.608189 -121.000000
25% 5.250000 64.250000 5112.500000 47.125000 33.000000 76.000000 59.200000 36.225000 5.490960 37.983696 32.000000
50% 9.500000 107.000000 9200.000000 58.000000 39.000000 95.000000 75.000000 40.000000 10.913907 42.077200 119.000000
75% 14.500000 145.250000 15662.500000 67.000000 43.000000 117.720000 94.250000 46.000000 14.250089 45.467767 283.000000
max 19.000000 298.000000 50000.000000 101.000000 62.000000 189.000000 156.000000 52.000000 40.728926 55.602600 1170.000000

In [10]:
# which have heights
ramphs_df[ramphs_df.exteriorheight > 0]


Out[10]:
id title label latintoponym pleiades welchid golvinid buildingtype chronogroup secondcentury ... modcountry romanregion arenamajor arenaminor extmajor extminor exteriorheight longitude latitude elevation
4 romeFlavianAmphitheater Flavian Amphitheater at Rome Colosseum https://pleiades.stoa.org/places/423025 152 amphitheater flavian True ... Italy regio-i 83.00 48.00 189.00 156.0 52.00 12.492269 41.890169 22
90 thysdrusAmphitheater Amphitheater at Thysdrus Thysdrus (lg.) https://pleiades.stoa.org/places/324835 amphitheater post-severan False ... Tunisia proconsularis 65.00 39.00 148.00 122.0 40.00 10.706939 35.296390 111
97 pulaAmphitheater Amphitheater at Pula Pula Colonia Pietas Iulia Pola Pollentia Herculanea https://pleiades.stoa.org/places/197448 amphitheater julio-claudian True ... Croatia regio-x 67.95 41.65 132.45 105.1 32.45 13.850243 44.873229 16

3 rows × 21 columns


In [11]:
# which don't have exteriormajor
ramphs_df[pd.isnull(ramphs_df.extmajor)].sort_values(by = 'longitude')\
[['id','modcountry','latintoponym','golvinid','extmajor','arenamajor','latitude','longitude']]


Out[11]:
id modcountry latintoponym golvinid extmajor arenamajor latitude longitude
157 bragaAmphitheater Portugal Bracara Augusta NaN NaN 41.546669 -8.430075
166 bobadelaAmphitheater Portugal Elbocoris NaN 50.0 40.361088 -7.893572
56 lixusAmphitheater Morocco NaN NaN 35.199900 -6.108468
121 caparraAmphitheater Spain Municipium Flavium Caparense NaN 30.0 40.164159 -6.100049
98 carmonaAmphitheater Spain Carmo NaN NaN 37.469674 -5.650907
... ... ... ... ... ... ... ... ...
105 salamisAmphitheater Northern Cyprus NaN NaN 35.185517 33.902435
47 scythopolisNysaAmphitheater Israel NaN NaN 32.498395 35.501631
102 antiochAmphitheater Turkey NaN NaN 36.202624 36.160437
174 bostraAmphitheater Syria Nova Trajana Bostra NaN NaN 32.517923 36.479844
175 palmyraAmphitheater Syria NaN NaN 34.553789 38.273763

80 rows × 8 columns


In [ ]:
ramphs_df[ramphs_df.golvinid == '' ][['id','latintoponym','golvinid','extmajor','arenamajor','latitude','longitude']]

In [ ]:
ramphs_df[ramphs_df.latintoponym == '' ][['id','latintoponym','golvinid','extmajor','arenamajor','latitude','longitude']]

Duplicate Checking


In [ ]:
dups = ramphs_df[ramphs_df.label.duplicated(keep = False)]\
[['id','pleiades','latintoponym','latitude','longitude']].sort_values('pleiades')

len(dups) == 0

In [ ]:
dups = ramphs_df[ramphs_df.id.duplicated(keep = False)]\
[['id','pleiades','latintoponym','latitude','longitude']].sort_values('pleiades')

len(dups) == 0

In [ ]:
dups = ramphs_df[ramphs_df.pleiades.duplicated(keep = False)]\
[['id','pleiades','latintoponym','latitude','longitude']].sort_values('pleiades')

len(dups) == 15

In [ ]:
dups = ramphs_df[ramphs_df.latintoponym.duplicated(keep = False)]\
[['id','pleiades','latintoponym',
  'latitude','longitude']].sort_values('pleiades')

len(dups.query("latintoponym != ''")) == 11

Basic Mapping


In [14]:
rgdf = gpd.read_file("roman-amphitheaters.geojson")

In [15]:
rgdf.crs


Out[15]:
<Geographic 2D CRS: EPSG:4326>
Name: WGS 84
Axis Info [ellipsoidal]:
- Lat[north]: Geodetic latitude (degree)
- Lon[east]: Geodetic longitude (degree)
Area of Use:
- name: World
- bounds: (-180.0, -90.0, 180.0, 90.0)
Datum: World Geodetic System 1984
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich

In [16]:
rgdf.plot(color = 'black')


Out[16]:
<matplotlib.axes._subplots.AxesSubplot at 0x120521710>

In [ ]:
# this is simple enough that all steps are in one cell
c = []
for cgrp in j['romanamphitheaterschronogroups']:
    c.append((cgrp['id'],
    cgrp['startdate'],
    cgrp['enddate']))
    
chrono_df  = pd.DataFrame(c, columns=('chronogroup','startdate','enddate'))

chrono_df.to_csv("chronogrps.csv", index = False, quoting = csv.QUOTE_NONNUMERIC)
chrono_df.head(2)

In [ ]:
ramphs_df.merge(chrono_df)