In [1]:
from iSDM.species import GBIFSpecies
In [2]:
my_species = GBIFSpecies(name_species="Etheostoma_blennioides")
In [3]:
my_species.name_species
Out[3]:
In [4]:
%matplotlib inline
import logging
root = logging.getLogger()
root.addHandler(logging.StreamHandler())
In [5]:
my_species.find_species_occurrences().head()
Out[5]:
In [6]:
my_species.ID # taxonkey derived from GBIF. It's a sort of unique ID per species
Out[6]:
In [7]:
my_species.save_data()
In [8]:
my_species.source.name
Out[8]:
In [9]:
my_species.plot_species_occurrence()
In [10]:
polygonized_species = my_species.polygonize()
In [11]:
my_species.overlay(polygonized_species.geometry)
my_species.data_full.shape
Out[11]:
In [12]:
polygonized_species.geometry = polygonized_species.geometry[7:]
In [13]:
polygonized_species.dropna()
Out[13]:
In [14]:
polygonized_species.dropna(inplace=True)
In [15]:
my_species.overlay(polygonized_species.geometry)
my_species.data_full.shape
Out[15]:
In [16]:
my_species.plot_species_occurrence()
In [17]:
data = my_species.load_data("./Etheostoma_blennioides2382397.pkl") # or just load existing data into Species object
In [18]:
data.columns # all the columns available per observation
Out[18]:
In [19]:
data.head()
Out[19]:
In [20]:
data['country'].unique().tolist()
Out[20]:
In [21]:
data.shape # there are 7226 observations, 138 parameters per observation
Out[21]:
In [22]:
data['vernacularname'].unique().tolist() # self-explanatory
Out[22]:
In [23]:
data['decimallatitude'].tail(10)
Out[23]:
In [24]:
import numpy as np
data_cleaned = data.dropna(subset = ['decimallatitude', 'decimallongitude']) # drop records where data not available
In [25]:
data_cleaned.shape # less occurrence records now: 5226
Out[25]:
In [26]:
data_cleaned['basisofrecord'].unique()
Out[26]:
In [27]:
# this many records with no decimalLatitude and decimalLongitude
import numpy as np
data[data['decimallatitude'].isnull() & data['decimallongitude'].isnull()].size
Out[27]:
In [28]:
data[data['decimallatitude'].isnull() &
data['decimallongitude'].isnull() &
data['locality'].isnull() &
data['verbatimlocality'].isnull()]
Out[28]:
In [29]:
data_cleaned[['dateidentified', 'day', 'month', 'year']].head()
Out[29]:
Seems like not all records have a 'dateidentified', but 'day','month', 'year' fields are there for many (all?) records. TODO: what about verbatimdate
Say that only latitude, longitude, rightsHolder, datasetName columns are interesting for our selection.
In [30]:
data_selected = data_cleaned[data_cleaned['year']>2010][['decimallatitude','decimallongitude', 'rightsholder', 'datasetname']]
In [31]:
data_selected[~data_selected.datasetname.isnull()].head(10)
Out[31]:
In [32]:
my_species.set_data(data_selected) # update the object "my_species" to contain the filtered data
In [33]:
my_species.save_data(file_name="updated_dataset.pkl")
In [34]:
my_species.plot_species_occurrence()
In [35]:
my_species.get_data().shape # there are 119 records now
Out[35]:
In [36]:
csv_data = my_species.load_csv('../data/GBIF.csv')
In [37]:
csv_data.head() # let's peak into the data
Out[37]:
In [38]:
csv_data['specieskey'].unique()
Out[38]:
In [39]:
my_species.save_data() # by default this 'speciesKey' is used. Alternative name can be provided
In [40]:
csv_data.columns.size # csv data for some reason a lot less columns
Out[40]:
In [41]:
data.columns.size # data from using GBIF API directly
Out[41]:
In [42]:
list(set(data.columns.tolist()) - set(csv_data.columns.tolist())) # hmm, 'decimalLatitude' vs 'decimallatitude'
Out[42]:
In [43]:
list(set(csv_data.columns.tolist()) - set(data.columns.tolist())) # hmm, not many
Out[43]:
One way of converting point-records (lat/lon) to geometric shapes is by expanding each sample point into a buffer (or "polygon of influence"), and simplifying + merging the overlapping buffers into a cascaded union.
In [44]:
geometrized_species = my_species.polygonize() # returns a geopandas dataframe with a geometry column.
In [45]:
geometrized_species
Out[45]:
In [46]:
geometrized_species.plot() # each isolated polygon is a separate record (do we want that or?)
Out[46]:
In [47]:
# we can tweak the parameters for the polygonize function
geometrized_species = my_species.polygonize(buffer_distance=0.2, simplify_tolerance=0.02)
geometrized_species.plot()
Out[47]:
In [48]:
my_species.get_data().shape
Out[48]:
In [49]:
# with_envelope means "pixelized" (envelope around each buffer region)
geometrized_species = my_species.polygonize(buffer_distance=0.3, simplify_tolerance=0.03, with_envelope=True)
geometrized_species.plot()
Out[49]:
Define a "zoom-in" polygon that we use for selecting a subset of the data.
In [50]:
from shapely.geometry import Point, Polygon
In [51]:
# say we want to crop to this polygon area only
overlay_polygon = Polygon(((-100,30), (-100, 50), (-70, 50),(-70, 30)))
In [52]:
# Beware, this overwrites the original my_species data ("data_full" field)
my_species.data_full = my_species.data_full[my_species.data_full.geometry.within(overlay_polygon)]
In [53]:
my_species.polygonize().plot()
Out[53]:
In [54]:
my_species.polygonize(buffer_distance=0.5, simplify_tolerance=0.05).plot() # more fine-grained
Out[54]:
In [58]:
my_species.polygonize(buffer_distance=0.3, simplify_tolerance=0.03).plot() # etc
Out[58]:
In [59]:
my_species.polygonize(buffer_distance=0.3, simplify_tolerance=0.03, with_envelope=True).plot() # with_envelope means pixelized
Out[59]:
In [60]:
# we can further simplify with a "convex hull" around each polygon
my_species.polygonize().geometry.convex_hull.plot()
Out[60]:
In [61]:
polygonized_species = my_species.polygonize()
In [62]:
polygonized_species
Out[62]:
In [63]:
polygonized_species.plot()
Out[63]:
In [64]:
# We can make a union of all polygons into one "multipolygon" (Do we need this? I can make a wrapper if needed)
import shapely.ops
my_multipolygon = shapely.ops.cascaded_union(polygonized_species.geometry.tolist())
my_multipolygon
Out[64]:
In [65]:
from geopandas import GeoDataFrame, GeoSeries
new_series = GeoSeries(shapely.ops.cascaded_union(polygonized_species.geometry.tolist()))
new_series
Out[65]:
In [66]:
new_series.plot()
Out[66]:
In [67]:
new_series.convex_hull.plot()
Out[67]:
In [68]:
my_species.data_full.geometry.total_bounds
Out[68]:
In [69]:
my_species.data_full.geometry.bounds.minx.min()
Out[69]:
In [70]:
my_species.data_full.geometry.bounds.miny.min()
Out[70]:
In [71]:
my_species.data_full.geometry.bounds.maxx.max()
Out[71]:
In [72]:
my_species.data_full.geometry.bounds.maxy.max()
Out[72]:
In [ ]: