In [1]:
    
from iSDM.species import GBIFSpecies
    
In [2]:
    
my_species = GBIFSpecies(name_species="Etheostoma_blennioides")
    
In [3]:
    
my_species.name_species
    
    Out[3]:
In [4]:
    
%matplotlib inline
import logging
root = logging.getLogger()
root.addHandler(logging.StreamHandler())
    
In [5]:
    
my_species.find_species_occurrences().head()
    
    
    Out[5]:
In [6]:
    
my_species.ID # taxonkey derived from GBIF. It's a sort of unique ID per species
    
    Out[6]:
In [7]:
    
my_species.save_data()
    
    
In [8]:
    
my_species.source.name
    
    Out[8]:
In [9]:
    
my_species.plot_species_occurrence()
    
    
    
In [10]:
    
polygonized_species = my_species.polygonize()
    
    
In [11]:
    
my_species.overlay(polygonized_species.geometry)
my_species.data_full.shape
    
    
    Out[11]:
In [12]:
    
polygonized_species.geometry = polygonized_species.geometry[7:]
    
In [13]:
    
polygonized_species.dropna()
    
    Out[13]:
In [14]:
    
polygonized_species.dropna(inplace=True)
    
In [15]:
    
my_species.overlay(polygonized_species.geometry)
my_species.data_full.shape
    
    
    Out[15]:
In [16]:
    
my_species.plot_species_occurrence()
    
    
In [17]:
    
data = my_species.load_data("./Etheostoma_blennioides2382397.pkl") # or just load existing data into Species object
    
    
In [18]:
    
data.columns # all the columns available per observation
    
    Out[18]:
In [19]:
    
data.head()
    
    Out[19]:
In [20]:
    
data['country'].unique().tolist()
    
    Out[20]:
In [21]:
    
data.shape # there are 7226 observations, 138 parameters per observation
    
    Out[21]:
In [22]:
    
data['vernacularname'].unique().tolist() # self-explanatory
    
    Out[22]:
In [23]:
    
data['decimallatitude'].tail(10)
    
    Out[23]:
In [24]:
    
import numpy as np
data_cleaned = data.dropna(subset = ['decimallatitude', 'decimallongitude']) # drop records where data not available
    
In [25]:
    
data_cleaned.shape # less occurrence records now: 5226
    
    Out[25]:
In [26]:
    
data_cleaned['basisofrecord'].unique()
    
    Out[26]:
In [27]:
    
# this many records with no decimalLatitude and decimalLongitude
import numpy as np
data[data['decimallatitude'].isnull() & data['decimallongitude'].isnull()].size
    
    Out[27]:
In [28]:
    
data[data['decimallatitude'].isnull() & 
     data['decimallongitude'].isnull() & 
     data['locality'].isnull() & 
     data['verbatimlocality'].isnull()]
    
    Out[28]:
In [29]:
    
data_cleaned[['dateidentified', 'day', 'month', 'year']].head()
    
    Out[29]:
Seems like not all records have a 'dateidentified', but 'day','month', 'year' fields are there for many (all?) records. TODO: what about verbatimdate
Say that only latitude, longitude, rightsHolder, datasetName columns are interesting for our selection.
In [30]:
    
data_selected = data_cleaned[data_cleaned['year']>2010][['decimallatitude','decimallongitude', 'rightsholder', 'datasetname']]
    
In [31]:
    
data_selected[~data_selected.datasetname.isnull()].head(10)
    
    Out[31]:
In [32]:
    
my_species.set_data(data_selected) # update the object "my_species" to contain the filtered data
    
In [33]:
    
my_species.save_data(file_name="updated_dataset.pkl")
    
    
In [34]:
    
my_species.plot_species_occurrence()
    
    
    
In [35]:
    
my_species.get_data().shape # there are 119 records now
    
    Out[35]:
In [36]:
    
csv_data = my_species.load_csv('../data/GBIF.csv')
    
    
In [37]:
    
csv_data.head() # let's peak into the data
    
    Out[37]:
In [38]:
    
csv_data['specieskey'].unique()
    
    Out[38]:
In [39]:
    
my_species.save_data() # by default this 'speciesKey' is used. Alternative name can be provided
    
    
In [40]:
    
csv_data.columns.size # csv data for some reason a lot less columns
    
    Out[40]:
In [41]:
    
data.columns.size # data from using GBIF API directly
    
    Out[41]:
In [42]:
    
list(set(data.columns.tolist()) - set(csv_data.columns.tolist())) # hmm, 'decimalLatitude' vs 'decimallatitude'
    
    Out[42]:
In [43]:
    
list(set(csv_data.columns.tolist()) - set(data.columns.tolist())) # hmm, not many
    
    Out[43]:
One way of converting point-records (lat/lon) to geometric shapes is by expanding each sample point into a buffer (or "polygon of influence"), and simplifying + merging the overlapping buffers into a cascaded union.
In [44]:
    
geometrized_species =  my_species.polygonize()  # returns a geopandas dataframe with a geometry column.
    
    
In [45]:
    
geometrized_species
    
    Out[45]:
In [46]:
    
geometrized_species.plot()  # each isolated polygon is a separate record (do we want that or?)
    
    Out[46]:
    
In [47]:
    
# we can tweak the parameters for the polygonize function
geometrized_species = my_species.polygonize(buffer_distance=0.2, simplify_tolerance=0.02)
geometrized_species.plot()
    
    
    Out[47]:
    
In [48]:
    
my_species.get_data().shape
    
    Out[48]:
In [49]:
    
# with_envelope means "pixelized" (envelope around each buffer region)
geometrized_species = my_species.polygonize(buffer_distance=0.3, simplify_tolerance=0.03, with_envelope=True)
geometrized_species.plot()
    
    
    Out[49]:
    
Define a "zoom-in" polygon that we use for selecting a subset of the data.
In [50]:
    
from shapely.geometry import Point, Polygon
    
In [51]:
    
# say we want to crop to this polygon area only
overlay_polygon = Polygon(((-100,30), (-100, 50), (-70, 50),(-70, 30)))
    
In [52]:
    
# Beware, this overwrites the original my_species data ("data_full" field)
my_species.data_full = my_species.data_full[my_species.data_full.geometry.within(overlay_polygon)]
    
In [53]:
    
my_species.polygonize().plot()
    
    
    Out[53]:
    
In [54]:
    
my_species.polygonize(buffer_distance=0.5, simplify_tolerance=0.05).plot() # more fine-grained
    
    
    Out[54]:
    
In [58]:
    
my_species.polygonize(buffer_distance=0.3, simplify_tolerance=0.03).plot()  # etc
    
    
    Out[58]:
    
In [59]:
    
my_species.polygonize(buffer_distance=0.3, simplify_tolerance=0.03, with_envelope=True).plot() # with_envelope means pixelized
    
    
    Out[59]:
    
In [60]:
    
# we can further simplify with a "convex hull" around each polygon
my_species.polygonize().geometry.convex_hull.plot()
    
    
    Out[60]:
    
In [61]:
    
polygonized_species = my_species.polygonize()
    
    
In [62]:
    
polygonized_species
    
    Out[62]:
In [63]:
    
polygonized_species.plot()
    
    Out[63]:
    
In [64]:
    
# We can make a union of all polygons into one "multipolygon" (Do we need this? I can make a wrapper if needed)
import shapely.ops
my_multipolygon = shapely.ops.cascaded_union(polygonized_species.geometry.tolist())
my_multipolygon
    
    Out[64]:
In [65]:
    
from geopandas import GeoDataFrame, GeoSeries
new_series = GeoSeries(shapely.ops.cascaded_union(polygonized_species.geometry.tolist()))
new_series
    
    Out[65]:
In [66]:
    
new_series.plot()
    
    Out[66]:
    
In [67]:
    
new_series.convex_hull.plot()
    
    Out[67]:
    
In [68]:
    
my_species.data_full.geometry.total_bounds
    
    Out[68]:
In [69]:
    
my_species.data_full.geometry.bounds.minx.min()
    
    Out[69]:
In [70]:
    
my_species.data_full.geometry.bounds.miny.min()
    
    Out[70]:
In [71]:
    
my_species.data_full.geometry.bounds.maxx.max()
    
    Out[71]:
In [72]:
    
my_species.data_full.geometry.bounds.maxy.max()
    
    Out[72]:
In [ ]: