In [1]:
    
from iSDM.species import GBIFSpecies
    
In [2]:
    
my_species = GBIFSpecies(name_species="Etheostoma_blennioides")
    
In [3]:
    
my_species.name_species
    
    Out[3]:
In [4]:
    
%matplotlib inline
import logging
root = logging.getLogger()
root.addHandler(logging.StreamHandler())
    
In [5]:
    
my_species.find_species_occurrences().head()
    
    
    Out[5]:
In [6]:
    
my_species.ID # taxonkey derived from GBIF. It's a sort of unique ID per species
    
    Out[6]:
In [8]:
    
my_species.save_data()
    
    
In [9]:
    
my_species.source.name
    
    Out[9]:
In [10]:
    
my_species.plot_species_occurrence()
    
    
In [11]:
    
data = my_species.load_data("./Etheostoma_blennioides2382397.pkl") # or just load existing data into Species object
    
    
In [12]:
    
data.columns # all the columns available per observation
    
    Out[12]:
In [13]:
    
data['country'].unique().tolist()
    
    Out[13]:
In [14]:
    
data.shape # there are 7226 observations, 138 parameters per observation
    
    Out[14]:
In [15]:
    
data['vernacularName'].unique().tolist() # self-explanatory
    
    Out[15]:
In [16]:
    
data['decimalLatitude'].tail(10)
    
    Out[16]:
In [17]:
    
import numpy as np
data_cleaned = data.dropna(subset = ['decimalLatitude', 'decimalLongitude']) # drop records where data not available
    
In [18]:
    
data_cleaned.shape # less occurrence records now: 5223
    
    Out[18]:
In [19]:
    
data_cleaned['basisOfRecord'].unique()
    
    Out[19]:
In [20]:
    
# this many records with no decimalLatitude and decimalLongitude
import numpy as np
data[data['decimalLatitude'].isnull() & data['decimalLongitude'].isnull()].size
    
    Out[20]:
In [21]:
    
data[data['decimalLatitude'].isnull() & 
     data['decimalLongitude'].isnull() & 
     data['locality'].isnull() & 
     data['verbatimLocality'].isnull()]
    
    Out[21]:
In [22]:
    
data_cleaned[['dateIdentified', 'day', 'month', 'year']].head()
    
    Out[22]:
Seems like not all records have a 'dateIdentified', but 'day','month', 'year' fields are there for many (all?) records. TODO: what about verbatimDate
Say that only latitude, longitude, rightsHolder, datasetName columns are interesting for our selection.
In [23]:
    
data_selected = data_cleaned[data_cleaned['year']>2010][['decimalLatitude','decimalLongitude', 'rightsHolder', 'datasetName']]
    
In [24]:
    
data_selected[~data_selected.datasetName.isnull()].head(10)
    
    Out[24]:
In [25]:
    
my_species.set_data(data_selected) # update the object "my_species" to contain the filtered data
    
In [26]:
    
my_species.save_data(file_name="updated_dataset.pkl")
    
    
In [27]:
    
my_species.plot_species_occurrence()
    
    
In [28]:
    
my_species.get_data().shape # there are 119 records now
    
    Out[28]:
In [29]:
    
csv_data = my_species.load_csv('../data/GBIF.csv')
    
    
In [30]:
    
csv_data.head() # let's peak into the data
    
    Out[30]:
In [31]:
    
csv_data['specieskey'].unique()
    
    Out[31]:
In [32]:
    
my_species.save_data() # by default this 'speciesKey' is used. Alternative name can be provided
    
    
In [33]:
    
csv_data.columns.size # csv data for some reason a lot less columns
    
    Out[33]:
In [34]:
    
data.columns.size # data from using GBIF API directly
    
    Out[34]:
In [35]:
    
list(set(data.columns.tolist()) - set(csv_data.columns.tolist())) # hmm, 'decimalLatitude' vs 'decimallatitude'
    
    Out[35]:
In [36]:
    
list(set(csv_data.columns.tolist()) - set(data.columns.tolist()))
    
    Out[36]:
In [ ]: