In [1]:
from iSDM.species import GBIFSpecies
In [2]:
my_species = GBIFSpecies(name_species="Etheostoma_blennioides")
In [3]:
my_species.name_species
Out[3]:
In [4]:
%matplotlib inline
import logging
root = logging.getLogger()
root.addHandler(logging.StreamHandler())
In [5]:
my_species.find_species_occurrences().head()
Out[5]:
In [6]:
my_species.ID # taxonkey derived from GBIF. It's a sort of unique ID per species
Out[6]:
In [8]:
my_species.save_data()
In [9]:
my_species.source.name
Out[9]:
In [10]:
my_species.plot_species_occurrence()
In [11]:
data = my_species.load_data("./Etheostoma_blennioides2382397.pkl") # or just load existing data into Species object
In [12]:
data.columns # all the columns available per observation
Out[12]:
In [13]:
data['country'].unique().tolist()
Out[13]:
In [14]:
data.shape # there are 7226 observations, 138 parameters per observation
Out[14]:
In [15]:
data['vernacularName'].unique().tolist() # self-explanatory
Out[15]:
In [16]:
data['decimalLatitude'].tail(10)
Out[16]:
In [17]:
import numpy as np
data_cleaned = data.dropna(subset = ['decimalLatitude', 'decimalLongitude']) # drop records where data not available
In [18]:
data_cleaned.shape # less occurrence records now: 5223
Out[18]:
In [19]:
data_cleaned['basisOfRecord'].unique()
Out[19]:
In [20]:
# this many records with no decimalLatitude and decimalLongitude
import numpy as np
data[data['decimalLatitude'].isnull() & data['decimalLongitude'].isnull()].size
Out[20]:
In [21]:
data[data['decimalLatitude'].isnull() &
data['decimalLongitude'].isnull() &
data['locality'].isnull() &
data['verbatimLocality'].isnull()]
Out[21]:
In [22]:
data_cleaned[['dateIdentified', 'day', 'month', 'year']].head()
Out[22]:
Seems like not all records have a 'dateIdentified', but 'day','month', 'year' fields are there for many (all?) records. TODO: what about verbatimDate
Say that only latitude, longitude, rightsHolder, datasetName columns are interesting for our selection.
In [23]:
data_selected = data_cleaned[data_cleaned['year']>2010][['decimalLatitude','decimalLongitude', 'rightsHolder', 'datasetName']]
In [24]:
data_selected[~data_selected.datasetName.isnull()].head(10)
Out[24]:
In [25]:
my_species.set_data(data_selected) # update the object "my_species" to contain the filtered data
In [26]:
my_species.save_data(file_name="updated_dataset.pkl")
In [27]:
my_species.plot_species_occurrence()
In [28]:
my_species.get_data().shape # there are 119 records now
Out[28]:
In [29]:
csv_data = my_species.load_csv('../data/GBIF.csv')
In [30]:
csv_data.head() # let's peak into the data
Out[30]:
In [31]:
csv_data['specieskey'].unique()
Out[31]:
In [32]:
my_species.save_data() # by default this 'speciesKey' is used. Alternative name can be provided
In [33]:
csv_data.columns.size # csv data for some reason a lot less columns
Out[33]:
In [34]:
data.columns.size # data from using GBIF API directly
Out[34]:
In [35]:
list(set(data.columns.tolist()) - set(csv_data.columns.tolist())) # hmm, 'decimalLatitude' vs 'decimallatitude'
Out[35]:
In [36]:
list(set(csv_data.columns.tolist()) - set(data.columns.tolist()))
Out[36]:
In [ ]: