In [1]:
import logging
import numpy as np
import pandas as pd
root = logging.getLogger()
root.addHandler(logging.StreamHandler())
%matplotlib inline
In [2]:
# download from Google Drive: https://drive.google.com/open?id=0B9cazFzBtPuCSFp3YWE1V2JGdnc
from iSDM.species import IUCNSpecies
fish = IUCNSpecies(name_species='All')
fish.load_shapefile('../data/fish/FW_FISH.shp') # warning, 2GB of data will be loaded, may take a while!!
In [3]:
fish.get_data().columns
Out[3]:
How many unique binomials are there?
In [4]:
fish_data = fish.get_data()
fish_data['binomial'].unique().size
Out[4]:
Get their names in an array:
In [5]:
unique_binomials = fish_data['binomial'].unique()
unique_binomials
Out[5]:
In [6]:
fish_data.head(10) # peak at the first 10 records
Out[6]:
Sort them by their "shape_area" column, to get the biggest and smallest areas
In [7]:
fish_data.sort(columns='shape_area', inplace=True)
Peak at the first 5 records with the smallest area
In [8]:
fish_data.head(5)
Out[8]:
The last 5 records (biggest area)
In [9]:
fish_data.tail(5)
Out[9]:
Decision: "We will include all species except those that are completely extinct. For the (possibly) extant species, we will include their entire range (including polygons from which they are currently extinct) in order to cover their full potential distribution."
From IUCN, these are the codes for presence:
Are there species for which the presence category isn't any of the above? (the "~" is negation, so this below says "select all fish data, for which the "presence" column is NOT in [1,2,3,4,5,6])
In [10]:
fish_data[~fish_data.presence.isin([1,2,3,4,5,6])]
Out[10]:
Apparently there are 28 species for which the "presence" is set to 0 in some regions. Their binomials are:
In [11]:
fish_data[~fish_data.presence.isin([1,2,3,4,5,6])]['binomial']
Out[11]:
Actually it's 26, since "Acipenser fulvescens" has 3 separate records (polygons)
"For the (possibly) extant species, we will include their entire range (including polygons from which they are currently extinct)"
As an example, here is a species with both extant and extinct areas
In [12]:
fish_data[fish_data.binomial=='Acantharchus pomotis']
Out[12]:
Plenty of columns, let's just select the 'binomial', 'presence', 'geometry', and 'shape_area' to have a clearer overview
In [13]:
fish_data[fish_data.binomial=='Acantharchus pomotis'][['binomial','presence', 'geometry','shape_area']]
Out[13]:
As agreed, we want to keep the first (extinct==5) polygon when there are other regions (the next two records) where the species is not extinct. There is a function that can do the filtering out in a simple way.
In [14]:
fish.drop_extinct_species()
In [15]:
fish.save_data("../data/fish/selection/non_extinct.pkl", method="pickle")
In [16]:
non_extinct_fish = fish.get_data()
In [17]:
total_records = 0
# download the file "merged.msg" from here: (My Google Drive)
# https://drive.google.com/open?id=0B9cazFzBtPuCTHcyTmVXV0pNT0k
# and place it the subfolder "selection"
# The "merged.msg" file contains all datagrames for all individual species.
for df in pd.read_msgpack("../data/fish/selection/merged.msg", iterator=True):
total_records += df.shape[0]
In [18]:
total_records
Out[18]:
In [19]:
important_columns = ['species', 'dateidentified','basisofrecord', 'verbatimlocality', 'day', 'month', 'year' ]
result_no_lat_long = pd.DataFrame(columns=important_columns)
for df in pd.read_msgpack("../data/fish/selection/merged.msg", iterator=True):
if "decimallatitude" not in df.columns.tolist() or "decimallongitude" not in df.columns.tolist():
common_columns = list(set(important_columns).intersection(set(df.columns.tolist())))
result_no_lat_long = result_no_lat_long.append(df[common_columns], ignore_index=True)
In [20]:
result_no_lat_long['species'].unique().size
Out[20]:
In [21]:
(result_no_lat_long.shape[0]/total_records) * 100 # not that bad, only about 0.2%
Out[21]:
In [22]:
result_no_lat_long[['species', 'verbatimlocality']][result_no_lat_long.verbatimlocality.notnull()]
Out[22]:
How many records with verbatim locality?
In [23]:
result_no_lat_long[['species', 'verbatimlocality']][result_no_lat_long.verbatimlocality.notnull()].shape[0]
Out[23]:
In [24]:
grouped_no_lat_lon = pd.DataFrame()
grouped_no_lat_lon['count'] = result_no_lat_long.groupby(['species', 'basisofrecord']).apply(lambda x: x['basisofrecord'].count())
grouped_no_lat_lon.head(30)
Out[24]:
In [25]:
grouped_no_lat_lon[grouped_no_lat_lon['count']>50]
Out[25]:
According to http://gbif.github.io/gbif-api/apidocs/org/gbif/api/vocabulary/BasisOfRecord.html these are different from "real" human observations. But I'm not sure what PRESERVED_SPECIMEN is exactly?
In [26]:
result_no_lat_long_years = result_no_lat_long[['species','year']][result_no_lat_long.year > 1960].groupby('species')['year'].apply(lambda x:x.tolist())
pd.DataFrame(result_no_lat_long_years).head(30)
Out[26]:
In [27]:
pd.DataFrame(result_no_lat_long_years).shape[0]
Out[27]:
228 unique species with records after 1960, but no useful locality info about them anyways. I think all records without lat/lon will need to be discarded, for the 600 species.
Further selection criteria...
In [28]:
import pandas as pd
important_columns1 = ['species', 'dateidentified', 'eventdate', 'basisofrecord', 'decimallatitude','decimallongitude', 'day', 'month', 'year' ]
result_with_lat_long = pd.DataFrame(columns=important_columns1)
counter = 0
for df in pd.read_msgpack("../data/fish/selection/merged.msg", iterator=True):
counter += 1
if (counter%100==0):
print("Processing.. ", counter)
if "decimallatitude" in df.columns.tolist() and "decimallongitude" in df.columns.tolist():
common_columns = list(set(important_columns1).intersection(set(df.columns.tolist())))
result_with_lat_long = result_with_lat_long.append(df[common_columns], ignore_index=True)
In [29]:
result_with_lat_long = result_with_lat_long[result_with_lat_long.decimallatitude.notnull() & result_with_lat_long.decimallongitude.notnull()]
In [30]:
result_with_lat_long.shape[0] # this-many occurrence records
Out[30]:
In [31]:
result_with_lat_long.shape[0]/ total_records * 100 # percentage of records out of *ALL* species records
Out[31]:
In [32]:
result_with_lat_long['species'].unique().size
Out[32]:
In [33]:
result_with_lat_long_no_date = result_with_lat_long[(result_with_lat_long.eventdate.isnull()) & (result_with_lat_long.year.isnull())]
In [34]:
result_with_lat_long_no_date.shape[0]/result_with_lat_long.shape[0] * 100 # 12%
Out[34]:
Best to take into account all observations which have either "year" or "eventdate" present. (or both) Let's group them by species name, and count the number of observation records.
In [35]:
grouped_lat_long_year_or_eventdate = pd.DataFrame()
grouped_lat_long_year_or_eventdate['count'] = result_with_lat_long[result_with_lat_long.eventdate.notnull() | result_with_lat_long.year.notnull()].groupby(['species']).apply(lambda x: x['species'].count())
grouped_lat_long_year_or_eventdate.head(10) # peak at the top 10 only
Out[35]:
In [36]:
grouped_lat_long_year_or_eventdate.shape[0]
Out[36]:
In [37]:
100 * result_with_lat_long[result_with_lat_long.eventdate.notnull() | result_with_lat_long.year.notnull()].shape[0] / result_with_lat_long.shape[0]
Out[37]:
In [38]:
year_or_eventdate_1960 = result_with_lat_long[['species', 'year', 'eventdate']][(result_with_lat_long.year>1960) | (result_with_lat_long.eventdate>"1960")]
grouped_year_or_eventdate_1960 = pd.DataFrame()
grouped_year_or_eventdate_1960['numobservations'] = year_or_eventdate_1960.groupby(['species']).apply(lambda x: x['species'].count())
grouped_year_or_eventdate_1960.shape[0]
Out[38]:
In [39]:
year_or_eventdate_1960.shape[0]/result_with_lat_long.shape[0] * 100
Out[39]:
In [40]:
grouped_year_or_eventdate_1960[grouped_year_or_eventdate_1960.numobservations>=50]
Out[40]:
In [41]:
(grouped_year_or_eventdate_1960[grouped_year_or_eventdate_1960.numobservations>50].numobservations.sum() / result_with_lat_long.shape[0]) * 100
Out[41]:
So 80% of all records with latitude and longitude, are with year>1960 and more than 50 occurrences per species. But note that this still covers only 1281 uniqute species (out of 6280 that we started with)
In [42]:
100 * grouped_year_or_eventdate_1960[grouped_year_or_eventdate_1960.numobservations>50].numobservations.sum() / grouped_year_or_eventdate_1960.numobservations.sum()
Out[42]:
Ok that means most (98%) species with latitude/longitude after year 1960, have more than 50 records.
In [43]:
year_or_eventdate_1990 = result_with_lat_long[['species', 'year', 'eventdate', 'basisofrecord']][(result_with_lat_long.year>=1990) | (result_with_lat_long.eventdate>="1990")]
grouped_year_or_eventdate_1990 = pd.DataFrame()
grouped_year_or_eventdate_1990['numobservations'] = year_or_eventdate_1990.groupby(['species']).apply(lambda x: x['species'].count())
grouped_year_or_eventdate_1990.shape[0] # number of unique species
Out[43]:
In [44]:
year_or_eventdate_1990.shape[0]/result_with_lat_long.shape[0] * 100
Out[44]:
In [45]:
grouped_year_or_eventdate_1990[grouped_year_or_eventdate_1990.numobservations>=50]
Out[45]:
From meeting: The test set should include at least 1) the species for which lab test data on thermal tolerance are available and 2) species with very small and very large ranges, in order to find out whether the point sampling and modelling also works for these extremes.
In [46]:
non_extinct_fish.tail(1).binomial
Out[46]:
See all the records(areas) for this binomial
In [47]:
non_extinct_fish[non_extinct_fish.binomial=="Esox lucius"]
Out[47]:
In [48]:
esox_lucius = IUCNSpecies(name_species="Esox lucius")
In [49]:
esox_lucius.set_data(non_extinct_fish[non_extinct_fish.binomial=="Esox lucius"])
In [50]:
esox_lucius.save_shapefile("../data/fish/selection/esox_lucius")
In [51]:
esox_lucius.plot_species_occurrence()
In [52]:
non_extinct_fish.head(1).binomial
Out[52]:
See all the records(areas) for this binomial
In [53]:
non_extinct_fish[non_extinct_fish.binomial=="Astatotilapia burtoni"]
Out[53]:
In [54]:
astatotilapia_burtoni = IUCNSpecies(name_species="Astatotilapia burtoni")
In [55]:
astatotilapia_burtoni.set_data(non_extinct_fish[non_extinct_fish.binomial=="Astatotilapia burtoni"])
In [56]:
astatotilapia_burtoni.save_shapefile("../data/fish/selection/astatotilapia_burtoni")
In [57]:
astatotilapia_burtoni.plot_species_occurrence()
This below is just another way of saving the data, i.e., not in a shapefile (which produces multiple files in a separate folder), but a "pickled" data in a single file, compressed. It is usually a lot faster than saving as a shape file, when there is big amounts of data to be saved.
In [58]:
astatotilapia_burtoni.save_data(dir_name="../data/fish/selection/", method="pickle")
In [59]:
esox_lucius.save_data(dir_name="../data/fish/selection/", method="pickle")
Since the records are individual shapes, and there could be multiple per species (binomial), let's group them by the species name, and apply a sum on the shape_area column. This will give us the total area (sum of all polygons) per-species. We want this in order to decide what our "average" species selection will be, for testing the workflow.
In [60]:
species_area_sum = non_extinct_fish.groupby('binomial')['shape_area'].apply(np.sum)
In [61]:
species_area_sum.sort(inplace=True)
In [62]:
species_area_sum.head(10)
Out[62]:
In [63]:
species_area_sum.tail(10)
Out[63]:
In [64]:
species_area_sum.hist(bins=100)
Out[64]:
The distribution of areas (in km^2?) is exponential. The median value is rather low:
In [65]:
species_area_sum.median()
Out[65]:
In [66]:
np.average(species_area_sum)
Out[66]:
In [67]:
species_area_sum[(species_area_sum>45) & (species_area_sum<46)]
Out[67]:
In [68]:
non_extinct_fish[non_extinct_fish.binomial=="Acrocheilus alutaceus"]
Out[68]:
In [69]:
acrocheilus_alutaceus = IUCNSpecies(name_species="Acrocheilus alutaceus")
In [70]:
acrocheilus_alutaceus.set_data(non_extinct_fish[non_extinct_fish.binomial=="Acrocheilus alutaceus"])
In [71]:
acrocheilus_alutaceus.plot_species_occurrence()
In [72]:
acrocheilus_alutaceus.save_data(dir_name="../data/fish/selection/", method="pickle")
In [73]:
acrocheilus_alutaceus.save_shapefile("../data/fish/selection/acrocheilus_alutaceus")
The individual polygons are below. They are also "not valid" geometries, makes them interesting for case study.
In [74]:
non_extinct_fish.loc[10322].geometry
Out[74]:
In [75]:
non_extinct_fish.loc[10320].geometry
Out[75]:
In [76]:
non_extinct_fish.loc[10321].geometry
Out[76]:
In [77]:
non_extinct_binomials = non_extinct_fish.binomial.unique().tolist()
Just persist on storage the list of non-extinct binomials we got from IUCN, so we don't have to filter out again.
In [78]:
import pickle
pickle.dump(non_extinct_binomials, open("../data/fish/selection/non_extinct_binomials.pkl","wb"))
In [79]:
non_extinct_binomials.index("Perca fluviatilis")
Out[79]:
Let's read Aafke's Excel with lab test data on thermal tolerance for species
In [80]:
thermal_tolerance_df = pd.read_excel("../data/fish/selection/Lethal_temperature_freshwater_fish_July2016.xlsx")
thermal_tolerance_df.head(10)
Out[80]:
In [81]:
thermal_tolerance_df.columns
Out[81]:
In [82]:
lab_data_list = thermal_tolerance_df['Species '].tolist()
In [83]:
species_with_lat_lon_after_1960 = year_or_eventdate_1960['species'].unique().tolist()
For every species that is in the lab data list, select it if it is also in the IUCN list of non-extinct binomials, AND in the GBIF filtered-out list of species with latitude/longitude information and event year > 1960:
In [84]:
filtered_list = []
for species_name in lab_data_list:
if species_name.strip() in non_extinct_binomials and species_name.strip() in species_with_lat_lon_after_1960:
filtered_list.append(species_name.strip()) # strip() is just to remove spaces in names
Now get the number of point records for this list of lab species (cross-checked with filtered GBIF records)
In [85]:
year_or_eventdate_1960[year_or_eventdate_1960.species.isin(filtered_list)].groupby('species')['species'].apply(lambda x: x.count())
Out[85]:
In [86]:
tmp_series = year_or_eventdate_1960[year_or_eventdate_1960.species.isin(filtered_list)].groupby('species')['species'].apply(lambda x: x.count())
In [87]:
tmp_series[tmp_series>100] # species with more than 50 records
Out[87]:
So this is our potential list to choose a representative example from
In [88]:
tmp_series[tmp_series>100].keys()
Out[88]:
In [89]:
len(tmp_series[tmp_series>100].keys())
Out[89]:
In [90]:
len(filtered_list) # so 20 records are out of the original list because of not enough point-records
Out[90]:
In [91]:
tmp_series[tmp_series>100].sort_values()
Out[91]:
Get the GBIF/IUCN data on the first and the last species in the sorted list. That would be our representative choise for testing workflow.
In [93]:
lepidomeda_mollispinis = IUCNSpecies(name_species="Lepidomeda mollispinis")
lepidomeda_mollispinis.set_data(non_extinct_fish[non_extinct_fish.binomial=="Lepidomeda mollispinis"])
lepidomeda_mollispinis.save_shapefile("../data/fish/selection/lepidomeda_mollispinis")
lepidomeda_mollispinis.plot_species_occurrence()
from iSDM.species import GBIFSpecies
my_species = GBIFSpecies(name_species="Lepidomeda mollispinis")
# download point-records on these species from: https://drive.google.com/open?id=0B9cazFzBtPuCMGlLMnQzb3hsQm8
data = my_species.load_data("../data/fish/selection/Lepidomeda mollispinis2361895.pkl")
my_species.plot_species_occurrence()
In [94]:
salmo_trutta = IUCNSpecies(name_species="Salmo trutta")
salmo_trutta.set_data(non_extinct_fish[non_extinct_fish.binomial=="Salmo trutta"])
salmo_trutta.save_shapefile("../data/fish/selection/salmo_trutta")
salmo_trutta.plot_species_occurrence()
my_species = GBIFSpecies(name_species="Salmo trutta")
# download point-records on these species from: https://drive.google.com/open?id=0B9cazFzBtPuCWEIySU5oU2ZfT2M
data = my_species.load_data("../data/fish/selection/Salmo trutta8215487.pkl")
my_species.plot_species_occurrence() # warning: a lot of point-records, may take a while!!
Interesting, lots of point-records outside the expert range? in particular in the US.
In [95]:
my_species.overlay(salmo_trutta)
In [ ]:
my_species.get_data()
In [96]:
my_species.plot_species_occurrence()
In [110]:
result_with_lat_long[result_with_lat_long.species=='Esox lucius'][result_with_lat_long.year>1960]
Out[110]:
In [114]:
result_with_lat_long[result_with_lat_long.species=='Astatotilapia burtoni'][result_with_lat_long.year>1960]
Out[114]:
In [115]:
result_with_lat_long[result_with_lat_long.species=='Astatotilapia burtoni'][result_with_lat_long.year>1960].shape
Out[115]:
In [116]:
result_with_lat_long[result_with_lat_long.species=='Acrocheilus alutaceus'][result_with_lat_long.year>1960].shape
Out[116]:
In [ ]: