In [1]:
import logging
import numpy as np
import pandas as pd
root = logging.getLogger()
root.addHandler(logging.StreamHandler())
import datetime
%matplotlib inline
from shapely.prepared import prep
from shapely import speedups
speedups.enable()
In [3]:
import pandas as pd
important_columns1 = ['species', 'dateidentified', 'eventdate', 'basisofrecord', 'decimallatitude','decimallongitude', 'day', 'month', 'year' ]
result_with_lat_long = pd.DataFrame(columns=important_columns1)
counter = 0
for df in pd.read_msgpack("../data/fish/selection/merged.msg", iterator=True):
counter += 1
if (counter%100==0):
print("%s Processing.. %s " % (datetime.datetime.now().time().isoformat(), counter))
if "decimallatitude" in df.columns.tolist() and "decimallongitude" in df.columns.tolist():
common_columns = list(set(important_columns1).intersection(set(df.columns.tolist())))
result_with_lat_long = result_with_lat_long.append(df[common_columns], ignore_index=True)
In [4]:
result_with_lat_long = result_with_lat_long[result_with_lat_long.decimallatitude.notnull() & result_with_lat_long.decimallongitude.notnull()]
In [5]:
result_with_lat_long['species'].unique().size
Out[5]:
Best to take into account all observations which have either "year" or "eventdate" present. (or both) Let's group them by species name, and count the number of observation records.
In [6]:
grouped_lat_long_year_or_eventdate = pd.DataFrame()
grouped_lat_long_year_or_eventdate['count'] = result_with_lat_long[result_with_lat_long.eventdate.notnull() | result_with_lat_long.year.notnull()].groupby(['species']).apply(lambda x: x['species'].count())
grouped_lat_long_year_or_eventdate.head(10) # peak at the top 10 only
Out[6]:
In [7]:
result_with_lat_long['species'].unique().size
Out[7]:
In [8]:
year_or_eventdate_1990 = result_with_lat_long[['species', 'year', 'eventdate', 'basisofrecord', 'decimallatitude', 'decimallongitude']][(result_with_lat_long.year>1990) | (result_with_lat_long.eventdate>"1990")]
grouped_year_or_eventdate_1990 = pd.DataFrame()
grouped_year_or_eventdate_1990['numobservations'] = year_or_eventdate_1990.groupby(['species']).apply(lambda x: x['species'].count())
grouped_year_or_eventdate_1990.shape[0]
Out[8]:
In [9]:
year_or_eventdate_1990.basisofrecord.unique()
Out[9]:
I guess we should keep only observations of type 'OBSERVATION', 'MACHINE_OBSERVATION' and 'HUMAN_OBSERVATION'?
In [10]:
final_selection = year_or_eventdate_1990[(year_or_eventdate_1990.basisofrecord=='OBSERVATION') | (year_or_eventdate_1990.basisofrecord=='HUMAN_OBSERVATION') | (year_or_eventdate_1990.basisofrecord=='MACHINE_OBSERVATION')]
In [11]:
final_selection.species.unique().shape
Out[11]:
In [12]:
final_selection
Out[12]:
In [13]:
from iSDM.species import GBIFSpecies
In [14]:
all_species = GBIFSpecies(name_species='All')
In [15]:
all_species.set_data(final_selection)
In [16]:
all_species.get_data().species.unique().shape # these many different species
Out[16]:
In [17]:
all_species.get_data().shape[0] # 1939675? this many observations satisfying our criteria (after 1990, with the correct observation type)
Out[17]:
In [18]:
year_or_eventdate_1990.shape[0] # total number, before filtering out observations that match our criteria
Out[18]:
In [19]:
all_species.geometrize()
In [20]:
all_species.get_data().species.unique().shape
Out[20]:
In [22]:
final_observations = all_species.get_data()[['species', 'year','eventdate', 'basisofrecord','geometry']]
In [32]:
final_observations.to_file("../data/bias_grid/final_observations", driver="ESRI Shapefile")
In [21]:
import gc
gc.collect()
Out[21]:
In [2]:
from geopandas import GeoDataFrame
final_observations = GeoDataFrame.from_file("../data/bias_grid/final_observations/")
In [4]:
final_observations.head()
Out[4]:
5 arcmin = 5/60 pixel = 0.083333333 => 'height': 2160, 'width': 4320 for a global map
(180/0.083333333) ~ 2160
(360/0.083333333) ~ 4320
Generate 2D array and use it as a basis for bias grid.
In [5]:
x_min, y_min, x_max, y_max = -180, -90, 180, 90
pixel_size = 0.0083333333 # changed from 0.083333333 to 30arcsec
x_res = int((x_max - x_min) / pixel_size)
y_res = int((y_max - y_min) / pixel_size)
In [6]:
bias_grid=np.zeros(shape=(y_res, x_res)).astype('int32')
In [7]:
def increase_pixel_value(row):
bias_grid[np.abs(int((row.y - 90) / pixel_size)),
np.abs(int((row.x + 180) / pixel_size))]+=1
In [8]:
here = final_observations.geometry.apply(lambda row: increase_pixel_value(row))
In [22]:
bias_grid.max()
Out[22]:
In [22]:
bias_grid.std()
In [10]:
bias_grid.sum()
Out[10]:
In [11]:
# is the sum of the bias grid equal to the total number of observations?
bias_grid.sum() == final_observations.shape[0]
Out[11]:
In [12]:
import gc
gc.collect()
Out[12]:
In [13]:
# bias_grid_plus_1 = bias_grid + 1
bias_grid_log = np.log10(bias_grid + 1)
# bias_grid_log[np.isneginf(bias_grid_log)] = 0 # is this a good idea, setting to 0? log10(0) = infinity otherwise
bias_grid_log.max()
Out[13]:
In [12]:
bias_grid_log.min()
Out[12]:
In [13]:
bias_grid_log.std()
In [ ]:
bias_grid_standardized = (bias_grid - bias_grid.mean()) / bias_grid.std()
bias_grid_standardized.max()
In [ ]:
bias_grid_standardized.min()
In [ ]:
bias_grid_standardized.std()
In [ ]:
bias_grid_minmax_scale = (bias_grid - bias_grid.min()) / (bias_grid.max() - bias_grid.min())
bias_grid_minmax_scale.max()
In [ ]:
bias_grid_minmax_scale.min()
In [ ]:
bias_grid_minmax_scale.std()
In [42]:
import matplotlib.pyplot as plt
plt.figure(figsize=(25,20))
plt.imshow(bias_grid_log, cmap="hot", interpolation="none")
Out[42]:
In [17]:
import pickle
pickle.dump(bias_grid, open("../data/bias_grid/bias_grid_30arcsec.pkl", "wb"))
In [ ]:
np.count_nonzero(bias_grid)
In [ ]:
np.product(bias_grid.shape)
In [49]:
41881/9331200
Out[49]:
In [26]:
bias_grid.shape
Out[26]:
In [21]:
import rasterio
from rasterio.transform import Affine
x_min, y_min, x_max, y_max = -180, -90, 180, 90
pixel_size = 0.083333333
x_res = int((x_max - x_min) / pixel_size)
y_res = int((y_max - y_min) / pixel_size)
crs = {'init': "EPSG:4326"}
transform = Affine.translation(x_min, y_max) * Affine.scale(pixel_size, -pixel_size)
In [37]:
with rasterio.open("../data/bias_grid/bias_grid_minmax_scale.tif", 'w', driver='GTiff', width=x_res, height=y_res,
count=1,
dtype=np.uint16,
nodata=0,
transform=transform,
crs=crs) as out:
out.write(bias_grid_minmax_scale.astype(np.uint16), indexes=1)
out.close()
In [27]:
pixel_size
Out[27]:
In [39]:
bias_grid_minmax_scale.std()
Out[39]:
In [19]:
bias_grid_log.shape
Out[19]:
In [20]:
import pickle
pickle.dump(bias_grid_log, open("../data/bias_grid/bias_grid_log_30arcsec.pkl", "wb"), protocol=4)
In [43]:
pickle.dump(bias_grid_standardized, open("../data/bias_grid/bias_grid_standardized.pkl", "wb"))
In [44]:
pickle.dump(bias_grid_minmax_scale, open("../data/bias_grid/bias_grid_minmax_scale.pkl", "wb"))
In [26]:
# bias_grid_mm=np.zeros(shape=(y_res, x_res)).astype('int32')
bias_grid_mm = np.memmap("../data/bias_grid/bias_grid_mm.dat", dtype='int32', mode='w+', shape=(y_res,x_res))
In [27]:
def increase_pixel_value(row):
bias_grid_mm[np.abs(int((row.y - 90) / pixel_size)),
np.abs(int((row.x + 180) / pixel_size))]+=1
In [28]:
here = final_observations.geometry.apply(lambda row: increase_pixel_value(row))
In [33]:
bias_grid_mm.flush()
In [35]:
bias_grid_mm.max()
Out[35]:
In [36]:
bias_grid_mm.std()
Out[36]:
In [40]:
del bias_grid_mm
In [41]:
gc.collect()
Out[41]:
In [42]:
fpr = np.memmap("../data/bias_grid/bias_grid_mm.dat", dtype='int32', mode='r', shape=(y_res, x_res))
In [44]:
fpr.max()
Out[44]:
In [47]:
fpr.flags.writeable
Out[47]:
In [81]:
anything = np.memmap("/home/daniela/git/iSDM/data/GLWD/downscaled/original_corrected.tif", dtype='uint8', mode='r', shape=(y_res,x_res))
In [82]:
np.unique(anything)
Out[82]:
In [67]:
type(anything)
Out[67]:
In [78]:
gc.collect()
Out[78]:
In [77]:
del anything
In [101]:
some_data = np.memmap("../data/bias_grid/some_data.tif", dtype='float64', mode='w+', shape=(y_res,x_res))
In [97]:
type(some_data)
Out[97]:
In [93]:
some_data.flags.writeable
Out[93]:
In [99]:
isinstance(some_data, np.ndarray)
Out[99]:
In [106]:
some_data.shape
Out[106]:
In [109]:
some_data[:,:] = 1
In [110]:
some_data._mmap
Out[110]:
In [112]:
some_data
Out[112]:
In [ ]: