In [1]:
    
import logging
import numpy as np
import pandas as pd
root = logging.getLogger()
root.addHandler(logging.StreamHandler())
import datetime
%matplotlib inline
from shapely.prepared import prep
from shapely import speedups
speedups.enable()
    
In [3]:
    
import pandas as pd
important_columns1 = ['species', 'dateidentified', 'eventdate', 'basisofrecord', 'decimallatitude','decimallongitude', 'day', 'month', 'year' ]
result_with_lat_long = pd.DataFrame(columns=important_columns1)
counter = 0
for df in pd.read_msgpack("../data/fish/selection/merged.msg", iterator=True):
    counter += 1
    if (counter%100==0):
        print("%s Processing.. %s " % (datetime.datetime.now().time().isoformat(), counter))
    if "decimallatitude" in df.columns.tolist() and "decimallongitude" in df.columns.tolist():
        common_columns = list(set(important_columns1).intersection(set(df.columns.tolist())))
        result_with_lat_long = result_with_lat_long.append(df[common_columns], ignore_index=True)
    
    
In [4]:
    
result_with_lat_long = result_with_lat_long[result_with_lat_long.decimallatitude.notnull() & result_with_lat_long.decimallongitude.notnull()]
    
In [5]:
    
result_with_lat_long['species'].unique().size
    
    Out[5]:
Best to take into account all observations which have either "year" or "eventdate" present. (or both) Let's group them by species name, and count the number of observation records.
In [6]:
    
grouped_lat_long_year_or_eventdate = pd.DataFrame()
grouped_lat_long_year_or_eventdate['count'] = result_with_lat_long[result_with_lat_long.eventdate.notnull() | result_with_lat_long.year.notnull()].groupby(['species']).apply(lambda x: x['species'].count())
grouped_lat_long_year_or_eventdate.head(10) # peak at the top 10 only
    
    Out[6]:
In [7]:
    
result_with_lat_long['species'].unique().size
    
    Out[7]:
In [8]:
    
year_or_eventdate_1990 = result_with_lat_long[['species', 'year', 'eventdate', 'basisofrecord', 'decimallatitude', 'decimallongitude']][(result_with_lat_long.year>1990) | (result_with_lat_long.eventdate>"1990")]
grouped_year_or_eventdate_1990 = pd.DataFrame()
grouped_year_or_eventdate_1990['numobservations'] = year_or_eventdate_1990.groupby(['species']).apply(lambda x: x['species'].count())
grouped_year_or_eventdate_1990.shape[0]
    
    Out[8]:
In [9]:
    
year_or_eventdate_1990.basisofrecord.unique()
    
    Out[9]:
I guess we should keep only observations of type 'OBSERVATION', 'MACHINE_OBSERVATION' and 'HUMAN_OBSERVATION'?
In [10]:
    
final_selection = year_or_eventdate_1990[(year_or_eventdate_1990.basisofrecord=='OBSERVATION') | (year_or_eventdate_1990.basisofrecord=='HUMAN_OBSERVATION') | (year_or_eventdate_1990.basisofrecord=='MACHINE_OBSERVATION')]
    
In [11]:
    
final_selection.species.unique().shape
    
    Out[11]:
In [12]:
    
final_selection
    
    Out[12]:
In [13]:
    
from iSDM.species import GBIFSpecies
    
In [14]:
    
all_species = GBIFSpecies(name_species='All')
    
    
In [15]:
    
all_species.set_data(final_selection)
    
In [16]:
    
all_species.get_data().species.unique().shape # these many different species
    
    Out[16]:
In [17]:
    
all_species.get_data().shape[0] # 1939675? this many observations satisfying our criteria (after 1990, with the correct observation type)
    
    Out[17]:
In [18]:
    
year_or_eventdate_1990.shape[0] # total number, before filtering out observations that match our criteria
    
    Out[18]:
In [19]:
    
all_species.geometrize()
    
    
In [20]:
    
all_species.get_data().species.unique().shape
    
    Out[20]:
In [22]:
    
final_observations = all_species.get_data()[['species', 'year','eventdate', 'basisofrecord','geometry']]
    
In [32]:
    
final_observations.to_file("../data/bias_grid/final_observations", driver="ESRI Shapefile")
    
In [21]:
    
import gc
gc.collect()
    
    Out[21]:
In [2]:
    
from geopandas import GeoDataFrame
final_observations = GeoDataFrame.from_file("../data/bias_grid/final_observations/")
    
In [4]:
    
final_observations.head()
    
    Out[4]:
5 arcmin = 5/60 pixel = 0.083333333 => 'height': 2160, 'width': 4320 for a global map
(180/0.083333333) ~ 2160
(360/0.083333333) ~ 4320
Generate 2D array and use it as a basis for bias grid.
In [5]:
    
x_min, y_min, x_max, y_max = -180, -90, 180, 90
pixel_size = 0.0083333333 # changed from 0.083333333 to 30arcsec
x_res = int((x_max - x_min) / pixel_size)
y_res = int((y_max - y_min) / pixel_size)
    
In [6]:
    
bias_grid=np.zeros(shape=(y_res, x_res)).astype('int32')
    
In [7]:
    
def increase_pixel_value(row):
    bias_grid[np.abs(int((row.y - 90) / pixel_size)),
              np.abs(int((row.x + 180) / pixel_size))]+=1
    
In [8]:
    
here = final_observations.geometry.apply(lambda row: increase_pixel_value(row))
    
In [22]:
    
bias_grid.max()
    
    Out[22]:
In [22]:
    
bias_grid.std()
    
    
In [10]:
    
bias_grid.sum()
    
    Out[10]:
In [11]:
    
# is the sum of the bias grid equal to the total number of observations?
bias_grid.sum() == final_observations.shape[0]
    
    Out[11]:
In [12]:
    
import gc
gc.collect()
    
    Out[12]:
In [13]:
    
# bias_grid_plus_1 = bias_grid + 1
bias_grid_log = np.log10(bias_grid + 1)
# bias_grid_log[np.isneginf(bias_grid_log)] = 0 # is this a good idea, setting to 0? log10(0) = infinity otherwise
bias_grid_log.max()
    
    Out[13]:
In [12]:
    
bias_grid_log.min()
    
    Out[12]:
In [13]:
    
bias_grid_log.std()
    
    
In [ ]:
    
bias_grid_standardized = (bias_grid - bias_grid.mean()) / bias_grid.std()
bias_grid_standardized.max()
    
In [ ]:
    
bias_grid_standardized.min()
    
In [ ]:
    
bias_grid_standardized.std()
    
In [ ]:
    
bias_grid_minmax_scale = (bias_grid - bias_grid.min()) / (bias_grid.max() - bias_grid.min())
bias_grid_minmax_scale.max()
    
In [ ]:
    
bias_grid_minmax_scale.min()
    
In [ ]:
    
bias_grid_minmax_scale.std()
    
In [42]:
    
import matplotlib.pyplot as plt
plt.figure(figsize=(25,20))
plt.imshow(bias_grid_log, cmap="hot", interpolation="none")
    
    Out[42]:
    
In [17]:
    
import pickle
pickle.dump(bias_grid, open("../data/bias_grid/bias_grid_30arcsec.pkl", "wb"))
    
    
In [ ]:
    
np.count_nonzero(bias_grid)
    
In [ ]:
    
np.product(bias_grid.shape)
    
In [49]:
    
41881/9331200
    
    Out[49]:
In [26]:
    
bias_grid.shape
    
    Out[26]:
In [21]:
    
import rasterio
from rasterio.transform import Affine
x_min, y_min, x_max, y_max = -180, -90, 180, 90
pixel_size = 0.083333333
x_res = int((x_max - x_min) / pixel_size)
y_res = int((y_max - y_min) / pixel_size)
crs = {'init': "EPSG:4326"}
transform = Affine.translation(x_min, y_max) * Affine.scale(pixel_size, -pixel_size)
    
In [37]:
    
with rasterio.open("../data/bias_grid/bias_grid_minmax_scale.tif", 'w', driver='GTiff', width=x_res, height=y_res,
                   count=1,
                   dtype=np.uint16,
                   nodata=0,
                   transform=transform,
                   crs=crs) as out:
    out.write(bias_grid_minmax_scale.astype(np.uint16), indexes=1)
    out.close()
    
In [27]:
    
pixel_size
    
    Out[27]:
In [39]:
    
bias_grid_minmax_scale.std()
    
    Out[39]:
In [19]:
    
bias_grid_log.shape
    
    Out[19]:
In [20]:
    
import pickle
pickle.dump(bias_grid_log, open("../data/bias_grid/bias_grid_log_30arcsec.pkl", "wb"), protocol=4)
    
    
In [43]:
    
pickle.dump(bias_grid_standardized, open("../data/bias_grid/bias_grid_standardized.pkl", "wb"))
    
In [44]:
    
pickle.dump(bias_grid_minmax_scale, open("../data/bias_grid/bias_grid_minmax_scale.pkl", "wb"))
    
In [26]:
    
# bias_grid_mm=np.zeros(shape=(y_res, x_res)).astype('int32')
bias_grid_mm = np.memmap("../data/bias_grid/bias_grid_mm.dat", dtype='int32', mode='w+', shape=(y_res,x_res))
    
In [27]:
    
def increase_pixel_value(row):
    bias_grid_mm[np.abs(int((row.y - 90) / pixel_size)),
              np.abs(int((row.x + 180) / pixel_size))]+=1
    
In [28]:
    
here = final_observations.geometry.apply(lambda row: increase_pixel_value(row))
    
In [33]:
    
bias_grid_mm.flush()
    
In [35]:
    
bias_grid_mm.max()
    
    Out[35]:
In [36]:
    
bias_grid_mm.std()
    
    Out[36]:
In [40]:
    
del bias_grid_mm
    
In [41]:
    
gc.collect()
    
    Out[41]:
In [42]:
    
fpr = np.memmap("../data/bias_grid/bias_grid_mm.dat", dtype='int32', mode='r', shape=(y_res, x_res))
    
In [44]:
    
fpr.max()
    
    Out[44]:
In [47]:
    
fpr.flags.writeable
    
    Out[47]:
In [81]:
    
anything = np.memmap("/home/daniela/git/iSDM/data/GLWD/downscaled/original_corrected.tif", dtype='uint8',  mode='r', shape=(y_res,x_res))
    
In [82]:
    
np.unique(anything)
    
    Out[82]:
In [67]:
    
type(anything)
    
    Out[67]:
In [78]:
    
gc.collect()
    
    Out[78]:
In [77]:
    
del anything
    
In [101]:
    
some_data = np.memmap("../data/bias_grid/some_data.tif", dtype='float64', mode='w+', shape=(y_res,x_res))
    
In [97]:
    
type(some_data)
    
    Out[97]:
In [93]:
    
some_data.flags.writeable
    
    Out[93]:
In [99]:
    
isinstance(some_data, np.ndarray)
    
    Out[99]:
In [106]:
    
some_data.shape
    
    Out[106]:
In [109]:
    
some_data[:,:] = 1
    
In [110]:
    
some_data._mmap
    
    Out[110]:
In [112]:
    
some_data
    
    Out[112]:
In [ ]: