Table of Contents

    
    
    In [1]:
    # import reader module from sparkxarray
    from sparkxarray import reader
    from pyspark.sql import SparkSession
    
    
    
    In [2]:
    # Create sparksession
    spark = SparkSession.builder.appName("bias").getOrCreate()
    sc = spark.sparkContext
    
    
    
    In [3]:
    FILE_1 = "/home/abanihi/Documents/Github/spark-xarray/datasets/AFRICA_KNMI-RACMO2.2b_CTL_ERAINT_MM_50km_1989-2008_tasmax.nc"
    FILE_2 = "/home/abanihi/Documents/Github/spark-xarray/datasets/AFRICA_UC-WRF311_CTL_ERAINT_MM_50km-rg_1989-2008_tasmax.nc"
    
    
    
    In [4]:
    knmi = reader.ncread(sc, FILE_1, mode='single', partition_on=['rlat', 'rlon'], partitions=500, decode_times=False)
    
    
    
    In [5]:
    knmi.first()
    
    
    
    
    Out[5]:
    <xarray.Dataset>
    Dimensions:       (bnds: 2, height: 1, rlat: 1, rlon: 1, time: 240)
    Coordinates:
      * time          (time) float64 468.0 469.0 470.0 471.0 472.0 473.0 474.0 ...
      * rlon          (rlon) float32 -24.64
        lon           (rlat, rlon) float32 -24.64
      * rlat          (rlat) float32 -45.76
        lat           (rlat, rlon) float32 -45.76
      * height        (height) float32 2.0
    Dimensions without coordinates: bnds
    Data variables:
        rotated_pole  |S1 b''
        time_bnds     (time, bnds) float64 468.0 469.0 469.0 470.0 470.0 471.0 ...
        tasmax        (time, height, rlat, rlon) float64 283.4 284.2 284.2 284.6 ...
    Attributes:
        institution:     KNMI
        Conventions:     CF-1.0
        conventionsURL:  http://www.cgd.ucar.edu/cms/eaton/cf-metadata/index.html
        source:          RACMO2.2b
        project_id:      ENSEMBLES
        experiment_id:   ERAIN CORDEX-Africa-50km
        realization:     1
        comments:        beta-version RACMO2.2 with default physics from ECMWF CY...
        creation_date:   2010-04-09 13:53:22
    
    
    In [6]:
    wrf = reader.ncread(sc, FILE_2, mode='single', partition_on=['rlat', 'rlon'], partitions=500, decode_times=False)
    
    
    
    In [7]:
    wrf.first()
    
    
    
    
    Out[7]:
    <xarray.Dataset>
    Dimensions:       (bnds: 2, height: 1, rlat: 1, rlon: 1, time: 240)
    Coordinates:
        lon           (rlat, rlon) float64 -24.64
        lat           (rlat, rlon) float64 -45.76
      * height        (height) float32 2.0
      * time          (time) float64 1.426e+04 1.429e+04 1.432e+04 1.435e+04 ...
      * rlat          (rlat) float64 -45.76
      * rlon          (rlon) float64 -24.64
    Dimensions without coordinates: bnds
    Data variables:
        tasmax        (time, height, rlat, rlon) float64 283.4 284.2 284.3 284.6 ...
        rotated_pole  |S1 b''
        time_bnds     (time, bnds) float64 1.424e+04 1.428e+04 1.428e+04 ...
    Attributes:
        Conventions:               CF-1.4
        institution:               Universidad de Cantabria (Spain)
        title:                     CORDEX Africa Sensitivity Run
        comment:                   The simulation was forced with ERA-Interim 2x2...
        nco_openmp_thread_number:  1
    
    
    In [8]:
    %time wrf.count()
    
    
    
    
    CPU times: user 72 ms, sys: 28 ms, total: 100 ms
    Wall time: 35.5 s
    
    Out[8]:
    38994
    
    
    In [12]:
    def create_indices(element):
        lat = round(float(element.rlat.data), 1)
        lon = round(float(element.rlon.data), 1)
        key = (lat, lon)
        return (key, element)
    
    
    
    In [13]:
    knmi2 = knmi.map(create_indices)
    knmi2.first()
    
    
    
    
    Out[13]:
    ((-45.8, -24.6), <xarray.Dataset>
     Dimensions:       (bnds: 2, height: 1, rlat: 1, rlon: 1, time: 240)
     Coordinates:
       * time          (time) float64 468.0 469.0 470.0 471.0 472.0 473.0 474.0 ...
       * rlon          (rlon) float32 -24.64
         lon           (rlat, rlon) float32 -24.64
       * rlat          (rlat) float32 -45.76
         lat           (rlat, rlon) float32 -45.76
       * height        (height) float32 2.0
     Dimensions without coordinates: bnds
     Data variables:
         rotated_pole  |S1 b''
         time_bnds     (time, bnds) float64 468.0 469.0 469.0 470.0 470.0 471.0 ...
         tasmax        (time, height, rlat, rlon) float64 283.4 284.2 284.2 284.6 ...
     Attributes:
         institution:     KNMI
         Conventions:     CF-1.0
         conventionsURL:  http://www.cgd.ucar.edu/cms/eaton/cf-metadata/index.html
         source:          RACMO2.2b
         project_id:      ENSEMBLES
         experiment_id:   ERAIN CORDEX-Africa-50km
         realization:     1
         comments:        beta-version RACMO2.2 with default physics from ECMWF CY...
         creation_date:   2010-04-09 13:53:22)
    
    
    In [14]:
    wrf2 = wrf.map(create_indices)
    wrf2.first()
    
    
    
    
    Out[14]:
    ((-45.8, -24.6), <xarray.Dataset>
     Dimensions:       (bnds: 2, height: 1, rlat: 1, rlon: 1, time: 240)
     Coordinates:
         lon           (rlat, rlon) float64 -24.64
         lat           (rlat, rlon) float64 -45.76
       * height        (height) float32 2.0
       * time          (time) float64 1.426e+04 1.429e+04 1.432e+04 1.435e+04 ...
       * rlat          (rlat) float64 -45.76
       * rlon          (rlon) float64 -24.64
     Dimensions without coordinates: bnds
     Data variables:
         tasmax        (time, height, rlat, rlon) float64 283.4 284.2 284.3 284.6 ...
         rotated_pole  |S1 b''
         time_bnds     (time, bnds) float64 1.424e+04 1.428e+04 1.428e+04 ...
     Attributes:
         Conventions:               CF-1.4
         institution:               Universidad de Cantabria (Spain)
         title:                     CORDEX Africa Sensitivity Run
         comment:                   The simulation was forced with ERA-Interim 2x2...
         nco_openmp_thread_number:  1)
    
    
    In [15]:
    rdd = wrf2.join(knmi2, numPartitions=500)
    rdd.first()
    
    
    
    
    Out[15]:
    ((-44.4, -14.1), (<xarray.Dataset>
      Dimensions:       (bnds: 2, height: 1, rlat: 1, rlon: 1, time: 240)
      Coordinates:
          lon           (rlat, rlon) float64 -14.08
          lat           (rlat, rlon) float64 -44.44
        * height        (height) float32 2.0
        * time          (time) float64 1.426e+04 1.429e+04 1.432e+04 1.435e+04 ...
        * rlat          (rlat) float64 -44.44
        * rlon          (rlon) float64 -14.08
      Dimensions without coordinates: bnds
      Data variables:
          tasmax        (time, height, rlat, rlon) float64 284.4 285.3 284.2 285.0 ...
          rotated_pole  |S1 b''
          time_bnds     (time, bnds) float64 1.424e+04 1.428e+04 1.428e+04 ...
      Attributes:
          Conventions:               CF-1.4
          institution:               Universidad de Cantabria (Spain)
          title:                     CORDEX Africa Sensitivity Run
          comment:                   The simulation was forced with ERA-Interim 2x2...
          nco_openmp_thread_number:  1, <xarray.Dataset>
      Dimensions:       (bnds: 2, height: 1, rlat: 1, rlon: 1, time: 240)
      Coordinates:
        * time          (time) float64 468.0 469.0 470.0 471.0 472.0 473.0 474.0 ...
        * rlon          (rlon) float32 -14.08
          lon           (rlat, rlon) float32 -14.08
        * rlat          (rlat) float32 -44.44
          lat           (rlat, rlon) float32 -44.44
        * height        (height) float32 2.0
      Dimensions without coordinates: bnds
      Data variables:
          rotated_pole  |S1 b''
          time_bnds     (time, bnds) float64 468.0 469.0 469.0 470.0 470.0 471.0 ...
          tasmax        (time, height, rlat, rlon) float64 284.1 285.2 284.2 285.0 ...
      Attributes:
          institution:     KNMI
          Conventions:     CF-1.0
          conventionsURL:  http://www.cgd.ucar.edu/cms/eaton/cf-metadata/index.html
          source:          RACMO2.2b
          project_id:      ENSEMBLES
          experiment_id:   ERAIN CORDEX-Africa-50km
          realization:     1
          comments:        beta-version RACMO2.2 with default physics from ECMWF CY...
          creation_date:   2010-04-09 13:53:22))
    
    
    In [16]:
    rdd.getNumPartitions()
    
    
    
    
    Out[16]:
    500
    
    
    In [17]:
    rdd.count()
    
    
    
    
    Out[17]:
    38994
    
    
    In [18]:
    a = rdd.first()
    a
    
    
    
    
    Out[18]:
    ((-44.4, -14.1), (<xarray.Dataset>
      Dimensions:       (bnds: 2, height: 1, rlat: 1, rlon: 1, time: 240)
      Coordinates:
          lon           (rlat, rlon) float64 -14.08
          lat           (rlat, rlon) float64 -44.44
        * height        (height) float32 2.0
        * time          (time) float64 1.426e+04 1.429e+04 1.432e+04 1.435e+04 ...
        * rlat          (rlat) float64 -44.44
        * rlon          (rlon) float64 -14.08
      Dimensions without coordinates: bnds
      Data variables:
          tasmax        (time, height, rlat, rlon) float64 284.4 285.3 284.2 285.0 ...
          rotated_pole  |S1 b''
          time_bnds     (time, bnds) float64 1.424e+04 1.428e+04 1.428e+04 ...
      Attributes:
          Conventions:               CF-1.4
          institution:               Universidad de Cantabria (Spain)
          title:                     CORDEX Africa Sensitivity Run
          comment:                   The simulation was forced with ERA-Interim 2x2...
          nco_openmp_thread_number:  1, <xarray.Dataset>
      Dimensions:       (bnds: 2, height: 1, rlat: 1, rlon: 1, time: 240)
      Coordinates:
        * time          (time) float64 468.0 469.0 470.0 471.0 472.0 473.0 474.0 ...
        * rlon          (rlon) float32 -14.08
          lon           (rlat, rlon) float32 -14.08
        * rlat          (rlat) float32 -44.44
          lat           (rlat, rlon) float32 -44.44
        * height        (height) float32 2.0
      Dimensions without coordinates: bnds
      Data variables:
          rotated_pole  |S1 b''
          time_bnds     (time, bnds) float64 468.0 469.0 469.0 470.0 470.0 471.0 ...
          tasmax        (time, height, rlat, rlon) float64 284.1 285.2 284.2 285.0 ...
      Attributes:
          institution:     KNMI
          Conventions:     CF-1.0
          conventionsURL:  http://www.cgd.ucar.edu/cms/eaton/cf-metadata/index.html
          source:          RACMO2.2b
          project_id:      ENSEMBLES
          experiment_id:   ERAIN CORDEX-Africa-50km
          realization:     1
          comments:        beta-version RACMO2.2 with default physics from ECMWF CY...
          creation_date:   2010-04-09 13:53:22))
    
    
    In [139]:
    def bias_correct(element):
        import numpy as np
        obs = element[1][1].tasmax.values.ravel()
        mod = element[1][0].tasmax.values.ravel()
        
        cdfn = 30.0
        
        obs = np.sort(obs)
        mod = np.sort(mod)
        
        global_max = max(np.amax(obs), np.amax(mod))
        
        wide = global_max / cdfn
        
        xbins = np.arange(0.0, global_max+wide, wide)
        
        pdfobs, bins = np.histogram(obs, bins=xbins)
        pdfmod, bins = np.histogram(mod, bins=xbins)
        
        cdfobs = np.insert(np.cumsum(pdfobs), 0, 0.0)
        cdfmod = np.insert(np.cumsum(pdfmod), 0, 0.0) 
        
        vals = [150., 256.6, 100000]
        
        def bias_map(vals, xbins, cdfmod, cdfobs):
            xbins = xbins
            cdfmod = cdfmod
            cdfobs = cdfobs
            
            cdf1 = np.interp(vals, xbins, cdfmod)
            
            corrected = np.interp(cdf1, cdfobs, xbins)
            
            return corrected 
    
        results = bias_map(vals, xbins, cdfmod, cdfobs)
            
        return results
    
    
    
    In [140]:
    bias_corrected = rdd.map(bias_correct)
    
    
    
    In [142]:
    bias_corrected.take(10)
    
    
    
    
    Out[142]:
    [array([ 276.68960063,  276.68960063,  286.23062134]),
     array([ 277.6843516 ,  277.6843516 ,  287.25967407]),
     array([ 280.57859904,  280.57859904,  290.25372314]),
     array([ 280.19751383,  280.19751383,  299.52148031]),
     array([ 283.49803975,  283.49803975,  293.27383423]),
     array([ 285.22876485,  285.22876485,  304.89971415]),
     array([ 285.08433126,  285.08433126,  294.91482544]),
     array([ 283.94258016,  283.94258016,  293.73370361]),
     array([ 283.80044759,  283.80044759,  293.58666992]),
     array([ 288.61755575,  288.61755575,  298.56988525])]
    
    
    In [102]:
    bias_corrected.first().mean()
    
    
    
    
    Out[102]:
    282.32231330871582
    
    
    In [ ]: