Calibration by Simulated Annealing


In [ ]:
%pylab notebook
from __future__ import print_function 
import charistools.find_best_model as fbm
import charistools.meltModels as mM
from charistools.modelEnv import ModelEnv
import glob
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import math
import os
import pandas as pd
import pdb
import re
import sys

In [ ]:
# Chooose starting DDFs
ddfs_start = list(mM.RandomNewDDFs(initialize=True))
print("Begin DDFs here:")
print(ddfs_start)

In [ ]:
%cd /work/charis/ti_model/calibrations_correct_cost
%ls

In [ ]:
# Simulated annealing parameters
# Number of cycles
numCycles = 100
# Number of trials per cycle
numTrials = 50
# Number of accepted solutions
na = 0.0
# Probability of accepting worse solution at the start
p1 = 0.7
# Probability of accepting worse solution at the end
p50 = 0.001
# Initial temperature
t1 = -1.0 / math.log(p1)
# Final temperature
t50 = -1.0 / math.log(p50)
# Fraction reduction every cycle
frac = (t50/t1)**(1.0/(numCycles-1.0))
# Initialize DDFs
x = np.zeros((numCycles + 1, 4))
x[0] = ddfs_start
xi = np.zeros(4)
xi = ddfs_start
na = na + 1.0

In [ ]:
# Melt model parameters for a given basin
#drainageid = "IN_Hunza_at_DainyorBridge"
#years = [2001, 2002, 2003]
#drainageid = "AM_Vakhsh_at_Komsomolabad"
#years = [2004, 2005, 2007]
#drainageid = "SY_Naryn_at_NarynTown"
#years = [2003, 2005, 2007]
#drainageid = "GA_Karnali_at_Benighat"
#years = [2001, 2003, 2004]
drainageid = "GA_Narayani_at_Devghat"
years = [2001, 2004, 2005]
#drainageid = "GA_SaptaKosi_at_Chatara"
#years = [2003, 2004, 2005]

nstrikes = 3
ablation_method='grsize_scag'
use_daily_threshold_file=True
rainfall_col='diff_km3'
runoff_col='runoff'
config_file='/projects/brodzik/charis_ti_melt/calibration_modelEnv_config.ini'
myEnv = ModelEnv(tileConfigFile=config_file)

# How far away to look from the previous set of DDFs
neighborhood_mm = 10

# label will be used for plot titles and output filenames
label = "%1dstr DDFnbr=%dmm N%03d M%03d" % ( nstrikes, neighborhood_mm, numCycles, numTrials )
print(label)

In [ ]:
# Calculate the first calibration cost,
# this is "Current best results so far"
xc = np.zeros(4)
xc[:] = x[0]                                                        
result = mM.CalibrationStats(myEnv, drainageid, years, nstrikes,                                                                
                             xi[0], xi[1], xi[2], xi[3], 
                             rainfall_col=rainfall_col, 
                             runoff_col=runoff_col,
                             ablation_method=ablation_method, 
                             use_daily_threshold_file=use_daily_threshold_file)
df, stats_df = mM.CalibrationCost(result)
fc = stats_df.z.iloc[0]
fs = np.zeros(numCycles+1)
fs[0] = fc
# Current temperature
t = t1
# DeltaE Average
DeltaE_avg = 0.0
print("Cost of initial DDFs: %.3f" % fs[0])

In [ ]:
%cd /work/charis/ti_model/calibrations_correct_cost
%ls

In [ ]:
logFile = "%s.%s.stats.txt" % ( drainageid, label.replace(" ", "_") )
logf = open(logFile, "w")
print(logFile)

In [ ]:
mM.SaveCalibrationStats(myEnv, drainageid, nstrikes,result,ablation_method=ablation_method, 
                        use_daily_threshold_file=use_daily_threshold_file, file=sys.stderr,header=True) 
mM.SaveCalibrationStats(myEnv, drainageid, nstrikes,result,ablation_method=ablation_method, 
                        use_daily_threshold_file=use_daily_threshold_file, file=logf,header=True)

In [ ]:
for i in range(numCycles):
    print("Cycle: %d with Temperature: %f" % (i, t))
    
    for j in range(numTrials):
        # Generate new trial DDFs, relative to xc "current best results so far"
        ddfs = list(mM.RandomNewDDFs(min_snow_ddf=xc[0], max_snow_ddf=xc[1], 
                                     min_ice_ddf=xc[2], max_ice_ddf=xc[3],
                                     neighborhood_mm=neighborhood_mm))
        xi[0] = ddfs[0]
        xi[1] = ddfs[1]
        xi[2] = ddfs[2]
        xi[3] = ddfs[3]
        
        # Calculate the cost at the new DDFs
        result = mM.CalibrationStats(
            myEnv, drainageid, years, nstrikes,                                                                
            xi[0], xi[1], xi[2], xi[3], 
            rainfall_col=rainfall_col, 
            runoff_col=runoff_col,
            ablation_method=ablation_method, 
            use_daily_threshold_file=use_daily_threshold_file)
        df, stats_df = mM.CalibrationCost(result)
        
        # Save the calibration stats 
        mM.SaveCalibrationStats(myEnv, drainageid, nstrikes,result,ablation_method=ablation_method, 
                                use_daily_threshold_file=use_daily_threshold_file, file=logf)
        
        trial_cost = stats_df.z.iloc[0]
        print("cycle=%d, trial=%d, ddfs=%.2f, %.2f, %.2f, %.2f, best_cost_so_far=%.3f, trial_cost=%.3f" % (
            i, j, xi[0], xi[1], xi[2], xi[3], fc, trial_cost), file=sys.stderr)
        
        DeltaE = abs(trial_cost - fc)
        if (trial_cost > fc):
            
            # Initialize DeltaE_avg if a worse solution was found
            # on the first iteration
            if (i==0 and j==0): DeltaE_avg = DeltaE
                
            # objective function is worse
            # generate prob of acceptance
            p = math.exp(-DeltaE/(DeltaE_avg * t))
            
            # determine whether to accept worse point
            if (random.random() < p):
                # accept the worse solution
                accept = True
            else:
                # don't accept the worse solution
                accept = False
        else:
            
            # objective function is lower, automatically accept
            accept = True
        
        if (accept == True):
            
            # update the currently accepted solution
            xc[0] = xi[0]
            xc[1] = xi[1]
            xc[2] = xi[2]
            xc[3] = xi[3]
            fc = trial_cost
            
            # increment number of accepted solutions
            na = na + 1.0
            # update DeltaE_avg
            DeltaE_avg = (DeltaE_avg * (na - 1.0) + DeltaE) / na
            
            print("==> Updating best_cost_so_far", file=sys.stderr)
            
    # Record the best x values at the end of every cycle
    x[i+1][0] = xc[0]
    x[i+1][1] = xc[1]
    x[i+1][2] = xc[2]
    x[i+1][3] = xc[3]
    fs[i+1] = fc
    
    # Lower the temperature for the next cycle
    t = frac * t

# Close the logfile
logf.close()

# print solution
print("Best solution: %s" % xc, file=sys.stderr)
print("Best cost    : %f" % fc, file=sys.stderr)

# Coerce answers to DataFrame
caldf = pd.DataFrame(x, columns=['winter_snow_ddf', 'summer_snow_ddf', 'winter_ice_ddf', 'summer_ice_ddf'])
caldf["min_cycle_z"] = fs
caldf

In [ ]:
datafile = "%s.%s.SA_summary.dat" % (drainageid, label.replace(" ", "_"))
caldf.to_pickle(datafile)
print("caldf pickled to %s" % datafile)

In [ ]:
%pwd

plot the progress made by SA


In [ ]:
# summary file is like this: AM_Vakhsh_at_Komsomolabad.3str_DDFnbr=10mm_N050_M050.SA_summary.dat
# filename is parsed for drainageid and label
# output will be written to : drainageid_label.SA_summary.png
def plot_sa_summary(file):
    caldf = pd.read_pickle(file)
    
    best_model = "%.2f_%.2f_%.2f_%.2f" % (
        caldf.iloc[-1].winter_snow_ddf,
        caldf.iloc[-1].summer_snow_ddf,
        caldf.iloc[-1].winter_ice_ddf,
        caldf.iloc[-1].summer_ice_ddf)
    
    # Parse the filename for drainageid and label
    parts = file.split('.')
    drainageid = parts[0]
    label = parts[1]
    
    # Parse the filename for nCycles and nTrials
    p = re.compile(r"_N(\d*)_M(\d*)")
    m = p.search(file)
    numCycles = int(m.group(1))
    numTrials = int(m.group(2))
    
    fig, ax = plt.subplots(2, 1, figsize=(8,6))

    fig.suptitle(
                "%s Best by SA=%s\n(%s)" % (
                drainageid, 
                best_model,
                label),
            fontsize=12)

    caldf["min_cycle_z"].plot(ax=ax[0], style='k.-')
    ax[0].legend(["Cost"])
    ax[0].set_ylabel('Cost')

    ddfs = caldf.drop(labels='min_cycle_z', axis=1)
    ddfs.plot(ax=ax[1], style=['b.-', 'b--', 'r.-', 'r--'])
    handles, labels = ax[1].get_legend_handles_labels()
    ax[1].legend(handles[::-1], labels[::-1])
    ax[1].set_xlabel('Calibration Cycle (%d trials/cycle)' % numTrials)
    ax[1].set_ylabel('DDF ($mm$)')
    
    fig.tight_layout()
    fig.subplots_adjust(top=0.86)

    outfile = "%s.%s.z%.3f_Best%s.SA_summary.png" % (
        drainageid, label.replace(" ", "_"), caldf.iloc[-1].min_cycle_z, best_model)
    plt.savefig(outfile)
    print("Plot saved to %s" % outfile)
    outlink = "%s.%s.SA_summary.png" % (drainageid, label.replace(" ", "_"))
    if os.path.exists( outlink):
        os.remove( outlink )
    os.symlink( outfile, outlink )

In [ ]:
plot_sa_summary(datafile)

plot the complete solution space that was investigated


In [ ]:
#caldf = fbm.get_calibration_stats(logFile, verbose=1)
#caldf["stats"]
%pwd

In [ ]:
#def plot_solution_space(caldf, drainageid, nstrikes, label):
def plot_solution_space(file):
    
    # Parse the filename for drainageid and label
    parts = file.split('.')
    drainageid = parts[0]
    label = parts[1]
    
    # Parse the label for nstrikes
    parts = label.split('str')
    nstrikes=int(parts[0])
    
    # Read the data 
    caldf = fbm.get_calibration_stats(file)
    
    # Reverse the order of rows, so smallest-ordered values are plotted last
    # Best model is now last in data frame
    df = caldf["stats"][::-1]
    best_i = -1

    fig, ax = plt.subplots(2, 3, figsize=(9,6))

    fig.suptitle(
        "%s Best=%s \n(%d strikes, RMSE=%.3f $km^3$, Voldiff=%.3f%%, numModels=%d)" % (
            drainageid, df.index[-1], 
            nstrikes,
            df["monthly_rmse_km3"].iloc[-1], 
            df["annual_voldiff_pcent"].iloc[-1],
            len(df.index)),
        fontsize=12)

    surfaces = ["ice", "snow"]
    markers =  ["^", "o"]

    for i, surface in enumerate(surfaces):
    
        row = i
        min_ddf = "min_%s_ddf" % surface
        max_ddf = "max_%s_ddf" % surface
        min_ddf_label = "winter_%s_ddf" % surface
        max_ddf_label = "summer_%s_ddf" % surface
        
        # Upper Left: RMSE as function of min/max snow ddf 
        col = 0
        rmse_plot = ax[row, col].scatter(df[min_ddf],
                                 df[max_ddf],
                                 c=df["monthly_rmse_km3"],
                                 cmap="Greens",
                                 edgecolor="",
                                 marker=markers[i])
        ax[row, col].plot([df[min_ddf].iloc[best_i]],
                          [df[max_ddf].iloc[best_i]],
                          marker="x",
                          color="red")
        ax[row, col].set_title('Monthly RMSE ($km^3$)')
        ax[row, col].set_xlabel(min_ddf_label)
        ax[row, col].set_ylabel(max_ddf_label)
        ax[row, col].axis([0., 60., 0., 60.])
        ax[row, col].set_aspect('equal', 'box')
        fig.colorbar(rmse_plot, ax=ax[row,col])

        # Middle: Annual volume difference
        col = 1
        vmax = np.max(df["abs_voldiff_pcent"])
        vmax = 10.0
        vmin = -1.0 * vmax
        voldiff_plot = ax[row, col].scatter(df[min_ddf],
                                            df[max_ddf],
                                            c=df["annual_voldiff_pcent"],
                                            cmap="BrBG_r",
                                            vmin=vmin,
                                            vmax=vmax,
                                            edgecolor="",
                                            marker=markers[i])
        ax[row, col].plot([df[min_ddf].iloc[best_i]],
                          [df[max_ddf].iloc[best_i]],
                          marker="x",
                          color="red")
        
        ax[row, col].set_title('Annual vol diff (%)')
        ax[row, col].set_xlabel(min_ddf_label)
        ax[row, col].set_ylabel(max_ddf_label)
        ax[row, col].axis([0., 60., 0., 60.])
        ax[row, col].set_aspect('equal', 'box')
        fig.colorbar(voldiff_plot, ax=ax[row,col])
        
        # Right: Combined Z score
        col = 2
        vmax = np.max(df["z"])
        vmax = 200.0;
        vmin = 0.0
        z_plot = ax[row, col].scatter(df[min_ddf],
                                      df[max_ddf],
                                      c=df["z"],
                                      cmap="Greys",
                                      vmin=vmin,
                                      vmax=vmax,
                                      edgecolor="",
                                      marker=markers[i])
        ax[row, col].plot([df[min_ddf].iloc[best_i]],
                          [df[max_ddf].iloc[best_i]],
                          marker="x",
                          color="red")
        ax[row, col].set_title('Z')
        ax[row, col].set_xlabel(min_ddf_label)
        ax[row, col].set_ylabel(max_ddf_label)
        ax[row, col].axis([0., 60., 0., 60.])
        ax[row, col].set_aspect('equal', 'box')
        fig.colorbar(z_plot, ax=ax[row,col])


    fig.tight_layout()
    fig.subplots_adjust(top=0.86)

    outfile = "%s.%s.z%.3f_Best%s.stats.png" % (
        drainageid, label.replace(" ", "_"), df["z"].iloc[best_i], df.index[-1])
    plt.savefig(outfile)
    print("Image saved to %s" % outfile)
    outlink = "%s.%s.stats.png" % (drainageid, label.replace(" ", "_"))
    if os.path.exists( outlink):
        os.remove( outlink )
    os.symlink( outfile, outlink )

In [ ]:
plot_solution_space(logFile)

In [ ]:
%pwd

Redo the whole batch of SA_summary plots


In [ ]:
list = glob.glob("GA_Sap*2str*SA_summary.dat")
print(list)
for file in list:
    plot_sa_summary(file)

Redo the whole batch of solution space plots


In [ ]:
%cd /work/charis/ti_model/calibrations_correct_cost
%ls

In [ ]:
list = glob.glob("GA_Sap*2str*stats.txt")
list

In [ ]:
for file in list:
    plot_solution_space(file)

In [ ]: