In [2]:
%%writefile helper_functions.py
import smtplib
import pandas as pd
import math
import numpy as np
import operator
import string


import matplotlib.pyplot as plt
import matplotlib.image as mpimg

import time
import datetime
import os
import sys
import threading
from functools import wraps

import xgboost as xgb
from xgboost.sklearn import XGBClassifier

from sklearn.metrics import log_loss, accuracy_score, precision_score, recall_score, roc_auc_score,log_loss
from sklearn.model_selection import GridSearchCV

from IPython.core.debugger import Tracer


from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from email.mime.application import MIMEApplication
from email.mime.image import MIMEImage

from os.path import basename


import cProfile
import pstats
from io import StringIO
import marshal
import tempfile
import pprint
import psutil
import re
import seaborn as sns

import objgraph

import pickle
from os.path import exists


# bit-serialize any object.Creates a new file if none, else appends.
# returns pickled object if only path is supplied
# stores the object in a dictionary with obj_key as key
def pickler(path, obj_to_pickle = None, obj_key = None):


    save ={}
    
    if exists(path):
        try:
          f = open(path, 'rb')
          save = pickle.load(f)
          f.close()
        except Exception as e:
          print('Unable to read data from', context['pickle'], ':', e)
          raise

    if(obj_to_pickle):
        save.update({obj_key: obj_to_pickle})

        try:
          f = open(path, 'wb')
          pickle.dump(save, f, pickle.HIGHEST_PROTOCOL)
          f.close()
        except Exception as e:
          print('Unable to save data to', context['pickle'], ':', e)
          raise
    
    return save




# Writes a dictionary to the file at supplied path.
# Optional description text describing the dictionary

def write_dict(d, path, description =''):

    
    with open(path, "a") as f:
      h_line = '-------------------------------\n'
      f.write(h_line)
      f.write(description +'\n')
      for k, v in d.items():
        if isinstance(v, dict):
          write_dict(v, path)
        else:
            tmp_str = str(k) + ' : ' + str(v) +'\n'
            f.write(tmp_str)
      f.write(h_line)
    
    return(d)



def get_new_context(version_list):

    
    context = fetch_paths()
    pickled = pickler(context['pickle'], context, 'run context')
    objects_growth(context['summary'], 'Beginning Heap')
    write_dict({'Installed Versions':version_list.__dict__['packages']}, context['summary'], 'Software Versions')
    write_dict(context, context['summary'], 'Run time Context')
    
    return context

def load_dataset(name, context):
    load_stats ={}
    size = 50
    image_size = 28
    num_labels =  10
    data = name

    data_pickle_path = '../../../../tensorflow/tensorflow/examples/udacity/' + data

    with open(data_pickle_path, 'rb') as f:
        data = pickle.load(f)

    train_dataset = data['train_dataset']
    length = train_dataset.shape[0]

    train_dataset = train_dataset.reshape(length, image_size*image_size)

    valid_dataset = data['valid_dataset']
    length = valid_dataset.shape[0]
    valid_dataset = valid_dataset.reshape(length, image_size*image_size)

    test_dataset = data['test_dataset']
    length = valid_dataset.shape[0]
    test_dataset = test_dataset.reshape(length, image_size*image_size)

    valid_labels = data['valid_labels']
    train_labels = data['train_labels']
    test_labels = data['test_labels']

    #be nice to your RAM
    del data

    load_stats.update({'training dataset': train_dataset.shape})
    load_stats.update({'training labels': train_labels.shape})
    load_stats.update({'validations dataset': valid_dataset.shape})
    load_stats.update({'validation labels': valid_labels.shape})
    load_stats.update({'test dataset': test_dataset.shape})
    load_stats.update({'test labels': test_labels.shape})

    ############## WRITE TO SUMMARY FILE
    write_dict(load_stats, context['summary'],'Dataset Details')

    datasets = [train_dataset, valid_dataset, test_dataset]
    labels = [train_labels, valid_labels, test_labels]
    return datasets, labels



#computes max_memory and cpu usage from dictionary of measured results 
def max_stats(profile_results, context):
    cpu_list= []
    used_memory_list =[]
    active_memory_list =[]
    total_memory_list = []
    buffered_memory_list =[]
    cached_memory_list =[]
    shared_memory_list = []
    swap_memory_list = []
    return_dict= {}


    for i, (key,value) in enumerate(profile_results.items()):

        if not key == 'max_memory':
            cpu_list.append(value['all_cpu'])             
            total_memory_list.append(value['memory'][0])
            used_memory_list.append(value['memory'][3])
            active_memory_list.append(value['memory'][5])
            buffered_memory_list.append(value['memory'][7])
            cached_memory_list.append(value['memory'][8])
            shared_memory_list.append(value['memory'][9])
            swap_memory_list.append(value['swap'][0])
            
            
    max_memory = profile_results['max_memory']  
    
    return_dict.update({'max_cpu': np.max(cpu_list)})
    return_dict.update({'total_memory': convert_size(np.max(total_memory_list))})
    return_dict.update({'max_used_memory': convert_size(np.max(used_memory_list))})
    return_dict.update({'max_active_memory': convert_size(np.max(active_memory_list))})
    return_dict.update({'max_buffered_memory': convert_size(np.max(buffered_memory_list))})
    return_dict.update({'max_cached_memory': convert_size(np.max(cached_memory_list))})
    return_dict.update({'max_shared_memory': convert_size(np.max(shared_memory_list))})
    return_dict.update({'max_swapped_memory': convert_size(np.max(swap_memory_list))})
    return_dict.update({'max_thread_memory': max_memory})

    
    write_dict(return_dict, context['summary'], 'Maximum Usage Stats')
    pickled = pickler(context['pickle'], return_dict, 'max stats')
    
    return return_dict



#sends email to self from self, with passed subject and body
#Files to attach can be passed as list to the 'files' argument

def send_email(subject, body, version_list_html='', files=None, context = None):
    
    def prompt(prompt):
        return raw_input(prompt).strip()

    fromaddr = 'abhijeet.jha@gmail.com'
    toaddr  = 'abhijeet.jha@gmail.com'
    msg = MIMEMultipart()
    msg['From'] = fromaddr
    msg['To'] = toaddr
    msg['Subject'] = subject
    
    body = body
    
    msg.attach(MIMEText(body, 'html'))
    
    
    footer ="<br>><hr>" + version_list_html
    msg.attach(MIMEText(footer, 'html'))

    
    #######################################
#     To embed accuracy image
#     pickled = pickler(context['pickle'])
#     img = pickled['accuracy plot']



#     # This example assumes the image is in the current directory
#     fp = open(img, 'rb')
#     msgImage = MIMEImage(fp.read())
#     fp.close()

#     # Define the image's ID as referenced above
#     msgImage.add_header('Content-ID', '<image1>')
#     msg.attach(msgImage)


####################################
    for f in files or []:
        with open(f, "rb") as fil:
            part = MIMEApplication(
                fil.read(),
                Name=basename(f)
            )
            part['Content-Disposition'] = 'attachment; filename="%s"' % basename(f)
            msg.attach(part)

    
 
    smtp_server = 'email-smtp.us-east-1.amazonaws.com'
    smtp_username = 'An    smtp_password = 'Ao    smtp_port = '587'
    smtp_do_tls = True

    server = smtplib.SMTP(
        host = smtp_server,
        port = smtp_port,
        timeout = 10
        )
    server.starttls()
    server.ehlo()
    server.login(smtp_username, smtp_password)
    
    text = msg.as_string()
    server.sendmail(fromaddr, toaddr, text)

    
# create html markup for a dictionay. 
# Note - doesnt work with nested dictionaries
#TO DO - write an iterator 
def dict_to_html(dict):
    df=pd.DataFrame(dict)
    outhtml= df.to_html(na_rep = "", index = True).replace('border="1"','border="0"')
    outhtml=outhtml.replace('<th>','<th style = "display: none">')
    outhtml=outhtml.replace('<td>','<td style= "padding: 8px;text-align: left;border-bottom: 1px solid #ddd;;">')
    outhtml=outhtml.replace('table','table width = "100%"')
    return outhtml


def convert_size(size_bytes):
   if size_bytes == 0:
       return "0B"
   size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
   i = int(math.floor(math.log(size_bytes, 1024)))
   p = math.pow(1024, i)
   s = round(size_bytes / p, 2)
   return "%s %s" % (s, size_name[i])

# dataset and labels are of type np.ndarray, returned by merge_dataset()
def randomize(dataset, labels):
  permutation = np.random.permutation(labels.shape[0])
  shuffled_dataset = dataset[permutation,:,:]
  shuffled_labels = labels[permutation]
  return shuffled_dataset, shuffled_labels


def fetch_paths():
    today = datetime.date.today().strftime("%Y%m%d")
    now = time.strftime("%H%M%S",time.gmtime())

    
    model_path = 'savedmodels/' + today +'/'
    log_path = 'logs/' + today +'/'
    stats_path = 'stats/' + today +'/'
    runprofiles_path = 'runprofiles/' + today +'/'
    pickle_path = 'runpickles/' + today + '/'
    plot_path = 'plots/' + today + '/' + 'run_' + now +'/'
    
    current = str(os.getcwd())
    log_root = os.path.join(log_path)
    model_root = os.path.join(model_path)
    stats_root = os.path.join(stats_path)
    runprofiles_root = os.path.join(runprofiles_path)
    pickled_root = os.path.join(pickle_path)
    plot_root = os.path.join(plot_path)

    if not os.path.exists(model_root):
        os.makedirs(model_root)

    if not os.path.exists(log_root):
        os.makedirs(log_root)
        
    if not os.path.exists(stats_root):
        os.makedirs(stats_root)
        
    if not os.path.exists(runprofiles_root):
        os.makedirs(runprofiles_root)
    
    if not os.path.exists(pickled_root):
        os.makedirs(pickled_root)
    
    if not os.path.exists(plot_root):
        os.makedirs(plot_root)
        
    summary = runprofiles_root + 'summary_' + today + now + '.txt'
    pickle = pickled_root + 'run_' + today + now 
    modelpickles = model_root + 'pickled_' + today + now
    statsfile = stats_root + 'stats_' + today + now
    
    context ={}
    context.update({'log path': log_root})
    context.update({'plot path': plot_root})
    context.update({'model path': model_root})
    context.update({'stats path': stats_root})
    context.update({'runprofiles path': runprofiles_root})
    context.update({'run date': today})
    context.update({'run time': now})
    context.update({'summary': summary})
    context.update({'pickle': pickle})
    context.update({'modelpickles': modelpickles})
    context.update({'statsfile': statsfile})
    
    return context


def html_class_name(class_name):
    #class_name = class_name.replace("<class '", "")
    class_name = class_name.replace(">", "")
    class_name = class_name.replace("<", "")
    class_name = class_name.replace("'", "")
    class_name = class_name.replace(" ", "")
    class_name = class_name.replace(".", "")
    class_name = class_name.replace(":", "")
    return class_name



# Routine to add commas to a float string
def commify3(amount):
    amount = str(amount)
    amount = amount[::-1]
    amount = re.sub(r"(\d\d\d)(?=\d)(?!\d*\.)", r"\1,", amount)
    return amount[::-1]



def save_summary(context, stats_file_path):
    #print (" --------------------------------------------------------------------")
    #summary = context['runprofiles path'] + 'summary_'+ context['run time'] +'.txt'
    stream = open(os.path.join(context['summary']), 'a');
    stats = pstats.Stats(stats_file_path, stream=stream)
    pprint.pformat(stats.strip_dirs().sort_stats('cumtime').print_stats(15))
    stream.flush()
    stream.close()




def poll_system_profile(context, interval=0.0):
    #log_root, model_root, stats_root, today, now = fetch_paths()    
    num_cpu =psutil.cpu_count()
    percpu_list =[]
    
    # Current system-wide CPU utilization as a percentage
    # ---------------------------------------------------
 
    # Individual CPUs
    sys_percs_percpu = psutil.cpu_percent(interval, percpu=True)
    
    
    for cpu_num, perc in enumerate(sys_percs_percpu):
        percpu_list.append(perc)
    # end for
 
 
    # Details on Current system-wide CPU utilziation as a percentage
 
    # --------------------------------------------------------------
    # Server as a whole
    overall_cpu = np.mean(percpu_list)
    sys_percs_total_details = psutil.cpu_times_percent(interval, percpu=False)
    mem = psutil.virtual_memory()
    swap = psutil.swap_memory()
    used = mem.total - mem.available
    sys_cpu_times = {}
    
    sys_cpu_times.update({'profile_time': datetime.date.today().strftime("%Y%m%d") + time.strftime("%H%M%S",time.gmtime()) })
    sys_cpu_times.update({'all_cpu': overall_cpu})
    sys_cpu_times.update({'per_cpu': sys_percs_percpu})
    sys_cpu_times.update({'memory': mem})
    sys_cpu_times.update({'swap':swap})
    
    write_dict(sys_cpu_times, context['summary'], 'Usage Logging')
    

    
    return sys_cpu_times

   

def measure_memory_usage(context, target_call, target_args, log_interval=30, log_filename=None, memory_usage_refresh=0.01):
    """
    measure the memory usage of a function call in python.\n
    Note: one may have to restart python to get accurate results.\n
    :param target_call: function to be tested\n
    :param target_args: arguments of the function in a tuple\n
    :param memory_usage_refresh: how frequent the memory is measured, default to 0.005 seconds\n
    :return: max memory usage in kB (on Linux/Ubuntu 14.04), may depend on OS
    """
  

    class StoppableThread(threading.Thread):
        def __init__(self, target, args):
            super(StoppableThread, self).__init__(target=target, args=args)
            self.daemon = True
            self.__monitor = threading.Event()
            self.__monitor.set()
            self.__has_shutdown = False

        def run(self):
            '''Overloads the threading.Thread.run'''
            # Call the User's Startup functions
            self.startup()

            # use the run method from Superclass threading.Thread
            super(StoppableThread, self).run()

            # Clean up
            self.cleanup()

            # Flag to the outside world that the thread has exited
            # AND that the cleanup is complete
            self.__has_shutdown = True

        def stop(self):
            self.__monitor.clear()

        def isRunning(self):
            return self.__monitor.isSet()

        def isShutdown(self):
            return self.__has_shutdown

        def mainloop(self):
            '''
            Expected to be overwritten in a subclass!!
            Note that Stoppable while(1) is handled in the built in "run".
            '''
            pass

        def startup(self):
            '''Expected to be overwritten in a subclass!!'''
            pass

        def cleanup(self):
            '''Expected to be overwritten in a subclass!!'''
            pass

    class MyLibrarySniffingClass(StoppableThread):
        def __init__(self, target, args):
            super(MyLibrarySniffingClass, self).__init__(target=target, args=args)
            self.target_function = target
            self.results = None

        def startup(self):
            # Overload the startup function
            print ("Calling the Target Library Function...")

        def cleanup(self):
            # Overload the cleanup function
            print ("Library Call Complete")

        #process = psutil.Process(os.getpid())

   
    process = psutil.Process(os.getpid())
    my_thread = MyLibrarySniffingClass(target_call, target_args)
    
    run_profile ={}
    start_mem = process.memory_full_info().uss  #uss
    
    sys_profile = poll_system_profile(context, interval=0.1)
    print ("Written to summary File")
    
    run_profile.update({time.strftime("%H:%M:%S",time.gmtime()): sys_profile})
    
    my_thread.start()
    delta_mem = 0
    max_memory = 0
    last_run=time.time()

    while(True):
        time.sleep(memory_usage_refresh)
        cur_time = time.time()
        del_time = cur_time - last_run
        
        
        
        if round(del_time) > log_interval:
            sys_profile = poll_system_profile(context)
            print ("Written to summary File")
            last_run = cur_time
            run_profile.update({time.strftime("%H:%M:%S",time.gmtime()): sys_profile})
            #print(run_profile)
        
        current_mem = process.memory_info().rss 
        delta_mem = current_mem - start_mem
        if delta_mem > max_memory:
            max_memory = delta_mem

            
        if my_thread.isShutdown():
            print ("Memory measurement complete!")
            break

    current_mem = process.memory_full_info().uss  #uss
    delta_mem = current_mem - start_mem
    if delta_mem > max_memory:
        max_memory = delta_mem



    print ("MAX Memory Usage in MB: {}".format( convert_size(max_memory)))

    
    run_profile.update({time.strftime("%H:%M:%S",time.gmtime()): sys_profile})
    run_profile.update({'max_memory': convert_size(max_memory)})
   
    
    written = max_stats(run_profile, context)
    
    return written



def objects_growth(path, description = ''):
    
    
    orig_stdout = sys.stdout
    
    f = open(path, 'a')
    sys.stdout = f
    
    print(description)
    f.flush()
    
    print(sys.version)
    print("---------------")
    print("Object Growth")
    print(objgraph.show_growth()) 
    f.flush()
    
    f.close()
    
    sys.stdout = orig_stdout
    #return''


    
# parameter_ranges = {
#     'colsample_bylevel': [0.4, 1.0],
#     'colsample_bytree': [0.4, 1.0],
#     'subsample': [0.4, 1.0],

#     'learning_rate': [0, 1],
#     'n_estimators': [15, 1000],
    
#     'max_depth': [1,15],
#     'min_child_weight': [1,15],
#     'gamma': [0, 1],

#     'reg_alpha': [-3,2],   #powers of 10
#     'reg_lambda': [-3,2]}  #powers of 10



def tuner(diagnosis, in_parameters, parameter_ranges, context):
    out_parameters = in_parameters.copy()
    
    if diagnosis == 'High Variance':
    
        #'colsample_bylevel' - reduce value to increase regularization
        left = min(parameter_ranges['colsample_bylevel']) *100 
        right = in_parameters['colsample_bylevel'] *100 
        
        if left == right:
            new_value = left
        else:
            new_value = np.random.randint(left,right) 
        
        out_parameters.update({'colsample_bylevel': new_value/100})
    
        #'colsample_bytree' - reduce value to increase regularization
        left = min(parameter_ranges['colsample_bytree']) *100 
        right = in_parameters['colsample_bytree'] *100 
        
        if left == right:
            new_value = left
        else:
            new_value = np.random.randint(left,right) 
        
        out_parameters.update({'colsample_bytree': new_value/100})
    
        #'subsample' - reduce value to increase regularization
        left = min(parameter_ranges['subsample']) *100 
        right = in_parameters['subsample'] *100 
        
        if left == right:
            new_value = left
        else:
            new_value = np.random.randint(left,right) 
        
        out_parameters.update({'subsample': new_value/100})
        
        #'max_depth' - reduce value to decrease model complexity
        left = min(parameter_ranges['max_depth']) 
        right = in_parameters['max_depth'] 
        
        if left == right:
            new_value = left
        else:
            new_value = np.random.randint(left,right) 

        out_parameters.update({'max_depth': new_value})
        
        #'min_child_weight' - increase to reduce model complexity
        left = in_parameters['min_child_weight'] 
        right = max(parameter_ranges['min_child_weight']) 
        
        if left == right:
            new_value = left
        else:
            new_value = np.random.randint(left,right) 
            
        out_parameters.update({'min_child_weight': new_value})
        
        #'gamma' - increase to reduce model complexity
        left = in_parameters['gamma']  * 100 
        right = max(parameter_ranges['gamma']) *100 
        
        if left == right:
            new_value = left
        else:
            new_value = np.random.randint(left,right)  
        
        out_parameters.update({'gamma': new_value/100})
        
        #'alpha' - increase to reduce model complexity
        left =  (np.log10(1.0 / in_parameters['reg_alpha']) * -1) 
        right = max(parameter_ranges['reg_alpha']) 
        
        if left == right:
            new_value = np.power(10, float(left))
        else:
            new_value = np.power(10, float(np.random.randint(left,right)))
        
        
        new_value = round(new_value, 4)
        out_parameters.update({'reg_alpha':new_value})
        
        #'lambda' - increase to reduce model complexity
        left =  (np.log10(1.0 / in_parameters['reg_lambda']) * -1) 
        right = max(parameter_ranges['reg_lambda']) 
        
        if left == right:
            new_value = np.power(10, float(left))
        else:
            new_value = np.power(10, float(np.random.randint(left,right)))
        

        out_parameters.update({'reg_lambda': round(new_value,4)})
    
    
    if diagnosis == 'High Bias':
    
        #'colsample_bylevel' - increase value to reduce regularization
        left = in_parameters['colsample_bylevel'] *100 
        right = max(parameter_ranges['colsample_bylevel']) *100 +1
        
        if left == right:
            new_value = left
        else:
            new_value = np.random.randint(left,right) 
        

        out_parameters.update({'colsample_bylevel': new_value/100})
    
        #'colsample_bytree' - increase value to reduce regularization
        left = in_parameters['colsample_bytree'] *100 
        right = max(parameter_ranges['colsample_bytree']) *100 +1
        
        if left == right:
            new_value = left
        else:
            new_value = np.random.randint(left,right) 

        out_parameters.update({'colsample_bytree': new_value/100})
    
        #'subsample' - increase value to reduce regularization
        left = in_parameters['subsample'] *100 
        right = max(parameter_ranges['subsample']) *100 +1
        
        if left == right:
            new_value = left
        else:
            new_value = np.random.randint(left,right) 
        
        out_parameters.update({'subsample': new_value/100})
        
        #'max_depth' - increase value to increase model complexity
        left = in_parameters['max_depth'] 
        right = max(parameter_ranges['max_depth']) +1
        
        if left == right:
            new_value = left
        else:
            new_value = np.random.randint(left,right) 
        
        out_parameters.update({'max_depth': new_value})
        
        #'min_child_weight' - decrease to increase model complexity
        right = in_parameters['min_child_weight'] 
        left = min(parameter_ranges['min_child_weight'])
        
        if left == right:
            new_value = left
        else:
            new_value = np.random.randint(left,right) 

        out_parameters.update({'min_child_weight': new_value})
        
        #'gamma' - reduce to increase model complexity
        right = in_parameters['gamma']  * 100 
        left = min(parameter_ranges['gamma']) *100 
        
        if left == right:
            new_value = left
        else:
            new_value = np.random.randint(left,right) 
        
        out_parameters.update({'gamma': new_value/100 })
        
        #'alpha' - decrease to increase model complexity
        right =  (np.log10(1.0 / in_parameters['reg_alpha']) * -1) + 1 
        left = min(parameter_ranges['reg_alpha']) 
        
        if left == right:
            new_value = np.power(10, left)
        else:
            new_value = np.power(10, float(np.random.randint(left,right)))
        
        new_value = round(new_value, 4)
        out_parameters.update({'reg_alpha':new_value})
        
        #'lambda' - decrease to reduce model complexity
        right =  (np.log10(1.0 / in_parameters['reg_lambda']) * -1) + 1 
        left = min(parameter_ranges['reg_lambda']) 
        
        if left == right:
            new_value = np.power(10, left)
        else:
            new_value = np.power(10, float(np.random.randint(left,right)))
        
        out_parameters.update({'reg_lambda': round(new_value,4)})
        

    #pprint.pprint(in_parameters)
    #pprint.pprint(out_parameters)
    
    return out_parameters



    
    
    
    
def meter(result, context, threshold, parameter_ranges, human_accuracy = 1.0):
    train_accuracy = 1.0 - result[0]
    valid_accuracy = 1.0 - result[1]
    test_accuracy = 1.0 - result[2]
    
    


    bias = abs(human_accuracy - train_accuracy)
    variance = abs(valid_accuracy - train_accuracy)
    
#     print(train_accuracy,valid_accuracy,test_accuracy)     
    #print(bias,variance, threshold)
    
    if bias < threshold:
        if variance < threshold:
            return('tuned') 
        else:
            return('High Variance')
    else:
        return('High Bias')    
    

    
    
    
def ngtuner( datasets, context, threshold, parameter_ranges, training_results, interval = 0 ):
    """Andrew Ng's recipe
    """
    intervals =[0]
    pickled = pickler(context['pickle'], intervals, 'tuner intervals')
    def callback(env):

        if interval > 0 and env.iteration > 0 and env.iteration % interval == 0:

            
            intervals.append(int(env.iteration))
            pickled = pickler(context['pickle'], intervals, 'tuner intervals')
            bst, i, n = env.model, env.iteration, env.end_iteration
            train_err= training_results['Train']['merror'][env.iteration-1]
            valid_err= training_results['Valid']['merror'][env.iteration-1]
            test_err= training_results['Test']['merror'][env.iteration-1]
            
            #pprint.pprint(training_results)
            #print(train_err,test_err, valid_err)
            #print(len(training_results),i,n)
            

    
            avg = [train_err, valid_err, test_err]

            write_dict({'train, valid, test': avg}, context['summary'],' merrors')

        
            diag = meter(avg, context, threshold, parameter_ranges, human_accuracy = 1.0)
            write_dict({'diagnosis': diag}, context['summary'],' Diagnosis')
        
            pickled = pickler(context['pickle'])
            parameters = pickled['optimal parameters']
            #pprint.pprint(parameters)
            new_params = tuner(diag, parameters, parameter_ranges, context)
            pickled = pickler(context['pickle'], new_params, 'optimal parameters')
        
            #pprint.pprint(new_params)
            
            write_dict({'iteration': env.iteration}, context['summary'],' Tuning Iteration')
            write_dict(new_params, context['summary'],' Updated Parameters')

            bst.set_param(new_params)
            

    
    callback.before_iteration = True
    


    
    #pprint.pprint(training_results)
    return callback


    
    
    
def modelfit(params, datasets, labels, context, title, parameter_ranges, interval = 0, threshold = 0.10, useTrainCV=True, early_stopping_rounds=20):
      
    try:
        train_dataset= datasets[0]
        train_labels = labels[0]
    
    except Exception as e:
        print('Unable to save data to load training samples', e)
        raise
    
    valid_dataset = datasets[1]
    test_dataset = datasets[2]
    
   
    valid_labels = labels[1]
    test_labels = labels[2]

    
    run_stats={}
    optimal_boosters = 0
    #num_class = num_labels
    
    
    if useTrainCV:

        
        xgb_param = params
        intervals =[interval]
        pickled = pickler(context['pickle'], intervals, 'tuner intervals')
    
        write_dict({'        ' : title}, context['summary'],' ')
        write_dict(xgb_param, context['summary'],' Initial Parameters')
        #xgb_param.update({'num_class': num_class})
        updated_pickle = pickler(context['pickle'], xgb_param, 'optimal parameters')


        xgtrain = xgb.DMatrix(train_dataset,label=train_labels)
        xgvalid = xgb.DMatrix(valid_dataset,label=valid_labels)
        xgtest = xgb.DMatrix(test_dataset,label=test_labels)
        
        xgdataset = [(xgtrain, 'Train'), (xgtest, 'Test'), (xgvalid, 'Valid') ]
        training_results ={}
        

        
        cv_start_time = time.time()
        if interval ==0:
            cvresult = xgb.train(xgb_param, xgtrain, num_boost_round=params['n_estimators'], evals = xgdataset,
                          evals_result = training_results, early_stopping_rounds=early_stopping_rounds,verbose_eval=10)
        
               
                
        else:
            cvresult = xgb.train(xgb_param, xgtrain, num_boost_round=params['n_estimators'], evals = xgdataset,
                          evals_result = training_results, early_stopping_rounds=early_stopping_rounds,verbose_eval=interval,
                          callbacks=[ngtuner( xgdataset, context, threshold, parameter_ranges, training_results, interval)])
        
        cv_end_time = time.time()
        
        #pprint.pprint(training_results)
        

        
        cv_time_raw = cv_end_time - cv_start_time
        cv_time = time.strftime("%H:%M:%S s",time.gmtime(cv_time_raw))

        

        
 
        #alg.set_params(n_estimators = cvresult.shape[0])
        optimal_boosters = cvresult.attr('best_iteration')
        #optimal_boosters = 10

        
    #Fit the algorithm on the data
    fit_start_time =time.time()
    #alg.fit(train_dataset, train_labels,eval_metric=metrics)
    fit_end_time =time.time()
    
    fit_time_raw = fit_end_time - fit_start_time
    fit_time = time.strftime("%H:%M:%S s",time.gmtime(fit_time_raw))

   
    #Predict training and validation set:
    predict_start_time = time.time()
    #dtrain_predictions = alg.predict(train_dataset)
    #dvalid_predictions = alg.predict(valid_dataset)
    #dtest_predictions = alg.predict(test_dataset)
    predict_end_time = time.time()
    
    predict_time_raw = predict_end_time - predict_start_time
    predict_time = time.strftime("%H:%M:%S s",time.gmtime(predict_time_raw))

    
    write_dict({'train time': cv_time, 'fit time': fit_time, 'predict time': predict_time}, context['summary'],'Run Times')

        
     #Print model report:
    #acc_score_train = accuracy_score(train_labels, dtrain_predictions)
    #acc_score_valid = accuracy_score(valid_labels, dvalid_predictions)
    #acc_score_test = accuracy_score(test_labels, dtest_predictions)
    #print ("\nModel Report")
    #print ("Accuracy : {0:.5f}".format(acc_score_test)) 
    #print ("Optimal Boosters : {}".format(optimal_boosters)) 
    
    #run_stats.update({'Train Accuracy': acc_score_train})
    #if acc_score_valid: run_stats.update({'Validation Accuracy': acc_score_valid})
    #if acc_score_test: run_stats.update({'Test Accuracy': acc_score_test})

    pickled = pickler(context['modelpickles'], cvresult, 'model')

    
    feat_imp_ser = pd.Series(cvresult.get_fscore()).head(10).sort_values(ascending=False)
    feat_dict = feat_imp_ser.to_dict()
    # run_stats.update({'Feature Importance Score': feat_dict})
     
    write_dict(run_stats, context['summary'], 'Results')
    pickled = pickler(context['pickle'], run_stats, 'model results')

        
    intervals = pickled['tuner intervals']
    #intervals.append(int(optimal_boosters))
    #print(intervals)

    plotCV(cvresult, int(optimal_boosters), context, intervals, training_results, title)
    
    
    ##########Book keeping - update optimal parameters in dictionary with new boosters
    
    pickled = pickler(context['pickle'])
    parameters = pickled['optimal parameters']
    parameters['n_estimators'] = int(optimal_boosters)

    #native xgboost requires num_class, scikit_learn doesnt like it
    #del parameters['num_class']

    #update with results
    #parameters.update({'n_estimators': optimal_boosters})

    updated_pickle = pickler(context['pickle'], parameters, 'optimal parameters')
    updated_pickle = pickler(context['pickle'], run_stats, 'run results')
    write_dict(parameters, context['summary'],' Final Parameters')
    
    return updated_pickle
    ########## End Book Keeping
 


def plotCV(cvresult, optimal_boosters, context, intervals, training_results, title ='accuracy score by #estimators', ylim=(0.5,1)):
    # ylim=(0.8,1.01)
    
    plt.rcParams['figure.figsize'] = (20,10)
    plt.style.use('seaborn-colorblind')
    sns.set_style("whitegrid")
    watermark = mpimg.imread('../images/current_logo_gray.png')
    
    

    
    #cvresult_df = pd.DataFrame(cvresult)
    x_values = len(training_results['Train']['merror'])
    test_error = training_results['Test']['merror']
    #test_std = cvresult_df.iloc[:,1].tolist()
    
    train_error = training_results['Train']['merror']
    valid_error = training_results['Valid']['merror']
    #train_std = cvresult_df.iloc[:,3].tolist()
    
    x_values_int= [None]* x_values
    test_acc_float= [None]*len(test_error)
    #test_std_float= [None]*len(x_values)
    train_acc_float= [None]*len(train_error)
    valid_acc_float= [None]*len(valid_error)
    #train_std_float= [None]*len(x_values)
    
    for i in range(x_values):
        x_values_int[i] = i
        test_acc_float[i] = 1 - float(test_error[i])
        #test_std_float[i] = float(test_std[i])
        train_acc_float[i] = 1 - float(train_error[i])
        valid_acc_float[i]= 1 - float(valid_error[i])
        #train_std_float[i] = float(train_std[i])
        
      
    fig = plt.figure()
    plt.xlabel('number of estimators')
    plt.ylabel('accuracy')
    #plt.ylim(0.7,1,1)

    plt.plot(x_values_int,
         train_acc_float,
         label='Training Score',
         color = 'r')

    plt.plot(x_values_int,
         test_acc_float,
         label='Test Score',
         color = 'g')
    
    plt.plot(x_values_int,
         valid_acc_float,
         label='Validation Score',
         color = 'b')




    plt.axhline(y = 1, color='k',linewidth=1, ls ='dashed')
    plt.axhline(y = valid_acc_float[int(optimal_boosters)], color='k',linewidth=2, ls ='dashed')
    plt.axvline(x = int(optimal_boosters), color='k',linewidth=2, ls ='dashed')
    #print(intervals[len(intervals)-1])
    
    for interval in intervals:
        
        plt.axvline(x = interval, linewidth=1, ls ='dashed')

    
#     plt.plot(optimal_boosters, float(accuracy_train), 'b^', label = 'Train Accuracy: ' + str(accuracy_train))
#     plt.plot(optimal_boosters, float(accuracy_valid), 'm^', label = 'Valid Accuracy: ' + str(accuracy_valid))
#     plt.plot(optimal_boosters, float(accuracy_test), 'g^', label = 'Test Accuracy: '+ str(accuracy_test))

    
    plt.legend(loc = 'best')
    if ylim:
        plt.ylim(ylim)
    plt.title(title)
    
    x_axis_range = plt.xlim()
    y_axis_range = plt.ylim()

    
    imgplot = plt.imshow(watermark, aspect = 'auto', extent=(x_axis_range[0], x_axis_range[1],  y_axis_range[0],  y_axis_range[1]), zorder= - 1, alpha =0.1)

    now = time.strftime("%H%M%S",time.gmtime())
    save_file = context['plot path'] + now + '.png'
    plt.text(x_axis_range[0], y_axis_range[0], save_file, color='gray', fontsize=8)
    
    plt.show()
    
    fig.savefig(save_file, bbox_inches='tight')
    pickled = pickler(context['pickle'], save_file, 'accuracy plot')





def remove_duplicates(inlist):
    outlist =[]
    for i in inlist:
        if i not in outlist:
            outlist.append(float(i))
    
    return outlist
    

  
    
    

def show_random_samples(image_size, dataset, labels, description, context, rows = 1, cols=10):
    
    unique, counts = np.unique(labels, return_counts=True)
    #print(unique,counts)
    font = {'family': 'monospace',
        'color':  '#351c4d',
        'weight': 'normal',
        'size': 20,
        }
    
    label_list = list(string.ascii_uppercase)
    plt.rcParams['figure.figsize'] = (20,14)
    plt.style.use('seaborn-colorblind')
    #sns.set_style("whitegrid")
    watermark = mpimg.imread('../images/current_logo_gray.png')
    footer_height =2
    footer_width = 4
    
    
    fig = plt.figure()
    counter = 1
    
    for row in range(rows):
  
            for col in range(cols):
                pick =   np.where(labels == col)[0] 
                random_pick = np.random.randint(len(pick))
                sample_idx = pick[random_pick]
                #print(sample_idx)
                
                #sample_idx = np.random.randint(len(dataset)) 
                sample_label = labels[sample_idx]  
        
                    
                sample_image = dataset[sample_idx, :] 
                a=fig.add_subplot(rows + footer_height, cols, counter)
                sample_image = sample_image.reshape(image_size, image_size)
                #plt.axis('off')
                plt.imshow(sample_image)
                a.set_title(label_list[sample_label], fontsize=12, weight = 'bold',color = 'r')
                counter+=1
                    
                
    
    
   
    for col in range(cols):
                pick =   np.where(labels == col)[0] 
                random_pick = np.random.randint(len(pick))
                sample_idx = pick[random_pick]
                sample_label = labels[sample_idx]      
                sample_image = dataset[sample_idx, :] 
                b=fig.add_subplot(rows + footer_height, cols, counter)
                sample_image = sample_image.reshape(image_size, image_size)
                #plt.axis('off')
                plt.imshow(sample_image,cmap='Greys_r')
                plt.tight_layout()
                b.set_title(counts[sample_label], fontsize=15, weight = 'bold',color = '#351c4d')
                counter+=1
    
    logo_footer= fig.add_subplot(rows + footer_height,footer_width,(rows+ footer_height)* footer_width)
    x_axis_range = plt.xlim()
    y_axis_range = plt.ylim()

    #plt.axis('off')
    #sns.set_style("whitegrid")
    plt.tick_params(
    axis='x',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom='off',      # ticks along the bottom edge are off
    top='off',         # ticks along the top edge are off
    labelbottom='off') # labels along the bottom edge are off
    
    plt.tick_params(
    axis='y',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    left='off',      # ticks along the bottom edge are off
    right='off',         # ticks along the top edge are off
    labelleft='off') # labels along the bottom edge are off
    
    imgplot = plt.imshow(watermark, aspect = 'auto', extent=(x_axis_range[0], x_axis_range[1],  y_axis_range[0],  y_axis_range[1]), zorder= - 1, alpha =0.3)
  
    
    
    
    title_footer= fig.add_subplot(rows + footer_height,2,(rows+ footer_height)* 2-1)
    plt.axis('off')
    x_axis_range = plt.xlim()
    y_axis_range = plt.ylim()
    plt.text(0, 0, description + str(dataset.shape), va='center', fontdict=font, fontsize=40)
    
    now = time.strftime("%H%M%S",time.gmtime())
    save_file = context['plot path'] + now + '.png'
    
    plt.text(x_axis_range[0], y_axis_range[0], save_file, color='gray', fontsize=8)
    
    plt.show()
    fig.savefig(save_file, bbox_inches='tight')


Overwriting helper_functions.py

In [ ]: