In [139]:
%reset


Once deleted, variables cannot be recovered. Proceed (y/[n])? y

In [140]:
# standard libraries and classes
import os
import sys
import time
import string
import logging
import cProfile
import pstats
import pprint
import collections

from os import walk
from io import StringIO

# third party imports
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import numpy as np
import pandas as pd
import xgboost as xgb
import seaborn as sns


# frequently used classes

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import log_loss, accuracy_score, precision_score, recall_score, roc_auc_score
from IPython.core.debugger import Tracer
from IPython.display import display, Image
from functools import wraps
from datetime import datetime as dt
from pandas import DataFrame
from xgboost.sklearn import XGBClassifier
from scipy.stats import randint, uniform
from six.moves import cPickle as pickle
#from __future__ import print_function


# custom imports and settings

import helper_functions as hf
%reload_ext autoreload
%autoreload 2
%reload_ext version_information
%matplotlib inline
version_list = %version_information numpy, scipy, matplotlib, pandas, scikit-learn, xgboost, tensorflow
version_list_html= hf.dict_to_html(version_list.__dict__['packages'])
plt.rcParams['figure.figsize'] = (20,10)

In [141]:
# run_pickle = hf.pickler('runpickles/20170914/run_20170914225536')
# #pprint.pprint(run_pickle)
# context = run_pickle['run context']

In [142]:
context = hf.fetch_paths()
pickled = hf.pickler(context['pickle'], context, 'run context')
hf.objects_growth(context['summary'], 'Beginning Heap')
hf.write_dict({'Installed Versions':version_list.__dict__['packages']}, context['summary'], 'Software Versions')
hf.write_dict(context, context['summary'], 'Run time Context')


Out[142]:
{'log path': 'logs/20170915/',
 'model path': 'savedmodels/20170915/',
 'modelpickles': 'savedmodels/20170915/pickled_20170915091004',
 'pickle': 'runpickles/20170915/run_20170915091004',
 'plot path': 'plots/20170915/run_091004/',
 'run date': '20170915',
 'run time': '091004',
 'runprofiles path': 'runprofiles/20170915/',
 'stats path': 'stats/20170915/',
 'statsfile': 'stats/20170915/stats_20170915091004',
 'summary': 'runprofiles/20170915/summary_20170915091004.txt'}

In [143]:
load_stats ={}

size = 50
image_size = 28
num_labels =  10
data = 'notMNIST20-10-10.pickle'

data_pickle_path = '../../../tensorflow/tensorflow/examples/udacity/' + data

with open(data_pickle_path, 'rb') as f:
    data = pickle.load(f)

train_dataset = data['train_dataset']
length = train_dataset.shape[0]
#print(train_dataset.shape)
train_dataset = train_dataset.reshape(length, image_size*image_size)

valid_dataset = data['valid_dataset']
length = valid_dataset.shape[0]
valid_dataset = valid_dataset.reshape(length, image_size*image_size)

test_dataset = data['test_dataset']
length = valid_dataset.shape[0]
test_dataset = test_dataset.reshape(length, image_size*image_size)

valid_labels = data['valid_labels']
train_labels = data['train_labels']
test_labels = data['test_labels']

#be nice to your RAM
del data

load_stats.update({'training dataset': train_dataset.shape})
load_stats.update({'training labels': train_labels.shape})
load_stats.update({'validations dataset': valid_dataset.shape})
load_stats.update({'validation labels': valid_labels.shape})
load_stats.update({'test dataset': test_dataset.shape})
load_stats.update({'test labels': test_labels.shape})

############## WRITE TO SUMMARY FILE
hf.write_dict(load_stats, context['summary'],'Dataset Details')

datasets = [train_dataset, valid_dataset, test_dataset]
labels = [train_labels, valid_labels, test_labels]

In [145]:
data_description =['Training Set', 'Validation Set', 'Test Set']
for i in range(len(datasets)):
    hf.show_random_samples(datasets[i], labels[i], data_description[i], context, 5, num_labels)



In [146]:
estimator = XGBClassifier(
                        learning_rate = 0.3,
                        n_estimators = 600,
                        max_depth = 3,
                        min_child_weight=1,
                        gamma = 0,
                        subsample = 0.7,
                        colsample_bytree = 0.7,
                        objective= 'multi:softmax',
                        seed=27)

In [147]:
initial_result = hf.modelfit(estimator, datasets, labels, context, 'merror', num_labels = num_labels, cv_folds=5)


Model Report
Accuracy : 0.99175
Optimal Boosters : 153

In [148]:
## TUNER PARAMETERS

# set ranges for parameters to be tuned by GridSearch. Includes upper limit
allowed_ranges = {
    'colsample_bylevel': [0,1],
    'colsample_bytree': [0,1],
    'gamma': [0,1],
    'learning_rate': [0,1],
    #'max_delta_step': 0,
    'max_depth': [1,10],
    'min_child_weight': [1,10],
    #'missing': None,
    #'reg_alpha': [0,100],
    'reg_lambda': [1,10],
    'scale_pos_weight': [1,5],
    'subsample': [0,1]}

# set magnitude of steps to traverse while fine-tuning grid search
steps = {
    'colsample_bylevel': 0.05,
    'colsample_bytree': 0.05,
    'gamma': 0.1,
    'learning_rate': 0.01,
    #'max_delta_step': 0,
    'max_depth': 1,
    'min_child_weight': 1,
    #'missing': None,
    #'reg_alpha': [0,100],
    'reg_lambda': 0.25,
    'scale_pos_weight': 1,
    'subsample': 0.05}

In [149]:
tuning_rounds = 3
tuner_params =[{'max_depth':[3,6,9], 'min_child_weight':[3,6,9]},
            {'gamma':[.2,.5,.8]},
            {'subsample':[.3, .6, .9], 'colsample_bytree':[.3,.6,.9]},
            {'reg_lambda':[3, 6, 9] }]

for i in range(len(tuner_params)):
    pickled = hf.pickler(context['pickle'])
    parameters = pickled['optimal parameters']
    estimator = XGBClassifier(**parameters)
    
    final_result = hf.tuner_cv(estimator, datasets[0], labels[0], datasets[1], labels[1], 
                               tuner_params[i], tuning_rounds, steps, 
                               allowed_ranges, context, cv=3)


Current Iteration  {'max_depth': 6, 'min_child_weight': 3}  CV Accuracy  0.8639  Validation Accuracy  0.8669
parameter max_depth
result: 6  step: 1  allowed_range: [1, 10]  seen: [3.0, 6.0, 9.0]
[5, 7]
parameter min_child_weight
result: 3  step: 1  allowed_range: [1, 10]  seen: [3.0, 6.0, 9.0]
[2, 4]
Extended List : {'max_depth': [5, 7], 'min_child_weight': [2, 4]}
-------------------------------
Current Iteration  {'max_depth': 7, 'min_child_weight': 2}  CV Accuracy  0.8645  Validation Accuracy  0.8666
parameter max_depth
result: 7  step: 1  allowed_range: [1, 10]  seen: [3.0, 6.0, 9.0, 5.0, 7.0]
[8]
parameter min_child_weight
result: 2  step: 1  allowed_range: [1, 10]  seen: [3.0, 6.0, 9.0, 2.0, 4.0]
[]
Extended List : {'max_depth': [8], 'min_child_weight': [2]}
-------------------------------
Current Iteration  {'max_depth': 8, 'min_child_weight': 2}  CV Accuracy  0.8633  Validation Accuracy  0.8654
parameter max_depth
result: 8  step: 1  allowed_range: [1, 10]  seen: [3.0, 6.0, 9.0, 5.0, 7.0, 8.0]
[]
parameter min_child_weight
result: 2  step: 1  allowed_range: [1, 10]  seen: [3.0, 6.0, 9.0, 2.0, 4.0]
[]
Extended List : {'max_depth': [8], 'min_child_weight': [2]}
-------------------------------
Current Iteration  {'gamma': 0.2}  CV Accuracy  0.86245  Validation Accuracy  0.8626
parameter gamma
result: 0.2  step: 0.1  allowed_range: [0, 1]  seen: [0.2, 0.5, 0.8]
[0.1, 0.3]
Extended List : {'gamma': [0.1, 0.3]}
-------------------------------
Current Iteration  {'gamma': 0.1}  CV Accuracy  0.86235  Validation Accuracy  0.8688
parameter gamma
result: 0.1  step: 0.1  allowed_range: [0, 1]  seen: [0.2, 0.5, 0.8, 0.1, 0.3]
[]
Extended List : {'gamma': [0.1]}
-------------------------------
Current Iteration  {'colsample_bytree': 0.6, 'subsample': 0.9}  CV Accuracy  0.863  Validation Accuracy  0.8683
parameter colsample_bytree
result: 0.6  step: 0.05  allowed_range: [0, 1]  seen: [0.3, 0.6, 0.9]
[0.55, 0.65]
parameter subsample
result: 0.9  step: 0.05  allowed_range: [0, 1]  seen: [0.3, 0.6, 0.9]
[0.85, 0.95]
Extended List : {'colsample_bytree': [0.55, 0.65], 'subsample': [0.85, 0.95]}
-------------------------------
Current Iteration  {'colsample_bytree': 0.65, 'subsample': 0.95}  CV Accuracy  0.8642  Validation Accuracy  0.8653
parameter colsample_bytree
result: 0.65  step: 0.05  allowed_range: [0, 1]  seen: [0.3, 0.6, 0.9, 0.55, 0.65]
[0.7]
parameter subsample
result: 0.95  step: 0.05  allowed_range: [0, 1]  seen: [0.3, 0.6, 0.9, 0.85, 0.95]
[1.0]
Extended List : {'colsample_bytree': [0.7], 'subsample': [1.0]}
-------------------------------
Current Iteration  {'colsample_bytree': 0.7, 'subsample': 1.0}  CV Accuracy  0.86315  Validation Accuracy  0.8686
parameter colsample_bytree
result: 0.7  step: 0.05  allowed_range: [0, 1]  seen: [0.3, 0.6, 0.9, 0.55, 0.65, 0.7]
[0.75]
parameter subsample
result: 1.0  step: 0.05  allowed_range: [0, 1]  seen: [0.3, 0.6, 0.9, 0.85, 0.95, 1.0]
[]
Extended List : {'colsample_bytree': [0.75], 'subsample': [1.0]}
-------------------------------
Current Iteration  {'reg_lambda': 3}  CV Accuracy  0.86445  Validation Accuracy  0.8689
parameter reg_lambda
result: 3  step: 0.25  allowed_range: [1, 10]  seen: [3.0, 6.0, 9.0]
[2.75, 3.25]
Extended List : {'reg_lambda': [2.75, 3.25]}
-------------------------------
Current Iteration  {'reg_lambda': 2.75}  CV Accuracy  0.86395  Validation Accuracy  0.8683
parameter reg_lambda
result: 2.75  step: 0.25  allowed_range: [1, 10]  seen: [3.0, 6.0, 9.0, 2.75, 3.25]
[2.5]
Extended List : {'reg_lambda': [2.5]}
-------------------------------
Current Iteration  {'reg_lambda': 2.5}  CV Accuracy  0.86375  Validation Accuracy  0.8696
parameter reg_lambda
result: 2.5  step: 0.25  allowed_range: [1, 10]  seen: [3.0, 6.0, 9.0, 2.75, 3.25, 2.5]
[2.25]
Extended List : {'reg_lambda': [2.25]}
-------------------------------

In [150]:
# Now tune reg_alpha
best_alpha ={}
pickled = hf.pickler(context['pickle'])
parameters = pickled['optimal parameters']
param_test = {'reg_alpha':[0, 0.001, 0.005, 0.01, 0.05] }

estimator = XGBClassifier(**parameters)

gsearch_alpha = GridSearchCV(estimator = estimator, 
                        param_grid = param_test, 
                        scoring= 'accuracy',
                        n_jobs= -1,
                        cv= 3)
optimal_alpha = gsearch_alpha.fit(datasets[0],labels[0])
gs_plot = hf.plot_grid_search(optimal_alpha, param_test, context)

#print('Best parameter', optimal_alpha.best_params_, 'CV Accuracy: ', optimal_alpha.best_score_)
# Update the parameters list with best scoring parameter
parameters.update({k: optimal_alpha.best_params_[k] for k in optimal_alpha.best_params_})


# Update the pickle
updated_pickle = hf.pickler(context['pickle'], parameters, 'optimal parameters')

# Write to summary file
best_alpha.update({'Chosen:':str(optimal_alpha.best_params_) + ' CV Score:' + str(optimal_alpha.best_score_)})   
hf.write_dict(best_alpha, context['summary'],'alpha Grid Search Result')


Out[150]:
{'Chosen:': "{'reg_alpha': 0.001} CV Score:0.8654"}

In [151]:
# Now lower learning rate and find optimal number of boosters
pickled = hf.pickler(context['pickle'])
parameters = pickled['optimal parameters']
parameters.update({'n_estimators': 5000})
parameters.update({'learning_rate': 0.01})

final_tuner = tree_tuner = XGBClassifier(**parameters)

optimal_model = hf.modelfit(final_tuner, datasets, labels, context, 'merror', num_labels = num_labels, cv_folds=5)


Model Report
Accuracy : 0.99525
Optimal Boosters : 935

In [152]:
#End Heap
hf.objects_growth(context['summary'], 'End Heap')

pickled = hf.pickler(context['pickle'])

attachments = []
for (dirpath, dirnames, filenames) in walk(pickled['run context']['plot path']):
    for f in filenames:
        file_path = str(pickled['run context']['plot path'] + f)
        attachments.append(file_path)
    break

# add summary file to list of attachments
attachments.append(context['summary'])


# compose summary and send via email
subject = 'With Charts'

body = pprint.pformat(pickled)


hf.send_email(subject, '<pre>' + pprint.pformat(pickled) + '</pre>', version_list_html,attachments, context)

In [ ]: