In [1]:
%reset


Once deleted, variables cannot be recovered. Proceed (y/[n])? y

In [3]:
# standard libraries and classes
import os
import sys
import time
import string
import logging
import cProfile
import pstats
import pprint
import collections

from os import walk
from io import StringIO

# third party imports
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import numpy as np
import pandas as pd
import xgboost as xgb
import seaborn as sns


# frequently used classes

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import log_loss, accuracy_score, precision_score, recall_score, roc_auc_score
from IPython.core.debugger import Tracer
from IPython.display import display, Image, FileLink
from functools import wraps
from datetime import datetime as dt
from pandas import DataFrame
from xgboost.sklearn import XGBClassifier
from scipy.stats import randint, uniform
from six.moves import cPickle as pickle
#from __future__ import print_function


# custom imports and settings

import helper_functions as hf
%reload_ext autoreload
%autoreload 2
%reload_ext version_information
%matplotlib inline
version_list = %version_information numpy, scipy, matplotlib, pandas, scikit-learn, xgboost, tensorflow
version_list_html= hf.dict_to_html(version_list.__dict__['packages'])
plt.rcParams['figure.figsize'] = (20,10)

In [4]:
context = hf.get_new_context(version_list)

In [5]:
data_pickles = ['sample-notMNIST.pickle','notMNIST20-10-10.pickle','notMNIST50-15-15.pickle','notMNIST75-18-18.pickle','notMNIST90-30-18.pickle']
size = 50
image_size = 28
num_labels =  10
datasets, labels = hf.load_dataset(data_pickles[0], context)
data_description =['Training Set', 'Validation Set', 'Test Set']
for i in range(len(datasets)):
    hf.show_random_samples(image_size ,datasets[i], labels[i], data_description[i], context, 5, num_labels)



In [6]:
estimator = XGBClassifier(
                        learning_rate = .01,
                        n_estimators = 5000,
                        max_depth = 3,
                        min_child_weight=1,
                        gamma = 0,
                        subsample = 0.7,
                        colsample_bytree = 0.7,
                        objective= 'multi:softmax',
                        reg_alpha = 0.001,
                        seed=27)

parameter_ranges = {
    'colsample_bylevel': [0.4, 1.0],
    'colsample_bytree': [0.4, 1.0],
    'subsample': [0.4, 1.0],

    'learning_rate': [0, 1],
    'n_estimators': [15, 1000],
    
    'max_depth': [1,15],
    'min_child_weight': [1,15],
    'gamma': [0, 1],

    'reg_alpha': [-3,2],   #powers of 10
    'reg_lambda': [-3,2]}  #powers of 10

In [8]:
data_pickles = ['notMNIST20-10-10.pickle','notMNIST50-15-15.pickle','notMNIST75-18-18.pickle','notMNIST90-30-18.pickle']

base_early_stopping =20
intervals=[0,25,50,75,100]
thresholds =[0.1, 0.05]
for dataset in data_pickles:

    context = hf.get_new_context(version_list)
    datasets, labels = hf.load_dataset(dataset, context)
    
    for interval in intervals:
        for threshold in thresholds:
            %time tuned = hf.modelfit(alg = estimator, datasets = datasets, labels = labels, context = context, metrics = 'merror',title = dataset+ '::: interval: '+ str(interval) + ' threshold: ' +str(threshold),parameter_ranges= parameter_ranges,interval = interval,threshold = threshold, early_stopping_rounds = base_early_stopping +interval, num_labels = num_labels, cv_folds = 3)
    file = FileLink(context['summary'])
    display('Run Details:',file)


Model Report
Accuracy : 0.91160
Optimal Boosters : 1165
CPU times: user 4h 29min 58s, sys: 1min 47s, total: 4h 31min 45s
Wall time: 7min 35s

Model Report
Accuracy : 0.91160
Optimal Boosters : 1165
CPU times: user 4h 29min 54s, sys: 1min 52s, total: 4h 31min 46s
Wall time: 7min 35s
Iteration  25
Average Error:  [[ 0.22778333  0.247       0.16893333]]
High Bias
Iteration  50
Average Error:  [[ 0.11125     0.187       0.11456667]]
High Bias
Iteration  75
Average Error:  [[ 0.0686      0.16926667  0.1003    ]]
High Variance
Iteration  100
Average Error:  [[ 0.07016667  0.1665      0.09733333]]
tuned
Iteration  125
Average Error:  [[ 0.07085     0.16436667  0.0955    ]]
tuned
Iteration  150
Average Error:  [[ 0.0708      0.16293333  0.0938    ]]
tuned
Iteration  175
Average Error:  [[ 0.07091667  0.16163333  0.0924    ]]
tuned
Iteration  200
Average Error:  [[ 0.07046667  0.16036667  0.09143333]]
tuned
Iteration  225
Average Error:  [[ 0.06988333  0.159       0.09026667]]
tuned
Iteration  250
Average Error:  [[ 0.06906667  0.15746667  0.0888    ]]
tuned
Iteration  275
Average Error:  [[ 0.06795     0.15666667  0.08793333]]
tuned
Iteration  300
Average Error:  [[ 0.06698333  0.15563333  0.08633333]]
tuned
Iteration  325
Average Error:  [[ 0.06591667  0.15473333  0.08576667]]
tuned
Iteration  350
Average Error:  [[ 0.065       0.15393333  0.08503333]]
tuned
Iteration  375
Average Error:  [[ 0.06385  0.1527   0.0847 ]]
tuned
Iteration  400
Average Error:  [[ 0.06288333  0.15236667  0.08403333]]
tuned
Iteration  425
Average Error:  [[ 0.06193333  0.1512      0.0838    ]]
tuned
Iteration  450
Average Error:  [[ 0.06088333  0.15013333  0.083     ]]
tuned
Iteration  475
Average Error:  [[ 0.06006667  0.14926667  0.08233333]]
tuned
Iteration  500
Average Error:  [[ 0.05926667  0.14823333  0.08196667]]
tuned
Iteration  525
Average Error:  [[ 0.05878333  0.14773333  0.08146667]]
tuned
Iteration  550
Average Error:  [[ 0.05786667  0.14663333  0.0809    ]]
tuned
Iteration  575
Average Error:  [[ 0.05683333  0.14576667  0.08016667]]
tuned
Iteration  600
Average Error:  [[ 0.05608333  0.14523333  0.07996667]]
tuned
Iteration  625
Average Error:  [[ 0.05548333  0.14463333  0.07983333]]
tuned
Iteration  650
Average Error:  [[ 0.05481667  0.14456667  0.07953333]]
tuned
Iteration  675
Average Error:  [[ 0.05408333  0.14423333  0.07926667]]
tuned
Iteration  700
Average Error:  [[ 0.05346667  0.144       0.079     ]]
tuned
Iteration  725
Average Error:  [[ 0.053       0.14346667  0.07893333]]
tuned
Iteration  750
Average Error:  [[ 0.05231667  0.14286667  0.07856667]]
tuned
Iteration  775
Average Error:  [[ 0.05183333  0.14226667  0.07826667]]
tuned
Iteration  800
Average Error:  [[ 0.0513      0.14213333  0.07816667]]
tuned
Iteration  825
Average Error:  [[ 0.05086667  0.14183333  0.0777    ]]
tuned
Iteration  850
Average Error:  [[ 0.05053333  0.14166667  0.07763333]]
tuned
Iteration  875
Average Error:  [[ 0.05016667  0.14133333  0.0774    ]]
tuned
Iteration  900
Average Error:  [[ 0.04976667  0.14086667  0.07723333]]
tuned
Iteration  925
Average Error:  [[ 0.04933333  0.14086667  0.07723333]]
tuned
Iteration  950
Average Error:  [[ 0.04905     0.1406      0.07703333]]
tuned
Iteration  975
Average Error:  [[ 0.04891667  0.1404      0.07663333]]
tuned

Model Report
Accuracy : 0.90760
Optimal Boosters : 951
CPU times: user 5h 59min 41s, sys: 2min 46s, total: 6h 2min 27s
Wall time: 10min 6s
Iteration  25
Average Error:  [[ 0.22778333  0.247       0.16893333]]
High Bias
Iteration  50
Average Error:  [[ 0.11125     0.187       0.11456667]]
High Bias
Iteration  75
Average Error:  [[ 0.0686      0.16926667  0.1003    ]]
High Bias
Iteration  100
Average Error:  [[ 0.05861667  0.16283333  0.0938    ]]
High Bias
Iteration  125
Average Error:  [[ 0.05461667  0.15913333  0.0902    ]]
High Bias
Iteration  150
Average Error:  [[ 0.05278333  0.15746667  0.08896667]]
High Bias
Iteration  175
Average Error:  [[ 0.05165     0.15746667  0.0876    ]]
High Bias
Iteration  200
Average Error:  [[ 0.05148333  0.15746667  0.0887    ]]
High Bias
Iteration  225
Average Error:  [[ 0.05116667  0.15743333  0.08856667]]
High Bias

Model Report
Accuracy : 0.86230
Optimal Boosters : 184
CPU times: user 3h 5min 22s, sys: 50.1 s, total: 3h 6min 12s
Wall time: 5min 12s
Iteration  50
Average Error:  [[ 0.2182      0.24016667  0.16223333]]
High Bias
Iteration  100
Average Error:  [[ 0.09921667  0.1815      0.10843333]]
tuned
Iteration  150
Average Error:  [[ 0.0743  0.1673  0.0969]]
tuned

Model Report
Accuracy : 0.86230
Optimal Boosters : 184
CPU times: user 1h 33min 31s, sys: 30.7 s, total: 1h 34min 2s
Wall time: 2min 38s
Iteration  50
Average Error:  [[ 0.2182      0.24016667  0.16223333]]
High Bias
Iteration  100
Average Error:  [[ 0.09921667  0.1815      0.10843333]]
High Bias
Iteration  150
Average Error:  [[ 0.06121667  0.16406667  0.0939    ]]
High Bias

Model Report
Accuracy : 0.86230
Optimal Boosters : 184
CPU times: user 1h 51min 46s, sys: 34.1 s, total: 1h 52min 20s
Wall time: 3min 9s
Iteration  75
Average Error:  [[ 0.20981667  0.23406667  0.15626667]]
High Bias
Iteration  150
Average Error:  [[ 0.08946667  0.17626667  0.10406667]]
tuned

Model Report
Accuracy : 0.86230
Optimal Boosters : 184
CPU times: user 1h 23min 44s, sys: 28.8 s, total: 1h 24min 13s
Wall time: 2min 22s
Iteration  75
Average Error:  [[ 0.20981667  0.23406667  0.15626667]]
High Bias
Iteration  150
Average Error:  [[ 0.08946667  0.17626667  0.10406667]]
High Bias

Model Report
Accuracy : 0.86230
Optimal Boosters : 184
CPU times: user 1h 30min 53s, sys: 31.6 s, total: 1h 31min 24s
Wall time: 2min 34s
Iteration  100
Average Error:  [[ 0.20365     0.22853333  0.15116667]]
High Bias

Model Report
Accuracy : 0.86230
Optimal Boosters : 184
CPU times: user 1h 14min 8s, sys: 26.6 s, total: 1h 14min 35s
Wall time: 2min 6s
Iteration  100
Average Error:  [[ 0.20365     0.22853333  0.15116667]]
High Bias

Model Report
Accuracy : 0.86230
Optimal Boosters : 184
CPU times: user 1h 14min 9s, sys: 28.2 s, total: 1h 14min 37s
Wall time: 2min 6s
'Run Details:'
Model Report
Accuracy : 0.86207
Optimal Boosters : 184
CPU times: user 1h 58min 33s, sys: 46.8 s, total: 1h 59min 20s
Wall time: 3min 23s

Model Report
Accuracy : 0.86207
Optimal Boosters : 184
CPU times: user 1h 59min 19s, sys: 48.5 s, total: 2h 7s
Wall time: 3min 24s
Iteration  25
Average Error:  [[ 0.23346   0.246467  0.1718  ]]
High Bias
Iteration  50
Average Error:  [[ 0.18274     0.20528867  0.13024467]]
High Bias
Iteration  75
Average Error:  [[ 0.08182     0.16293333  0.09417767]]
tuned
Iteration  100
Average Error:  [[ 0.05874667  0.14866667  0.082511  ]]
tuned
Iteration  125
Average Error:  [[ 0.052       0.14273367  0.07835567]]
tuned
Iteration  150
Average Error:  [[ 0.04939333  0.13926667  0.07564433]]
tuned
Iteration  175
Average Error:  [[ 0.04766     0.13657767  0.07386667]]
tuned

Model Report
Accuracy : 0.86207
Optimal Boosters : 184
CPU times: user 6h 49min 54s, sys: 1min 26s, total: 6h 51min 21s
Wall time: 11min 30s
Iteration  25
Average Error:  [[ 0.23346   0.246467  0.1718  ]]
High Bias
Iteration  50
Average Error:  [[ 0.18274     0.20528867  0.13024467]]
High Bias
Iteration  75
Average Error:  [[ 0.08182     0.16293333  0.09417767]]
High Bias
Iteration  100
Average Error:  [[ 0.05682667  0.14893333  0.08335567]]
High Bias
Iteration  125
Average Error:  [[ 0.05107333  0.144222    0.07864433]]
High Bias
Iteration  150
Average Error:  [[ 0.04938667  0.142889    0.078022  ]]
High Variance
Iteration  175
Average Error:  [[ 0.04957333  0.142289    0.076822  ]]
High Variance

Model Report
Accuracy : 0.86207
Optimal Boosters : 184
CPU times: user 6h 19min 31s, sys: 1min 29s, total: 6h 21min 1s
Wall time: 10min 39s
Iteration  50
Average Error:  [[ 0.22328   0.237267  0.163711]]
High Bias
Iteration  100
Average Error:  [[ 0.17365333  0.196711    0.123022  ]]
High Bias
Iteration  150
Average Error:  [[ 0.06784     0.15444433  0.08675567]]
tuned

Model Report
Accuracy : 0.86207
Optimal Boosters : 184
CPU times: user 5h 17min 1s, sys: 1min 19s, total: 5h 18min 20s
Wall time: 8min 55s
Iteration  50
Average Error:  [[ 0.22328   0.237267  0.163711]]
High Bias
Iteration  100
Average Error:  [[ 0.17365333  0.196711    0.123022  ]]
High Bias
Iteration  150
Average Error:  [[ 0.06784     0.15444433  0.08675567]]
High Bias

Model Report
Accuracy : 0.86207
Optimal Boosters : 184
CPU times: user 5h 32min 9s, sys: 1min 20s, total: 5h 33min 29s
Wall time: 9min 20s
Iteration  75
Average Error:  [[ 0.21519333  0.229667    0.15613333]]
High Bias
Iteration  150
Average Error:  [[ 0.16636     0.18973333  0.11717767]]
High Bias

Model Report
Accuracy : 0.86207
Optimal Boosters : 184
CPU times: user 3h 39min 42s, sys: 1min 6s, total: 3h 40min 48s
Wall time: 6min 12s
Iteration  75
Average Error:  [[ 0.21519333  0.229667    0.15613333]]
High Bias
Iteration  150
Average Error:  [[ 0.16636     0.18973333  0.11717767]]
High Bias

Model Report
Accuracy : 0.86207
Optimal Boosters : 184
CPU times: user 3h 40min 23s, sys: 1min 4s, total: 3h 41min 28s
Wall time: 6min 13s
Iteration  100
Average Error:  [[ 0.20971333  0.224822    0.15113333]]
High Bias

Model Report
Accuracy : 0.86207
Optimal Boosters : 184
CPU times: user 2h 27min, sys: 52.7 s, total: 2h 27min 53s
Wall time: 4min 10s
Iteration  100
Average Error:  [[ 0.20971333  0.224822    0.15113333]]
High Bias

Model Report
Accuracy : 0.86207
Optimal Boosters : 184
CPU times: user 2h 25min 45s, sys: 52.7 s, total: 2h 26min 37s
Wall time: 4min 9s
'Run Details:'
Model Report
Accuracy : 0.86144
Optimal Boosters : 184
CPU times: user 2h 56min 46s, sys: 1min 9s, total: 2h 57min 55s
Wall time: 5min 2s

Model Report
Accuracy : 0.86144
Optimal Boosters : 184
CPU times: user 2h 58min 1s, sys: 1min 9s, total: 2h 59min 10s
Wall time: 5min 4s
Iteration  25
Average Error:  [[ 0.24047567  0.246278    0.17492567]]
High Bias
Iteration  50
Average Error:  [[ 0.17092     0.18666667  0.117926  ]]
High Bias
Iteration  75
Average Error:  [[ 0.15246667  0.173426    0.10427767]]
High Bias
Iteration  100
Average Error:  [[ 0.08254233  0.151537    0.08542567]]
tuned
Iteration  125
Average Error:  [[ 0.06225767  0.14461133  0.07994467]]
tuned
Iteration  150
Average Error:  [[ 0.0548      0.14068533  0.07705567]]
tuned
Iteration  175
Average Error:  [[ 0.050969  0.137278  0.074963]]
tuned

Model Report
Accuracy : 0.86144
Optimal Boosters : 184
CPU times: user 9h 53min 39s, sys: 2min 4s, total: 9h 55min 43s
Wall time: 16min 39s
Iteration  25
Average Error:  [[ 0.24047567  0.246278    0.17492567]]
High Bias
Iteration  50
Average Error:  [[ 0.17092     0.18666667  0.117926  ]]
High Bias
Iteration  75
Average Error:  [[ 0.15246667  0.173426    0.10427767]]
High Bias
Iteration  100
Average Error:  [[ 0.08254233  0.151537    0.08542567]]
High Bias
Iteration  125
Average Error:  [[ 0.05948     0.14381467  0.07922233]]
High Bias
Iteration  150
Average Error:  [[ 0.05193767  0.13940733  0.07624067]]
High Bias
Iteration  175
Average Error:  [[ 0.04895967  0.13725933  0.074185  ]]
High Variance

Model Report
Accuracy : 0.86144
Optimal Boosters : 184
CPU times: user 10h 24min 16s, sys: 2min 12s, total: 10h 26min 29s
Wall time: 17min 30s
Iteration  50
Average Error:  [[ 0.22959533  0.235722    0.16487033]]
High Bias
Iteration  100
Average Error:  [[ 0.15915133  0.17692633  0.10970367]]
High Bias
Iteration  150
Average Error:  [[ 0.139382    0.16331467  0.096037  ]]
High Bias

Model Report
Accuracy : 0.86144
Optimal Boosters : 184
CPU times: user 6h 38min 58s, sys: 1min 34s, total: 6h 40min 32s
Wall time: 11min 14s
Iteration  50
Average Error:  [[ 0.22959533  0.235722    0.16487033]]
High Bias
Iteration  100
Average Error:  [[ 0.15915133  0.17692633  0.10970367]]
High Bias
Iteration  150
Average Error:  [[ 0.139382    0.16331467  0.096037  ]]
High Bias

Model Report
Accuracy : 0.86144
Optimal Boosters : 184
CPU times: user 6h 38min 6s, sys: 1min 31s, total: 6h 39min 38s
Wall time: 11min 12s
Iteration  75
Average Error:  [[ 0.221782    0.22794433  0.15774067]]
High Bias
Iteration  150
Average Error:  [[ 0.15099567  0.17118533  0.10390767]]
High Bias

Model Report
Accuracy : 0.86144
Optimal Boosters : 184
CPU times: user 4h 52min 52s, sys: 1min 14s, total: 4h 54min 6s
Wall time: 8min 16s
Iteration  75
Average Error:  [[ 0.221782    0.22794433  0.15774067]]
High Bias
Iteration  150
Average Error:  [[ 0.15099567  0.17118533  0.10390767]]
High Bias

Model Report
Accuracy : 0.86144
Optimal Boosters : 184
CPU times: user 4h 52min 8s, sys: 1min 15s, total: 4h 53min 24s
Wall time: 8min 15s
Iteration  100
Average Error:  [[ 0.21593333  0.22127767  0.15194467]]
High Bias

Model Report
Accuracy : 0.86144
Optimal Boosters : 184
CPU times: user 4h 26min 10s, sys: 1min 16s, total: 4h 27min 26s
Wall time: 7min 32s
Iteration  100
Average Error:  [[ 0.21593333  0.22127767  0.15194467]]
High Bias

Model Report
Accuracy : 0.86144
Optimal Boosters : 184
CPU times: user 4h 25min 59s, sys: 1min 20s, total: 4h 27min 20s
Wall time: 7min 31s
'Run Details:'
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-8-eaee1388d775> in <module>()
      7 
      8     context = hf.get_new_context(version_list)
----> 9     datasets, labels = hf.load_dataset(dataset, context)
     10 
     11     for interval in intervals:

/datadisk/public/predkt/tuner/ngtuner/helper_functions.py in load_dataset(name, context)
    139     test_dataset = data['test_dataset']
    140     length = valid_dataset.shape[0]
--> 141     test_dataset = test_dataset.reshape(length, image_size*image_size)
    142 
    143     valid_labels = data['valid_labels']

ValueError: cannot reshape array of size 14112000 into shape (30000,784)