In [1]:

    
import numpy as np
import pandas as pd
import sqlalchemy as sa
import pickle, json, requests, base64

import time
from datetime import datetime

Scikit-Learn Model Deployment Use Case

In this case, we will build a very large ensemble model (here, Random Foreast with 512 trees) on a digits dataset (not very original !!!) and generate a SQL code for deployment using the web service.

We then execute the SQL code on a local database (postgresql) and compare the SQL execution result with scikit-learn predict/predict_proba/.predict_log_proba result.

Both results are stored in pandas dataframes.

Build a scikit-learn model



In [2]:

    
from sklearn import datasets

digits = datasets.load_digits()
X = digits.data
n_classes = len(digits.target_names)



In [3]:

    
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=512, max_depth=7, min_samples_leaf=30, random_state = 1960)
clf.fit(digits.data, digits.target)









    Out[3]:





RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=7, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=30, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=512, n_jobs=None,
            oob_score=False, random_state=1960, verbose=0,
            warm_start=False)



In [4]:

    
#clf.__dict__

Generate SQL Code from the Model



In [5]:

    
def test_ws_sql_gen(pickle_data):
    WS_URL="http://localhost:1888/model"
    b64_data = base64.b64encode(pickle_data).decode('utf-8')
    data={"Name":"model1", "PickleData":b64_data , "SQLDialect":"postgresql"}
    r = requests.post(WS_URL, json=data)
    #print(r.__dict__)
    content = r.json()
    # print(content)
    lSQL = content["model"]["SQLGenrationResult"][0]["SQL"]
    return lSQL;



In [6]:

    
pickle_data = pickle.dumps(clf)
lSQL = test_ws_sql_gen(pickle_data)
N = len(lSQL)
P = 4000
print(lSQL[0:P] + "..." + lSQL[N//2:(N//2 + P)] + "..." + lSQL[-P:])









    



WITH "RF_0" AS 
(WITH "DT_node_lookup" AS 
(SELECT "ADS"."KEY" AS "KEY", CASE WHEN ("ADS"."Feature_43" <= 2.5) THEN CASE WHEN ("ADS"."Feature_26" <= 3.5) THEN CASE WHEN ("ADS"."Feature_19" <= 6.5) THEN CASE WHEN ("ADS"."Feature_28" <= 15.5) THEN 4 ELSE 5 END ELSE 6 END ELSE CASE WHEN ("ADS"."Feature_22" <= 0.5) THEN CASE WHEN ("ADS"."Feature_54" <= 2.5) THEN CASE WHEN ("ADS"."Feature_36" <= 7.5) THEN 10 ELSE 11 END ELSE CASE WHEN ("ADS"."Feature_33" <= 0.5) THEN 13 ELSE 14 END END ELSE CASE WHEN ("ADS"."Feature_34" <= 7.5) THEN CASE WHEN ("ADS"."Feature_44" <= 1.5) THEN 17 ELSE 18 END ELSE CASE WHEN ("ADS"."Feature_35" <= 2.5) THEN CASE WHEN ("ADS"."Feature_37" <= 8.5) THEN 21 ELSE 22 END ELSE 23 END END END END ELSE CASE WHEN ("ADS"."Feature_54" <= 1.5) THEN CASE WHEN ("ADS"."Feature_38" <= 0.5) THEN CASE WHEN ("ADS"."Feature_20" <= 13.5) THEN CASE WHEN ("ADS"."Feature_61" <= 0.5) THEN CASE WHEN ("ADS"."Feature_20" <= 0.5) THEN 29 ELSE 30 END ELSE 31 END ELSE CASE WHEN ("ADS"."Feature_35" <= 15.5) THEN 33 ELSE CASE WHEN ("ADS"."Feature_50" <= 9.5) THEN 35 ELSE 36 END END END ELSE CASE WHEN ("ADS"."Feature_53" <= 0.5) THEN CASE WHEN ("ADS"."Feature_26" <= 12.5) THEN 39 ELSE 40 END ELSE CASE WHEN ("ADS"."Feature_21" <= 6.5) THEN 42 ELSE 43 END END END ELSE CASE WHEN ("ADS"."Feature_26" <= 7.5) THEN CASE WHEN ("ADS"."Feature_9" <= 2.5) THEN 46 ELSE CASE WHEN ("ADS"."Feature_44" <= 6.5) THEN 48 ELSE 49 END END ELSE CASE WHEN ("ADS"."Feature_53" <= 12.5) THEN CASE WHEN ("ADS"."Feature_54" <= 12.5) THEN 52 ELSE 53 END ELSE CASE WHEN ("ADS"."Feature_62" <= 1.5) THEN 55 ELSE 56 END END END END END AS node_id_2 
FROM "INPUT_DATA" AS "ADS"), 
"DT_node_data" AS 
(SELECT "Values".nid AS nid, "Values"."P_0.0" AS "P_0.0", "Values"."P_1.0" AS "P_1.0", "Values"."P_2.0" AS "P_2.0", "Values"."P_3.0" AS "P_3.0", "Values"."P_4.0" AS "P_4.0", "Values"."P_5.0" AS "P_5.0", "Values"."P_6.0" AS "P_6.0", "Values"."P_7.0" AS "P_7.0", "Values"."P_8.0" AS "P_8.0", "Values"."P_9.0" AS "P_9.0", "Values"."D" AS "D", "Values"."DP" AS "DP" 
FROM (SELECT 4 AS nid, 0.0 AS "P_0.0", 0.0 AS "P_1.0", 0.05 AS "P_2.0", 0.8333333333333334 AS "P_3.0", 0.0 AS "P_4.0", 0.0 AS "P_5.0", 0.0 AS "P_6.0", 0.06666666666666667 AS "P_7.0", 0.0 AS "P_8.0", 0.05 AS "P_9.0", 3.0 AS "D", 0.8333333333333334 AS "DP" UNION ALL SELECT 5 AS nid, 0.0 AS "P_0.0", 0.0 AS "P_1.0", 0.0 AS "P_2.0", 0.9875 AS "P_3.0", 0.0 AS "P_4.0", 0.0 AS "P_5.0", 0.0 AS "P_6.0", 0.0 AS "P_7.0", 0.0 AS "P_8.0", 0.0125 AS "P_9.0", 3.0 AS "D", 0.9875 AS "DP" UNION ALL SELECT 6 AS nid, 0.0 AS "P_0.0", 0.18 AS "P_1.0", 0.02 AS "P_2.0", 0.3 AS "P_3.0", 0.0 AS "P_4.0", 0.0 AS "P_5.0", 0.0 AS "P_6.0", 0.0 AS "P_7.0", 0.14 AS "P_8.0", 0.36 AS "P_9.0", 9.0 AS "D", 0.36 AS "DP" UNION ALL SELECT 10 AS nid, 0.2037037037037037 AS "P_0.0", 0.0 AS "P_1.0", 0.0 AS "P_2.0", 0.0 AS "P_3.0", 0.0 AS "P_4.0", 0.5555555555555556 AS "P_5.0", 0.0 AS "P_6.0", 0.0 AS "P_7.0", 0.018518518518518517 AS "P_8.0", 0.2222222222222222 AS "P_9.0", 5.0 AS "D", 0.5555555555555556 AS "DP" UNION ALL SELECT 11 AS nid, 0.0 AS "P_0.0", 0.14102564102564102 AS "P_1.0", 0.0 AS "P_2.0", 0.0 AS "P_3.0", 0.0 AS "P_4.0", 0.7692307692307693 AS "P_5.0", 0.0 AS "P_6.0", 0.0 AS "P_7.0", 0.0641025641025641 AS "P_8.0", 0.02564102564102564 AS "P_9.0", 5.0 AS "D", 0.7692307692307693 AS "DP" UNION ALL SELECT 13 AS nid, 0.0 AS "P_0.0", 0.05970149253731343 AS "P_1.0", 0.014925373134328358 AS "P_2.0", 0.3582089552238806 AS "P_3.0", 0.0 AS "P_4.0", 0.11940298507462686 AS "P_5.0", 0.0 AS "P_6.0", 0.0 AS "P_7.0", 0.04477611940298507 AS "P_8.0", 0.40298507462686567 AS "P_9.0", 9.0 AS "D", 0.40298507462686567 AS "DP" UNION ALL SELECT 14 AS nid, 0.2807017543859649 AS "P_0.0", 0.03508771929824561 AS "P_1.0", 0.0 AS "P_2.0", 0.0 AS "P_3.0", 0.0 AS "P_4.0", 0.22807017543859648 AS "P_5.0", 0.2807017543859649 AS "P_6.0", 0.0 AS "P_7.0", 0.03508771929824561 AS "P_8.0", 0.14035087719298245 AS "P_9.0", 0.0 AS "D", 0.2807017543859649 AS "DP" UNION ALL SELECT 17 AS nid, 0.107692307692...a_4", CAST("RF_Model_259"."Score_5" AS FLOAT) AS "Score_5", CAST("RF_Model_259"."Proba_5" AS FLOAT) AS "Proba_5", CAST("RF_Model_259"."LogProba_5" AS FLOAT) AS "LogProba_5", CAST("RF_Model_259"."Score_6" AS FLOAT) AS "Score_6", CAST("RF_Model_259"."Proba_6" AS FLOAT) AS "Proba_6", CAST("RF_Model_259"."LogProba_6" AS FLOAT) AS "LogProba_6", CAST("RF_Model_259"."Score_7" AS FLOAT) AS "Score_7", CAST("RF_Model_259"."Proba_7" AS FLOAT) AS "Proba_7", CAST("RF_Model_259"."LogProba_7" AS FLOAT) AS "LogProba_7", CAST("RF_Model_259"."Score_8" AS FLOAT) AS "Score_8", CAST("RF_Model_259"."Proba_8" AS FLOAT) AS "Proba_8", CAST("RF_Model_259"."LogProba_8" AS FLOAT) AS "LogProba_8", CAST("RF_Model_259"."Score_9" AS FLOAT) AS "Score_9", CAST("RF_Model_259"."Proba_9" AS FLOAT) AS "Proba_9", CAST("RF_Model_259"."LogProba_9" AS FLOAT) AS "LogProba_9" 
FROM "RF_Model_259") AS "RF_esu_25"), 
"RF_26" AS 
(WITH "DT_node_lookup_260" AS 
(SELECT "ADS"."KEY" AS "KEY", CASE WHEN ("ADS"."Feature_33" <= 2.5) THEN CASE WHEN ("ADS"."Feature_43" <= 1.5) THEN CASE WHEN ("ADS"."Feature_38" <= 3.5) THEN CASE WHEN ("ADS"."Feature_36" <= 7.5) THEN 4 ELSE CASE WHEN ("ADS"."Feature_53" <= 9.5) THEN 6 ELSE CASE WHEN ("ADS"."Feature_46" <= 4.5) THEN 8 ELSE 9 END END END ELSE CASE WHEN ("ADS"."Feature_58" <= 8.5) THEN CASE WHEN ("ADS"."Feature_37" <= 13.5) THEN 12 ELSE 13 END ELSE 14 END END ELSE CASE WHEN ("ADS"."Feature_53" <= 0.5) THEN CASE WHEN ("ADS"."Feature_38" <= 0.5) THEN CASE WHEN ("ADS"."Feature_29" <= 4.5) THEN 18 ELSE 19 END ELSE CASE WHEN ("ADS"."Feature_3" <= 13.5) THEN 21 ELSE 22 END END ELSE CASE WHEN ("ADS"."Feature_27" <= 7.5) THEN CASE WHEN ("ADS"."Feature_12" <= 9.5) THEN 25 ELSE CASE WHEN ("ADS"."Feature_43" <= 9.5) THEN 27 ELSE 28 END END ELSE CASE WHEN ("ADS"."Feature_19" <= 14.5) THEN CASE WHEN ("ADS"."Feature_21" <= 5.5) THEN 31 ELSE CASE WHEN ("ADS"."Feature_29" <= 6.5) THEN 33 ELSE 34 END END ELSE CASE WHEN ("ADS"."Feature_34" <= 7.5) THEN 36 ELSE 37 END END END END END ELSE CASE WHEN ("ADS"."Feature_52" <= 5.5) THEN CASE WHEN ("ADS"."Feature_61" <= 7.5) THEN 40 ELSE CASE WHEN ("ADS"."Feature_51" <= 10.5) THEN 42 ELSE 43 END END ELSE CASE WHEN ("ADS"."Feature_35" <= 1.5) THEN CASE WHEN ("ADS"."Feature_44" <= 4.5) THEN 46 ELSE 47 END ELSE CASE WHEN ("ADS"."Feature_2" <= 1.5) THEN CASE WHEN ("ADS"."Feature_38" <= 0.5) THEN 50 ELSE CASE WHEN ("ADS"."Feature_44" <= 14.5) THEN 52 ELSE 53 END END ELSE CASE WHEN ("ADS"."Feature_42" <= 9.5) THEN 55 ELSE 56 END END END END END AS node_id_2 
FROM "INPUT_DATA" AS "ADS"), 
"DT_node_data_260" AS 
(SELECT "Values".nid AS nid, "Values"."P_0.0" AS "P_0.0", "Values"."P_1.0" AS "P_1.0", "Values"."P_2.0" AS "P_2.0", "Values"."P_3.0" AS "P_3.0", "Values"."P_4.0" AS "P_4.0", "Values"."P_5.0" AS "P_5.0", "Values"."P_6.0" AS "P_6.0", "Values"."P_7.0" AS "P_7.0", "Values"."P_8.0" AS "P_8.0", "Values"."P_9.0" AS "P_9.0", "Values"."D" AS "D", "Values"."DP" AS "DP" 
FROM (SELECT 4 AS nid, 0.0 AS "P_0.0", 0.0 AS "P_1.0", 0.06557377049180328 AS "P_2.0", 0.09836065573770492 AS "P_3.0", 0.0 AS "P_4.0", 0.3114754098360656 AS "P_5.0", 0.0 AS "P_6.0", 0.09836065573770492 AS "P_7.0", 0.01639344262295082 AS "P_8.0", 0.4098360655737705 AS "P_9.0", 9.0 AS "D", 0.4098360655737705 AS "DP" UNION ALL SELECT 6 AS nid, 0.0 AS "P_0.0", 0.038461538461538464 AS "P_1.0", 0.038461538461538464 AS "P_2.0", 0.2692307692307692 AS "P_3.0", 0.0 AS "P_4.0", 0.2692307692307692 AS "P_5.0", 0.0 AS "P_6.0", 0.019230769230769232 AS "P_7.0", 0.17307692307692307 AS "P_8.0", 0.19230769230769232 AS "P_9.0", 3.0 AS "D", 0.2692307692307692 AS "DP" UNION ALL SELECT 8 AS nid, 0.0 AS "P_0.0", 0.2111111111111111 AS "P_1.0", 0.0 AS "P_2.0", 0.4666666666666667 AS "P_3.0", 0.0 AS "P_4.0", 0.17777777777777778 AS "P_5.0", 0.0 AS "P_6.0", 0.0 AS "P_7.0", 0.03333333333333333 AS "P_8.0", 0.1111111111111111 AS "P_9.0", 3.0 AS "D", 0.4666666666666667 AS "DP" UNION ALL SELECT 9 AS nid, 0.0 AS "P_0.0", 0.0 AS "P_1.0", 0.0 AS "P_2.0", 0.9193548387096774 AS "P_3.0", 0.0 AS...core_0", score_max."Score_1" AS "Score_1", score_max."Score_2" AS "Score_2", score_max."Score_3" AS "Score_3", score_max."Score_4" AS "Score_4", score_max."Score_5" AS "Score_5", score_max."Score_6" AS "Score_6", score_max."Score_7" AS "Score_7", score_max."Score_8" AS "Score_8", score_max."Score_9" AS "Score_9", score_max."Proba_0" AS "Proba_0", score_max."Proba_1" AS "Proba_1", score_max."Proba_2" AS "Proba_2", score_max."Proba_3" AS "Proba_3", score_max."Proba_4" AS "Proba_4", score_max."Proba_5" AS "Proba_5", score_max."Proba_6" AS "Proba_6", score_max."Proba_7" AS "Proba_7", score_max."Proba_8" AS "Proba_8", score_max."Proba_9" AS "Proba_9", score_max."LogProba_0" AS "LogProba_0", score_max."LogProba_1" AS "LogProba_1", score_max."LogProba_2" AS "LogProba_2", score_max."LogProba_3" AS "LogProba_3", score_max."LogProba_4" AS "LogProba_4", score_max."LogProba_5" AS "LogProba_5", score_max."LogProba_6" AS "LogProba_6", score_max."LogProba_7" AS "LogProba_7", score_max."LogProba_8" AS "LogProba_8", score_max."LogProba_9" AS "LogProba_9", score_max."Decision" AS "Decision", score_max."DecisionProba" AS "DecisionProba", score_max."KEY_m" AS "KEY_m", score_max."max_Proba" AS "max_Proba", "arg_max_t_Proba"."KEY_Proba" AS "KEY_Proba", "arg_max_t_Proba"."arg_max_Proba" AS "arg_max_Proba" 
FROM score_max LEFT OUTER JOIN (SELECT union_with_max."KEY" AS "KEY_Proba", max(union_with_max.class) AS "arg_max_Proba" 
FROM union_with_max 
WHERE union_with_max."max_Proba" <= union_with_max."Proba" GROUP BY union_with_max."KEY") AS "arg_max_t_Proba" ON score_max."KEY" = "arg_max_t_Proba"."KEY_Proba")
 SELECT arg_max_cte."KEY" AS "KEY", arg_max_cte."Score_0" AS "Score_0", arg_max_cte."Score_1" AS "Score_1", arg_max_cte."Score_2" AS "Score_2", arg_max_cte."Score_3" AS "Score_3", arg_max_cte."Score_4" AS "Score_4", arg_max_cte."Score_5" AS "Score_5", arg_max_cte."Score_6" AS "Score_6", arg_max_cte."Score_7" AS "Score_7", arg_max_cte."Score_8" AS "Score_8", arg_max_cte."Score_9" AS "Score_9", arg_max_cte."Proba_0" AS "Proba_0", arg_max_cte."Proba_1" AS "Proba_1", arg_max_cte."Proba_2" AS "Proba_2", arg_max_cte."Proba_3" AS "Proba_3", arg_max_cte."Proba_4" AS "Proba_4", arg_max_cte."Proba_5" AS "Proba_5", arg_max_cte."Proba_6" AS "Proba_6", arg_max_cte."Proba_7" AS "Proba_7", arg_max_cte."Proba_8" AS "Proba_8", arg_max_cte."Proba_9" AS "Proba_9", CASE WHEN (arg_max_cte."Proba_0" IS NULL OR arg_max_cte."Proba_0" > 0.0) THEN ln(arg_max_cte."Proba_0") ELSE -1.79769313486231e+308 END AS "LogProba_0", CASE WHEN (arg_max_cte."Proba_1" IS NULL OR arg_max_cte."Proba_1" > 0.0) THEN ln(arg_max_cte."Proba_1") ELSE -1.79769313486231e+308 END AS "LogProba_1", CASE WHEN (arg_max_cte."Proba_2" IS NULL OR arg_max_cte."Proba_2" > 0.0) THEN ln(arg_max_cte."Proba_2") ELSE -1.79769313486231e+308 END AS "LogProba_2", CASE WHEN (arg_max_cte."Proba_3" IS NULL OR arg_max_cte."Proba_3" > 0.0) THEN ln(arg_max_cte."Proba_3") ELSE -1.79769313486231e+308 END AS "LogProba_3", CASE WHEN (arg_max_cte."Proba_4" IS NULL OR arg_max_cte."Proba_4" > 0.0) THEN ln(arg_max_cte."Proba_4") ELSE -1.79769313486231e+308 END AS "LogProba_4", CASE WHEN (arg_max_cte."Proba_5" IS NULL OR arg_max_cte."Proba_5" > 0.0) THEN ln(arg_max_cte."Proba_5") ELSE -1.79769313486231e+308 END AS "LogProba_5", CASE WHEN (arg_max_cte."Proba_6" IS NULL OR arg_max_cte."Proba_6" > 0.0) THEN ln(arg_max_cte."Proba_6") ELSE -1.79769313486231e+308 END AS "LogProba_6", CASE WHEN (arg_max_cte."Proba_7" IS NULL OR arg_max_cte."Proba_7" > 0.0) THEN ln(arg_max_cte."Proba_7") ELSE -1.79769313486231e+308 END AS "LogProba_7", CASE WHEN (arg_max_cte."Proba_8" IS NULL OR arg_max_cte."Proba_8" > 0.0) THEN ln(arg_max_cte."Proba_8") ELSE -1.79769313486231e+308 END AS "LogProba_8", CASE WHEN (arg_max_cte."Proba_9" IS NULL OR arg_max_cte."Proba_9" > 0.0) THEN ln(arg_max_cte."Proba_9") ELSE -1.79769313486231e+308 END AS "LogProba_9", arg_max_cte."arg_max_Proba" AS "Decision", arg_max_cte."max_Proba" AS "DecisionProba" 
FROM arg_max_cte

Execute the SQL Code



In [7]:

    
# save the dataset in a database table

engine = sa.create_engine('postgresql://db:db@localhost/db?port=5432' , echo=False)
conn = engine.connect()

lTable = pd.DataFrame(digits.data);
lTable.columns = ['Feature_' + str(c) for c in range(digits.data.shape[1])]
lTable['KEY'] = range(lTable.shape[0])
lTable.to_sql("INPUT_DATA" , conn,   if_exists='replace', index=False)



In [8]:

    
sql_output = pd.read_sql(lSQL , conn);
sql_output = sql_output.sort_values(by='KEY').reset_index(drop=True)
conn.close()



In [9]:

    
sql_output.sample(12, random_state=1960)









    Out[9]:







  
    
      
      KEY
      Score_0
      Score_1
      Score_2
      Score_3
      Score_4
      Score_5
      Score_6
      Score_7
      Score_8
      ...
      LogProba_2
      LogProba_3
      LogProba_4
      LogProba_5
      LogProba_6
      LogProba_7
      LogProba_8
      LogProba_9
      Decision
      DecisionProba
    
  
  
    
      1197
      1197
      None
      None
      None
      None
      None
      None
      None
      None
      None
      ...
      -3.271224
      -2.185342
      -3.074713
      -1.532286
      -3.913458
      -2.400581
      -1.469503
      -2.246424
      8
      0.230040
    
    
      913
      913
      None
      None
      None
      None
      None
      None
      None
      None
      None
      ...
      -2.496320
      -2.097181
      -3.883939
      -2.617019
      -3.047992
      -3.786127
      -0.820853
      -2.183164
      8
      0.440056
    
    
      893
      893
      None
      None
      None
      None
      None
      None
      None
      None
      None
      ...
      -2.764154
      -2.486125
      -4.001539
      -0.718490
      -4.412884
      -3.060126
      -2.164634
      -2.968205
      5
      0.487488
    
    
      604
      604
      None
      None
      None
      None
      None
      None
      None
      None
      None
      ...
      -3.325013
      -4.182968
      -2.009558
      -1.538349
      -1.123425
      -3.581279
      -2.308723
      -4.636519
      6
      0.325164
    
    
      743
      743
      None
      None
      None
      None
      None
      None
      None
      None
      None
      ...
      -5.167743
      -5.425719
      -0.615005
      -3.479554
      -4.037963
      -1.379316
      -3.507207
      -3.300613
      4
      0.540638
    
    
      556
      556
      None
      None
      None
      None
      None
      None
      None
      None
      None
      ...
      -1.689932
      -2.412699
      -3.587378
      -4.484135
      -2.869010
      -3.602473
      -0.967115
      -3.418996
      8
      0.380178
    
    
      664
      664
      None
      None
      None
      None
      None
      None
      None
      None
      None
      ...
      -3.111373
      -2.387851
      -3.237153
      -2.289746
      -2.790281
      -3.369948
      -0.969122
      -2.358257
      8
      0.379416
    
    
      195
      195
      None
      None
      None
      None
      None
      None
      None
      None
      None
      ...
      -3.639786
      -4.971824
      -3.079982
      -4.108785
      -0.226434
      -5.803139
      -3.301036
      -5.121011
      6
      0.797372
    
    
      692
      692
      None
      None
      None
      None
      None
      None
      None
      None
      None
      ...
      -3.625768
      -3.617052
      -3.603747
      -0.359531
      -3.612624
      -3.345042
      -2.542779
      -3.533635
      5
      0.698004
    
    
      1589
      1589
      None
      None
      None
      None
      None
      None
      None
      None
      None
      ...
      -3.479113
      -4.513059
      -3.068721
      -0.702351
      -2.655061
      -2.595765
      -2.939838
      -3.168150
      5
      0.495419
    
    
      1194
      1194
      None
      None
      None
      None
      None
      None
      None
      None
      None
      ...
      -3.633719
      -2.567681
      -4.120463
      -2.611717
      -4.634110
      -3.696028
      -2.798969
      -0.443228
      9
      0.641961
    
    
      822
      822
      None
      None
      None
      None
      None
      None
      None
      None
      None
      ...
      -3.526294
      -3.807369
      -3.652604
      -0.363653
      -4.745843
      -2.786068
      -2.546733
      -3.290512
      5
      0.695132
    
  

12 rows × 33 columns



In [10]:

    
sql_output.Decision.value_counts()









    Out[10]:





7    197
9    186
1    184
4    180
6    179
0    178
5    176
3    175
2    175
8    167
Name: Decision, dtype: int64

Scikit-learn Prediction



In [11]:

    
skl_outputs = pd.DataFrame()
skl_output_key = pd.DataFrame(list(range(X.shape[0])), columns=['KEY']);
skl_output_score = pd.DataFrame(columns=['Score_' + str(c) for c in range(n_classes)]);
skl_output_proba = pd.DataFrame(clf.predict_proba(X), columns=['Proba_' + str(c) for c in range(n_classes)])
skl_output_log_proba = pd.DataFrame(clf.predict_log_proba(X), columns=['LogProba_' + str(c) for c in range(n_classes)])
skl_output_decision = pd.DataFrame(clf.predict(X), columns=['Decision'])
skl_output = pd.concat([skl_output_key, skl_output_score, skl_output_proba, skl_output_log_proba, skl_output_decision] , axis=1)
skl_output.sample(12, random_state=1960)









    Out[11]:







  
    
      
      KEY
      Score_0
      Score_1
      Score_2
      Score_3
      Score_4
      Score_5
      Score_6
      Score_7
      Score_8
      ...
      LogProba_1
      LogProba_2
      LogProba_3
      LogProba_4
      LogProba_5
      LogProba_6
      LogProba_7
      LogProba_8
      LogProba_9
      Decision
    
  
  
    
      1197
      1197
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      ...
      -2.239128
      -3.271224
      -2.185342
      -3.074713
      -1.532286
      -3.913458
      -2.400581
      -1.469503
      -2.246424
      8
    
    
      913
      913
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      ...
      -3.128394
      -2.496320
      -2.097181
      -3.883939
      -2.617019
      -3.047992
      -3.786127
      -0.820853
      -2.183164
      8
    
    
      893
      893
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      ...
      -2.129655
      -2.764154
      -2.486125
      -4.001539
      -0.718490
      -4.412884
      -3.060126
      -2.164634
      -2.968205
      5
    
    
      604
      604
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      ...
      -2.750753
      -3.325013
      -4.182968
      -2.009558
      -1.538349
      -1.123425
      -3.581279
      -2.308723
      -4.636519
      6
    
    
      743
      743
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      ...
      -3.187762
      -5.167743
      -5.425719
      -0.615005
      -3.479554
      -4.037963
      -1.379316
      -3.507207
      -3.300613
      4
    
    
      556
      556
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      ...
      -1.724560
      -1.689932
      -2.412699
      -3.587378
      -4.484135
      -2.869010
      -3.602473
      -0.967115
      -3.418996
      8
    
    
      664
      664
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      ...
      -2.673481
      -3.111373
      -2.387851
      -3.237153
      -2.289746
      -2.790281
      -3.369948
      -0.969122
      -2.358257
      8
    
    
      195
      195
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      ...
      -3.013571
      -3.639786
      -4.971824
      -3.079982
      -4.108785
      -0.226434
      -5.803139
      -3.301036
      -5.121011
      6
    
    
      692
      692
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      ...
      -3.312214
      -3.625768
      -3.617052
      -3.603747
      -0.359531
      -3.612624
      -3.345042
      -2.542779
      -3.533635
      5
    
    
      1589
      1589
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      ...
      -3.708405
      -3.479113
      -4.513059
      -3.068721
      -0.702351
      -2.655061
      -2.595765
      -2.939838
      -3.168150
      5
    
    
      1194
      1194
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      ...
      -3.715642
      -3.633719
      -2.567681
      -4.120463
      -2.611717
      -4.634110
      -3.696028
      -2.798969
      -0.443228
      9
    
    
      822
      822
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      ...
      -3.242409
      -3.526294
      -3.807369
      -3.652604
      -0.363653
      -4.745843
      -2.786068
      -2.546733
      -3.290512
      5
    
  

12 rows × 32 columns

Comparing the SQL and Scikit-learn Predictions



In [12]:

    
sql_skl_join = skl_output.join(sql_output , how='left', on='KEY', lsuffix='_skl', rsuffix='_sql')



In [13]:

    
sql_skl_join.sample(12, random_state=1960)









    Out[13]:







  
    
      
      KEY_skl
      Score_0_skl
      Score_1_skl
      Score_2_skl
      Score_3_skl
      Score_4_skl
      Score_5_skl
      Score_6_skl
      Score_7_skl
      Score_8_skl
      ...
      LogProba_2_sql
      LogProba_3_sql
      LogProba_4_sql
      LogProba_5_sql
      LogProba_6_sql
      LogProba_7_sql
      LogProba_8_sql
      LogProba_9_sql
      Decision_sql
      DecisionProba
    
  
  
    
      1197
      1197
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      ...
      -3.271224
      -2.185342
      -3.074713
      -1.532286
      -3.913458
      -2.400581
      -1.469503
      -2.246424
      8
      0.230040
    
    
      913
      913
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      ...
      -2.496320
      -2.097181
      -3.883939
      -2.617019
      -3.047992
      -3.786127
      -0.820853
      -2.183164
      8
      0.440056
    
    
      893
      893
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      ...
      -2.764154
      -2.486125
      -4.001539
      -0.718490
      -4.412884
      -3.060126
      -2.164634
      -2.968205
      5
      0.487488
    
    
      604
      604
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      ...
      -3.325013
      -4.182968
      -2.009558
      -1.538349
      -1.123425
      -3.581279
      -2.308723
      -4.636519
      6
      0.325164
    
    
      743
      743
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      ...
      -5.167743
      -5.425719
      -0.615005
      -3.479554
      -4.037963
      -1.379316
      -3.507207
      -3.300613
      4
      0.540638
    
    
      556
      556
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      ...
      -1.689932
      -2.412699
      -3.587378
      -4.484135
      -2.869010
      -3.602473
      -0.967115
      -3.418996
      8
      0.380178
    
    
      664
      664
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      ...
      -3.111373
      -2.387851
      -3.237153
      -2.289746
      -2.790281
      -3.369948
      -0.969122
      -2.358257
      8
      0.379416
    
    
      195
      195
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      ...
      -3.639786
      -4.971824
      -3.079982
      -4.108785
      -0.226434
      -5.803139
      -3.301036
      -5.121011
      6
      0.797372
    
    
      692
      692
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      ...
      -3.625768
      -3.617052
      -3.603747
      -0.359531
      -3.612624
      -3.345042
      -2.542779
      -3.533635
      5
      0.698004
    
    
      1589
      1589
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      ...
      -3.479113
      -4.513059
      -3.068721
      -0.702351
      -2.655061
      -2.595765
      -2.939838
      -3.168150
      5
      0.495419
    
    
      1194
      1194
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      ...
      -3.633719
      -2.567681
      -4.120463
      -2.611717
      -4.634110
      -3.696028
      -2.798969
      -0.443228
      9
      0.641961
    
    
      822
      822
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      ...
      -3.526294
      -3.807369
      -3.652604
      -0.363653
      -4.745843
      -2.786068
      -2.546733
      -3.290512
      5
      0.695132
    
  

12 rows × 65 columns



In [14]:

    
condition = (sql_skl_join.Decision_sql != sql_skl_join.Decision_skl)
sql_skl_join[condition]









    Out[14]:







  
    
      
      KEY_skl
      Score_0_skl
      Score_1_skl
      Score_2_skl
      Score_3_skl
      Score_4_skl
      Score_5_skl
      Score_6_skl
      Score_7_skl
      Score_8_skl
      ...
      LogProba_2_sql
      LogProba_3_sql
      LogProba_4_sql
      LogProba_5_sql
      LogProba_6_sql
      LogProba_7_sql
      LogProba_8_sql
      LogProba_9_sql
      Decision_sql
      DecisionProba
    
  
  
  

0 rows × 65 columns



In [ ]:

	KEY	Score_0	Score_1	Score_2	Score_3	Score_4	Score_5	Score_6	Score_7	Score_8	...	LogProba_2	LogProba_3	LogProba_4	LogProba_5	LogProba_6	LogProba_7	LogProba_8	LogProba_9	Decision	DecisionProba
1197	1197	None	None	None	None	None	None	None	None	None	...	-3.271224	-2.185342	-3.074713	-1.532286	-3.913458	-2.400581	-1.469503	-2.246424	8	0.230040
913	913	None	None	None	None	None	None	None	None	None	...	-2.496320	-2.097181	-3.883939	-2.617019	-3.047992	-3.786127	-0.820853	-2.183164	8	0.440056
893	893	None	None	None	None	None	None	None	None	None	...	-2.764154	-2.486125	-4.001539	-0.718490	-4.412884	-3.060126	-2.164634	-2.968205	5	0.487488
604	604	None	None	None	None	None	None	None	None	None	...	-3.325013	-4.182968	-2.009558	-1.538349	-1.123425	-3.581279	-2.308723	-4.636519	6	0.325164
743	743	None	None	None	None	None	None	None	None	None	...	-5.167743	-5.425719	-0.615005	-3.479554	-4.037963	-1.379316	-3.507207	-3.300613	4	0.540638
556	556	None	None	None	None	None	None	None	None	None	...	-1.689932	-2.412699	-3.587378	-4.484135	-2.869010	-3.602473	-0.967115	-3.418996	8	0.380178
664	664	None	None	None	None	None	None	None	None	None	...	-3.111373	-2.387851	-3.237153	-2.289746	-2.790281	-3.369948	-0.969122	-2.358257	8	0.379416
195	195	None	None	None	None	None	None	None	None	None	...	-3.639786	-4.971824	-3.079982	-4.108785	-0.226434	-5.803139	-3.301036	-5.121011	6	0.797372
692	692	None	None	None	None	None	None	None	None	None	...	-3.625768	-3.617052	-3.603747	-0.359531	-3.612624	-3.345042	-2.542779	-3.533635	5	0.698004
1589	1589	None	None	None	None	None	None	None	None	None	...	-3.479113	-4.513059	-3.068721	-0.702351	-2.655061	-2.595765	-2.939838	-3.168150	5	0.495419
1194	1194	None	None	None	None	None	None	None	None	None	...	-3.633719	-2.567681	-4.120463	-2.611717	-4.634110	-3.696028	-2.798969	-0.443228	9	0.641961
822	822	None	None	None	None	None	None	None	None	None	...	-3.526294	-3.807369	-3.652604	-0.363653	-4.745843	-2.786068	-2.546733	-3.290512	5	0.695132

	KEY	Score_0	Score_1	Score_2	Score_3	Score_4	Score_5	Score_6	Score_7	Score_8	...	LogProba_1	LogProba_2	LogProba_3	LogProba_4	LogProba_5	LogProba_6	LogProba_7	LogProba_8	LogProba_9	Decision
1197	1197	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	-2.239128	-3.271224	-2.185342	-3.074713	-1.532286	-3.913458	-2.400581	-1.469503	-2.246424	8
913	913	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	-3.128394	-2.496320	-2.097181	-3.883939	-2.617019	-3.047992	-3.786127	-0.820853	-2.183164	8
893	893	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	-2.129655	-2.764154	-2.486125	-4.001539	-0.718490	-4.412884	-3.060126	-2.164634	-2.968205	5
604	604	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	-2.750753	-3.325013	-4.182968	-2.009558	-1.538349	-1.123425	-3.581279	-2.308723	-4.636519	6
743	743	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	-3.187762	-5.167743	-5.425719	-0.615005	-3.479554	-4.037963	-1.379316	-3.507207	-3.300613	4
556	556	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	-1.724560	-1.689932	-2.412699	-3.587378	-4.484135	-2.869010	-3.602473	-0.967115	-3.418996	8
664	664	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	-2.673481	-3.111373	-2.387851	-3.237153	-2.289746	-2.790281	-3.369948	-0.969122	-2.358257	8
195	195	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	-3.013571	-3.639786	-4.971824	-3.079982	-4.108785	-0.226434	-5.803139	-3.301036	-5.121011	6
692	692	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	-3.312214	-3.625768	-3.617052	-3.603747	-0.359531	-3.612624	-3.345042	-2.542779	-3.533635	5
1589	1589	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	-3.708405	-3.479113	-4.513059	-3.068721	-0.702351	-2.655061	-2.595765	-2.939838	-3.168150	5
1194	1194	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	-3.715642	-3.633719	-2.567681	-4.120463	-2.611717	-4.634110	-3.696028	-2.798969	-0.443228	9
822	822	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	-3.242409	-3.526294	-3.807369	-3.652604	-0.363653	-4.745843	-2.786068	-2.546733	-3.290512	5