In [1]:
import numpy as np
import pandas as pd
import sqlalchemy as sa
import pickle, json, requests, base64

import time
from datetime import datetime

Scikit-Learn Model Deployment Use Case

In this case, we will build a very large ensemble model (here, Random Foreast with 512 trees) on a digits dataset (not very original !!!) and generate a SQL code for deployment using the web service.

We then execute the SQL code on a local database (postgresql) and compare the SQL execution result with scikit-learn predict/predict_proba/.predict_log_proba result.

Both results are stored in pandas dataframes.

Build a scikit-learn model


In [2]:
from sklearn import datasets

digits = datasets.load_digits()
X = digits.data
n_classes = len(digits.target_names)

In [3]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=512, max_depth=7, min_samples_leaf=30, random_state = 1960)
clf.fit(digits.data, digits.target)


Out[3]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=7, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=30, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=512, n_jobs=None,
            oob_score=False, random_state=1960, verbose=0,
            warm_start=False)

In [4]:
#clf.__dict__

Generate SQL Code from the Model


In [5]:
def test_ws_sql_gen(pickle_data):
    WS_URL="http://localhost:1888/model"
    b64_data = base64.b64encode(pickle_data).decode('utf-8')
    data={"Name":"model1", "PickleData":b64_data , "SQLDialect":"postgresql"}
    r = requests.post(WS_URL, json=data)
    #print(r.__dict__)
    content = r.json()
    # print(content)
    lSQL = content["model"]["SQLGenrationResult"][0]["SQL"]
    return lSQL;

In [6]:
pickle_data = pickle.dumps(clf)
lSQL = test_ws_sql_gen(pickle_data)
N = len(lSQL)
P = 4000
print(lSQL[0:P] + "..." + lSQL[N//2:(N//2 + P)] + "..." + lSQL[-P:])


WITH "RF_0" AS 
(WITH "DT_node_lookup" AS 
(SELECT "ADS"."KEY" AS "KEY", CASE WHEN ("ADS"."Feature_43" <= 2.5) THEN CASE WHEN ("ADS"."Feature_26" <= 3.5) THEN CASE WHEN ("ADS"."Feature_19" <= 6.5) THEN CASE WHEN ("ADS"."Feature_28" <= 15.5) THEN 4 ELSE 5 END ELSE 6 END ELSE CASE WHEN ("ADS"."Feature_22" <= 0.5) THEN CASE WHEN ("ADS"."Feature_54" <= 2.5) THEN CASE WHEN ("ADS"."Feature_36" <= 7.5) THEN 10 ELSE 11 END ELSE CASE WHEN ("ADS"."Feature_33" <= 0.5) THEN 13 ELSE 14 END END ELSE CASE WHEN ("ADS"."Feature_34" <= 7.5) THEN CASE WHEN ("ADS"."Feature_44" <= 1.5) THEN 17 ELSE 18 END ELSE CASE WHEN ("ADS"."Feature_35" <= 2.5) THEN CASE WHEN ("ADS"."Feature_37" <= 8.5) THEN 21 ELSE 22 END ELSE 23 END END END END ELSE CASE WHEN ("ADS"."Feature_54" <= 1.5) THEN CASE WHEN ("ADS"."Feature_38" <= 0.5) THEN CASE WHEN ("ADS"."Feature_20" <= 13.5) THEN CASE WHEN ("ADS"."Feature_61" <= 0.5) THEN CASE WHEN ("ADS"."Feature_20" <= 0.5) THEN 29 ELSE 30 END ELSE 31 END ELSE CASE WHEN ("ADS"."Feature_35" <= 15.5) THEN 33 ELSE CASE WHEN ("ADS"."Feature_50" <= 9.5) THEN 35 ELSE 36 END END END ELSE CASE WHEN ("ADS"."Feature_53" <= 0.5) THEN CASE WHEN ("ADS"."Feature_26" <= 12.5) THEN 39 ELSE 40 END ELSE CASE WHEN ("ADS"."Feature_21" <= 6.5) THEN 42 ELSE 43 END END END ELSE CASE WHEN ("ADS"."Feature_26" <= 7.5) THEN CASE WHEN ("ADS"."Feature_9" <= 2.5) THEN 46 ELSE CASE WHEN ("ADS"."Feature_44" <= 6.5) THEN 48 ELSE 49 END END ELSE CASE WHEN ("ADS"."Feature_53" <= 12.5) THEN CASE WHEN ("ADS"."Feature_54" <= 12.5) THEN 52 ELSE 53 END ELSE CASE WHEN ("ADS"."Feature_62" <= 1.5) THEN 55 ELSE 56 END END END END END AS node_id_2 
FROM "INPUT_DATA" AS "ADS"), 
"DT_node_data" AS 
(SELECT "Values".nid AS nid, "Values"."P_0.0" AS "P_0.0", "Values"."P_1.0" AS "P_1.0", "Values"."P_2.0" AS "P_2.0", "Values"."P_3.0" AS "P_3.0", "Values"."P_4.0" AS "P_4.0", "Values"."P_5.0" AS "P_5.0", "Values"."P_6.0" AS "P_6.0", "Values"."P_7.0" AS "P_7.0", "Values"."P_8.0" AS "P_8.0", "Values"."P_9.0" AS "P_9.0", "Values"."D" AS "D", "Values"."DP" AS "DP" 
FROM (SELECT 4 AS nid, 0.0 AS "P_0.0", 0.0 AS "P_1.0", 0.05 AS "P_2.0", 0.8333333333333334 AS "P_3.0", 0.0 AS "P_4.0", 0.0 AS "P_5.0", 0.0 AS "P_6.0", 0.06666666666666667 AS "P_7.0", 0.0 AS "P_8.0", 0.05 AS "P_9.0", 3.0 AS "D", 0.8333333333333334 AS "DP" UNION ALL SELECT 5 AS nid, 0.0 AS "P_0.0", 0.0 AS "P_1.0", 0.0 AS "P_2.0", 0.9875 AS "P_3.0", 0.0 AS "P_4.0", 0.0 AS "P_5.0", 0.0 AS "P_6.0", 0.0 AS "P_7.0", 0.0 AS "P_8.0", 0.0125 AS "P_9.0", 3.0 AS "D", 0.9875 AS "DP" UNION ALL SELECT 6 AS nid, 0.0 AS "P_0.0", 0.18 AS "P_1.0", 0.02 AS "P_2.0", 0.3 AS "P_3.0", 0.0 AS "P_4.0", 0.0 AS "P_5.0", 0.0 AS "P_6.0", 0.0 AS "P_7.0", 0.14 AS "P_8.0", 0.36 AS "P_9.0", 9.0 AS "D", 0.36 AS "DP" UNION ALL SELECT 10 AS nid, 0.2037037037037037 AS "P_0.0", 0.0 AS "P_1.0", 0.0 AS "P_2.0", 0.0 AS "P_3.0", 0.0 AS "P_4.0", 0.5555555555555556 AS "P_5.0", 0.0 AS "P_6.0", 0.0 AS "P_7.0", 0.018518518518518517 AS "P_8.0", 0.2222222222222222 AS "P_9.0", 5.0 AS "D", 0.5555555555555556 AS "DP" UNION ALL SELECT 11 AS nid, 0.0 AS "P_0.0", 0.14102564102564102 AS "P_1.0", 0.0 AS "P_2.0", 0.0 AS "P_3.0", 0.0 AS "P_4.0", 0.7692307692307693 AS "P_5.0", 0.0 AS "P_6.0", 0.0 AS "P_7.0", 0.0641025641025641 AS "P_8.0", 0.02564102564102564 AS "P_9.0", 5.0 AS "D", 0.7692307692307693 AS "DP" UNION ALL SELECT 13 AS nid, 0.0 AS "P_0.0", 0.05970149253731343 AS "P_1.0", 0.014925373134328358 AS "P_2.0", 0.3582089552238806 AS "P_3.0", 0.0 AS "P_4.0", 0.11940298507462686 AS "P_5.0", 0.0 AS "P_6.0", 0.0 AS "P_7.0", 0.04477611940298507 AS "P_8.0", 0.40298507462686567 AS "P_9.0", 9.0 AS "D", 0.40298507462686567 AS "DP" UNION ALL SELECT 14 AS nid, 0.2807017543859649 AS "P_0.0", 0.03508771929824561 AS "P_1.0", 0.0 AS "P_2.0", 0.0 AS "P_3.0", 0.0 AS "P_4.0", 0.22807017543859648 AS "P_5.0", 0.2807017543859649 AS "P_6.0", 0.0 AS "P_7.0", 0.03508771929824561 AS "P_8.0", 0.14035087719298245 AS "P_9.0", 0.0 AS "D", 0.2807017543859649 AS "DP" UNION ALL SELECT 17 AS nid, 0.107692307692...a_4", CAST("RF_Model_259"."Score_5" AS FLOAT) AS "Score_5", CAST("RF_Model_259"."Proba_5" AS FLOAT) AS "Proba_5", CAST("RF_Model_259"."LogProba_5" AS FLOAT) AS "LogProba_5", CAST("RF_Model_259"."Score_6" AS FLOAT) AS "Score_6", CAST("RF_Model_259"."Proba_6" AS FLOAT) AS "Proba_6", CAST("RF_Model_259"."LogProba_6" AS FLOAT) AS "LogProba_6", CAST("RF_Model_259"."Score_7" AS FLOAT) AS "Score_7", CAST("RF_Model_259"."Proba_7" AS FLOAT) AS "Proba_7", CAST("RF_Model_259"."LogProba_7" AS FLOAT) AS "LogProba_7", CAST("RF_Model_259"."Score_8" AS FLOAT) AS "Score_8", CAST("RF_Model_259"."Proba_8" AS FLOAT) AS "Proba_8", CAST("RF_Model_259"."LogProba_8" AS FLOAT) AS "LogProba_8", CAST("RF_Model_259"."Score_9" AS FLOAT) AS "Score_9", CAST("RF_Model_259"."Proba_9" AS FLOAT) AS "Proba_9", CAST("RF_Model_259"."LogProba_9" AS FLOAT) AS "LogProba_9" 
FROM "RF_Model_259") AS "RF_esu_25"), 
"RF_26" AS 
(WITH "DT_node_lookup_260" AS 
(SELECT "ADS"."KEY" AS "KEY", CASE WHEN ("ADS"."Feature_33" <= 2.5) THEN CASE WHEN ("ADS"."Feature_43" <= 1.5) THEN CASE WHEN ("ADS"."Feature_38" <= 3.5) THEN CASE WHEN ("ADS"."Feature_36" <= 7.5) THEN 4 ELSE CASE WHEN ("ADS"."Feature_53" <= 9.5) THEN 6 ELSE CASE WHEN ("ADS"."Feature_46" <= 4.5) THEN 8 ELSE 9 END END END ELSE CASE WHEN ("ADS"."Feature_58" <= 8.5) THEN CASE WHEN ("ADS"."Feature_37" <= 13.5) THEN 12 ELSE 13 END ELSE 14 END END ELSE CASE WHEN ("ADS"."Feature_53" <= 0.5) THEN CASE WHEN ("ADS"."Feature_38" <= 0.5) THEN CASE WHEN ("ADS"."Feature_29" <= 4.5) THEN 18 ELSE 19 END ELSE CASE WHEN ("ADS"."Feature_3" <= 13.5) THEN 21 ELSE 22 END END ELSE CASE WHEN ("ADS"."Feature_27" <= 7.5) THEN CASE WHEN ("ADS"."Feature_12" <= 9.5) THEN 25 ELSE CASE WHEN ("ADS"."Feature_43" <= 9.5) THEN 27 ELSE 28 END END ELSE CASE WHEN ("ADS"."Feature_19" <= 14.5) THEN CASE WHEN ("ADS"."Feature_21" <= 5.5) THEN 31 ELSE CASE WHEN ("ADS"."Feature_29" <= 6.5) THEN 33 ELSE 34 END END ELSE CASE WHEN ("ADS"."Feature_34" <= 7.5) THEN 36 ELSE 37 END END END END END ELSE CASE WHEN ("ADS"."Feature_52" <= 5.5) THEN CASE WHEN ("ADS"."Feature_61" <= 7.5) THEN 40 ELSE CASE WHEN ("ADS"."Feature_51" <= 10.5) THEN 42 ELSE 43 END END ELSE CASE WHEN ("ADS"."Feature_35" <= 1.5) THEN CASE WHEN ("ADS"."Feature_44" <= 4.5) THEN 46 ELSE 47 END ELSE CASE WHEN ("ADS"."Feature_2" <= 1.5) THEN CASE WHEN ("ADS"."Feature_38" <= 0.5) THEN 50 ELSE CASE WHEN ("ADS"."Feature_44" <= 14.5) THEN 52 ELSE 53 END END ELSE CASE WHEN ("ADS"."Feature_42" <= 9.5) THEN 55 ELSE 56 END END END END END AS node_id_2 
FROM "INPUT_DATA" AS "ADS"), 
"DT_node_data_260" AS 
(SELECT "Values".nid AS nid, "Values"."P_0.0" AS "P_0.0", "Values"."P_1.0" AS "P_1.0", "Values"."P_2.0" AS "P_2.0", "Values"."P_3.0" AS "P_3.0", "Values"."P_4.0" AS "P_4.0", "Values"."P_5.0" AS "P_5.0", "Values"."P_6.0" AS "P_6.0", "Values"."P_7.0" AS "P_7.0", "Values"."P_8.0" AS "P_8.0", "Values"."P_9.0" AS "P_9.0", "Values"."D" AS "D", "Values"."DP" AS "DP" 
FROM (SELECT 4 AS nid, 0.0 AS "P_0.0", 0.0 AS "P_1.0", 0.06557377049180328 AS "P_2.0", 0.09836065573770492 AS "P_3.0", 0.0 AS "P_4.0", 0.3114754098360656 AS "P_5.0", 0.0 AS "P_6.0", 0.09836065573770492 AS "P_7.0", 0.01639344262295082 AS "P_8.0", 0.4098360655737705 AS "P_9.0", 9.0 AS "D", 0.4098360655737705 AS "DP" UNION ALL SELECT 6 AS nid, 0.0 AS "P_0.0", 0.038461538461538464 AS "P_1.0", 0.038461538461538464 AS "P_2.0", 0.2692307692307692 AS "P_3.0", 0.0 AS "P_4.0", 0.2692307692307692 AS "P_5.0", 0.0 AS "P_6.0", 0.019230769230769232 AS "P_7.0", 0.17307692307692307 AS "P_8.0", 0.19230769230769232 AS "P_9.0", 3.0 AS "D", 0.2692307692307692 AS "DP" UNION ALL SELECT 8 AS nid, 0.0 AS "P_0.0", 0.2111111111111111 AS "P_1.0", 0.0 AS "P_2.0", 0.4666666666666667 AS "P_3.0", 0.0 AS "P_4.0", 0.17777777777777778 AS "P_5.0", 0.0 AS "P_6.0", 0.0 AS "P_7.0", 0.03333333333333333 AS "P_8.0", 0.1111111111111111 AS "P_9.0", 3.0 AS "D", 0.4666666666666667 AS "DP" UNION ALL SELECT 9 AS nid, 0.0 AS "P_0.0", 0.0 AS "P_1.0", 0.0 AS "P_2.0", 0.9193548387096774 AS "P_3.0", 0.0 AS...core_0", score_max."Score_1" AS "Score_1", score_max."Score_2" AS "Score_2", score_max."Score_3" AS "Score_3", score_max."Score_4" AS "Score_4", score_max."Score_5" AS "Score_5", score_max."Score_6" AS "Score_6", score_max."Score_7" AS "Score_7", score_max."Score_8" AS "Score_8", score_max."Score_9" AS "Score_9", score_max."Proba_0" AS "Proba_0", score_max."Proba_1" AS "Proba_1", score_max."Proba_2" AS "Proba_2", score_max."Proba_3" AS "Proba_3", score_max."Proba_4" AS "Proba_4", score_max."Proba_5" AS "Proba_5", score_max."Proba_6" AS "Proba_6", score_max."Proba_7" AS "Proba_7", score_max."Proba_8" AS "Proba_8", score_max."Proba_9" AS "Proba_9", score_max."LogProba_0" AS "LogProba_0", score_max."LogProba_1" AS "LogProba_1", score_max."LogProba_2" AS "LogProba_2", score_max."LogProba_3" AS "LogProba_3", score_max."LogProba_4" AS "LogProba_4", score_max."LogProba_5" AS "LogProba_5", score_max."LogProba_6" AS "LogProba_6", score_max."LogProba_7" AS "LogProba_7", score_max."LogProba_8" AS "LogProba_8", score_max."LogProba_9" AS "LogProba_9", score_max."Decision" AS "Decision", score_max."DecisionProba" AS "DecisionProba", score_max."KEY_m" AS "KEY_m", score_max."max_Proba" AS "max_Proba", "arg_max_t_Proba"."KEY_Proba" AS "KEY_Proba", "arg_max_t_Proba"."arg_max_Proba" AS "arg_max_Proba" 
FROM score_max LEFT OUTER JOIN (SELECT union_with_max."KEY" AS "KEY_Proba", max(union_with_max.class) AS "arg_max_Proba" 
FROM union_with_max 
WHERE union_with_max."max_Proba" <= union_with_max."Proba" GROUP BY union_with_max."KEY") AS "arg_max_t_Proba" ON score_max."KEY" = "arg_max_t_Proba"."KEY_Proba")
 SELECT arg_max_cte."KEY" AS "KEY", arg_max_cte."Score_0" AS "Score_0", arg_max_cte."Score_1" AS "Score_1", arg_max_cte."Score_2" AS "Score_2", arg_max_cte."Score_3" AS "Score_3", arg_max_cte."Score_4" AS "Score_4", arg_max_cte."Score_5" AS "Score_5", arg_max_cte."Score_6" AS "Score_6", arg_max_cte."Score_7" AS "Score_7", arg_max_cte."Score_8" AS "Score_8", arg_max_cte."Score_9" AS "Score_9", arg_max_cte."Proba_0" AS "Proba_0", arg_max_cte."Proba_1" AS "Proba_1", arg_max_cte."Proba_2" AS "Proba_2", arg_max_cte."Proba_3" AS "Proba_3", arg_max_cte."Proba_4" AS "Proba_4", arg_max_cte."Proba_5" AS "Proba_5", arg_max_cte."Proba_6" AS "Proba_6", arg_max_cte."Proba_7" AS "Proba_7", arg_max_cte."Proba_8" AS "Proba_8", arg_max_cte."Proba_9" AS "Proba_9", CASE WHEN (arg_max_cte."Proba_0" IS NULL OR arg_max_cte."Proba_0" > 0.0) THEN ln(arg_max_cte."Proba_0") ELSE -1.79769313486231e+308 END AS "LogProba_0", CASE WHEN (arg_max_cte."Proba_1" IS NULL OR arg_max_cte."Proba_1" > 0.0) THEN ln(arg_max_cte."Proba_1") ELSE -1.79769313486231e+308 END AS "LogProba_1", CASE WHEN (arg_max_cte."Proba_2" IS NULL OR arg_max_cte."Proba_2" > 0.0) THEN ln(arg_max_cte."Proba_2") ELSE -1.79769313486231e+308 END AS "LogProba_2", CASE WHEN (arg_max_cte."Proba_3" IS NULL OR arg_max_cte."Proba_3" > 0.0) THEN ln(arg_max_cte."Proba_3") ELSE -1.79769313486231e+308 END AS "LogProba_3", CASE WHEN (arg_max_cte."Proba_4" IS NULL OR arg_max_cte."Proba_4" > 0.0) THEN ln(arg_max_cte."Proba_4") ELSE -1.79769313486231e+308 END AS "LogProba_4", CASE WHEN (arg_max_cte."Proba_5" IS NULL OR arg_max_cte."Proba_5" > 0.0) THEN ln(arg_max_cte."Proba_5") ELSE -1.79769313486231e+308 END AS "LogProba_5", CASE WHEN (arg_max_cte."Proba_6" IS NULL OR arg_max_cte."Proba_6" > 0.0) THEN ln(arg_max_cte."Proba_6") ELSE -1.79769313486231e+308 END AS "LogProba_6", CASE WHEN (arg_max_cte."Proba_7" IS NULL OR arg_max_cte."Proba_7" > 0.0) THEN ln(arg_max_cte."Proba_7") ELSE -1.79769313486231e+308 END AS "LogProba_7", CASE WHEN (arg_max_cte."Proba_8" IS NULL OR arg_max_cte."Proba_8" > 0.0) THEN ln(arg_max_cte."Proba_8") ELSE -1.79769313486231e+308 END AS "LogProba_8", CASE WHEN (arg_max_cte."Proba_9" IS NULL OR arg_max_cte."Proba_9" > 0.0) THEN ln(arg_max_cte."Proba_9") ELSE -1.79769313486231e+308 END AS "LogProba_9", arg_max_cte."arg_max_Proba" AS "Decision", arg_max_cte."max_Proba" AS "DecisionProba" 
FROM arg_max_cte

Execute the SQL Code


In [7]:
# save the dataset in a database table

engine = sa.create_engine('postgresql://db:db@localhost/db?port=5432' , echo=False)
conn = engine.connect()

lTable = pd.DataFrame(digits.data);
lTable.columns = ['Feature_' + str(c) for c in range(digits.data.shape[1])]
lTable['KEY'] = range(lTable.shape[0])
lTable.to_sql("INPUT_DATA" , conn,   if_exists='replace', index=False)

In [8]:
sql_output = pd.read_sql(lSQL , conn);
sql_output = sql_output.sort_values(by='KEY').reset_index(drop=True)
conn.close()

In [9]:
sql_output.sample(12, random_state=1960)


Out[9]:
KEY Score_0 Score_1 Score_2 Score_3 Score_4 Score_5 Score_6 Score_7 Score_8 ... LogProba_2 LogProba_3 LogProba_4 LogProba_5 LogProba_6 LogProba_7 LogProba_8 LogProba_9 Decision DecisionProba
1197 1197 None None None None None None None None None ... -3.271224 -2.185342 -3.074713 -1.532286 -3.913458 -2.400581 -1.469503 -2.246424 8 0.230040
913 913 None None None None None None None None None ... -2.496320 -2.097181 -3.883939 -2.617019 -3.047992 -3.786127 -0.820853 -2.183164 8 0.440056
893 893 None None None None None None None None None ... -2.764154 -2.486125 -4.001539 -0.718490 -4.412884 -3.060126 -2.164634 -2.968205 5 0.487488
604 604 None None None None None None None None None ... -3.325013 -4.182968 -2.009558 -1.538349 -1.123425 -3.581279 -2.308723 -4.636519 6 0.325164
743 743 None None None None None None None None None ... -5.167743 -5.425719 -0.615005 -3.479554 -4.037963 -1.379316 -3.507207 -3.300613 4 0.540638
556 556 None None None None None None None None None ... -1.689932 -2.412699 -3.587378 -4.484135 -2.869010 -3.602473 -0.967115 -3.418996 8 0.380178
664 664 None None None None None None None None None ... -3.111373 -2.387851 -3.237153 -2.289746 -2.790281 -3.369948 -0.969122 -2.358257 8 0.379416
195 195 None None None None None None None None None ... -3.639786 -4.971824 -3.079982 -4.108785 -0.226434 -5.803139 -3.301036 -5.121011 6 0.797372
692 692 None None None None None None None None None ... -3.625768 -3.617052 -3.603747 -0.359531 -3.612624 -3.345042 -2.542779 -3.533635 5 0.698004
1589 1589 None None None None None None None None None ... -3.479113 -4.513059 -3.068721 -0.702351 -2.655061 -2.595765 -2.939838 -3.168150 5 0.495419
1194 1194 None None None None None None None None None ... -3.633719 -2.567681 -4.120463 -2.611717 -4.634110 -3.696028 -2.798969 -0.443228 9 0.641961
822 822 None None None None None None None None None ... -3.526294 -3.807369 -3.652604 -0.363653 -4.745843 -2.786068 -2.546733 -3.290512 5 0.695132

12 rows × 33 columns


In [10]:
sql_output.Decision.value_counts()


Out[10]:
7    197
9    186
1    184
4    180
6    179
0    178
5    176
3    175
2    175
8    167
Name: Decision, dtype: int64

Scikit-learn Prediction


In [11]:
skl_outputs = pd.DataFrame()
skl_output_key = pd.DataFrame(list(range(X.shape[0])), columns=['KEY']);
skl_output_score = pd.DataFrame(columns=['Score_' + str(c) for c in range(n_classes)]);
skl_output_proba = pd.DataFrame(clf.predict_proba(X), columns=['Proba_' + str(c) for c in range(n_classes)])
skl_output_log_proba = pd.DataFrame(clf.predict_log_proba(X), columns=['LogProba_' + str(c) for c in range(n_classes)])
skl_output_decision = pd.DataFrame(clf.predict(X), columns=['Decision'])
skl_output = pd.concat([skl_output_key, skl_output_score, skl_output_proba, skl_output_log_proba, skl_output_decision] , axis=1)
skl_output.sample(12, random_state=1960)


Out[11]:
KEY Score_0 Score_1 Score_2 Score_3 Score_4 Score_5 Score_6 Score_7 Score_8 ... LogProba_1 LogProba_2 LogProba_3 LogProba_4 LogProba_5 LogProba_6 LogProba_7 LogProba_8 LogProba_9 Decision
1197 1197 NaN NaN NaN NaN NaN NaN NaN NaN NaN ... -2.239128 -3.271224 -2.185342 -3.074713 -1.532286 -3.913458 -2.400581 -1.469503 -2.246424 8
913 913 NaN NaN NaN NaN NaN NaN NaN NaN NaN ... -3.128394 -2.496320 -2.097181 -3.883939 -2.617019 -3.047992 -3.786127 -0.820853 -2.183164 8
893 893 NaN NaN NaN NaN NaN NaN NaN NaN NaN ... -2.129655 -2.764154 -2.486125 -4.001539 -0.718490 -4.412884 -3.060126 -2.164634 -2.968205 5
604 604 NaN NaN NaN NaN NaN NaN NaN NaN NaN ... -2.750753 -3.325013 -4.182968 -2.009558 -1.538349 -1.123425 -3.581279 -2.308723 -4.636519 6
743 743 NaN NaN NaN NaN NaN NaN NaN NaN NaN ... -3.187762 -5.167743 -5.425719 -0.615005 -3.479554 -4.037963 -1.379316 -3.507207 -3.300613 4
556 556 NaN NaN NaN NaN NaN NaN NaN NaN NaN ... -1.724560 -1.689932 -2.412699 -3.587378 -4.484135 -2.869010 -3.602473 -0.967115 -3.418996 8
664 664 NaN NaN NaN NaN NaN NaN NaN NaN NaN ... -2.673481 -3.111373 -2.387851 -3.237153 -2.289746 -2.790281 -3.369948 -0.969122 -2.358257 8
195 195 NaN NaN NaN NaN NaN NaN NaN NaN NaN ... -3.013571 -3.639786 -4.971824 -3.079982 -4.108785 -0.226434 -5.803139 -3.301036 -5.121011 6
692 692 NaN NaN NaN NaN NaN NaN NaN NaN NaN ... -3.312214 -3.625768 -3.617052 -3.603747 -0.359531 -3.612624 -3.345042 -2.542779 -3.533635 5
1589 1589 NaN NaN NaN NaN NaN NaN NaN NaN NaN ... -3.708405 -3.479113 -4.513059 -3.068721 -0.702351 -2.655061 -2.595765 -2.939838 -3.168150 5
1194 1194 NaN NaN NaN NaN NaN NaN NaN NaN NaN ... -3.715642 -3.633719 -2.567681 -4.120463 -2.611717 -4.634110 -3.696028 -2.798969 -0.443228 9
822 822 NaN NaN NaN NaN NaN NaN NaN NaN NaN ... -3.242409 -3.526294 -3.807369 -3.652604 -0.363653 -4.745843 -2.786068 -2.546733 -3.290512 5

12 rows × 32 columns

Comparing the SQL and Scikit-learn Predictions


In [12]:
sql_skl_join = skl_output.join(sql_output , how='left', on='KEY', lsuffix='_skl', rsuffix='_sql')

In [13]:
sql_skl_join.sample(12, random_state=1960)


Out[13]:
KEY_skl Score_0_skl Score_1_skl Score_2_skl Score_3_skl Score_4_skl Score_5_skl Score_6_skl Score_7_skl Score_8_skl ... LogProba_2_sql LogProba_3_sql LogProba_4_sql LogProba_5_sql LogProba_6_sql LogProba_7_sql LogProba_8_sql LogProba_9_sql Decision_sql DecisionProba
1197 1197 NaN NaN NaN NaN NaN NaN NaN NaN NaN ... -3.271224 -2.185342 -3.074713 -1.532286 -3.913458 -2.400581 -1.469503 -2.246424 8 0.230040
913 913 NaN NaN NaN NaN NaN NaN NaN NaN NaN ... -2.496320 -2.097181 -3.883939 -2.617019 -3.047992 -3.786127 -0.820853 -2.183164 8 0.440056
893 893 NaN NaN NaN NaN NaN NaN NaN NaN NaN ... -2.764154 -2.486125 -4.001539 -0.718490 -4.412884 -3.060126 -2.164634 -2.968205 5 0.487488
604 604 NaN NaN NaN NaN NaN NaN NaN NaN NaN ... -3.325013 -4.182968 -2.009558 -1.538349 -1.123425 -3.581279 -2.308723 -4.636519 6 0.325164
743 743 NaN NaN NaN NaN NaN NaN NaN NaN NaN ... -5.167743 -5.425719 -0.615005 -3.479554 -4.037963 -1.379316 -3.507207 -3.300613 4 0.540638
556 556 NaN NaN NaN NaN NaN NaN NaN NaN NaN ... -1.689932 -2.412699 -3.587378 -4.484135 -2.869010 -3.602473 -0.967115 -3.418996 8 0.380178
664 664 NaN NaN NaN NaN NaN NaN NaN NaN NaN ... -3.111373 -2.387851 -3.237153 -2.289746 -2.790281 -3.369948 -0.969122 -2.358257 8 0.379416
195 195 NaN NaN NaN NaN NaN NaN NaN NaN NaN ... -3.639786 -4.971824 -3.079982 -4.108785 -0.226434 -5.803139 -3.301036 -5.121011 6 0.797372
692 692 NaN NaN NaN NaN NaN NaN NaN NaN NaN ... -3.625768 -3.617052 -3.603747 -0.359531 -3.612624 -3.345042 -2.542779 -3.533635 5 0.698004
1589 1589 NaN NaN NaN NaN NaN NaN NaN NaN NaN ... -3.479113 -4.513059 -3.068721 -0.702351 -2.655061 -2.595765 -2.939838 -3.168150 5 0.495419
1194 1194 NaN NaN NaN NaN NaN NaN NaN NaN NaN ... -3.633719 -2.567681 -4.120463 -2.611717 -4.634110 -3.696028 -2.798969 -0.443228 9 0.641961
822 822 NaN NaN NaN NaN NaN NaN NaN NaN NaN ... -3.526294 -3.807369 -3.652604 -0.363653 -4.745843 -2.786068 -2.546733 -3.290512 5 0.695132

12 rows × 65 columns


In [14]:
condition = (sql_skl_join.Decision_sql != sql_skl_join.Decision_skl)
sql_skl_join[condition]


Out[14]:
KEY_skl Score_0_skl Score_1_skl Score_2_skl Score_3_skl Score_4_skl Score_5_skl Score_6_skl Score_7_skl Score_8_skl ... LogProba_2_sql LogProba_3_sql LogProba_4_sql LogProba_5_sql LogProba_6_sql LogProba_7_sql LogProba_8_sql LogProba_9_sql Decision_sql DecisionProba

0 rows × 65 columns


In [ ]: