In [1]:
import numpy as np
import pandas as pd
import sqlalchemy as sa
import pickle, json, requests, base64
import time
from datetime import datetime
In this case, we will build a very large ensemble model (here, Random Foreast with 512 trees) on a digits dataset (not very original !!!) and generate a SQL code for deployment using the web service.
We then execute the SQL code on a local database (postgresql) and compare the SQL execution result with scikit-learn predict/predict_proba/.predict_log_proba result.
Both results are stored in pandas dataframes.
In [2]:
from sklearn import datasets
digits = datasets.load_digits()
X = digits.data
n_classes = len(digits.target_names)
In [3]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=512, max_depth=7, min_samples_leaf=30, random_state = 1960)
clf.fit(digits.data, digits.target)
Out[3]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=7, max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=30, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=512, n_jobs=None,
oob_score=False, random_state=1960, verbose=0,
warm_start=False)
In [4]:
#clf.__dict__
In [5]:
def test_ws_sql_gen(pickle_data):
WS_URL="http://localhost:1888/model"
b64_data = base64.b64encode(pickle_data).decode('utf-8')
data={"Name":"model1", "PickleData":b64_data , "SQLDialect":"postgresql"}
r = requests.post(WS_URL, json=data)
#print(r.__dict__)
content = r.json()
# print(content)
lSQL = content["model"]["SQLGenrationResult"][0]["SQL"]
return lSQL;
In [6]:
pickle_data = pickle.dumps(clf)
lSQL = test_ws_sql_gen(pickle_data)
N = len(lSQL)
P = 4000
print(lSQL[0:P] + "..." + lSQL[N//2:(N//2 + P)] + "..." + lSQL[-P:])
WITH "RF_0" AS
(WITH "DT_node_lookup" AS
(SELECT "ADS"."KEY" AS "KEY", CASE WHEN ("ADS"."Feature_43" <= 2.5) THEN CASE WHEN ("ADS"."Feature_26" <= 3.5) THEN CASE WHEN ("ADS"."Feature_19" <= 6.5) THEN CASE WHEN ("ADS"."Feature_28" <= 15.5) THEN 4 ELSE 5 END ELSE 6 END ELSE CASE WHEN ("ADS"."Feature_22" <= 0.5) THEN CASE WHEN ("ADS"."Feature_54" <= 2.5) THEN CASE WHEN ("ADS"."Feature_36" <= 7.5) THEN 10 ELSE 11 END ELSE CASE WHEN ("ADS"."Feature_33" <= 0.5) THEN 13 ELSE 14 END END ELSE CASE WHEN ("ADS"."Feature_34" <= 7.5) THEN CASE WHEN ("ADS"."Feature_44" <= 1.5) THEN 17 ELSE 18 END ELSE CASE WHEN ("ADS"."Feature_35" <= 2.5) THEN CASE WHEN ("ADS"."Feature_37" <= 8.5) THEN 21 ELSE 22 END ELSE 23 END END END END ELSE CASE WHEN ("ADS"."Feature_54" <= 1.5) THEN CASE WHEN ("ADS"."Feature_38" <= 0.5) THEN CASE WHEN ("ADS"."Feature_20" <= 13.5) THEN CASE WHEN ("ADS"."Feature_61" <= 0.5) THEN CASE WHEN ("ADS"."Feature_20" <= 0.5) THEN 29 ELSE 30 END ELSE 31 END ELSE CASE WHEN ("ADS"."Feature_35" <= 15.5) THEN 33 ELSE CASE WHEN ("ADS"."Feature_50" <= 9.5) THEN 35 ELSE 36 END END END ELSE CASE WHEN ("ADS"."Feature_53" <= 0.5) THEN CASE WHEN ("ADS"."Feature_26" <= 12.5) THEN 39 ELSE 40 END ELSE CASE WHEN ("ADS"."Feature_21" <= 6.5) THEN 42 ELSE 43 END END END ELSE CASE WHEN ("ADS"."Feature_26" <= 7.5) THEN CASE WHEN ("ADS"."Feature_9" <= 2.5) THEN 46 ELSE CASE WHEN ("ADS"."Feature_44" <= 6.5) THEN 48 ELSE 49 END END ELSE CASE WHEN ("ADS"."Feature_53" <= 12.5) THEN CASE WHEN ("ADS"."Feature_54" <= 12.5) THEN 52 ELSE 53 END ELSE CASE WHEN ("ADS"."Feature_62" <= 1.5) THEN 55 ELSE 56 END END END END END AS node_id_2
FROM "INPUT_DATA" AS "ADS"),
"DT_node_data" AS
(SELECT "Values".nid AS nid, "Values"."P_0.0" AS "P_0.0", "Values"."P_1.0" AS "P_1.0", "Values"."P_2.0" AS "P_2.0", "Values"."P_3.0" AS "P_3.0", "Values"."P_4.0" AS "P_4.0", "Values"."P_5.0" AS "P_5.0", "Values"."P_6.0" AS "P_6.0", "Values"."P_7.0" AS "P_7.0", "Values"."P_8.0" AS "P_8.0", "Values"."P_9.0" AS "P_9.0", "Values"."D" AS "D", "Values"."DP" AS "DP"
FROM (SELECT 4 AS nid, 0.0 AS "P_0.0", 0.0 AS "P_1.0", 0.05 AS "P_2.0", 0.8333333333333334 AS "P_3.0", 0.0 AS "P_4.0", 0.0 AS "P_5.0", 0.0 AS "P_6.0", 0.06666666666666667 AS "P_7.0", 0.0 AS "P_8.0", 0.05 AS "P_9.0", 3.0 AS "D", 0.8333333333333334 AS "DP" UNION ALL SELECT 5 AS nid, 0.0 AS "P_0.0", 0.0 AS "P_1.0", 0.0 AS "P_2.0", 0.9875 AS "P_3.0", 0.0 AS "P_4.0", 0.0 AS "P_5.0", 0.0 AS "P_6.0", 0.0 AS "P_7.0", 0.0 AS "P_8.0", 0.0125 AS "P_9.0", 3.0 AS "D", 0.9875 AS "DP" UNION ALL SELECT 6 AS nid, 0.0 AS "P_0.0", 0.18 AS "P_1.0", 0.02 AS "P_2.0", 0.3 AS "P_3.0", 0.0 AS "P_4.0", 0.0 AS "P_5.0", 0.0 AS "P_6.0", 0.0 AS "P_7.0", 0.14 AS "P_8.0", 0.36 AS "P_9.0", 9.0 AS "D", 0.36 AS "DP" UNION ALL SELECT 10 AS nid, 0.2037037037037037 AS "P_0.0", 0.0 AS "P_1.0", 0.0 AS "P_2.0", 0.0 AS "P_3.0", 0.0 AS "P_4.0", 0.5555555555555556 AS "P_5.0", 0.0 AS "P_6.0", 0.0 AS "P_7.0", 0.018518518518518517 AS "P_8.0", 0.2222222222222222 AS "P_9.0", 5.0 AS "D", 0.5555555555555556 AS "DP" UNION ALL SELECT 11 AS nid, 0.0 AS "P_0.0", 0.14102564102564102 AS "P_1.0", 0.0 AS "P_2.0", 0.0 AS "P_3.0", 0.0 AS "P_4.0", 0.7692307692307693 AS "P_5.0", 0.0 AS "P_6.0", 0.0 AS "P_7.0", 0.0641025641025641 AS "P_8.0", 0.02564102564102564 AS "P_9.0", 5.0 AS "D", 0.7692307692307693 AS "DP" UNION ALL SELECT 13 AS nid, 0.0 AS "P_0.0", 0.05970149253731343 AS "P_1.0", 0.014925373134328358 AS "P_2.0", 0.3582089552238806 AS "P_3.0", 0.0 AS "P_4.0", 0.11940298507462686 AS "P_5.0", 0.0 AS "P_6.0", 0.0 AS "P_7.0", 0.04477611940298507 AS "P_8.0", 0.40298507462686567 AS "P_9.0", 9.0 AS "D", 0.40298507462686567 AS "DP" UNION ALL SELECT 14 AS nid, 0.2807017543859649 AS "P_0.0", 0.03508771929824561 AS "P_1.0", 0.0 AS "P_2.0", 0.0 AS "P_3.0", 0.0 AS "P_4.0", 0.22807017543859648 AS "P_5.0", 0.2807017543859649 AS "P_6.0", 0.0 AS "P_7.0", 0.03508771929824561 AS "P_8.0", 0.14035087719298245 AS "P_9.0", 0.0 AS "D", 0.2807017543859649 AS "DP" UNION ALL SELECT 17 AS nid, 0.107692307692...a_4", CAST("RF_Model_259"."Score_5" AS FLOAT) AS "Score_5", CAST("RF_Model_259"."Proba_5" AS FLOAT) AS "Proba_5", CAST("RF_Model_259"."LogProba_5" AS FLOAT) AS "LogProba_5", CAST("RF_Model_259"."Score_6" AS FLOAT) AS "Score_6", CAST("RF_Model_259"."Proba_6" AS FLOAT) AS "Proba_6", CAST("RF_Model_259"."LogProba_6" AS FLOAT) AS "LogProba_6", CAST("RF_Model_259"."Score_7" AS FLOAT) AS "Score_7", CAST("RF_Model_259"."Proba_7" AS FLOAT) AS "Proba_7", CAST("RF_Model_259"."LogProba_7" AS FLOAT) AS "LogProba_7", CAST("RF_Model_259"."Score_8" AS FLOAT) AS "Score_8", CAST("RF_Model_259"."Proba_8" AS FLOAT) AS "Proba_8", CAST("RF_Model_259"."LogProba_8" AS FLOAT) AS "LogProba_8", CAST("RF_Model_259"."Score_9" AS FLOAT) AS "Score_9", CAST("RF_Model_259"."Proba_9" AS FLOAT) AS "Proba_9", CAST("RF_Model_259"."LogProba_9" AS FLOAT) AS "LogProba_9"
FROM "RF_Model_259") AS "RF_esu_25"),
"RF_26" AS
(WITH "DT_node_lookup_260" AS
(SELECT "ADS"."KEY" AS "KEY", CASE WHEN ("ADS"."Feature_33" <= 2.5) THEN CASE WHEN ("ADS"."Feature_43" <= 1.5) THEN CASE WHEN ("ADS"."Feature_38" <= 3.5) THEN CASE WHEN ("ADS"."Feature_36" <= 7.5) THEN 4 ELSE CASE WHEN ("ADS"."Feature_53" <= 9.5) THEN 6 ELSE CASE WHEN ("ADS"."Feature_46" <= 4.5) THEN 8 ELSE 9 END END END ELSE CASE WHEN ("ADS"."Feature_58" <= 8.5) THEN CASE WHEN ("ADS"."Feature_37" <= 13.5) THEN 12 ELSE 13 END ELSE 14 END END ELSE CASE WHEN ("ADS"."Feature_53" <= 0.5) THEN CASE WHEN ("ADS"."Feature_38" <= 0.5) THEN CASE WHEN ("ADS"."Feature_29" <= 4.5) THEN 18 ELSE 19 END ELSE CASE WHEN ("ADS"."Feature_3" <= 13.5) THEN 21 ELSE 22 END END ELSE CASE WHEN ("ADS"."Feature_27" <= 7.5) THEN CASE WHEN ("ADS"."Feature_12" <= 9.5) THEN 25 ELSE CASE WHEN ("ADS"."Feature_43" <= 9.5) THEN 27 ELSE 28 END END ELSE CASE WHEN ("ADS"."Feature_19" <= 14.5) THEN CASE WHEN ("ADS"."Feature_21" <= 5.5) THEN 31 ELSE CASE WHEN ("ADS"."Feature_29" <= 6.5) THEN 33 ELSE 34 END END ELSE CASE WHEN ("ADS"."Feature_34" <= 7.5) THEN 36 ELSE 37 END END END END END ELSE CASE WHEN ("ADS"."Feature_52" <= 5.5) THEN CASE WHEN ("ADS"."Feature_61" <= 7.5) THEN 40 ELSE CASE WHEN ("ADS"."Feature_51" <= 10.5) THEN 42 ELSE 43 END END ELSE CASE WHEN ("ADS"."Feature_35" <= 1.5) THEN CASE WHEN ("ADS"."Feature_44" <= 4.5) THEN 46 ELSE 47 END ELSE CASE WHEN ("ADS"."Feature_2" <= 1.5) THEN CASE WHEN ("ADS"."Feature_38" <= 0.5) THEN 50 ELSE CASE WHEN ("ADS"."Feature_44" <= 14.5) THEN 52 ELSE 53 END END ELSE CASE WHEN ("ADS"."Feature_42" <= 9.5) THEN 55 ELSE 56 END END END END END AS node_id_2
FROM "INPUT_DATA" AS "ADS"),
"DT_node_data_260" AS
(SELECT "Values".nid AS nid, "Values"."P_0.0" AS "P_0.0", "Values"."P_1.0" AS "P_1.0", "Values"."P_2.0" AS "P_2.0", "Values"."P_3.0" AS "P_3.0", "Values"."P_4.0" AS "P_4.0", "Values"."P_5.0" AS "P_5.0", "Values"."P_6.0" AS "P_6.0", "Values"."P_7.0" AS "P_7.0", "Values"."P_8.0" AS "P_8.0", "Values"."P_9.0" AS "P_9.0", "Values"."D" AS "D", "Values"."DP" AS "DP"
FROM (SELECT 4 AS nid, 0.0 AS "P_0.0", 0.0 AS "P_1.0", 0.06557377049180328 AS "P_2.0", 0.09836065573770492 AS "P_3.0", 0.0 AS "P_4.0", 0.3114754098360656 AS "P_5.0", 0.0 AS "P_6.0", 0.09836065573770492 AS "P_7.0", 0.01639344262295082 AS "P_8.0", 0.4098360655737705 AS "P_9.0", 9.0 AS "D", 0.4098360655737705 AS "DP" UNION ALL SELECT 6 AS nid, 0.0 AS "P_0.0", 0.038461538461538464 AS "P_1.0", 0.038461538461538464 AS "P_2.0", 0.2692307692307692 AS "P_3.0", 0.0 AS "P_4.0", 0.2692307692307692 AS "P_5.0", 0.0 AS "P_6.0", 0.019230769230769232 AS "P_7.0", 0.17307692307692307 AS "P_8.0", 0.19230769230769232 AS "P_9.0", 3.0 AS "D", 0.2692307692307692 AS "DP" UNION ALL SELECT 8 AS nid, 0.0 AS "P_0.0", 0.2111111111111111 AS "P_1.0", 0.0 AS "P_2.0", 0.4666666666666667 AS "P_3.0", 0.0 AS "P_4.0", 0.17777777777777778 AS "P_5.0", 0.0 AS "P_6.0", 0.0 AS "P_7.0", 0.03333333333333333 AS "P_8.0", 0.1111111111111111 AS "P_9.0", 3.0 AS "D", 0.4666666666666667 AS "DP" UNION ALL SELECT 9 AS nid, 0.0 AS "P_0.0", 0.0 AS "P_1.0", 0.0 AS "P_2.0", 0.9193548387096774 AS "P_3.0", 0.0 AS...core_0", score_max."Score_1" AS "Score_1", score_max."Score_2" AS "Score_2", score_max."Score_3" AS "Score_3", score_max."Score_4" AS "Score_4", score_max."Score_5" AS "Score_5", score_max."Score_6" AS "Score_6", score_max."Score_7" AS "Score_7", score_max."Score_8" AS "Score_8", score_max."Score_9" AS "Score_9", score_max."Proba_0" AS "Proba_0", score_max."Proba_1" AS "Proba_1", score_max."Proba_2" AS "Proba_2", score_max."Proba_3" AS "Proba_3", score_max."Proba_4" AS "Proba_4", score_max."Proba_5" AS "Proba_5", score_max."Proba_6" AS "Proba_6", score_max."Proba_7" AS "Proba_7", score_max."Proba_8" AS "Proba_8", score_max."Proba_9" AS "Proba_9", score_max."LogProba_0" AS "LogProba_0", score_max."LogProba_1" AS "LogProba_1", score_max."LogProba_2" AS "LogProba_2", score_max."LogProba_3" AS "LogProba_3", score_max."LogProba_4" AS "LogProba_4", score_max."LogProba_5" AS "LogProba_5", score_max."LogProba_6" AS "LogProba_6", score_max."LogProba_7" AS "LogProba_7", score_max."LogProba_8" AS "LogProba_8", score_max."LogProba_9" AS "LogProba_9", score_max."Decision" AS "Decision", score_max."DecisionProba" AS "DecisionProba", score_max."KEY_m" AS "KEY_m", score_max."max_Proba" AS "max_Proba", "arg_max_t_Proba"."KEY_Proba" AS "KEY_Proba", "arg_max_t_Proba"."arg_max_Proba" AS "arg_max_Proba"
FROM score_max LEFT OUTER JOIN (SELECT union_with_max."KEY" AS "KEY_Proba", max(union_with_max.class) AS "arg_max_Proba"
FROM union_with_max
WHERE union_with_max."max_Proba" <= union_with_max."Proba" GROUP BY union_with_max."KEY") AS "arg_max_t_Proba" ON score_max."KEY" = "arg_max_t_Proba"."KEY_Proba")
SELECT arg_max_cte."KEY" AS "KEY", arg_max_cte."Score_0" AS "Score_0", arg_max_cte."Score_1" AS "Score_1", arg_max_cte."Score_2" AS "Score_2", arg_max_cte."Score_3" AS "Score_3", arg_max_cte."Score_4" AS "Score_4", arg_max_cte."Score_5" AS "Score_5", arg_max_cte."Score_6" AS "Score_6", arg_max_cte."Score_7" AS "Score_7", arg_max_cte."Score_8" AS "Score_8", arg_max_cte."Score_9" AS "Score_9", arg_max_cte."Proba_0" AS "Proba_0", arg_max_cte."Proba_1" AS "Proba_1", arg_max_cte."Proba_2" AS "Proba_2", arg_max_cte."Proba_3" AS "Proba_3", arg_max_cte."Proba_4" AS "Proba_4", arg_max_cte."Proba_5" AS "Proba_5", arg_max_cte."Proba_6" AS "Proba_6", arg_max_cte."Proba_7" AS "Proba_7", arg_max_cte."Proba_8" AS "Proba_8", arg_max_cte."Proba_9" AS "Proba_9", CASE WHEN (arg_max_cte."Proba_0" IS NULL OR arg_max_cte."Proba_0" > 0.0) THEN ln(arg_max_cte."Proba_0") ELSE -1.79769313486231e+308 END AS "LogProba_0", CASE WHEN (arg_max_cte."Proba_1" IS NULL OR arg_max_cte."Proba_1" > 0.0) THEN ln(arg_max_cte."Proba_1") ELSE -1.79769313486231e+308 END AS "LogProba_1", CASE WHEN (arg_max_cte."Proba_2" IS NULL OR arg_max_cte."Proba_2" > 0.0) THEN ln(arg_max_cte."Proba_2") ELSE -1.79769313486231e+308 END AS "LogProba_2", CASE WHEN (arg_max_cte."Proba_3" IS NULL OR arg_max_cte."Proba_3" > 0.0) THEN ln(arg_max_cte."Proba_3") ELSE -1.79769313486231e+308 END AS "LogProba_3", CASE WHEN (arg_max_cte."Proba_4" IS NULL OR arg_max_cte."Proba_4" > 0.0) THEN ln(arg_max_cte."Proba_4") ELSE -1.79769313486231e+308 END AS "LogProba_4", CASE WHEN (arg_max_cte."Proba_5" IS NULL OR arg_max_cte."Proba_5" > 0.0) THEN ln(arg_max_cte."Proba_5") ELSE -1.79769313486231e+308 END AS "LogProba_5", CASE WHEN (arg_max_cte."Proba_6" IS NULL OR arg_max_cte."Proba_6" > 0.0) THEN ln(arg_max_cte."Proba_6") ELSE -1.79769313486231e+308 END AS "LogProba_6", CASE WHEN (arg_max_cte."Proba_7" IS NULL OR arg_max_cte."Proba_7" > 0.0) THEN ln(arg_max_cte."Proba_7") ELSE -1.79769313486231e+308 END AS "LogProba_7", CASE WHEN (arg_max_cte."Proba_8" IS NULL OR arg_max_cte."Proba_8" > 0.0) THEN ln(arg_max_cte."Proba_8") ELSE -1.79769313486231e+308 END AS "LogProba_8", CASE WHEN (arg_max_cte."Proba_9" IS NULL OR arg_max_cte."Proba_9" > 0.0) THEN ln(arg_max_cte."Proba_9") ELSE -1.79769313486231e+308 END AS "LogProba_9", arg_max_cte."arg_max_Proba" AS "Decision", arg_max_cte."max_Proba" AS "DecisionProba"
FROM arg_max_cte
In [7]:
# save the dataset in a database table
engine = sa.create_engine('postgresql://db:db@localhost/db?port=5432' , echo=False)
conn = engine.connect()
lTable = pd.DataFrame(digits.data);
lTable.columns = ['Feature_' + str(c) for c in range(digits.data.shape[1])]
lTable['KEY'] = range(lTable.shape[0])
lTable.to_sql("INPUT_DATA" , conn, if_exists='replace', index=False)
In [8]:
sql_output = pd.read_sql(lSQL , conn);
sql_output = sql_output.sort_values(by='KEY').reset_index(drop=True)
conn.close()
In [9]:
sql_output.sample(12, random_state=1960)
Out[9]:
KEY
Score_0
Score_1
Score_2
Score_3
Score_4
Score_5
Score_6
Score_7
Score_8
...
LogProba_2
LogProba_3
LogProba_4
LogProba_5
LogProba_6
LogProba_7
LogProba_8
LogProba_9
Decision
DecisionProba
1197
1197
None
None
None
None
None
None
None
None
None
...
-3.271224
-2.185342
-3.074713
-1.532286
-3.913458
-2.400581
-1.469503
-2.246424
8
0.230040
913
913
None
None
None
None
None
None
None
None
None
...
-2.496320
-2.097181
-3.883939
-2.617019
-3.047992
-3.786127
-0.820853
-2.183164
8
0.440056
893
893
None
None
None
None
None
None
None
None
None
...
-2.764154
-2.486125
-4.001539
-0.718490
-4.412884
-3.060126
-2.164634
-2.968205
5
0.487488
604
604
None
None
None
None
None
None
None
None
None
...
-3.325013
-4.182968
-2.009558
-1.538349
-1.123425
-3.581279
-2.308723
-4.636519
6
0.325164
743
743
None
None
None
None
None
None
None
None
None
...
-5.167743
-5.425719
-0.615005
-3.479554
-4.037963
-1.379316
-3.507207
-3.300613
4
0.540638
556
556
None
None
None
None
None
None
None
None
None
...
-1.689932
-2.412699
-3.587378
-4.484135
-2.869010
-3.602473
-0.967115
-3.418996
8
0.380178
664
664
None
None
None
None
None
None
None
None
None
...
-3.111373
-2.387851
-3.237153
-2.289746
-2.790281
-3.369948
-0.969122
-2.358257
8
0.379416
195
195
None
None
None
None
None
None
None
None
None
...
-3.639786
-4.971824
-3.079982
-4.108785
-0.226434
-5.803139
-3.301036
-5.121011
6
0.797372
692
692
None
None
None
None
None
None
None
None
None
...
-3.625768
-3.617052
-3.603747
-0.359531
-3.612624
-3.345042
-2.542779
-3.533635
5
0.698004
1589
1589
None
None
None
None
None
None
None
None
None
...
-3.479113
-4.513059
-3.068721
-0.702351
-2.655061
-2.595765
-2.939838
-3.168150
5
0.495419
1194
1194
None
None
None
None
None
None
None
None
None
...
-3.633719
-2.567681
-4.120463
-2.611717
-4.634110
-3.696028
-2.798969
-0.443228
9
0.641961
822
822
None
None
None
None
None
None
None
None
None
...
-3.526294
-3.807369
-3.652604
-0.363653
-4.745843
-2.786068
-2.546733
-3.290512
5
0.695132
12 rows × 33 columns
In [10]:
sql_output.Decision.value_counts()
Out[10]:
7 197
9 186
1 184
4 180
6 179
0 178
5 176
3 175
2 175
8 167
Name: Decision, dtype: int64
In [11]:
skl_outputs = pd.DataFrame()
skl_output_key = pd.DataFrame(list(range(X.shape[0])), columns=['KEY']);
skl_output_score = pd.DataFrame(columns=['Score_' + str(c) for c in range(n_classes)]);
skl_output_proba = pd.DataFrame(clf.predict_proba(X), columns=['Proba_' + str(c) for c in range(n_classes)])
skl_output_log_proba = pd.DataFrame(clf.predict_log_proba(X), columns=['LogProba_' + str(c) for c in range(n_classes)])
skl_output_decision = pd.DataFrame(clf.predict(X), columns=['Decision'])
skl_output = pd.concat([skl_output_key, skl_output_score, skl_output_proba, skl_output_log_proba, skl_output_decision] , axis=1)
skl_output.sample(12, random_state=1960)
Out[11]:
KEY
Score_0
Score_1
Score_2
Score_3
Score_4
Score_5
Score_6
Score_7
Score_8
...
LogProba_1
LogProba_2
LogProba_3
LogProba_4
LogProba_5
LogProba_6
LogProba_7
LogProba_8
LogProba_9
Decision
1197
1197
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
...
-2.239128
-3.271224
-2.185342
-3.074713
-1.532286
-3.913458
-2.400581
-1.469503
-2.246424
8
913
913
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
...
-3.128394
-2.496320
-2.097181
-3.883939
-2.617019
-3.047992
-3.786127
-0.820853
-2.183164
8
893
893
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
...
-2.129655
-2.764154
-2.486125
-4.001539
-0.718490
-4.412884
-3.060126
-2.164634
-2.968205
5
604
604
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
...
-2.750753
-3.325013
-4.182968
-2.009558
-1.538349
-1.123425
-3.581279
-2.308723
-4.636519
6
743
743
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
...
-3.187762
-5.167743
-5.425719
-0.615005
-3.479554
-4.037963
-1.379316
-3.507207
-3.300613
4
556
556
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
...
-1.724560
-1.689932
-2.412699
-3.587378
-4.484135
-2.869010
-3.602473
-0.967115
-3.418996
8
664
664
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
...
-2.673481
-3.111373
-2.387851
-3.237153
-2.289746
-2.790281
-3.369948
-0.969122
-2.358257
8
195
195
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
...
-3.013571
-3.639786
-4.971824
-3.079982
-4.108785
-0.226434
-5.803139
-3.301036
-5.121011
6
692
692
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
...
-3.312214
-3.625768
-3.617052
-3.603747
-0.359531
-3.612624
-3.345042
-2.542779
-3.533635
5
1589
1589
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
...
-3.708405
-3.479113
-4.513059
-3.068721
-0.702351
-2.655061
-2.595765
-2.939838
-3.168150
5
1194
1194
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
...
-3.715642
-3.633719
-2.567681
-4.120463
-2.611717
-4.634110
-3.696028
-2.798969
-0.443228
9
822
822
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
...
-3.242409
-3.526294
-3.807369
-3.652604
-0.363653
-4.745843
-2.786068
-2.546733
-3.290512
5
12 rows × 32 columns
In [12]:
sql_skl_join = skl_output.join(sql_output , how='left', on='KEY', lsuffix='_skl', rsuffix='_sql')
In [13]:
sql_skl_join.sample(12, random_state=1960)
Out[13]:
KEY_skl
Score_0_skl
Score_1_skl
Score_2_skl
Score_3_skl
Score_4_skl
Score_5_skl
Score_6_skl
Score_7_skl
Score_8_skl
...
LogProba_2_sql
LogProba_3_sql
LogProba_4_sql
LogProba_5_sql
LogProba_6_sql
LogProba_7_sql
LogProba_8_sql
LogProba_9_sql
Decision_sql
DecisionProba
1197
1197
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
...
-3.271224
-2.185342
-3.074713
-1.532286
-3.913458
-2.400581
-1.469503
-2.246424
8
0.230040
913
913
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
...
-2.496320
-2.097181
-3.883939
-2.617019
-3.047992
-3.786127
-0.820853
-2.183164
8
0.440056
893
893
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
...
-2.764154
-2.486125
-4.001539
-0.718490
-4.412884
-3.060126
-2.164634
-2.968205
5
0.487488
604
604
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
...
-3.325013
-4.182968
-2.009558
-1.538349
-1.123425
-3.581279
-2.308723
-4.636519
6
0.325164
743
743
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
...
-5.167743
-5.425719
-0.615005
-3.479554
-4.037963
-1.379316
-3.507207
-3.300613
4
0.540638
556
556
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
...
-1.689932
-2.412699
-3.587378
-4.484135
-2.869010
-3.602473
-0.967115
-3.418996
8
0.380178
664
664
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
...
-3.111373
-2.387851
-3.237153
-2.289746
-2.790281
-3.369948
-0.969122
-2.358257
8
0.379416
195
195
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
...
-3.639786
-4.971824
-3.079982
-4.108785
-0.226434
-5.803139
-3.301036
-5.121011
6
0.797372
692
692
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
...
-3.625768
-3.617052
-3.603747
-0.359531
-3.612624
-3.345042
-2.542779
-3.533635
5
0.698004
1589
1589
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
...
-3.479113
-4.513059
-3.068721
-0.702351
-2.655061
-2.595765
-2.939838
-3.168150
5
0.495419
1194
1194
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
...
-3.633719
-2.567681
-4.120463
-2.611717
-4.634110
-3.696028
-2.798969
-0.443228
9
0.641961
822
822
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
...
-3.526294
-3.807369
-3.652604
-0.363653
-4.745843
-2.786068
-2.546733
-3.290512
5
0.695132
12 rows × 65 columns
In [14]:
condition = (sql_skl_join.Decision_sql != sql_skl_join.Decision_skl)
sql_skl_join[condition]
Out[14]:
KEY_skl
Score_0_skl
Score_1_skl
Score_2_skl
Score_3_skl
Score_4_skl
Score_5_skl
Score_6_skl
Score_7_skl
Score_8_skl
...
LogProba_2_sql
LogProba_3_sql
LogProba_4_sql
LogProba_5_sql
LogProba_6_sql
LogProba_7_sql
LogProba_8_sql
LogProba_9_sql
Decision_sql
DecisionProba
0 rows × 65 columns
In [ ]:
Content source: antoinecarme/sklearn2sql_heroku
Similar notebooks: