In [1]:
import numpy as np
import pandas as pd
import sqlalchemy as sa
import pickle, json, requests, base64

Scikit-Learn Model Deployment Use Case

In this case, we will build a model (here, MLP model) on a IRIS dataset (not very original !!!) and generate a SQL code for deployment using the web service.

We then execute the SQL code on a local database (postgresql) and compare the SQL execution result with scikit-learn predict/predict_proba/.predict_log_proba result.

Both results are stored in pandas dataframes.

Build a scikit-learn model


In [2]:
from sklearn import datasets

iris = datasets.load_iris()
X = iris.data  
Y = iris.target
# print(iris.DESCR)

In [3]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(random_state=1960)
clf.fit(X, Y)


/home/antoine/.local/lib/python3.6/site-packages/sklearn/neural_network/multilayer_perceptron.py:564: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
  % self.max_iter, ConvergenceWarning)
Out[3]:
MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1960,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [4]:
#clf.__dict__

Generate SQL Code from the Model


In [5]:
def test_ws_sql_gen(pickle_data):
    WS_URL="https://sklearn2sql.herokuapp.com/model"
    b64_data = base64.b64encode(pickle_data).decode('utf-8')
    data={"Name":"model1", "PickleData":b64_data , "SQLDialect":"postgresql"}
    r = requests.post(WS_URL, json=data)
    #print(r.__dict__)
    content = r.json()
    # print(content)
    lSQL = content["model"]["SQLGenrationResult"][0]["SQL"]
    return lSQL;

In [6]:
pickle_data = pickle.dumps(clf)
lSQL = test_ws_sql_gen(pickle_data)
print(lSQL[0:2000])


WITH "IL" AS 
(SELECT "ADS"."KEY" AS "KEY", CAST("ADS"."Feature_0" AS FLOAT) AS "Feature_0", CAST("ADS"."Feature_1" AS FLOAT) AS "Feature_1", CAST("ADS"."Feature_2" AS FLOAT) AS "Feature_2", CAST("ADS"."Feature_3" AS FLOAT) AS "Feature_3" 
FROM "INPUT_DATA" AS "ADS"), 
"HL_BA_1" AS 
(SELECT "IL"."KEY" AS "KEY", -0.00157287498516 * "IL"."Feature_0" + 0.0077384928365 * "IL"."Feature_1" + -0.108342302544 * "IL"."Feature_2" + 0.0161288513631 * "IL"."Feature_3" + -0.0839798413234 AS "NEUR_1_1", 0.00311192859931 * "IL"."Feature_0" + -0.0239251031949 * "IL"."Feature_1" + -0.0245801148495 * "IL"."Feature_2" + -0.0817595256442 * "IL"."Feature_3" + 0.0275445187367 AS "NEUR_1_2", 0.248912741243 * "IL"."Feature_0" + -0.0552821531517 * "IL"."Feature_1" + 0.212159693381 * "IL"."Feature_2" + 0.137156022862 * "IL"."Feature_3" + -0.134948900283 AS "NEUR_1_3", -0.0471112122355 * "IL"."Feature_0" + -0.0558486707687 * "IL"."Feature_1" + 0.0254497024161 * "IL"."Feature_2" + -0.000331389841263 * "IL"."Feature_3" + -0.118351485716 AS "NEUR_1_4", -0.00692595810497 * "IL"."Feature_0" + -0.0440941170096 * "IL"."Feature_1" + -0.0560036165265 * "IL"."Feature_2" + -0.0315257105931 * "IL"."Feature_3" + 0.138809954137 AS "NEUR_1_5", -0.0486515679848 * "IL"."Feature_0" + -0.10643246192 * "IL"."Feature_1" + -0.0102961495528 * "IL"."Feature_2" + -0.0445269682379 * "IL"."Feature_3" + -0.216774857326 AS "NEUR_1_6", -0.0713261790304 * "IL"."Feature_0" + 0.00859019784703 * "IL"."Feature_1" + 0.00452850038773 * "IL"."Feature_2" + -0.0728581424756 * "IL"."Feature_3" + -0.00884230955846 AS "NEUR_1_7", -0.0725012359029 * "IL"."Feature_0" + -0.0972087837458 * "IL"."Feature_1" + 0.117611313404 * "IL"."Feature_2" + -0.0862775532873 * "IL"."Feature_3" + -0.0544282553946 AS "NEUR_1_8", 0.0763579919383 * "IL"."Feature_0" + -0.120668185977 * "IL"."Feature_1" + 0.0351149154507 * "IL"."Feature_2" + 0.253296739551 * "IL"."Feature_3" + -0.0651191018497 AS "NEUR_1_9", -0.172511083231 * "IL"."Feature_0" + 0.144296218234

Execute the SQL Code


In [7]:
# save the dataset in a database table

#engine = sa.create_engine('sqlite://' , echo=False)
engine = sa.create_engine("postgresql://db:db@localhost/db?port=5432", echo=False)
conn = engine.connect()

lTable = pd.DataFrame(X);
lTable.columns = ['Feature_0', 'Feature_1', 'Feature_2', 'Feature_3']
lTable['KEY'] = range(lTable.shape[0])
lTable.to_sql("INPUT_DATA" , conn,   if_exists='replace', index=False)

In [8]:
sql_output = pd.read_sql(lSQL , conn);
conn.close()

In [9]:
sql_output.sample(12, random_state=1960)


Out[9]:
KEY Score_0 Score_1 Score_2 Proba_0 Proba_1 Proba_2 LogProba_0 LogProba_1 LogProba_2 Decision DecisionProba
114 114 None None None 0.000801 0.063040 0.936158 -7.129114 -2.763981 -0.065971 2 0.936158
74 74 None None None 0.035078 0.853853 0.111069 -3.350170 -0.157996 -2.197607 1 0.853853
9 9 None None None 0.940009 0.059202 0.000789 -0.061866 -2.826803 -7.144567 0 0.940009
88 88 None None None 0.055286 0.731221 0.213492 -2.895232 -0.313039 -1.544154 1 0.731221
25 25 None None None 0.909896 0.088855 0.001248 -0.094424 -2.420748 -6.685931 0 0.909896
5 5 None None None 0.964028 0.035551 0.000421 -0.036635 -3.336794 -7.771959 0 0.964028
48 48 None None None 0.970925 0.028793 0.000282 -0.029506 -3.547623 -8.172936 0 0.970925
117 117 None None None 0.000690 0.146254 0.853056 -7.278340 -1.922412 -0.158930 2 0.853056
83 83 None None None 0.004807 0.302693 0.692499 -5.337587 -1.195035 -0.367448 2 0.692499
105 105 None None None 0.000305 0.114976 0.884719 -8.094618 -2.163033 -0.122485 2 0.884719
27 27 None None None 0.961258 0.038327 0.000414 -0.039512 -3.261589 -7.788609 0 0.961258
64 64 None None None 0.096049 0.786053 0.117897 -2.342892 -0.240731 -2.137940 1 0.786053

In [10]:
sql_output.Decision.value_counts()


Out[10]:
2    53
0    50
1    47
Name: Decision, dtype: int64

Scikit-learn Prediction


In [11]:
skl_outputs = pd.DataFrame()
skl_output_key = pd.DataFrame(list(range(X.shape[0])), columns=['KEY']);
skl_output_score = pd.DataFrame(columns=['Score_0', 'Score_1', 'Score_2']);
skl_output_proba = pd.DataFrame(clf.predict_proba(X), columns=['Proba_0', 'Proba_1', 'Proba_2'])
skl_output_log_proba = pd.DataFrame(clf.predict_log_proba(X), columns=['LogProba_0', 'LogProba_1', 'LogProba_2'])
skl_output_decision = pd.DataFrame(clf.predict(X), columns=['Decision'])
skl_output = pd.concat([skl_output_key, skl_output_score, skl_output_proba, skl_output_log_proba, skl_output_decision] , axis=1)
skl_output.sample(12, random_state=1960)


Out[11]:
KEY Score_0 Score_1 Score_2 Proba_0 Proba_1 Proba_2 LogProba_0 LogProba_1 LogProba_2 Decision
114 114 NaN NaN NaN 0.000801 0.063040 0.936158 -7.129114 -2.763981 -0.065971 2
74 74 NaN NaN NaN 0.035078 0.853853 0.111069 -3.350170 -0.157996 -2.197607 1
9 9 NaN NaN NaN 0.940009 0.059202 0.000789 -0.061866 -2.826803 -7.144567 0
88 88 NaN NaN NaN 0.055286 0.731221 0.213492 -2.895232 -0.313039 -1.544154 1
25 25 NaN NaN NaN 0.909896 0.088855 0.001248 -0.094424 -2.420748 -6.685931 0
5 5 NaN NaN NaN 0.964028 0.035551 0.000421 -0.036635 -3.336794 -7.771959 0
48 48 NaN NaN NaN 0.970925 0.028793 0.000282 -0.029506 -3.547623 -8.172936 0
117 117 NaN NaN NaN 0.000690 0.146254 0.853056 -7.278340 -1.922412 -0.158930 2
83 83 NaN NaN NaN 0.004807 0.302693 0.692499 -5.337587 -1.195035 -0.367448 2
105 105 NaN NaN NaN 0.000305 0.114976 0.884719 -8.094618 -2.163033 -0.122485 2
27 27 NaN NaN NaN 0.961258 0.038327 0.000414 -0.039512 -3.261589 -7.788609 0
64 64 NaN NaN NaN 0.096049 0.786053 0.117897 -2.342892 -0.240731 -2.137940 1

Comparing the SQL and Scikit-learn Predictions


In [12]:
sql_skl_join = skl_output.join(sql_output , how='left', on='KEY', lsuffix='_skl', rsuffix='_sql')

In [13]:
sql_skl_join.sample(12, random_state=1960)


Out[13]:
KEY_skl Score_0_skl Score_1_skl Score_2_skl Proba_0_skl Proba_1_skl Proba_2_skl LogProba_0_skl LogProba_1_skl LogProba_2_skl ... Score_1_sql Score_2_sql Proba_0_sql Proba_1_sql Proba_2_sql LogProba_0_sql LogProba_1_sql LogProba_2_sql Decision_sql DecisionProba
114 114 NaN NaN NaN 0.000801 0.063040 0.936158 -7.129114 -2.763981 -0.065971 ... None None 0.000801 0.063040 0.936158 -7.129114 -2.763981 -0.065971 2 0.936158
74 74 NaN NaN NaN 0.035078 0.853853 0.111069 -3.350170 -0.157996 -2.197607 ... None None 0.035078 0.853853 0.111069 -3.350170 -0.157996 -2.197607 1 0.853853
9 9 NaN NaN NaN 0.940009 0.059202 0.000789 -0.061866 -2.826803 -7.144567 ... None None 0.940009 0.059202 0.000789 -0.061866 -2.826803 -7.144567 0 0.940009
88 88 NaN NaN NaN 0.055286 0.731221 0.213492 -2.895232 -0.313039 -1.544154 ... None None 0.055286 0.731221 0.213492 -2.895232 -0.313039 -1.544154 1 0.731221
25 25 NaN NaN NaN 0.909896 0.088855 0.001248 -0.094424 -2.420748 -6.685931 ... None None 0.909896 0.088855 0.001248 -0.094424 -2.420748 -6.685931 0 0.909896
5 5 NaN NaN NaN 0.964028 0.035551 0.000421 -0.036635 -3.336794 -7.771959 ... None None 0.964028 0.035551 0.000421 -0.036635 -3.336794 -7.771959 0 0.964028
48 48 NaN NaN NaN 0.970925 0.028793 0.000282 -0.029506 -3.547623 -8.172936 ... None None 0.970925 0.028793 0.000282 -0.029506 -3.547623 -8.172936 0 0.970925
117 117 NaN NaN NaN 0.000690 0.146254 0.853056 -7.278340 -1.922412 -0.158930 ... None None 0.000690 0.146254 0.853056 -7.278340 -1.922412 -0.158930 2 0.853056
83 83 NaN NaN NaN 0.004807 0.302693 0.692499 -5.337587 -1.195035 -0.367448 ... None None 0.004807 0.302693 0.692499 -5.337587 -1.195035 -0.367448 2 0.692499
105 105 NaN NaN NaN 0.000305 0.114976 0.884719 -8.094618 -2.163033 -0.122485 ... None None 0.000305 0.114976 0.884719 -8.094618 -2.163033 -0.122485 2 0.884719
27 27 NaN NaN NaN 0.961258 0.038327 0.000414 -0.039512 -3.261589 -7.788609 ... None None 0.961258 0.038327 0.000414 -0.039512 -3.261589 -7.788609 0 0.961258
64 64 NaN NaN NaN 0.096049 0.786053 0.117897 -2.342892 -0.240731 -2.137940 ... None None 0.096049 0.786053 0.117897 -2.342892 -0.240731 -2.137940 1 0.786053

12 rows × 23 columns


In [14]:
condition = (sql_skl_join.Decision_sql != sql_skl_join.Decision_skl)
sql_skl_join[condition]


Out[14]:
KEY_skl Score_0_skl Score_1_skl Score_2_skl Proba_0_skl Proba_1_skl Proba_2_skl LogProba_0_skl LogProba_1_skl LogProba_2_skl ... Score_1_sql Score_2_sql Proba_0_sql Proba_1_sql Proba_2_sql LogProba_0_sql LogProba_1_sql LogProba_2_sql Decision_sql DecisionProba

0 rows × 23 columns


In [ ]: