In [1]:

    
import numpy as np
import pandas as pd
import sqlalchemy as sa
import pickle, json, requests, base64

Scikit-Learn Model Deployment Use Case

In this case, we will build a model (here, MLP model) on a IRIS dataset (not very original !!!) and generate a SQL code for deployment using the web service.

We then execute the SQL code on a local database (postgresql) and compare the SQL execution result with scikit-learn predict/predict_proba/.predict_log_proba result.

Both results are stored in pandas dataframes.

Build a scikit-learn model



In [2]:

    
from sklearn import datasets

iris = datasets.load_iris()
X = iris.data  
Y = iris.target
# print(iris.DESCR)



In [3]:

    
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(random_state=1960)
clf.fit(X, Y)









    



/home/antoine/.local/lib/python3.6/site-packages/sklearn/neural_network/multilayer_perceptron.py:564: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
  % self.max_iter, ConvergenceWarning)






    Out[3]:





MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1960,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)



In [4]:

    
#clf.__dict__

Generate SQL Code from the Model



In [5]:

    
def test_ws_sql_gen(pickle_data):
    WS_URL="https://sklearn2sql.herokuapp.com/model"
    b64_data = base64.b64encode(pickle_data).decode('utf-8')
    data={"Name":"model1", "PickleData":b64_data , "SQLDialect":"postgresql"}
    r = requests.post(WS_URL, json=data)
    #print(r.__dict__)
    content = r.json()
    # print(content)
    lSQL = content["model"]["SQLGenrationResult"][0]["SQL"]
    return lSQL;



In [6]:

    
pickle_data = pickle.dumps(clf)
lSQL = test_ws_sql_gen(pickle_data)
print(lSQL[0:2000])









    



WITH "IL" AS 
(SELECT "ADS"."KEY" AS "KEY", CAST("ADS"."Feature_0" AS FLOAT) AS "Feature_0", CAST("ADS"."Feature_1" AS FLOAT) AS "Feature_1", CAST("ADS"."Feature_2" AS FLOAT) AS "Feature_2", CAST("ADS"."Feature_3" AS FLOAT) AS "Feature_3" 
FROM "INPUT_DATA" AS "ADS"), 
"HL_BA_1" AS 
(SELECT "IL"."KEY" AS "KEY", -0.00157287498516 * "IL"."Feature_0" + 0.0077384928365 * "IL"."Feature_1" + -0.108342302544 * "IL"."Feature_2" + 0.0161288513631 * "IL"."Feature_3" + -0.0839798413234 AS "NEUR_1_1", 0.00311192859931 * "IL"."Feature_0" + -0.0239251031949 * "IL"."Feature_1" + -0.0245801148495 * "IL"."Feature_2" + -0.0817595256442 * "IL"."Feature_3" + 0.0275445187367 AS "NEUR_1_2", 0.248912741243 * "IL"."Feature_0" + -0.0552821531517 * "IL"."Feature_1" + 0.212159693381 * "IL"."Feature_2" + 0.137156022862 * "IL"."Feature_3" + -0.134948900283 AS "NEUR_1_3", -0.0471112122355 * "IL"."Feature_0" + -0.0558486707687 * "IL"."Feature_1" + 0.0254497024161 * "IL"."Feature_2" + -0.000331389841263 * "IL"."Feature_3" + -0.118351485716 AS "NEUR_1_4", -0.00692595810497 * "IL"."Feature_0" + -0.0440941170096 * "IL"."Feature_1" + -0.0560036165265 * "IL"."Feature_2" + -0.0315257105931 * "IL"."Feature_3" + 0.138809954137 AS "NEUR_1_5", -0.0486515679848 * "IL"."Feature_0" + -0.10643246192 * "IL"."Feature_1" + -0.0102961495528 * "IL"."Feature_2" + -0.0445269682379 * "IL"."Feature_3" + -0.216774857326 AS "NEUR_1_6", -0.0713261790304 * "IL"."Feature_0" + 0.00859019784703 * "IL"."Feature_1" + 0.00452850038773 * "IL"."Feature_2" + -0.0728581424756 * "IL"."Feature_3" + -0.00884230955846 AS "NEUR_1_7", -0.0725012359029 * "IL"."Feature_0" + -0.0972087837458 * "IL"."Feature_1" + 0.117611313404 * "IL"."Feature_2" + -0.0862775532873 * "IL"."Feature_3" + -0.0544282553946 AS "NEUR_1_8", 0.0763579919383 * "IL"."Feature_0" + -0.120668185977 * "IL"."Feature_1" + 0.0351149154507 * "IL"."Feature_2" + 0.253296739551 * "IL"."Feature_3" + -0.0651191018497 AS "NEUR_1_9", -0.172511083231 * "IL"."Feature_0" + 0.144296218234

Execute the SQL Code



In [7]:

    
# save the dataset in a database table

#engine = sa.create_engine('sqlite://' , echo=False)
engine = sa.create_engine("postgresql://db:db@localhost/db?port=5432", echo=False)
conn = engine.connect()

lTable = pd.DataFrame(X);
lTable.columns = ['Feature_0', 'Feature_1', 'Feature_2', 'Feature_3']
lTable['KEY'] = range(lTable.shape[0])
lTable.to_sql("INPUT_DATA" , conn,   if_exists='replace', index=False)



In [8]:

    
sql_output = pd.read_sql(lSQL , conn);
conn.close()



In [9]:

    
sql_output.sample(12, random_state=1960)









    Out[9]:







  
    
      
      KEY
      Score_0
      Score_1
      Score_2
      Proba_0
      Proba_1
      Proba_2
      LogProba_0
      LogProba_1
      LogProba_2
      Decision
      DecisionProba
    
  
  
    
      114
      114
      None
      None
      None
      0.000801
      0.063040
      0.936158
      -7.129114
      -2.763981
      -0.065971
      2
      0.936158
    
    
      74
      74
      None
      None
      None
      0.035078
      0.853853
      0.111069
      -3.350170
      -0.157996
      -2.197607
      1
      0.853853
    
    
      9
      9
      None
      None
      None
      0.940009
      0.059202
      0.000789
      -0.061866
      -2.826803
      -7.144567
      0
      0.940009
    
    
      88
      88
      None
      None
      None
      0.055286
      0.731221
      0.213492
      -2.895232
      -0.313039
      -1.544154
      1
      0.731221
    
    
      25
      25
      None
      None
      None
      0.909896
      0.088855
      0.001248
      -0.094424
      -2.420748
      -6.685931
      0
      0.909896
    
    
      5
      5
      None
      None
      None
      0.964028
      0.035551
      0.000421
      -0.036635
      -3.336794
      -7.771959
      0
      0.964028
    
    
      48
      48
      None
      None
      None
      0.970925
      0.028793
      0.000282
      -0.029506
      -3.547623
      -8.172936
      0
      0.970925
    
    
      117
      117
      None
      None
      None
      0.000690
      0.146254
      0.853056
      -7.278340
      -1.922412
      -0.158930
      2
      0.853056
    
    
      83
      83
      None
      None
      None
      0.004807
      0.302693
      0.692499
      -5.337587
      -1.195035
      -0.367448
      2
      0.692499
    
    
      105
      105
      None
      None
      None
      0.000305
      0.114976
      0.884719
      -8.094618
      -2.163033
      -0.122485
      2
      0.884719
    
    
      27
      27
      None
      None
      None
      0.961258
      0.038327
      0.000414
      -0.039512
      -3.261589
      -7.788609
      0
      0.961258
    
    
      64
      64
      None
      None
      None
      0.096049
      0.786053
      0.117897
      -2.342892
      -0.240731
      -2.137940
      1
      0.786053



In [10]:

    
sql_output.Decision.value_counts()









    Out[10]:





2    53
0    50
1    47
Name: Decision, dtype: int64

Scikit-learn Prediction



In [11]:

    
skl_outputs = pd.DataFrame()
skl_output_key = pd.DataFrame(list(range(X.shape[0])), columns=['KEY']);
skl_output_score = pd.DataFrame(columns=['Score_0', 'Score_1', 'Score_2']);
skl_output_proba = pd.DataFrame(clf.predict_proba(X), columns=['Proba_0', 'Proba_1', 'Proba_2'])
skl_output_log_proba = pd.DataFrame(clf.predict_log_proba(X), columns=['LogProba_0', 'LogProba_1', 'LogProba_2'])
skl_output_decision = pd.DataFrame(clf.predict(X), columns=['Decision'])
skl_output = pd.concat([skl_output_key, skl_output_score, skl_output_proba, skl_output_log_proba, skl_output_decision] , axis=1)
skl_output.sample(12, random_state=1960)

Comparing the SQL and Scikit-learn Predictions



In [12]:

    
sql_skl_join = skl_output.join(sql_output , how='left', on='KEY', lsuffix='_skl', rsuffix='_sql')



In [13]:

    
sql_skl_join.sample(12, random_state=1960)









    Out[13]:







  
    
      
      KEY_skl
      Score_0_skl
      Score_1_skl
      Score_2_skl
      Proba_0_skl
      Proba_1_skl
      Proba_2_skl
      LogProba_0_skl
      LogProba_1_skl
      LogProba_2_skl
      ...
      Score_1_sql
      Score_2_sql
      Proba_0_sql
      Proba_1_sql
      Proba_2_sql
      LogProba_0_sql
      LogProba_1_sql
      LogProba_2_sql
      Decision_sql
      DecisionProba
    
  
  
    
      114
      114
      NaN
      NaN
      NaN
      0.000801
      0.063040
      0.936158
      -7.129114
      -2.763981
      -0.065971
      ...
      None
      None
      0.000801
      0.063040
      0.936158
      -7.129114
      -2.763981
      -0.065971
      2
      0.936158
    
    
      74
      74
      NaN
      NaN
      NaN
      0.035078
      0.853853
      0.111069
      -3.350170
      -0.157996
      -2.197607
      ...
      None
      None
      0.035078
      0.853853
      0.111069
      -3.350170
      -0.157996
      -2.197607
      1
      0.853853
    
    
      9
      9
      NaN
      NaN
      NaN
      0.940009
      0.059202
      0.000789
      -0.061866
      -2.826803
      -7.144567
      ...
      None
      None
      0.940009
      0.059202
      0.000789
      -0.061866
      -2.826803
      -7.144567
      0
      0.940009
    
    
      88
      88
      NaN
      NaN
      NaN
      0.055286
      0.731221
      0.213492
      -2.895232
      -0.313039
      -1.544154
      ...
      None
      None
      0.055286
      0.731221
      0.213492
      -2.895232
      -0.313039
      -1.544154
      1
      0.731221
    
    
      25
      25
      NaN
      NaN
      NaN
      0.909896
      0.088855
      0.001248
      -0.094424
      -2.420748
      -6.685931
      ...
      None
      None
      0.909896
      0.088855
      0.001248
      -0.094424
      -2.420748
      -6.685931
      0
      0.909896
    
    
      5
      5
      NaN
      NaN
      NaN
      0.964028
      0.035551
      0.000421
      -0.036635
      -3.336794
      -7.771959
      ...
      None
      None
      0.964028
      0.035551
      0.000421
      -0.036635
      -3.336794
      -7.771959
      0
      0.964028
    
    
      48
      48
      NaN
      NaN
      NaN
      0.970925
      0.028793
      0.000282
      -0.029506
      -3.547623
      -8.172936
      ...
      None
      None
      0.970925
      0.028793
      0.000282
      -0.029506
      -3.547623
      -8.172936
      0
      0.970925
    
    
      117
      117
      NaN
      NaN
      NaN
      0.000690
      0.146254
      0.853056
      -7.278340
      -1.922412
      -0.158930
      ...
      None
      None
      0.000690
      0.146254
      0.853056
      -7.278340
      -1.922412
      -0.158930
      2
      0.853056
    
    
      83
      83
      NaN
      NaN
      NaN
      0.004807
      0.302693
      0.692499
      -5.337587
      -1.195035
      -0.367448
      ...
      None
      None
      0.004807
      0.302693
      0.692499
      -5.337587
      -1.195035
      -0.367448
      2
      0.692499
    
    
      105
      105
      NaN
      NaN
      NaN
      0.000305
      0.114976
      0.884719
      -8.094618
      -2.163033
      -0.122485
      ...
      None
      None
      0.000305
      0.114976
      0.884719
      -8.094618
      -2.163033
      -0.122485
      2
      0.884719
    
    
      27
      27
      NaN
      NaN
      NaN
      0.961258
      0.038327
      0.000414
      -0.039512
      -3.261589
      -7.788609
      ...
      None
      None
      0.961258
      0.038327
      0.000414
      -0.039512
      -3.261589
      -7.788609
      0
      0.961258
    
    
      64
      64
      NaN
      NaN
      NaN
      0.096049
      0.786053
      0.117897
      -2.342892
      -0.240731
      -2.137940
      ...
      None
      None
      0.096049
      0.786053
      0.117897
      -2.342892
      -0.240731
      -2.137940
      1
      0.786053
    
  

12 rows × 23 columns



In [14]:

    
condition = (sql_skl_join.Decision_sql != sql_skl_join.Decision_skl)
sql_skl_join[condition]









    Out[14]:







  
    
      
      KEY_skl
      Score_0_skl
      Score_1_skl
      Score_2_skl
      Proba_0_skl
      Proba_1_skl
      Proba_2_skl
      LogProba_0_skl
      LogProba_1_skl
      LogProba_2_skl
      ...
      Score_1_sql
      Score_2_sql
      Proba_0_sql
      Proba_1_sql
      Proba_2_sql
      LogProba_0_sql
      LogProba_1_sql
      LogProba_2_sql
      Decision_sql
      DecisionProba
    
  
  
  

0 rows × 23 columns



In [ ]:

	KEY	Score_0	Score_1	Score_2	Proba_0	Proba_1	Proba_2	LogProba_0	LogProba_1	LogProba_2	Decision	DecisionProba
114	114	None	None	None	0.000801	0.063040	0.936158	-7.129114	-2.763981	-0.065971	2	0.936158
74	74	None	None	None	0.035078	0.853853	0.111069	-3.350170	-0.157996	-2.197607	1	0.853853
9	9	None	None	None	0.940009	0.059202	0.000789	-0.061866	-2.826803	-7.144567	0	0.940009
88	88	None	None	None	0.055286	0.731221	0.213492	-2.895232	-0.313039	-1.544154	1	0.731221
25	25	None	None	None	0.909896	0.088855	0.001248	-0.094424	-2.420748	-6.685931	0	0.909896
5	5	None	None	None	0.964028	0.035551	0.000421	-0.036635	-3.336794	-7.771959	0	0.964028
48	48	None	None	None	0.970925	0.028793	0.000282	-0.029506	-3.547623	-8.172936	0	0.970925
117	117	None	None	None	0.000690	0.146254	0.853056	-7.278340	-1.922412	-0.158930	2	0.853056
83	83	None	None	None	0.004807	0.302693	0.692499	-5.337587	-1.195035	-0.367448	2	0.692499
105	105	None	None	None	0.000305	0.114976	0.884719	-8.094618	-2.163033	-0.122485	2	0.884719
27	27	None	None	None	0.961258	0.038327	0.000414	-0.039512	-3.261589	-7.788609	0	0.961258
64	64	None	None	None	0.096049	0.786053	0.117897	-2.342892	-0.240731	-2.137940	1	0.786053