In [1]:
import numpy as np
import pandas as pd
import sqlalchemy as sa
import pickle, json, requests, base64
In this case, we will build a model (here, MLP model) on a IRIS dataset (not very original !!!) and generate a SQL code for deployment using the web service.
We then execute the SQL code on a local database (postgresql) and compare the SQL execution result with scikit-learn predict/predict_proba/.predict_log_proba result.
Both results are stored in pandas dataframes.
In [2]:
from sklearn import datasets
iris = datasets.load_iris()
X = iris.data
Y = iris.target
# print(iris.DESCR)
In [3]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(random_state=1960)
clf.fit(X, Y)
Out[3]:
In [4]:
#clf.__dict__
In [5]:
def test_ws_sql_gen(pickle_data):
WS_URL="https://sklearn2sql.herokuapp.com/model"
b64_data = base64.b64encode(pickle_data).decode('utf-8')
data={"Name":"model1", "PickleData":b64_data , "SQLDialect":"postgresql"}
r = requests.post(WS_URL, json=data)
#print(r.__dict__)
content = r.json()
# print(content)
lSQL = content["model"]["SQLGenrationResult"][0]["SQL"]
return lSQL;
In [6]:
pickle_data = pickle.dumps(clf)
lSQL = test_ws_sql_gen(pickle_data)
print(lSQL[0:2000])
In [7]:
# save the dataset in a database table
#engine = sa.create_engine('sqlite://' , echo=False)
engine = sa.create_engine("postgresql://db:db@localhost/db?port=5432", echo=False)
conn = engine.connect()
lTable = pd.DataFrame(X);
lTable.columns = ['Feature_0', 'Feature_1', 'Feature_2', 'Feature_3']
lTable['KEY'] = range(lTable.shape[0])
lTable.to_sql("INPUT_DATA" , conn, if_exists='replace', index=False)
In [8]:
sql_output = pd.read_sql(lSQL , conn);
conn.close()
In [9]:
sql_output.sample(12, random_state=1960)
Out[9]:
In [10]:
sql_output.Decision.value_counts()
Out[10]:
In [11]:
skl_outputs = pd.DataFrame()
skl_output_key = pd.DataFrame(list(range(X.shape[0])), columns=['KEY']);
skl_output_score = pd.DataFrame(columns=['Score_0', 'Score_1', 'Score_2']);
skl_output_proba = pd.DataFrame(clf.predict_proba(X), columns=['Proba_0', 'Proba_1', 'Proba_2'])
skl_output_log_proba = pd.DataFrame(clf.predict_log_proba(X), columns=['LogProba_0', 'LogProba_1', 'LogProba_2'])
skl_output_decision = pd.DataFrame(clf.predict(X), columns=['Decision'])
skl_output = pd.concat([skl_output_key, skl_output_score, skl_output_proba, skl_output_log_proba, skl_output_decision] , axis=1)
skl_output.sample(12, random_state=1960)
Out[11]:
In [12]:
sql_skl_join = skl_output.join(sql_output , how='left', on='KEY', lsuffix='_skl', rsuffix='_sql')
In [13]:
sql_skl_join.sample(12, random_state=1960)
Out[13]:
In [14]:
condition = (sql_skl_join.Decision_sql != sql_skl_join.Decision_skl)
sql_skl_join[condition]
Out[14]:
In [ ]: