In [1]:
from pymldb import Connection
mldb = Connection("http://localhost")

#we'll need these also later!
import pandas as pd
import numpy as np
from IPython.display import display, HTML
%matplotlib inline
import matplotlib.pyplot as plt
from pylab import rcParams
rcParams['figure.figsize'] = 15, 5
from ipywidgets import interact

In [2]:
mldb.v1.datasets("bank").put({ 
    "type": "text.csv.tabular",
    "params": { 
        "dataFileUrl": 
            "archive+"+
            "https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank-additional.zip"+
            "#bank-additional/bank-additional-full.csv",
        "delimiter": ";"
    } 
})


Out[2]:
PUT http://localhost/v1/datasets/bank
201 Created
{
  "status": {
    "num_skipped_lines": 0
  }, 
  "config": {
    "params": {
      "delimiter": ";", 
      "dataFileUrl": "archive+https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank-additional.zip#bank-additional/bank-additional-full.csv"
    }, 
    "type": "text.csv.tabular", 
    "id": "bank"
  }, 
  "state": "ok", 
  "type": "text.csv.tabular", 
  "id": "bank"
}

In [3]:
mldb.query("select * from bank limit 10")


Out[3]:
age campaign cons.conf.idx cons.price.idx contact day_of_week default duration education emp.var.rate ... housing job loan marital month nr.employed pdays poutcome previous y
_rowName
000000022989 44 6 -36.1 93.444 cellular mon no 63 professional.course 1.4 ... yes technician no married aug 5228.1 999 nonexistent 0 no
000000012341 32 2 -42.7 93.918 telephone fri no 137 university.degree 1.4 ... no admin. yes divorced jul 5228.1 999 nonexistent 0 no
000000038427 30 2 -26.9 92.431 telephone wed no 196 university.degree -3.4 ... yes admin. no married oct 5017.5 999 nonexistent 0 no
000000021757 33 3 -36.1 93.444 cellular tue no 173 university.degree 1.4 ... yes self-employed no married aug 5228.1 999 nonexistent 0 no
000000001125 59 2 -36.4 93.994 telephone wed unknown 81 basic.4y 1.1 ... no retired no married may 5191.0 999 nonexistent 0 no
000000031134 46 1 -46.2 92.893 cellular wed unknown 19 basic.6y -1.8 ... no blue-collar no married may 5099.1 999 nonexistent 0 no
000000037727 28 1 -31.4 92.201 cellular thu no 64 professional.course -2.9 ... yes technician no single aug 5076.2 999 nonexistent 0 no
000000025535 29 1 -42.0 93.200 cellular wed no 224 basic.9y -0.1 ... no technician yes single nov 5195.8 999 nonexistent 0 no
000000015218 28 1 -42.7 93.918 cellular thu no 25 unknown 1.4 ... yes blue-collar no single jul 5228.1 999 nonexistent 0 no
000000009389 33 1 -41.8 94.465 telephone fri unknown 161 basic.6y 1.4 ... unknown blue-collar unknown married jun 5228.1 999 nonexistent 0 no

10 rows × 21 columns


In [15]:
training_procedure = mldb.v1.procedures("bank_exp")
training_procedure.put({
    "type": "classifier.experiment",
    "params": {
        "experimentName": "bank",
        "keepArtifacts": True,
        "training_dataset": { "id": "bank" },
        "testing_dataset": { "id": "bank" },
        "dataset_folds": [{
            "training_where": "rowHash() % 3 != 0",
            "testing_where": "rowHash() % 3 = 0",
        }],
        "modelFileUrlPattern": "file://models/bank.cls",
        "algorithm": "bbdt",
        "select": "* excluding (y, duration)",
        "label": "y='yes'",
        "outputAccuracyDataset": True,
        "mode": "boolean"
    }
})
result = training_procedure.runs.post({})
print "\nArea under ROC curve = %0.4f\n" % result.json()["status"]["folds"][0]["results"]["auc"]


Area under ROC curve = 0.7863


In [26]:
training_procedure.runs("2015-10-25T02:46:14.328745Z-5bc7042b732cb41f").details.get()


Out[26]:
GET http://localhost/v1/procedures/bank_exp/runs/2015-10-25T02:46:14.328745Z-5bc7042b732cb41f/details
200 OK
null

In [16]:
accuracy = mldb.query("select * from bank_results_0 order by score desc")

@interact
def accuracy_plot( threshold_index=[0,len(accuracy)-1]):
    row = accuracy.iloc[threshold_index]
    cols = ["trueNegatives","falsePositives","falseNegatives","truePositives",]
    f, (ax1, ax2) = plt.subplots(1, 2)
    
    accuracy.plot(ax=ax1, x="falsePositiveRate", y="truePositiveRate", 
    legend=False, title="ROC Curve, threshold=%.4f" % row.score).set_ylabel('truePositiveRate')
    ax1.plot(row.falsePositiveRate, row.truePositiveRate, 'gs')
    
    ax2.pie(row[cols], labels=cols, autopct='%1.1f%%', startangle = 90,
            colors=['lightskyblue','lightcoral','lightcoral', 'lightskyblue'])
    ax2.axis('equal')
    f.subplots_adjust(hspace=.75)
    plt.show()


None

In [17]:
explanation = mldb.v1.functions("bank_explainer")
explanation.put({ 
    "type": "classifier.explain",
    "params": { "modelFileUrl": "file://models/bank.cls" }
})


Out[17]:
PUT http://localhost/v1/functions/bank_explainer
201 Created
{
  "status": {
    "mode": "regression", 
    "summary": "COMMITTEE"
  }, 
  "config": {
    "params": {
      "modelFileUrl": "file://models/bank.cls"
    }, 
    "type": "classifier.explain", 
    "id": "bank_explainer"
  }, 
  "state": "ok", 
  "type": "classifier.explain", 
  "id": "bank_explainer"
}

In [21]:
mldb.query("""
select avg(
    @bank_explainer(
        label: y='yes', 
        features: {*}
    )[explanation]
) as *
from bank
where rowHash() % 3 = 0
group by y
""").transpose().plot(kind='barh', title="Feature Importance")
plt.xticks(rotation=0)
plt.show()



In [ ]: