Before dabl


In [8]:
!pwd


/home/andy/checkout/talks_odt/2019

In [9]:
!ls ../../ml-workshop-2-of-4


check_env.ipynb  LICENSE    README2-of-4.pdf  slides
images		 notebooks  README.md

In [33]:
import pandas as pd
data = pd.read_csv("../../ml-workshop-2-of-4/notebooks/data/adult.csv", index_col=0)
target = data.income
data_features = data.drop("income", axis=1)
X_train, X_test, y_train, y_test = train_test_split(data_features, target)

In [1]:
import pandas as pd
data = pd.read_csv("adult.csv", index_col=0)

In [31]:
df.dtypes


Out[31]:
income      object
variable    object
value       object
dtype: object

In [43]:


In [48]:
import pandas as pd
import seaborn as sns

data = pd.read_csv("adult.csv", index_col=0)

cols = data.columns[data.dtypes != object].tolist() + ['income']
df = data.loc[:, cols].melt("income")
g = sns.FacetGrid(df, col='variable', hue='income',
                  sharey=False, sharex=False, col_wrap=3)
g = g.map(sns.kdeplot, "value", shade=True)
g.axes[0].legend()


Out[48]:
<matplotlib.legend.Legend at 0x7eff1d500668>

In [20]:
# %load solutions/adult_classification.py
import pandas as pd
data = pd.read_csv("../../ml-workshop-2-of-4/notebooks/data/adult.csv", index_col=0)
target = data.income
data_features = data.drop("income", axis=1)
X_train, X_test, y_train, y_test = train_test_split(data_features, target)


from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

categorical_columns = data_features.dtypes == object

cont_pipe = Pipeline([('scaler', StandardScaler()),
                      ('imputer', SimpleImputer(strategy='median', add_indicator=True))])
cat_pipe = Pipeline([('ohe', OneHotEncoder(handle_unknown='ignore')),
                     ('imputer', SimpleImputer(strategy='most_frequent', add_indicator=True))])

pre = ColumnTransformer([('categorical', cat_pipe, categorical_columns),
                         ('continuous', cont_pipe, ~categorical_columns),
                        ])

model = Pipeline([('preprocessing', pre), ('clf', LogisticRegression())])
param_grid = {'clf__C': np.logspace(-3, 3, 7)}
grid_search = GridSearchCV(model, param_grid=param_grid)
grid_search.fit(X_train, y_train)


/home/andy/checkout/scikit-learn/sklearn/linear_model/logistic.py:932: ConvergenceWarning: lbfgs failed to converge (status=1): b'STOP: TOTAL NO. of ITERATIONS REACHED LIMIT'. Increase the number of iterations.
  n_iter_i = _check_optimize_result(solver, opt_res, max_iter)
/home/andy/checkout/scikit-learn/sklearn/linear_model/logistic.py:932: ConvergenceWarning: lbfgs failed to converge (status=1): b'STOP: TOTAL NO. of ITERATIONS REACHED LIMIT'. Increase the number of iterations.
  n_iter_i = _check_optimize_result(solver, opt_res, max_iter)
/home/andy/checkout/scikit-learn/sklearn/linear_model/logistic.py:932: ConvergenceWarning: lbfgs failed to converge (status=1): b'STOP: TOTAL NO. of ITERATIONS REACHED LIMIT'. Increase the number of iterations.
  n_iter_i = _check_optimize_result(solver, opt_res, max_iter)
/home/andy/checkout/scikit-learn/sklearn/linear_model/logistic.py:932: ConvergenceWarning: lbfgs failed to converge (status=1): b'STOP: TOTAL NO. of ITERATIONS REACHED LIMIT'. Increase the number of iterations.
  n_iter_i = _check_optimize_result(solver, opt_res, max_iter)
/home/andy/checkout/scikit-learn/sklearn/linear_model/logistic.py:932: ConvergenceWarning: lbfgs failed to converge (status=1): b'STOP: TOTAL NO. of ITERATIONS REACHED LIMIT'. Increase the number of iterations.
  n_iter_i = _check_optimize_result(solver, opt_res, max_iter)
/home/andy/checkout/scikit-learn/sklearn/linear_model/logistic.py:932: ConvergenceWarning: lbfgs failed to converge (status=1): b'STOP: TOTAL NO. of ITERATIONS REACHED LIMIT'. Increase the number of iterations.
  n_iter_i = _check_optimize_result(solver, opt_res, max_iter)
/home/andy/checkout/scikit-learn/sklearn/linear_model/logistic.py:932: ConvergenceWarning: lbfgs failed to converge (status=1): b'STOP: TOTAL NO. of ITERATIONS REACHED LIMIT'. Increase the number of iterations.
  n_iter_i = _check_optimize_result(solver, opt_res, max_iter)
/home/andy/checkout/scikit-learn/sklearn/linear_model/logistic.py:932: ConvergenceWarning: lbfgs failed to converge (status=1): b'STOP: TOTAL NO. of ITERATIONS REACHED LIMIT'. Increase the number of iterations.
  n_iter_i = _check_optimize_result(solver, opt_res, max_iter)
/home/andy/checkout/scikit-learn/sklearn/linear_model/logistic.py:932: ConvergenceWarning: lbfgs failed to converge (status=1): b'STOP: TOTAL NO. of ITERATIONS REACHED LIMIT'. Increase the number of iterations.
  n_iter_i = _check_optimize_result(solver, opt_res, max_iter)
/home/andy/checkout/scikit-learn/sklearn/linear_model/logistic.py:932: ConvergenceWarning: lbfgs failed to converge (status=1): b'STOP: TOTAL NO. of ITERATIONS REACHED LIMIT'. Increase the number of iterations.
  n_iter_i = _check_optimize_result(solver, opt_res, max_iter)
/home/andy/checkout/scikit-learn/sklearn/linear_model/logistic.py:932: ConvergenceWarning: lbfgs failed to converge (status=1): b'STOP: TOTAL NO. of ITERATIONS REACHED LIMIT'. Increase the number of iterations.
  n_iter_i = _check_optimize_result(solver, opt_res, max_iter)
/home/andy/checkout/scikit-learn/sklearn/linear_model/logistic.py:932: ConvergenceWarning: lbfgs failed to converge (status=1): b'STOP: TOTAL NO. of ITERATIONS REACHED LIMIT'. Increase the number of iterations.
  n_iter_i = _check_optimize_result(solver, opt_res, max_iter)
/home/andy/checkout/scikit-learn/sklearn/linear_model/logistic.py:932: ConvergenceWarning: lbfgs failed to converge (status=1): b'STOP: TOTAL NO. of ITERATIONS REACHED LIMIT'. Increase the number of iterations.
  n_iter_i = _check_optimize_result(solver, opt_res, max_iter)
/home/andy/checkout/scikit-learn/sklearn/linear_model/logistic.py:932: ConvergenceWarning: lbfgs failed to converge (status=1): b'STOP: TOTAL NO. of ITERATIONS REACHED LIMIT'. Increase the number of iterations.
  n_iter_i = _check_optimize_result(solver, opt_res, max_iter)
/home/andy/checkout/scikit-learn/sklearn/linear_model/logistic.py:932: ConvergenceWarning: lbfgs failed to converge (status=1): b'STOP: TOTAL NO. of ITERATIONS REACHED LIMIT'. Increase the number of iterations.
  n_iter_i = _check_optimize_result(solver, opt_res, max_iter)
/home/andy/checkout/scikit-learn/sklearn/linear_model/logistic.py:932: ConvergenceWarning: lbfgs failed to converge (status=1): b'STOP: TOTAL NO. of ITERATIONS REACHED LIMIT'. Increase the number of iterations.
  n_iter_i = _check_optimize_result(solver, opt_res, max_iter)
/home/andy/checkout/scikit-learn/sklearn/linear_model/logistic.py:932: ConvergenceWarning: lbfgs failed to converge (status=1): b'STOP: TOTAL NO. of ITERATIONS REACHED LIMIT'. Increase the number of iterations.
  n_iter_i = _check_optimize_result(solver, opt_res, max_iter)
/home/andy/checkout/scikit-learn/sklearn/linear_model/logistic.py:932: ConvergenceWarning: lbfgs failed to converge (status=1): b'STOP: TOTAL NO. of ITERATIONS REACHED LIMIT'. Increase the number of iterations.
  n_iter_i = _check_optimize_result(solver, opt_res, max_iter)
/home/andy/checkout/scikit-learn/sklearn/linear_model/logistic.py:932: ConvergenceWarning: lbfgs failed to converge (status=1): b'STOP: TOTAL NO. of ITERATIONS REACHED LIMIT'. Increase the number of iterations.
  n_iter_i = _check_optimize_result(solver, opt_res, max_iter)
/home/andy/checkout/scikit-learn/sklearn/linear_model/logistic.py:932: ConvergenceWarning: lbfgs failed to converge (status=1): b'STOP: TOTAL NO. of ITERATIONS REACHED LIMIT'. Increase the number of iterations.
  n_iter_i = _check_optimize_result(solver, opt_res, max_iter)
/home/andy/checkout/scikit-learn/sklearn/linear_model/logistic.py:932: ConvergenceWarning: lbfgs failed to converge (status=1): b'STOP: TOTAL NO. of ITERATIONS REACHED LIMIT'. Increase the number of iterations.
  n_iter_i = _check_optimize_result(solver, opt_res, max_iter)
Out[20]:
GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('preprocessing',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('categorical',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('ohe',
                                                                                          OneHotEncoder(categories='auto',
                                                                                                        drop=None,
                                                                                                        dtype=<class 'numpy.float64'>,
                                                                                                        handle_unknown='ignore',
                                                                                                        sparse...
                                                           l1_ratio=None,
                                                           max_iter=100,
                                                           multi_class='auto',
                                                           n_jobs=None,
                                                           penalty='l2',
                                                           random_state=None,
                                                           solver='lbfgs',
                                                           tol=0.0001,
                                                           verbose=0,
                                                           warm_start=False))],
                                verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'clf__C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [19]:
categorical_columns


Out[19]:
age               False
workclass          True
education          True
education-num     False
marital-status     True
occupation         True
relationship       True
race               True
gender             True
capital-gain      False
capital-loss      False
hours-per-week    False
native-country     True
dtype: bool

In [15]:



Out[15]:
array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03])

In [ ]:
### Preprocessing


scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)

### Cross-validation with default parameters
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

scores = cross_val_score(LogisticRegression(), X_train, y_train, cv=5)
print(scores.mean())


### do grid search

import numpy as np

param_grid = {'C': np.logspace(-3, 3, 7)}
param_grid

from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(LogisticRegression(solver='lbfgs'), param_grid, cv=5,
                    return_train_score=True)

grid.fit(X_train, y_train)

grid.best_params_
grid.best_score_

# some visualization

import pandas as pd
%matplotlib inline
res = pd.DataFrame(grid.cv_results_)
res.mean_test_score.plot()
res.mean_train_score.plot()
import matplotlib.pyplot as plt
plt.xscale("log")

grid.score(X_test, y_test)

important = np.argsort(np.abs(grid.best_estimator_.coef_)).ravel()

plt.barh(range(10), grid.best_estimator_.coef_.ravel()[important[-10:]])
plt.yticks(range(10), X_train.columns[important[-10:]]);

In [50]:
import dabl
ames_df = dabl.datasets.load_ames()
ames_df.head()


Out[50]:
Order PID MS SubClass MS Zoning Lot Frontage Lot Area Street Alley Lot Shape Land Contour ... Pool Area Pool QC Fence Misc Feature Misc Val Mo Sold Yr Sold Sale Type Sale Condition SalePrice
0 1 526301100 20 RL 141.0 31770 Pave NaN IR1 Lvl ... 0 NaN NaN NaN 0 5 2010 WD Normal 215000
1 2 526350040 20 RH 80.0 11622 Pave NaN Reg Lvl ... 0 NaN MnPrv NaN 0 6 2010 WD Normal 105000
2 3 526351010 20 RL 81.0 14267 Pave NaN IR1 Lvl ... 0 NaN NaN Gar2 12500 6 2010 WD Normal 172000
3 4 526353030 20 RL 93.0 11160 Pave NaN Reg Lvl ... 0 NaN NaN NaN 0 4 2010 WD Normal 244000
4 5 527105010 60 RL 74.0 13830 Pave NaN IR1 Lvl ... 0 NaN MnPrv NaN 0 3 2010 WD Normal 189900

5 rows × 82 columns


In [54]:
clean_df = dabl.clean(ames_df, verbose=2)


Detected feature types:
11 float, 28 int, 43 object, 0 date, 0 other
Interpreted as:
continuous      23
dirty_float      0
low_card_int     6
categorical     40
date             0
free_string      0
useless         13
dtype: int64
WARN dropped useless columns: ['Order', 'Street', 'Utilities', 'Land Slope', 'Condition 2', 'Roof Matl', 'Heating', 'Low Qual Fin SF', 'Kitchen AbvGr', 'Garage Cond', '3Ssn Porch', 'Pool Area', 'Misc Val']

In [65]:
from dabl import EasyPreprocessor

In [61]:
X, y = ames_df.drop('SalePrice', axis=1), ames_df.SalePrice
ep =  EasyPreprocessor().fit(X, y)


/home/andy/checkout/dabl/dabl/preprocessing.py:258: UserWarning: Discarding near-constant features: ['Street', 'Utilities', 'Land Slope', 'Condition 2', 'Roof Matl', 'Heating', 'Low Qual Fin SF', 'Kitchen AbvGr', 'Garage Cond', '3Ssn Porch', 'Pool Area', 'Misc Val']
  near_constant.index[near_constant].tolist()))

In [63]:
ep.ct_


Out[63]:
ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.1,
                  transformer_weights=None,
                  transformers=[('continuous',
                                 Pipeline(memory=None,
                                          steps=[('simpleimputer',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='median',
                                                                verbose=0)),
                                                 ('standardscaler',
                                                  StandardScaler(copy=True,
                                                                 with_mean=True,
                                                                 with_std=True))],...
Garage Type        True
Garage Yr Blt     False
Garage Finish      True
Garage Cars       False
Garage Area       False
Garage Qual        True
Garage Cond       False
Paved Drive        True
Wood Deck SF      False
Open Porch SF     False
Enclosed Porch    False
3Ssn Porch        False
Screen Porch      False
Pool Area         False
Pool QC            True
Fence              True
Misc Feature       True
Misc Val          False
Mo Sold            True
Yr Sold            True
Sale Type          True
Sale Condition     True
Name: categorical, Length: 81, dtype: bool)],
                  verbose=False)

In [1]:
data = pd.read_csv("../../ml-workshop-2-of-4/notebooks/data/adult.csv", index_col=0)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-a89d5705e541> in <module>
----> 1 data = pd.read_csv("../../ml-workshop-2-of-4/notebooks/data/adult.csv", index_col=0)

NameError: name 'pd' is not defined

In [74]:
!ln -s "../../ml-workshop-2-of-4/notebooks/data/adult.csv" .

In [3]:
from sklearn.exceptions import ConvergenceWarning
import warnings
warnings.simplefilter('ignore', ConvergenceWarning)

In [2]:
from dabl import SimpleClassifier
import pandas as pd
data = pd.read_csv("adult.csv", index_col=0)
SimpleClassifier().fit(data, target_col='income')


/home/andy/checkout/dabl/dabl/preprocessing.py:258: UserWarning: Discarding near-constant features: ['capital-loss']
  near_constant.index[near_constant].tolist()))
Running DummyClassifier(strategy='prior')
accuracy: 0.759 average_precision: 0.241 f1_macro: 0.432 recall_macro: 0.500 roc_auc: 0.500
=== new best DummyClassifier(strategy='prior') (using recall_macro):
accuracy: 0.759 average_precision: 0.241 f1_macro: 0.432 recall_macro: 0.500 roc_auc: 0.500

Running GaussianNB()
accuracy: 0.407 average_precision: 0.288 f1_macro: 0.405 recall_macro: 0.605 roc_auc: 0.607
=== new best GaussianNB() (using recall_macro):
accuracy: 0.407 average_precision: 0.288 f1_macro: 0.405 recall_macro: 0.605 roc_auc: 0.607

Running MultinomialNB()
accuracy: 0.831 average_precision: 0.773 f1_macro: 0.787 recall_macro: 0.815 roc_auc: 0.908
=== new best MultinomialNB() (using recall_macro):
accuracy: 0.831 average_precision: 0.773 f1_macro: 0.787 recall_macro: 0.815 roc_auc: 0.908

Running DecisionTreeClassifier(class_weight='balanced', max_depth=1)
accuracy: 0.710 average_precision: 0.417 f1_macro: 0.682 recall_macro: 0.759 roc_auc: 0.759
Running DecisionTreeClassifier(class_weight='balanced', max_depth=5)
accuracy: 0.784 average_precision: 0.711 f1_macro: 0.750 recall_macro: 0.811 roc_auc: 0.894
Running DecisionTreeClassifier(class_weight='balanced', min_impurity_decrease=0.01)
accuracy: 0.718 average_precision: 0.561 f1_macro: 0.693 recall_macro: 0.779 roc_auc: 0.848
/home/andy/checkout/scikit-learn/sklearn/linear_model/logistic.py:932: ConvergenceWarning: lbfgs failed to converge (status=1): b'STOP: TOTAL NO. of ITERATIONS REACHED LIMIT'. Increase the number of iterations.
  n_iter_i = _check_optimize_result(solver, opt_res, max_iter)
/home/andy/checkout/scikit-learn/sklearn/linear_model/logistic.py:932: ConvergenceWarning: lbfgs failed to converge (status=1): b'STOP: TOTAL NO. of ITERATIONS REACHED LIMIT'. Increase the number of iterations.
  n_iter_i = _check_optimize_result(solver, opt_res, max_iter)
/home/andy/checkout/scikit-learn/sklearn/linear_model/logistic.py:932: ConvergenceWarning: lbfgs failed to converge (status=1): b'STOP: TOTAL NO. of ITERATIONS REACHED LIMIT'. Increase the number of iterations.
  n_iter_i = _check_optimize_result(solver, opt_res, max_iter)
/home/andy/checkout/scikit-learn/sklearn/linear_model/logistic.py:932: ConvergenceWarning: lbfgs failed to converge (status=1): b'STOP: TOTAL NO. of ITERATIONS REACHED LIMIT'. Increase the number of iterations.
  n_iter_i = _check_optimize_result(solver, opt_res, max_iter)
/home/andy/checkout/scikit-learn/sklearn/linear_model/logistic.py:932: ConvergenceWarning: lbfgs failed to converge (status=1): b'STOP: TOTAL NO. of ITERATIONS REACHED LIMIT'. Increase the number of iterations.
  n_iter_i = _check_optimize_result(solver, opt_res, max_iter)
Running LogisticRegression(C=0.1, class_weight='balanced')
accuracy: 0.819 average_precision: 0.789 f1_macro: 0.783 recall_macro: 0.832 roc_auc: 0.915
=== new best LogisticRegression(C=0.1, class_weight='balanced') (using recall_macro):
accuracy: 0.819 average_precision: 0.789 f1_macro: 0.783 recall_macro: 0.832 roc_auc: 0.915


Best model:
LogisticRegression(C=0.1, class_weight='balanced')
Best Scores:
accuracy: 0.819 average_precision: 0.789 f1_macro: 0.783 recall_macro: 0.832 roc_auc: 0.915
/home/andy/checkout/scikit-learn/sklearn/linear_model/logistic.py:932: ConvergenceWarning: lbfgs failed to converge (status=1): b'STOP: TOTAL NO. of ITERATIONS REACHED LIMIT'. Increase the number of iterations.
  n_iter_i = _check_optimize_result(solver, opt_res, max_iter)
Out[2]:
SimpleClassifier(random_state=None, refit=True, verbose=1)

In [6]:
import pandas as pd
data = pd.read_csv("adult.csv", index_col=0)
from dabl.models import AnyClassifier
AnyClassifier(verbose=1, force_exhaust_budget=False).fit(data, target_col='income')


/home/andy/checkout/dabl/dabl/preprocessing.py:258: UserWarning: Discarding near-constant features: ['capital-loss']
  near_constant.index[near_constant].tolist()))
n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 7
r_min_: 20
max_budget_: 32561
aggressive_elimination: False
force_exhaust_budget: False
ratio: 3
----------
iter_i: 0
n_candidates: 26
r_i: 20
Fitting 5 folds for each of 26 candidates, totalling 130 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 130 out of 130 | elapsed:  1.4min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
----------
iter_i: 1
n_candidates: 9
r_i: 60
Fitting 5 folds for each of 9 candidates, totalling 45 fits
[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed:  1.3min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
----------
iter_i: 2
n_candidates: 3
r_i: 180
Fitting 5 folds for each of 3 candidates, totalling 15 fits
[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:   15.4s finished
best classifier:  HistGradientBoostingClassifier(l2_regularization=0.0001, learning_rate=0.1,
                               loss='auto', max_bins=16, max_depth=7,
                               max_iter=200, max_leaf_nodes=4,
                               min_samples_leaf=4, n_iter_no_change=None,
                               random_state=7320, scoring=None, tol=1e-07,
                               validation_fraction=0.1, verbose=0,
                               warm_start=False)
best score: 0.773
Out[6]:
AnyClassifier(force_exhaust_budget=False, n_jobs=None, verbose=1)

In [7]:
ac = Out[6]

In [9]:
import dabl
dabl.explain(ac)


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-9-4771deb5b001> in <module>
      1 import dabl
----> 2 dabl.explain(ac)

~/checkout/dabl/dabl/explain.py in explain(estimator, X_val, y_val, target_col, feature_names)
    124     else:
    125         raise ValueError("Don't know how to explain estimator {} "
--> 126                          "yet.".format(inner_estimator))
    127 
    128     if X_val is not None:

ValueError: Don't know how to explain estimator HistGradientBoostingClassifier(l2_regularization=0.0001, learning_rate=0.1,
                               loss='auto', max_bins=16, max_depth=7,
                               max_iter=200, max_leaf_nodes=4,
                               min_samples_leaf=4, n_iter_no_change=None,
                               random_state=7320, scoring=None, tol=1e-07,
                               validation_fraction=0.1, verbose=0,
                               warm_start=False) yet.

In [ ]: