In [31]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.exceptions import NotFittedError
from sklearn.base import BaseEstimator, TransformerMixin, clone
from sklearn.model_selection import train_test_split as tts
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from yellowbrick.classifier import ClassificationReport
In [2]:
names = [
'age',
'workclass',
'fnlwgt',
'education',
'education-num',
'marital-status',
'occupation',
'relationship',
'race',
'sex',
'capital-gain',
'capital-loss',
'hours-per-week',
'native-country',
'income',
]
data = pd.read_csv('data/adult.data', names=names)
data.head()
Out[2]:
In [3]:
class NanLabelEncoder(BaseEstimator, TransformerMixin):
def __init__(self, missing_string=' ?', replacement=np.nan):
self.set_params(missing_string=missing_string, replacement=replacement)
def fit(self, X, y=None):
self.encoders_ = {
col: LabelEncoder().fit(X[col])
for col in X.columns
}
return self
def transform(self, X):
if not hasattr(self, "encoders_"):
raise NotFittedError("please fit first!")
Xp = []
for col in X.columns:
encoder = self.encoders_[col]
colp = encoder.transform(X[col])
if self.missing_string in encoder.classes_:
label = np.where(encoder.classes_ == self.missing_string)[0][0]
colp = np.where(colp==label, self.replacement, colp)
Xp.append(pd.Series(colp, name=col))
return pd.concat(Xp, axis=1)
In [5]:
class IdentityTransformer(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
return X
In [25]:
target_encoder = LabelEncoder()
X = data[[col for col in data.columns if col != 'income']]
y = target_encoder.fit_transform(data['income'])
remove = ['fnlwgt', 'education-num']
continuous = ['age', 'capital-gain', 'capital-loss', 'hours-per-week']
categorical = [
'workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country'
]
In [24]:
extractor = ColumnTransformer([
('encoder', Pipeline([
('nancode', NanLabelEncoder()),
('impute', SimpleImputer(strategy='most_frequent')),
('onehot', OneHotEncoder(categories='auto')),
]), categorical),
('dropper', ColumnTransformer([
('drop', 'drop', remove),
('keep', 'passthrough', continuous),
]), continuous+remove),
])
In [27]:
extractor.fit_transform(X, y).shape
Out[27]:
In [29]:
X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2)
In [38]:
def score_model(clf):
_, ax = plt.subplots(figsize=(9,6))
model = Pipeline([
('extract', clone(extractor)),
('clf', clone(clf)),
])
cr = ClassificationReport(model, classes=target_encoder.classes_, ax=ax).fit(X_train, y_train)
score = cr.score(X_test, y_test)
cr.poof()
return score
score_model(MultinomialNB())
Out[38]:
In [39]:
score_model(LogisticRegression())
Out[39]:
In [40]:
score_model(KNeighborsClassifier())
Out[40]:
In [41]:
score_model(RandomForestClassifier())
Out[41]: