In [1]:
%pylab inline
pylab.style.use('ggplot')
import pandas as pd
import numpy as np
import seaborn as sns
In [3]:
train_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
train_df = pd.read_csv(train_url, header=None)
In [4]:
train_df.head()
Out[4]:
age: continuous.
workclass: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked.
fnlwgt: continuous.
education: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool.
education-num: continuous.
marital-status: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse.
occupation: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces.
relationship: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried.
race: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black.
sex: Female, Male.
capital-gain: continuous.
capital-loss: continuous.
hours-per-week: continuous.
native-country: United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands.
In [5]:
train_df.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education_num',
'marital_status', 'occupation', 'relationship', 'race', 'sex',
'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'income_band']
In [6]:
train_df.head()
Out[6]:
In [7]:
train_df.dtypes
Out[7]:
In [8]:
# Remove excess whitespace from strings
nn_cols = train_df.dtypes[train_df.dtypes == np.object].index
for c in nn_cols:
train_df.loc[:, c] = train_df.loc[:, c].str.strip()
In [9]:
# Missing values per column
train_df.loc[:, nn_cols].isin(['?']).sum(axis=0).plot(kind='barh')
Out[9]:
In [10]:
# Drop rows with missing data
n_missing_per_row = train_df.loc[:, nn_cols].isin(['?']).sum(axis=1)
train_df_full = train_df.loc[n_missing_per_row.isin([0]), :]
In [11]:
train_df_full.isin(['?']).sum(axis=0)
Out[11]:
In [12]:
train_df_full.income_band.value_counts().plot(kind='barh')
Out[12]:
In [13]:
train_df_full.dtypes
Out[13]:
In [14]:
num_cols = train_df_full.dtypes[train_df_full.dtypes == np.int64].index
for n_col in num_cols:
g_col = sns.FacetGrid(col='income_band', data=train_df_full)
g_col = g_col.map(pylab.hist, n_col)
In [15]:
obj_cols = train_df_full.dtypes[train_df_full.dtypes == np.object].index
_, axes = pylab.subplots(len(obj_cols)-1, 1, figsize=(10, 20))
for i, colname in enumerate(obj_cols.drop('income_band')):
sns.countplot(x=train_df_full[colname],
hue=train_df_full.income_band,
ax=axes[i]
)
pylab.tight_layout()
In [16]:
num_cols = train_df_full.dtypes[train_df_full.dtypes == np.int64].index
num_features = train_df_full.loc[:, num_cols]
f_corrs = num_features.corr()
sns.heatmap(f_corrs, annot=True)
Out[16]:
In [26]:
n_samples = train_df_full.shape[0]
label_counts = train_df_full.income_band.value_counts()
label_counts
Out[26]:
So assuming our naive baseline model predicts the income band for all samples as '<=50K':
In [31]:
accuracy = label_counts['<=50K'] / label_counts.sum()
accuracy
Out[31]:
In [36]:
precision = label_counts['<=50K'] / label_counts.sum() # precision = TP / (TP + FP)
recall = 1.0 # recall = TP/(TP + FN), here FN=0
f1_score = (2.0 * precision * recall) / (precision + recall)
f1_score
Out[36]:
In [35]:
from sklearn.metrics import classification_report
scores = pd.Series(index=train_df_full.index, data='<=50K')
print(classification_report(train_df_full.income_band, scores))
In [67]:
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score
from IPython.display import display
In [68]:
def cross_validation_score(features, labels, C=1000, scoring='f1_macro'):
"""
Build an SVM with the Radial Basis Kernel and return cross-validation scores.
Use a `BaggingClassifier` to construct an ensemble of SVMs on a subset of samples for all features. This speeds up the
training phase considerably, and also acts as a regularizer.
"""
base = SVC(C=C, kernel='rbf')
model = BaggingClassifier(base_estimator=base, n_estimators=20, max_samples=0.05)
prep = StandardScaler()
estimator = make_pipeline(prep, model)
scores = cross_val_score(estimator=estimator,
X=features,
y=labels,
cv=10,
verbose=10,
scoring=scoring)
scores = pd.Series(scores)
return scores
In [25]:
num_only_scores = cross_validation_score(
features=num_features,
labels=train_df_full.income_band)
num_only_scores.plot(kind='bar',
title='10-Fold CV scores for numeric only features.')
Out[25]:
In [33]:
num_only_scores.mean()
Out[33]:
In [37]:
from sklearn.preprocessing import MultiLabelBinarizer
def encode_one_hot(source, target, feature_name):
"""
One-hot encode categorical feature `feature_name` in `source` DataFrame, and append it to `target`.
"""
labels = sorted(pd.unique(source.loc[:, feature_name]))
encoder = MultiLabelBinarizer(classes=labels)
raw = np.atleast_2d(train_df_full.loc[:, feature_name].values).T
encoded_df = pd.DataFrame(index=source.index, data=encoder.fit_transform(raw))
encoded_df.columns = [feature_name + '_' + str(c) for c in encoded_df.columns]
return pd.concat([target, encoded_df], axis=1)
In [39]:
numeric_plus_occupation = encode_one_hot(train_df_full, num_features, 'occupation')
In [43]:
numeric_plus_occupation_scores = cross_validation_score(
features=numeric_plus_occupation,
labels=train_df_full.income_band)
numeric_plus_occupation_scores.plot(kind='bar',
title='10-Fold CV scores for numeric only features + occupation')
Out[43]:
In [44]:
numeric_plus_occupation_scores.mean()
Out[44]:
In [46]:
numeric_plus_occupation_plus_sex = numeric_plus_occupation.assign(
sex=train_df_full.sex.map(lambda s: 0 if s=='Male' else 1))
numeric_plus_occupation_plus_sex_scores = cross_validation_score(
features=numeric_plus_occupation_plus_sex,
labels=train_df_full.income_band)
numeric_plus_occupation_plus_sex_scores.plot(kind='bar',
title='10-Fold CV scores for numeric only features + occupation + sex')
Out[46]:
In [48]:
numeric_plus_occupation_plus_sex_scores.mean()
Out[48]:
In [50]:
numeric_plus_3 = numeric_plus_occupation_plus_sex.assign(
race=train_df_full.race.map(lambda s: 0 if s=='White' else 1))
numeric_plus_3_scores = cross_validation_score(
features=numeric_plus_3,
labels=train_df_full.income_band)
numeric_plus_3_scores.plot(kind='bar',
title='10-Fold CV scores for numeric only features + occupation + sex + race')
Out[50]:
In [51]:
numeric_plus_3_scores.mean()
Out[51]:
In [52]:
numeric_plus_3 = encode_one_hot(train_df_full, numeric_plus_occupation_plus_sex, 'relationship')
numeric_plus_3_scores = cross_validation_score(
features=numeric_plus_3,
labels=train_df_full.income_band)
numeric_plus_3_scores.plot(kind='bar',
title='10-Fold CV scores for numeric only features + occupation + sex + relationship')
Out[52]:
In [53]:
numeric_plus_3_scores.mean()
Out[53]:
In [56]:
numeric_plus_4 = encode_one_hot(train_df_full, numeric_plus_3, 'workclass')
numeric_plus_4_scores = cross_validation_score(
features=numeric_plus_4,
labels=train_df_full.income_band)
numeric_plus_4_scores.plot(
kind='bar',
title='10-Fold CV scores for numeric only features + occupation + sex + relationship + workclass')
Out[56]:
In [57]:
numeric_plus_4_scores.mean()
Out[57]:
In [58]:
numeric_plus_4 = encode_one_hot(train_df_full, numeric_plus_3, 'marital_status')
numeric_plus_4_scores = cross_validation_score(
features=numeric_plus_4,
labels=train_df_full.income_band)
numeric_plus_4_scores.plot(
kind='bar',
title='10-Fold CV scores for numeric only features + occupation + sex + relationship + marital_status')
Out[58]:
In [59]:
numeric_plus_4_scores.mean()
Out[59]:
In [61]:
numeric_plus_4 = numeric_plus_3.assign(
native_country=train_df_full.native_country.map(lambda v: 0.0 if v=='United-States' else 1.0))
numeric_plus_4_scores = cross_validation_score(
features=numeric_plus_4,
labels=train_df_full.income_band)
numeric_plus_4_scores.plot(
kind='bar',
title='10-Fold CV scores for numeric only features + occupation + sex + relationship + native_country')
Out[61]:
In [63]:
numeric_plus_4_scores.mean()
Out[63]:
In [77]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
base = SVC()
model = BaggingClassifier(base_estimator=base, n_estimators=20, max_samples=0.05)
prep = StandardScaler()
estimator = Pipeline([
('prep', prep),
('model', model),
])
params = [
{'model__base_estimator__C': [100, 1000],
'model__base_estimator__gamma': [0.1, 0.001]
},
]
grid_search = GridSearchCV(estimator=estimator, param_grid=params, scoring='accuracy', verbose=10, cv=10)
grid_search = grid_search.fit(numeric_plus_3, train_df_full.income_band)
In [78]:
grid_search.best_score_
Out[78]:
In [79]:
grid_search.best_params_
Out[79]:
In [80]:
params = [
{'model__base_estimator__C': [1000, 5000],
'model__base_estimator__gamma': [0.001, 0.005]
},
]
grid_search = GridSearchCV(estimator=estimator, param_grid=params, scoring='accuracy', verbose=10, cv=10)
grid_search = grid_search.fit(numeric_plus_3, train_df_full.income_band)
In [81]:
grid_search.best_score_
Out[81]:
In [82]:
grid_search.best_params_
Out[82]:
In [83]:
params = [
{'model__base_estimator__C': [6000, 10000],
'model__base_estimator__gamma': [0.006, 0.0001]
},
]
grid_search = GridSearchCV(estimator=estimator, param_grid=params, scoring='accuracy', verbose=10, cv=10)
grid_search = grid_search.fit(numeric_plus_3, train_df_full.income_band)
In [84]:
grid_search.best_score_
Out[84]:
In [87]:
grid_search.best_params_
Out[87]:
In [ ]: