Preprocessing


In [ ]:
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline

In [ ]:
from sklearn.datasets import load_boston
boston = load_boston()
from sklearn.model_selection import train_test_split
X, y = boston.data, boston.target
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=0)

In [ ]:
print(boston.DESCR)

In [ ]:
fig, axes = plt.subplots(3, 5, figsize=(20, 10))
for i, ax in enumerate(axes.ravel()):
    if i > 12:
        ax.set_visible(False)
        continue
    ax.plot(X[:, i], y, 'o', alpha=.5)
    ax.set_title("{}: {}".format(i, boston.feature_names[i]))
    ax.set_ylabel("MEDV")

In [ ]:
plt.boxplot(X)
plt.xticks(np.arange(1, X.shape[1] + 1),
           boston.feature_names, rotation=30, ha="right");

In [ ]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [ ]:
from sklearn.neighbors import KNeighborsRegressor
knr = KNeighborsRegressor().fit(X_train, y_train)
knr.score(X_train, y_train)

In [ ]:
knr.score(X_test, y_test)

In [ ]:
knr_scaled = KNeighborsRegressor().fit(X_train_scaled, y_train)
knr_scaled.fit(X_train_scaled, y_train)
knr_scaled.score(X_train_scaled, y_train)

In [ ]:
X_test_scaled = scaler.transform(X_test)
knr_scaled.score(X_test_scaled, y_test)

In [ ]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=100, random_state=0)
rf.fit(X_train, y_train)
rf.score(X_test, y_test)

In [ ]:
rf_scaled = RandomForestRegressor(n_estimators=100, random_state=0)
rf_scaled.fit(X_train_scaled, y_train)
rf_scaled.score(X_test_scaled, y_test)

Categorical Variables


In [ ]:
import pandas as pd
df = pd.DataFrame({'salary': [103, 89, 142, 54, 63, 219],
                   'boro': ['Manhattan', 'Queens', 'Manhattan', 'Brooklyn', 'Brooklyn', 'Bronx']})
df

In [ ]:
pd.get_dummies(df)

In [ ]:
from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
categorical = df.dtypes == object
ct = make_column_transformer((categorical, OneHotEncoder()),
                             (~categorical, StandardScaler()))
ct.fit_transform(df)

Exercises

Exercise 1

Load the "adult" datasets consisting of income data from the census, classifying adults into those earning above $50k a year vs those earning below.

Exercise 2

Experiment with visualizing the data. Can you find out which features influence the income the most?

Exercise 3

Separate the target variable from the features. Split the data into training and test set. Apply dummy encoding and scaling. How did this change the number of variables?

Exercise 4

Build and evaluate a LogisticRegression model on the data.


In [ ]:
data = pd.read_csv("data/adult.csv", index_col=0)

In [ ]:
# %load solutions/load_adult.py