In [ ]:
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline
In [ ]:
from sklearn.datasets import load_boston
boston = load_boston()
from sklearn.model_selection import train_test_split
X, y = boston.data, boston.target
X_train, X_test, y_train, y_test = train_test_split(
X, y, random_state=0)
In [ ]:
print(boston.DESCR)
In [ ]:
fig, axes = plt.subplots(3, 5, figsize=(20, 10))
for i, ax in enumerate(axes.ravel()):
if i > 12:
ax.set_visible(False)
continue
ax.plot(X[:, i], y, 'o', alpha=.5)
ax.set_title("{}: {}".format(i, boston.feature_names[i]))
ax.set_ylabel("MEDV")
In [ ]:
plt.boxplot(X)
plt.xticks(np.arange(1, X.shape[1] + 1),
boston.feature_names, rotation=30, ha="right");
In [ ]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
In [ ]:
from sklearn.neighbors import KNeighborsRegressor
knr = KNeighborsRegressor().fit(X_train, y_train)
knr.score(X_train, y_train)
In [ ]:
knr.score(X_test, y_test)
In [ ]:
knr_scaled = KNeighborsRegressor().fit(X_train_scaled, y_train)
knr_scaled.fit(X_train_scaled, y_train)
knr_scaled.score(X_train_scaled, y_train)
In [ ]:
X_test_scaled = scaler.transform(X_test)
knr_scaled.score(X_test_scaled, y_test)
In [ ]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=100, random_state=0)
rf.fit(X_train, y_train)
rf.score(X_test, y_test)
In [ ]:
rf_scaled = RandomForestRegressor(n_estimators=100, random_state=0)
rf_scaled.fit(X_train_scaled, y_train)
rf_scaled.score(X_test_scaled, y_test)
In [ ]:
import pandas as pd
df = pd.DataFrame({'salary': [103, 89, 142, 54, 63, 219],
'boro': ['Manhattan', 'Queens', 'Manhattan', 'Brooklyn', 'Brooklyn', 'Bronx']})
df
In [ ]:
pd.get_dummies(df)
In [ ]:
from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
categorical = df.dtypes == object
ct = make_column_transformer((categorical, OneHotEncoder()),
(~categorical, StandardScaler()))
ct.fit_transform(df)
Load the "adult" datasets consisting of income data from the census, classifying adults into those earning above $50k a year vs those earning below.
Experiment with visualizing the data. Can you find out which features influence the income the most?
Separate the target variable from the features. Split the data into training and test set. Apply dummy encoding and scaling. How did this change the number of variables?
Build and evaluate a LogisticRegression model on the data.
In [ ]:
data = pd.read_csv("data/adult.csv", index_col=0)
In [ ]:
# %load solutions/load_adult.py