In [1]:
%pylab inline
pylab.style.use('ggplot')
import pandas as pd
import numpy as np
import seaborn as sns
In [2]:
from sklearn.datasets import load_boston
boston_data = load_boston()
In [3]:
dir(boston_data)
Out[3]:
In [4]:
data_df = pd.DataFrame(boston_data.data, columns=boston_data.feature_names)
In [5]:
data_df = data_df.assign(target=boston_data.target)
In [6]:
data_df.head()
Out[6]:
In [7]:
data_df.shape[1]
Out[7]:
In [8]:
_, axes = pylab.subplots(5, 3, figsize=(20, 20))
for idx, colname in enumerate(data_df.columns):
row, col = divmod(idx, 3)
data_df.loc[:, colname].plot(kind='hist', ax=axes[row][col], title=colname)
In [9]:
_, axes = pylab.subplots(5, 3, figsize=(20, 20))
for idx, colname in enumerate(data_df.columns.drop('target')):
row, col = divmod(idx, 3)
sns.regplot(data=data_df, x=colname, y='target', ax=axes[row][col])
In [10]:
f_corrs = data_df.drop('target', axis=1).corrwith(data_df.target)
f_corrs.plot(kind='bar')
Out[10]:
In [11]:
_, ax = pylab.subplots(1, 1, figsize=(12, 10))
corrs = data_df.drop('target', axis=1).corr()
sns.heatmap(corrs, annot=True, ax=ax)
Out[11]:
In [12]:
from sklearn.feature_selection import mutual_info_regression
importances = mutual_info_regression(data_df.drop('target', axis=1), data_df.target)
importances = pd.Series(data=importances, index=data_df.columns.drop('target').copy())
importances.sort_values(ascending=True).plot(kind='barh')
Out[12]:
In [13]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
In [14]:
def score_with_best_k_features(k):
"""Cross_val score with best n features."""
model = RandomForestRegressor(n_estimators=50, max_features=k)
features = data_df.drop('target', axis=1)
target = data_df.target
cv = KFold(n_splits=10, shuffle=True, random_state=1234)
scores = cross_val_score(estimator=model,
X=features,
y=target,
cv=cv,
scoring='r2')
scores = pd.Series(scores)
return scores
In [15]:
scores = {k: score_with_best_k_features(k) for k in (5, 9, 13)}
scores = pd.concat(scores, axis=1)
In [16]:
scores.plot(kind='bar')
Out[16]:
In [17]:
scores.mean().plot(kind='bar')
Out[17]:
In [18]:
scores.T
Out[18]:
In [19]:
scores.mean()
Out[19]:
In [20]:
from sklearn.model_selection import GridSearchCV
In [21]:
model = RandomForestRegressor()
param_grid = {
'n_estimators' : [100, 120, 140, 160, 180, 200],
'max_depth' : [10],
'max_features' : [5],
}
cv = KFold(n_splits=10, shuffle=True, random_state=1234)
grid_search = GridSearchCV(model, param_grid, cv=cv, verbose=1)
features = data_df.drop('target', axis=1)
target = data_df.target
grid_search = grid_search.fit(X=features, y=target)
In [22]:
grid_search.best_params_
Out[22]:
In [23]:
grid_search.best_score_
Out[23]:
In [ ]: