In [16]:
import pandas as pd
data_df = pd.read_csv('https://query.data.world/s/afb7vfbdtzot2kgsqa3obq2jqra5y2')
In [17]:
import numpy as np
np.random.seed(12345)
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')
In [18]:
data_df.head()
Out[18]:
In [19]:
data_df.info()
In [20]:
ax = data_df.TARGET_5Yrs.value_counts().plot(kind='bar')
In [21]:
data_df = data_df.drop('Name', axis=1)
In [22]:
_, ax = plt.subplots(1, 1, figsize=(12, 4))
data_df.groupby('TARGET_5Yrs', as_index=False).mean().T.plot(kind='bar', ax=ax, rot=30)
Out[22]:
In [23]:
_, ax = plt.subplots(1, 1, figsize=(12, 4))
data_df.groupby('TARGET_5Yrs', as_index=False).std().T.plot(kind='bar', ax=ax, rot=30)
Out[23]:
In [24]:
_, axes = plt.subplots(5, 4, sharey=False, figsize=(16, 16))
for i, col in enumerate(data_df.columns):
row_i, col_i = divmod(i, 4)
_ = data_df.boxplot(col, ax=axes[row_i][col_i], by='TARGET_5Yrs')
plt.tight_layout()
In [25]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
pipeline = Pipeline([
('selector', SelectKBest(f_classif, k=5)),
('model', GaussianNB(priors=[0.5, 0.5]))
])
cv = StratifiedKFold(n_splits=10, random_state=12345)
results = cross_validate(
pipeline,
data_df.drop(['TARGET_5Yrs', '3P%'], axis=1).values,
data_df.TARGET_5Yrs.values,
cv=cv,
scoring='f1',
return_train_score=True
)
results = pd.DataFrame.from_dict(results)
In [26]:
results.loc[:, ['train_score', 'test_score']].plot(kind='bar')
Out[26]:
In [27]:
results.mean()
Out[27]:
In [28]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import StandardScaler
pipeline = Pipeline([
('selector', SelectKBest(f_classif, k=10)),
('scaler', StandardScaler()),
('model', LogisticRegression(solver='lbfgs'))
])
cv = StratifiedKFold(n_splits=10, random_state=12345)
results = cross_validate(
pipeline,
data_df.drop(['TARGET_5Yrs', '3P%'], axis=1).values,
data_df.TARGET_5Yrs.values,
scoring='f1',
cv=cv,
return_train_score=True
)
results = pd.DataFrame.from_dict(results)
In [29]:
results.loc[:, ['train_score', 'test_score']].plot(kind='bar')
Out[29]:
In [30]:
results.mean()
Out[30]:
In [38]:
from sklearn.model_selection import GridSearchCV
pipeline = Pipeline([
('selector', SelectKBest(f_classif, k=10)),
('scaler', StandardScaler()),
('model', LogisticRegression(solver='lbfgs'))
])
cv = StratifiedKFold(n_splits=10, random_state=12345)
param_grid = {'model__C': [0.01, 0.1, 1, 10, 100]}
gs = GridSearchCV(pipeline, param_grid, scoring='f1', cv=cv, return_train_score=True)
gs = gs.fit(
data_df.drop(['TARGET_5Yrs', '3P%'], axis=1).values,
data_df.TARGET_5Yrs.values
)
gs.best_params_
Out[38]:
In [39]:
pipeline = Pipeline([
('selector', SelectKBest(f_classif, k=10)),
('scaler', StandardScaler()),
('model', LogisticRegression(solver='lbfgs', C=0.1))
])
cv = StratifiedKFold(n_splits=10, random_state=12345)
results = cross_validate(
pipeline,
data_df.drop(['TARGET_5Yrs', '3P%'], axis=1).values,
data_df.TARGET_5Yrs.values,
scoring='f1',
cv=cv,
return_train_score=True
)
results = pd.DataFrame.from_dict(results)
In [40]:
results.loc[:, ['train_score', 'test_score']].plot(kind='bar')
Out[40]:
In [41]:
results.mean()
Out[41]: