In [16]:
    
import pandas as pd
data_df = pd.read_csv('https://query.data.world/s/afb7vfbdtzot2kgsqa3obq2jqra5y2')
    
In [17]:
    
import numpy as np
np.random.seed(12345)
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')
    
In [18]:
    
data_df.head()
    
    Out[18]:
In [19]:
    
data_df.info()
    
    
In [20]:
    
ax = data_df.TARGET_5Yrs.value_counts().plot(kind='bar')
    
    
In [21]:
    
data_df = data_df.drop('Name', axis=1)
    
In [22]:
    
_, ax = plt.subplots(1, 1, figsize=(12, 4))
data_df.groupby('TARGET_5Yrs', as_index=False).mean().T.plot(kind='bar', ax=ax, rot=30)
    
    Out[22]:
    
In [23]:
    
_, ax = plt.subplots(1, 1, figsize=(12, 4))
data_df.groupby('TARGET_5Yrs', as_index=False).std().T.plot(kind='bar', ax=ax, rot=30)
    
    Out[23]:
    
In [24]:
    
_, axes = plt.subplots(5, 4, sharey=False, figsize=(16, 16))
for i, col in enumerate(data_df.columns):
    row_i, col_i = divmod(i, 4)
    _ = data_df.boxplot(col, ax=axes[row_i][col_i], by='TARGET_5Yrs')
    
plt.tight_layout()
    
    
In [25]:
    
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
pipeline = Pipeline([
    ('selector', SelectKBest(f_classif, k=5)),
    ('model', GaussianNB(priors=[0.5, 0.5]))
])
cv = StratifiedKFold(n_splits=10, random_state=12345)
results = cross_validate(
    pipeline,
    data_df.drop(['TARGET_5Yrs', '3P%'], axis=1).values,
    data_df.TARGET_5Yrs.values,
    cv=cv,
    scoring='f1',
    return_train_score=True
)
results = pd.DataFrame.from_dict(results)
    
In [26]:
    
results.loc[:, ['train_score', 'test_score']].plot(kind='bar')
    
    Out[26]:
    
In [27]:
    
results.mean()
    
    Out[27]:
In [28]:
    
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import StandardScaler
pipeline = Pipeline([
    ('selector', SelectKBest(f_classif, k=10)),
    ('scaler', StandardScaler()),
    ('model', LogisticRegression(solver='lbfgs'))
])
cv = StratifiedKFold(n_splits=10, random_state=12345)
results = cross_validate(
    pipeline,
    data_df.drop(['TARGET_5Yrs', '3P%'], axis=1).values,
    data_df.TARGET_5Yrs.values,
    scoring='f1',
    cv=cv,
    return_train_score=True
)
results = pd.DataFrame.from_dict(results)
    
In [29]:
    
results.loc[:, ['train_score', 'test_score']].plot(kind='bar')
    
    Out[29]:
    
In [30]:
    
results.mean()
    
    Out[30]:
In [38]:
    
from sklearn.model_selection import GridSearchCV
pipeline = Pipeline([
    ('selector', SelectKBest(f_classif, k=10)),
    ('scaler', StandardScaler()),
    ('model', LogisticRegression(solver='lbfgs'))
])
cv = StratifiedKFold(n_splits=10, random_state=12345)
param_grid = {'model__C': [0.01, 0.1, 1, 10, 100]}
gs = GridSearchCV(pipeline, param_grid, scoring='f1', cv=cv, return_train_score=True)
gs = gs.fit(
    data_df.drop(['TARGET_5Yrs', '3P%'], axis=1).values,
    data_df.TARGET_5Yrs.values
)
gs.best_params_
    
    Out[38]:
In [39]:
    
pipeline = Pipeline([
    ('selector', SelectKBest(f_classif, k=10)),
    ('scaler', StandardScaler()),
    ('model', LogisticRegression(solver='lbfgs', C=0.1))
])
cv = StratifiedKFold(n_splits=10, random_state=12345)
results = cross_validate(
    pipeline,
    data_df.drop(['TARGET_5Yrs', '3P%'], axis=1).values,
    data_df.TARGET_5Yrs.values,
    scoring='f1',
    cv=cv,
    return_train_score=True
)
results = pd.DataFrame.from_dict(results)
    
In [40]:
    
results.loc[:, ['train_score', 'test_score']].plot(kind='bar')
    
    Out[40]:
    
In [41]:
    
results.mean()
    
    Out[41]: