In [16]:
import pandas as pd
data_df = pd.read_csv('https://query.data.world/s/afb7vfbdtzot2kgsqa3obq2jqra5y2')

In [17]:
import numpy as np
np.random.seed(12345)
%matplotlib inline
import seaborn as sns

import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [18]:
data_df.head()


Out[18]:
Name GP MIN PTS FGM FGA FG% 3P Made 3PA 3P% ... FTA FT% OREB DREB REB AST STL BLK TOV TARGET_5Yrs
0 Brandon Ingram 36 27.4 7.4 2.6 7.6 34.7 0.5 2.1 25.0 ... 2.3 69.9 0.7 3.4 4.1 1.9 0.4 0.4 1.3 0.0
1 Andrew Harrison 35 26.9 7.2 2.0 6.7 29.6 0.7 2.8 23.5 ... 3.4 76.5 0.5 2.0 2.4 3.7 1.1 0.5 1.6 0.0
2 JaKarr Sampson 74 15.3 5.2 2.0 4.7 42.2 0.4 1.7 24.4 ... 1.3 67.0 0.5 1.7 2.2 1.0 0.5 0.3 1.0 0.0
3 Malik Sealy 58 11.6 5.7 2.3 5.5 42.6 0.1 0.5 22.6 ... 1.3 68.9 1.0 0.9 1.9 0.8 0.6 0.1 1.0 1.0
4 Matt Geiger 48 11.5 4.5 1.6 3.0 52.4 0.0 0.1 0.0 ... 1.9 67.4 1.0 1.5 2.5 0.3 0.3 0.4 0.8 1.0

5 rows × 21 columns


In [19]:
data_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1340 entries, 0 to 1339
Data columns (total 21 columns):
Name           1340 non-null object
GP             1340 non-null int64
MIN            1340 non-null float64
PTS            1340 non-null float64
FGM            1340 non-null float64
FGA            1340 non-null float64
FG%            1340 non-null float64
3P Made        1340 non-null float64
3PA            1340 non-null float64
3P%            1329 non-null float64
FTM            1340 non-null float64
FTA            1340 non-null float64
FT%            1340 non-null float64
OREB           1340 non-null float64
DREB           1340 non-null float64
REB            1340 non-null float64
AST            1340 non-null float64
STL            1340 non-null float64
BLK            1340 non-null float64
TOV            1340 non-null float64
TARGET_5Yrs    1340 non-null float64
dtypes: float64(19), int64(1), object(1)
memory usage: 219.9+ KB

Class Distribution


In [20]:
ax = data_df.TARGET_5Yrs.value_counts().plot(kind='bar')


Mean and STD of Both Groups`


In [21]:
data_df = data_df.drop('Name', axis=1)

In [22]:
_, ax = plt.subplots(1, 1, figsize=(12, 4))
data_df.groupby('TARGET_5Yrs', as_index=False).mean().T.plot(kind='bar', ax=ax, rot=30)


Out[22]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f12ae1a0e80>

In [23]:
_, ax = plt.subplots(1, 1, figsize=(12, 4))
data_df.groupby('TARGET_5Yrs', as_index=False).std().T.plot(kind='bar', ax=ax, rot=30)


Out[23]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f12adafe208>

Boxplot Per Feature


In [24]:
_, axes = plt.subplots(5, 4, sharey=False, figsize=(16, 16))

for i, col in enumerate(data_df.columns):
    row_i, col_i = divmod(i, 4)
    _ = data_df.boxplot(col, ax=axes[row_i][col_i], by='TARGET_5Yrs')
    
plt.tight_layout()


First Model: Naive Bayes


In [25]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif


pipeline = Pipeline([
    ('selector', SelectKBest(f_classif, k=5)),
    ('model', GaussianNB(priors=[0.5, 0.5]))
])

cv = StratifiedKFold(n_splits=10, random_state=12345)

results = cross_validate(
    pipeline,
    data_df.drop(['TARGET_5Yrs', '3P%'], axis=1).values,
    data_df.TARGET_5Yrs.values,
    cv=cv,
    scoring='f1',
    return_train_score=True
)

results = pd.DataFrame.from_dict(results)

In [26]:
results.loc[:, ['train_score', 'test_score']].plot(kind='bar')


Out[26]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f12ad328eb8>

In [27]:
results.mean()


Out[27]:
fit_time       0.002678
score_time     0.001037
test_score     0.628162
train_score    0.630542
dtype: float64

Logistic Regression`


In [28]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import StandardScaler


pipeline = Pipeline([
    ('selector', SelectKBest(f_classif, k=10)),
    ('scaler', StandardScaler()),
    ('model', LogisticRegression(solver='lbfgs'))
])

cv = StratifiedKFold(n_splits=10, random_state=12345)

results = cross_validate(
    pipeline,
    data_df.drop(['TARGET_5Yrs', '3P%'], axis=1).values,
    data_df.TARGET_5Yrs.values,
    scoring='f1',
    cv=cv,
    return_train_score=True
)

results = pd.DataFrame.from_dict(results)

In [29]:
results.loc[:, ['train_score', 'test_score']].plot(kind='bar')


Out[29]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f12ad26e208>

In [30]:
results.mean()


Out[30]:
fit_time       0.018507
score_time     0.000974
test_score     0.774837
train_score    0.779661
dtype: float64

Hyperparameter Optimization


In [38]:
from sklearn.model_selection import GridSearchCV

pipeline = Pipeline([
    ('selector', SelectKBest(f_classif, k=10)),
    ('scaler', StandardScaler()),
    ('model', LogisticRegression(solver='lbfgs'))
])

cv = StratifiedKFold(n_splits=10, random_state=12345)

param_grid = {'model__C': [0.01, 0.1, 1, 10, 100]}
gs = GridSearchCV(pipeline, param_grid, scoring='f1', cv=cv, return_train_score=True)
gs = gs.fit(
    data_df.drop(['TARGET_5Yrs', '3P%'], axis=1).values,
    data_df.TARGET_5Yrs.values
)


gs.best_params_


Out[38]:
{'model__C': 0.1}

Best Model


In [39]:
pipeline = Pipeline([
    ('selector', SelectKBest(f_classif, k=10)),
    ('scaler', StandardScaler()),
    ('model', LogisticRegression(solver='lbfgs', C=0.1))
])

cv = StratifiedKFold(n_splits=10, random_state=12345)

results = cross_validate(
    pipeline,
    data_df.drop(['TARGET_5Yrs', '3P%'], axis=1).values,
    data_df.TARGET_5Yrs.values,
    scoring='f1',
    cv=cv,
    return_train_score=True
)

results = pd.DataFrame.from_dict(results)

In [40]:
results.loc[:, ['train_score', 'test_score']].plot(kind='bar')


Out[40]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f12acfef7b8>

In [41]:
results.mean()


Out[41]:
fit_time       0.011130
score_time     0.000856
test_score     0.777136
train_score    0.781080
dtype: float64