In [1]:
%load_ext sql
%matplotlib inline

In [2]:
import logging
import os
import sys

from configparser import ConfigParser
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

In [3]:
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
config = ConfigParser()
configfn = os.path.join(os.path.expanduser('~'), '.pgcred')
config.read(configfn)


Out[3]:
['/home/sansbacon/.nbadb']

In [4]:
user=config['nbadb']['username']
password=config['nbadb']['password']
db=config['nbadb']['database']

In [5]:
connection_string = "postgresql://{user}:{password}@localhost/{db}".format(user=user, password=password, db=db)
%sql $connection_string


Out[5]:
u'Connected: nbadb@nbadb'

In [25]:
from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

2017 Player Gamelogs


In [55]:
result = %sql SELECT * FROM tmpmodel


5586 rows affected.

In [56]:
df = result.DataFrame()
df.head(10)


Out[56]:
salary minema dkema consensus_game_ou dtot back_to_back three_in_four def_rating_ema pace_ema y
0 5000 23.3 22.8 197.0 -4.44 0 0 98.2 98.3 0
1 5200 25.0 22.6 196.5 -7.08 0 0 107.8 93.0 1
2 5800 26.5 27.2 207.5 -1.12 0 0 110.6 96.0 1
3 6000 27.4 28.0 198.25 -4.65 0 0 97.1 97.2 1
4 6100 27.5 29.6 197.5 -4.98 0 0 106.1 98.1 0
5 5900 27.4 29.5 200.5 0.75 0 0 105.7 101.6 1
6 5400 27.8 29.6 188.5 -7.69 1 1 101.9 95.4 0
7 5500 27.1 26.1 207.5 2.06 0 1 106.2 102.1 0
8 5500 25.3 25.8 201.0 -3.61 0 0 107.3 98.4 1
9 5000 25.2 27.6 195.5 -15.79 0 0 109.1 99.4 1

In [57]:
df = df.dropna(how='any')

In [58]:
X_train, X_test, y_train, y_test = train_test_split(df.ix[:,:-1], df.ix[:,-1],
                                                    train_size=0.75, test_size=0.25)

In [59]:
X_train_scale = preprocessing.scale(X_train)
X_test_scale = preprocessing.scale(X_test)

In [60]:
tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2)
tpot.fit(X_train_scale, y_train)
print(tpot.score(X_test_scale, y_test))


Optimization Progress:  17%|█▋        | 20/120 [00:35<02:20,  1.40s/pipeline]
Version 0.6.8 of tpot is outdated. Version 0.7.0 was released Wednesday March 22, 2017.
Generation 1 - Current best internal CV score: 0.531110408573
Optimization Progress:  32%|███▏      | 38/120 [03:28<12:16,  8.98s/pipeline]
Generation 2 - Current best internal CV score: 0.531110408573
Optimization Progress:  49%|████▉     | 59/120 [06:11<11:07, 10.95s/pipeline]
Generation 3 - Current best internal CV score: 0.540038846303
Optimization Progress:  62%|██████▎   | 75/120 [08:07<05:38,  7.51s/pipeline]
Generation 4 - Current best internal CV score: 0.540038846303
Optimization Progress:  82%|████████▏ | 98/120 [10:36<03:20,  9.11s/pipeline]
Generation 5 - Current best internal CV score: 0.540038846303
                                                                              

Best pipeline: ExtraTreesClassifier(input_matrix, 43, 0.82000000000000006)
0.512391465097


In [61]:
from sklearn.ensemble import ExtraTreesClassifier

forest = ExtraTreesClassifier(n_estimators=250, random_state=0)

In [62]:
forest.fit(X_train_scale, y_train)


Out[62]:
ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=250, n_jobs=1, oob_score=False, random_state=0,
           verbose=0, warm_start=False)

In [63]:
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

In [66]:
print("Feature ranking:")

for f in range(X_train_scale.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

plt.figure()
plt.title("Feature importances")
plt.bar(range(X_train_scale.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(X_train_scale.shape[1]), indices)
plt.xlim([-1, X_train_scale.shape[1]])
plt.show()


Feature ranking:
1. feature 2 (0.159057)
2. feature 1 (0.158966)
3. feature 0 (0.148447)
4. feature 4 (0.131073)
5. feature 7 (0.128373)
6. feature 8 (0.127443)
7. feature 3 (0.126365)
8. feature 6 (0.010289)
9. feature 5 (0.009987)

In [68]:
df.columns


Out[68]:
Index([u'salary', u'minema', u'dkema', u'consensus_game_ou', u'dtot', u'back_to_back', u'three_in_four', u'def_rating_ema', u'pace_ema', u'y'], dtype='object')

In [ ]: