notebook.community

Edit and run



In [1]:

    
%load_ext sql
%matplotlib inline



In [2]:

    
import logging
import os
import sys

from configparser import ConfigParser
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt



In [3]:

    
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
config = ConfigParser()
configfn = os.path.join(os.path.expanduser('~'), '.pgcred')
config.read(configfn)









    Out[3]:





['/home/sansbacon/.nbadb']



In [4]:

    
user=config['nbadb']['username']
password=config['nbadb']['password']
db=config['nbadb']['database']



In [5]:

    
connection_string = "postgresql://{user}:{password}@localhost/{db}".format(user=user, password=password, db=db)
%sql $connection_string









    Out[5]:





u'Connected: nbadb@nbadb'



In [25]:

    
from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

2017 Player Gamelogs



In [55]:

    
result = %sql SELECT * FROM tmpmodel









    



5586 rows affected.



In [56]:

    
df = result.DataFrame()
df.head(10)









    Out[56]:






  
    
      
      salary
      minema
      dkema
      consensus_game_ou
      dtot
      back_to_back
      three_in_four
      def_rating_ema
      pace_ema
      y
    
  
  
    
      0
      5000
      23.3
      22.8
      197.0
      -4.44
      0
      0
      98.2
      98.3
      0
    
    
      1
      5200
      25.0
      22.6
      196.5
      -7.08
      0
      0
      107.8
      93.0
      1
    
    
      2
      5800
      26.5
      27.2
      207.5
      -1.12
      0
      0
      110.6
      96.0
      1
    
    
      3
      6000
      27.4
      28.0
      198.25
      -4.65
      0
      0
      97.1
      97.2
      1
    
    
      4
      6100
      27.5
      29.6
      197.5
      -4.98
      0
      0
      106.1
      98.1
      0
    
    
      5
      5900
      27.4
      29.5
      200.5
      0.75
      0
      0
      105.7
      101.6
      1
    
    
      6
      5400
      27.8
      29.6
      188.5
      -7.69
      1
      1
      101.9
      95.4
      0
    
    
      7
      5500
      27.1
      26.1
      207.5
      2.06
      0
      1
      106.2
      102.1
      0
    
    
      8
      5500
      25.3
      25.8
      201.0
      -3.61
      0
      0
      107.3
      98.4
      1
    
    
      9
      5000
      25.2
      27.6
      195.5
      -15.79
      0
      0
      109.1
      99.4
      1



In [57]:

    
df = df.dropna(how='any')



In [58]:

    
X_train, X_test, y_train, y_test = train_test_split(df.ix[:,:-1], df.ix[:,-1],
                                                    train_size=0.75, test_size=0.25)



In [59]:

    
X_train_scale = preprocessing.scale(X_train)
X_test_scale = preprocessing.scale(X_test)



In [60]:

    
tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2)
tpot.fit(X_train_scale, y_train)
print(tpot.score(X_test_scale, y_test))









    



Optimization Progress:  17%|█▋        | 20/120 [00:35<02:20,  1.40s/pipeline]





    



Version 0.6.8 of tpot is outdated. Version 0.7.0 was released Wednesday March 22, 2017.
Generation 1 - Current best internal CV score: 0.531110408573





    



Optimization Progress:  32%|███▏      | 38/120 [03:28<12:16,  8.98s/pipeline]





    



Generation 2 - Current best internal CV score: 0.531110408573





    



Optimization Progress:  49%|████▉     | 59/120 [06:11<11:07, 10.95s/pipeline]





    



Generation 3 - Current best internal CV score: 0.540038846303





    



Optimization Progress:  62%|██████▎   | 75/120 [08:07<05:38,  7.51s/pipeline]





    



Generation 4 - Current best internal CV score: 0.540038846303





    



Optimization Progress:  82%|████████▏ | 98/120 [10:36<03:20,  9.11s/pipeline]





    



Generation 5 - Current best internal CV score: 0.540038846303





    



                                                                              





    




Best pipeline: ExtraTreesClassifier(input_matrix, 43, 0.82000000000000006)
0.512391465097



In [61]:

    
from sklearn.ensemble import ExtraTreesClassifier

forest = ExtraTreesClassifier(n_estimators=250, random_state=0)



In [62]:

    
forest.fit(X_train_scale, y_train)









    Out[62]:





ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=250, n_jobs=1, oob_score=False, random_state=0,
           verbose=0, warm_start=False)



In [63]:

    
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]



In [66]:

    
print("Feature ranking:")

for f in range(X_train_scale.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

plt.figure()
plt.title("Feature importances")
plt.bar(range(X_train_scale.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(X_train_scale.shape[1]), indices)
plt.xlim([-1, X_train_scale.shape[1]])
plt.show()









    



Feature ranking:
1. feature 2 (0.159057)
2. feature 1 (0.158966)
3. feature 0 (0.148447)
4. feature 4 (0.131073)
5. feature 7 (0.128373)
6. feature 8 (0.127443)
7. feature 3 (0.126365)
8. feature 6 (0.010289)
9. feature 5 (0.009987)



In [68]:

    
df.columns









    Out[68]:





Index([u'salary', u'minema', u'dkema', u'consensus_game_ou', u'dtot', u'back_to_back', u'three_in_four', u'def_rating_ema', u'pace_ema', u'y'], dtype='object')



In [ ]:

	salary	minema	dkema	consensus_game_ou	dtot	back_to_back	three_in_four	def_rating_ema	pace_ema	y
0	5000	23.3	22.8	197.0	-4.44	0	0	98.2	98.3	0
1	5200	25.0	22.6	196.5	-7.08	0	0	107.8	93.0	1
2	5800	26.5	27.2	207.5	-1.12	0	0	110.6	96.0	1
3	6000	27.4	28.0	198.25	-4.65	0	0	97.1	97.2	1
4	6100	27.5	29.6	197.5	-4.98	0	0	106.1	98.1	0
5	5900	27.4	29.5	200.5	0.75	0	0	105.7	101.6	1
6	5400	27.8	29.6	188.5	-7.69	1	1	101.9	95.4	0
7	5500	27.1	26.1	207.5	2.06	0	1	106.2	102.1	0
8	5500	25.3	25.8	201.0	-3.61	0	0	107.3	98.4	1
9	5000	25.2	27.6	195.5	-15.79	0	0	109.1	99.4	1