In [43]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, scale
from sklearn.cross_validation import cross_val_score, train_test_split
import os
import pickle
import matplotlib.pyplot as plt
%matplotlib inline

# Set some Pandas options
pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 25)

#http://nbviewer.ipython.org/github/herrfz/dataanalysis/blob/master/week3/exploratory_graphs.ipynb
#http://nbviewer.ipython.org/gist/fonnesbeck/5850463

Training


Tournament data

load data


In [44]:
with open('20150423_zurich_classic_of_new_orleans/past.pkl', 'r') as f:
    df = pd.DataFrame.from_dict(pickle.load(f)).set_index('player_id')
print df


             money      player_name  points  pos  r1  r2  r3  r4  score
player_id                                                              
29289      1224000    Seung-Yul Noh     500    1  65  68  65  71    269
34431       598400     Robert Streb     245   T2  67  66  68  70    271
29425       598400   Andrew Svoboda     245   T2  64  68  70  69    271
28475       326400     Jeff Overton     135    4  67  68  67  70    272
24494       248200     Erik Compton     100   T5  66  68  72  68    274
24358       248200  Robert Garrigus     100   T5  73  69  68  64    274
12716       248200  Charley Hoffman     100   T5  68  67  68  71    274
33141       197200   Keegan Bradley      80   T8  69  66  65  75    275
30750       197200     Tommy Gainey      80   T8  71  66  67  71    275
22405       197200      Justin Rose      80   T8  71  67  69  68    275
25364       149600       Paul Casey      63  T11  71  68  64  73    276
34021       149600       Bud Cauley      63  T11  71  68  66  71    276
...            ...              ...     ...  ...  ..  ..  ..  ..    ...
34409            0  David Lingmerth       0  CUT  68  79 NaN NaN    147
35541            0    John Peterson       0  CUT  76  71 NaN NaN    147
20691            0        Greg Owen       0  CUT  76  72 NaN NaN    148
01948            0     Tim Petrovic       0  CUT  72  76 NaN NaN    148
26679            0    Kevin Stadler       0  CUT  74  74 NaN NaN    148
30692            0  Scott Stallings       0  CUT  74  74 NaN NaN    148
24490            0   George McNeill       0  CUT  75  74 NaN NaN    149
02239            0   Scott Verplank       0  CUT  73  76 NaN NaN    149
34360            0     Patrick Reed       0  CUT  74  76 NaN NaN    150
32698            0      Bobby Gates       0  CUT  76  75 NaN NaN    151
12782            0       Tim Herron       0  CUT  79  76 NaN NaN    155
28307            0       Matt Every       0  W/D  76 NaN NaN NaN     76

[134 rows x 9 columns]

Clean tournament data

first scale the scores for the labels


In [45]:
# top half labels
df_top = df[np.isfinite(df['r4'])]
# print tdf_top.sort('score')
df_top['score_scaled'] = MinMaxScaler(feature_range=(0.5, 1.)).fit_transform(df_top['score'].astype(float))[::-1]
# print tdf_top.sort('score')

# bottom half
df_bot = df[df['pos'] == 'CUT']
# print tdf_bot.sort('score')
df_bot['score_scaled'] = MinMaxScaler(feature_range=(0., 0.5)).fit_transform(df_bot['score'].astype(float))[::-1]
# print tdf_bot

# combine labels
df = pd.concat([df_top, df_bot])['score_scaled']
print df


/Users/jaco/anaconda/lib/python2.7/site-packages/IPython/kernel/__main__.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/Users/jaco/anaconda/lib/python2.7/site-packages/IPython/kernel/__main__.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
player_id
29289        1.000
34431        0.975
29425        0.975
28475        0.975
24494        0.950
24358        0.950
12716        0.950
33141        0.925
30750        0.925
22405        0.925
...
34409        0.041667
35541        0.041667
20691        0.041667
01948        0.041667
26679        0.041667
30692        0.000000
24490        0.000000
02239        0.000000
34360        0.000000
32698        0.000000
12782        0.000000
Name: score_scaled, Length: 123, dtype: float64

Players data

load data


In [46]:
df_players = pd.DataFrame()
for player_id, score_scaled in df.iteritems():
    player_file = 'players/{}/{}.pkl'.format(2013, player_id)
#     print pfile
    if os.path.isfile(player_file):
        with open(player_file, 'r') as f:
            player_data = pickle.load(f)
            player_data['player_id'] = player_id
#             print pdata
            df_players = df_players.append(player_data, ignore_index=True)
#             break
    else:
        print 'could not load {}'.format(player_id)
df_players = df_players.set_index('player_id')
print df_players.head(1)


could not load 21805
           approach the green_approaches_from_100-125_yards  \
player_id                                                     
29289                                              5.230769   

           approach the green_approaches_from_100-125_yards_(rgh)  \
player_id                                                           
29289                                               9.538462        

           approach the green_approaches_from_125-150_yards  \
player_id                                                     
29289                                              7.076923   

           approach the green_approaches_from_125-150_yards_(rgh)  \
player_id                                                           
29289                                               9.846154        

           approach the green_approaches_from_150-175_yards  \
player_id                                                     
29289                                              8.923077   

           approach the green_approaches_from_150-175_yards_(rgh)  \
player_id                                                           
29289                                              13.538462        

           approach the green_approaches_from_175-200_yards  \
player_id                                                     
29289                                             10.461538   

           approach the green_approaches_from_175-200_yards_(rgh)  \
player_id                                                           
29289                                              16.307692        

           approach the green_approaches_from_200-225_yards  \
player_id                                                     
29289                                             14.153846   

           approach the green_approaches_from_200-225_yards_(rgh)  \
player_id                                                           
29289                                              19.076923        

                         ...                  recap_putting_average  \
player_id                ...                                          
29289                    ...                                    NaN   

           recap_scoring_average_(actual)  \
player_id                                   
29289                                 NaN   

           recap_web.com_tour_finals_priority_rankings  \
player_id                                                
29289                                              NaN   

           recap_web.com_tour_regular_season_money_list  \
player_id                                                 
29289                                               NaN   

           scoring_scoring_average_final_round  putting_3_putt_avoidance  \
player_id                                                                  
29289                                      NaN                       NaN   

           putting_one_putt_percentage  scoring_scoring_average_final_rnd  \
player_id                                                                   
29289                              NaN                                NaN   

           streaks_consecutive_par_4_birdies  \
player_id                                      
29289                                    NaN   

           streaks_consecutive_par_5_birdies  
player_id                                     
29289                                    NaN  

[1 rows x 152 columns]

Clean player data

remove the features which has mostly nan


In [47]:
# print df_players
cols = []
for col in df_players.columns:
    #print col, df_players[col].isnull().sum()
    if df_players[col].isnull().sum() > len(df_players) * 0.2:
        cols.append(col)
print len(df_players.columns), 'vs', len(cols)
print 'dropping', cols
df_players_cleaned = df_players.drop(cols, axis=1)
print len(df_players_cleaned.columns)
print len(df_players_cleaned)
df_players_cleaned = df_players_cleaned.dropna()
print len(df_players_cleaned)
cols_to_use = df_players_cleaned.columns


152 vs 26
dropping ['approach the green_approaches_from_250-275_yards_(rgh)', 'approach the green_approaches_from_50-75_yards', 'approach the green_approaches_from_>_275_yards', 'approach the green_approaches_from_>_275_yards_(rgh)', 'approach the green_longest_hole_outs_(in_yards)', 'off the tee_longest_drives', 'off the tee_total_driving', 'off the tee_total_driving_efficiency', 'putting_longest_putts', 'putting_total_putting', 'recap_all-around_ranking', 'recap_ball_striking', 'recap_fedexcup_season_points', 'recap_total_driving', 'recap_fedexcup_playoffs_points', 'streaks_consecutive_cuts', 'recap_putting_average', 'recap_scoring_average_(actual)', 'recap_web.com_tour_finals_priority_rankings', 'recap_web.com_tour_regular_season_money_list', 'scoring_scoring_average_final_round', 'putting_3_putt_avoidance', 'putting_one_putt_percentage', 'scoring_scoring_average_final_rnd', 'streaks_consecutive_par_4_birdies', 'streaks_consecutive_par_5_birdies']
126
122
87

Merge data

merge players data with history


In [48]:
print df.tail()
df_players_cleaned['score'] = df
# print df_players_cleaned.tail()
# print df_players_cleaned.head()
print len(df_players_cleaned)
df_merged = df_players_cleaned.dropna()
print len(df_merged)


player_id
24490        0
02239        0
34360        0
32698        0
12782        0
Name: score_scaled, dtype: float64
87
87

In [49]:
labels = df_merged['score']
print 'labels\n', labels

# print df_merged.columns[-10:]
features = scale(df_merged.drop('score', axis=1))
# print df_merged.columns[-10:]
print 'features\n', features[0]

X_train, X_test, y_train, y_test = train_test_split(features, labels)


labels
player_id
29289        1.000
34431        0.975
29425        0.975
28475        0.975
24494        0.950
24358        0.950
12716        0.950
33141        0.925
30750        0.925
22405        0.925
...
27770        0.041667
32448        0.041667
27936        0.041667
34409        0.041667
20691        0.041667
01948        0.041667
26679        0.041667
24490        0.000000
34360        0.000000
32698        0.000000
12782        0.000000
Name: score, Length: 87, dtype: float64
features
[ -1.16407126e+00   4.96469085e-01   3.99732679e-01  -4.81634729e-01
   1.12233865e+00   8.56188699e-01   3.84548421e-01   1.76989773e-01
   1.47462328e+00  -1.32195914e-01  -2.33193037e-01   8.13805473e-01
   5.29031808e-01  -1.30395827e+00   1.78732032e-01  -5.90176450e-01
  -3.80896160e-01  -6.27762265e-01  -1.86984555e-01   9.46497355e-01
   3.74840505e-01  -5.51319097e-01   9.97096032e-01   6.32521585e-01
  -3.09595370e-01   8.14773162e-01  -1.66290904e-01  -6.75559418e-01
   1.59558622e+00   7.00659205e-01   4.09921758e-02  -8.70723590e-01
   2.85668743e-01  -8.57160540e-01  -1.24100579e+00  -1.83950580e-01
   1.08371325e-01  -7.14582545e-01  -1.11249627e+00  -9.42987783e-01
  -5.36092520e-01   1.73416607e+00   2.85795509e+00  -2.16552496e+00
   1.37707116e+00   9.71797071e-01   2.29149714e+00   4.96297455e-01
  -1.15629039e-01   4.19499060e-01   1.07832773e-01  -3.28548007e-01
   1.61458791e-01  -3.54037446e-01   5.90315254e-01  -2.07299123e+00
   3.50249634e-01  -1.23373415e-01   8.24663191e-01  -5.66173569e-01
  -6.35288284e-01  -8.64130921e-01   6.99604529e-01  -1.08244124e+00
  -1.80597735e+00  -8.06822021e-01   2.75251206e-02  -7.29633413e-01
  -2.38917985e+00  -5.30731781e-01   5.90618517e-01  -2.77244547e-01
   7.29206242e-01   3.28649740e-01   1.41443757e+00  -1.17687547e+00
   4.21046970e-02  -2.16552496e+00   1.37707116e+00   3.88033032e-01
  -1.66290904e-01  -7.32781610e-01  -1.04522993e-01  -8.57160540e-01
   9.92144879e-01  -1.17687547e+00  -1.15629039e-01  -8.92067092e-01
   1.58303355e+00   4.21046970e-02  -8.72017349e-02   1.24462346e+00
   3.88033032e-01   1.00422829e+00  -8.51555548e-01   1.62798643e+00
   7.74001465e-02   5.74006201e-01   1.23534510e+00   4.28872049e-01
  -2.45508389e-03  -4.95710003e-01   6.30939118e-01   1.28509499e-02
   1.35353909e+00  -8.72017349e-02   1.04349063e-01   9.08910673e-01
   6.59040798e-01   1.61122382e+00   9.92144879e-01   9.84359181e-01
  -8.92067092e-01  -3.24990753e-02  -8.50557336e-01   1.19348829e+00
   5.02997033e-01  -3.03840509e-01  -3.16306853e-02  -1.36671747e+00
  -1.05405292e+00  -1.05673169e-01   1.00259885e+00   4.65241935e-01
  -7.53319047e-01   7.12313633e-02]

Machine Learning



In [50]:
from sklearn.ensemble import GradientBoostingRegressor

Cross Validate

CV the data


In [51]:
tree = GradientBoostingRegressor()
cv = cross_val_score(tree, X_train, y_train, cv=10, scoring='r2')
print np.mean(cv), np.std(cv)
print cv


-0.518658628948 0.77034501557
[-0.11986646 -0.49213462  0.12505254 -0.0707887  -0.53345185 -0.88065398
  0.00876729 -2.667447   -0.18558574 -0.37047777]

Training

fit the model


In [52]:
tree.fit(X_train, y_train)
tree.score(X_test, y_test)


Out[52]:
-0.54581798121072689

Predict

predict using current players


In [54]:
df_players = pd.DataFrame(columns=cols_to_use)
for player_id, score_scaled in df.iteritems():
    player_file = 'players/{}/{}.pkl'.format(2014, player_id)
#     print pfile
    if os.path.isfile(player_file):
        with open(player_file, 'r') as f:
            player_data = pickle.load(f)
            player_data['player_id'] = player_id
#             print pdata
            df_players = df_players.append(player_data, ignore_index=True)
#             break
    else:
        print 'could not load {}'.format(player_id)
df_players = df_players.set_index('player_id')
print df_players.head(1)


could not load 12716
could not load 25364
could not load 34021
could not load 22792
could not load 27141
could not load 33413
could not load 20098
could not load 36689
could not load 25834
could not load 09011
could not load 27436
could not load 01161
could not load 20160
could not load 23497
could not load 24663
could not load 20498
could not load 30110
could not load 33418
could not load 06567
could not load 06621
could not load 20766
could not load 27120
could not load 22913
could not load 06522
could not load 23320
could not load 27556
could not load 28310
could not load 06004
could not load 23614
could not load 22621
could not load 21805
could not load 35139
could not load 28093
could not load 32200
could not load 31289
could not load 12652
could not load 20070
could not load 10213
could not load 20850
could not load 32448
could not load 01948
could not load 26679
could not load 02239
could not load 34360
could not load 32698
could not load 12782
           approach the green_approaches_from_100-125_yards  \
player_id                                                     
29289                                              6.153846   

           approach the green_approaches_from_100-125_yards_(rgh)  \
player_id                                                           
29289                                               9.538462        

           approach the green_approaches_from_125-150_yards  \
player_id                                                     
29289                                              6.769231   

           approach the green_approaches_from_125-150_yards_(rgh)  \
player_id                                                           
29289                                               9.230769        

           approach the green_approaches_from_150-175_yards  \
player_id                                                     
29289                                              8.923077   

           approach the green_approaches_from_150-175_yards_(rgh)  \
player_id                                                           
29289                                              11.692308        

           approach the green_approaches_from_175-200_yards  \
player_id                                                     
29289                                              9.538462   

           approach the green_approaches_from_175-200_yards_(rgh)  \
player_id                                                           
29289                                              14.769231        

           approach the green_approaches_from_200-225_yards  \
player_id                                                     
29289                                             12.307692   

           approach the green_approaches_from_200-225_yards_(rgh)  \
player_id                                                           
29289                                              19.692308        

                          ...                   recap_ball_striking  \
player_id                 ...                                         
29289                     ...                                   167   

           recap_fedexcup_playoffs_points  recap_fedexcup_season_points  \
player_id                                                                 
29289                                1590                           950   

           recap_total_driving  streaks_consecutive_cuts  \
player_id                                                  
29289                      201                         3   

           recap_putting_average  recap_scoring_average_(actual)  \
player_id                                                          
29289                        NaN                             NaN   

           recap_web.com_tour_finals_priority_rankings  \
player_id                                                
29289                                              NaN   

           recap_web.com_tour_regular_season_money_list  \
player_id                                                 
29289                                               NaN   

           scoring_scoring_average_final_round  
player_id                                       
29289                                      NaN  

[1 rows x 147 columns]

In [55]:
print len(df_players_cleaned)
df_players_cleaned = df_players_cleaned.dropna()
print len(df_players_cleaned)
#print df_players_cleaned.tail()


87
87

In [58]:
#print df_players_cleaned
# for i, row in df_players_cleaned.iterrows():
#     print row
print len(df_players_cleaned.columns)
features = scale(df_players_cleaned.drop('score', axis=1))
print len(features[0])


127
126

In [78]:
prediction = pd.DataFrame(index=df_players_cleaned.index)
prediction['p'] = tree.predict(features)
print prediction


                  p
player_id          
29289      0.995145
34431      0.975571
29425      0.969948
28475      0.854066
24494      0.943813
24358      0.944726
12716      0.948466
33141      0.696594
30750      0.920409
22405      0.922537
34021      0.513123
22792      0.898801
...             ...
27963      0.045627
27770      0.043564
32448      0.043936
27936      0.048787
34409      0.047952
20691      0.822302
01948      0.456366
26679      0.046156
24490      0.004876
34360      0.007630
32698      0.652090
12782      0.001841

[87 rows x 1 columns]

In [79]:
prediction['p_w'] = prediction['p'] / prediction['p'].sum()
print prediction


                  p       p_w
player_id                    
29289      0.995145  0.022182
34431      0.975571  0.021745
29425      0.969948  0.021620
28475      0.854066  0.019037
24494      0.943813  0.021037
24358      0.944726  0.021058
12716      0.948466  0.021141
33141      0.696594  0.015527
30750      0.920409  0.020516
22405      0.922537  0.020563
34021      0.513123  0.011437
22792      0.898801  0.020034
...             ...       ...
27963      0.045627  0.001017
27770      0.043564  0.000971
32448      0.043936  0.000979
27936      0.048787  0.001087
34409      0.047952  0.001069
20691      0.822302  0.018329
01948      0.456366  0.010172
26679      0.046156  0.001029
24490      0.004876  0.000109
34360      0.007630  0.000170
32698      0.652090  0.014535
12782      0.001841  0.000041

[87 rows x 2 columns]

In [75]:



Out[75]:
20.916847480658383

In [73]:
#print prediction.index.values
with open('players/_list.pkl', 'r') as f:
    players = pickle.load(f)
    players = {v: k for k, v in players.iteritems()}
    #print players
    prediction['player_name'] = [players[i] if i in players else None for i in prediction.index.values]
prediction_found = prediction.dropna()
print prediction_found


                  p      player_name
player_id                           
29289      0.995145    Seung-Yul Noh
34431      0.975571     Robert Streb
29425      0.969948   Andrew Svoboda
28475      0.854066     Jeff Overton
24494      0.943813     Erik Compton
24358      0.944726  Robert Garrigus
12716      0.948466  Charley Hoffman
33141      0.696594   Keegan Bradley
30750      0.920409     Tommy Gainey
22405      0.922537      Justin Rose
34021      0.513123       Bud Cauley
22792      0.898801     Peter Hanson
...             ...              ...
27963      0.045627     Chris Stroud
27770      0.043564  Camilo Villegas
32448      0.043936       James Hahn
27936      0.048787     Martin Laird
34409      0.047952  David Lingmerth
20691      0.822302        Greg Owen
01948      0.456366     Tim Petrovic
26679      0.046156    Kevin Stadler
24490      0.004876   George McNeill
34360      0.007630     Patrick Reed
32698      0.652090      Bobby Gates
12782      0.001841       Tim Herron

[87 rows x 2 columns]

In [ ]: