In [43]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, scale
from sklearn.cross_validation import cross_val_score, train_test_split
import os
import pickle
import matplotlib.pyplot as plt
%matplotlib inline
# Set some Pandas options
pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 25)
#http://nbviewer.ipython.org/github/herrfz/dataanalysis/blob/master/week3/exploratory_graphs.ipynb
#http://nbviewer.ipython.org/gist/fonnesbeck/5850463
load data
In [44]:
with open('20150423_zurich_classic_of_new_orleans/past.pkl', 'r') as f:
df = pd.DataFrame.from_dict(pickle.load(f)).set_index('player_id')
print df
first scale the scores for the labels
In [45]:
# top half labels
df_top = df[np.isfinite(df['r4'])]
# print tdf_top.sort('score')
df_top['score_scaled'] = MinMaxScaler(feature_range=(0.5, 1.)).fit_transform(df_top['score'].astype(float))[::-1]
# print tdf_top.sort('score')
# bottom half
df_bot = df[df['pos'] == 'CUT']
# print tdf_bot.sort('score')
df_bot['score_scaled'] = MinMaxScaler(feature_range=(0., 0.5)).fit_transform(df_bot['score'].astype(float))[::-1]
# print tdf_bot
# combine labels
df = pd.concat([df_top, df_bot])['score_scaled']
print df
load data
In [46]:
df_players = pd.DataFrame()
for player_id, score_scaled in df.iteritems():
player_file = 'players/{}/{}.pkl'.format(2013, player_id)
# print pfile
if os.path.isfile(player_file):
with open(player_file, 'r') as f:
player_data = pickle.load(f)
player_data['player_id'] = player_id
# print pdata
df_players = df_players.append(player_data, ignore_index=True)
# break
else:
print 'could not load {}'.format(player_id)
df_players = df_players.set_index('player_id')
print df_players.head(1)
remove the features which has mostly nan
In [47]:
# print df_players
cols = []
for col in df_players.columns:
#print col, df_players[col].isnull().sum()
if df_players[col].isnull().sum() > len(df_players) * 0.2:
cols.append(col)
print len(df_players.columns), 'vs', len(cols)
print 'dropping', cols
df_players_cleaned = df_players.drop(cols, axis=1)
print len(df_players_cleaned.columns)
print len(df_players_cleaned)
df_players_cleaned = df_players_cleaned.dropna()
print len(df_players_cleaned)
cols_to_use = df_players_cleaned.columns
merge players data with history
In [48]:
print df.tail()
df_players_cleaned['score'] = df
# print df_players_cleaned.tail()
# print df_players_cleaned.head()
print len(df_players_cleaned)
df_merged = df_players_cleaned.dropna()
print len(df_merged)
In [49]:
labels = df_merged['score']
print 'labels\n', labels
# print df_merged.columns[-10:]
features = scale(df_merged.drop('score', axis=1))
# print df_merged.columns[-10:]
print 'features\n', features[0]
X_train, X_test, y_train, y_test = train_test_split(features, labels)
In [50]:
from sklearn.ensemble import GradientBoostingRegressor
CV the data
In [51]:
tree = GradientBoostingRegressor()
cv = cross_val_score(tree, X_train, y_train, cv=10, scoring='r2')
print np.mean(cv), np.std(cv)
print cv
fit the model
In [52]:
tree.fit(X_train, y_train)
tree.score(X_test, y_test)
Out[52]:
predict using current players
In [54]:
df_players = pd.DataFrame(columns=cols_to_use)
for player_id, score_scaled in df.iteritems():
player_file = 'players/{}/{}.pkl'.format(2014, player_id)
# print pfile
if os.path.isfile(player_file):
with open(player_file, 'r') as f:
player_data = pickle.load(f)
player_data['player_id'] = player_id
# print pdata
df_players = df_players.append(player_data, ignore_index=True)
# break
else:
print 'could not load {}'.format(player_id)
df_players = df_players.set_index('player_id')
print df_players.head(1)
In [55]:
print len(df_players_cleaned)
df_players_cleaned = df_players_cleaned.dropna()
print len(df_players_cleaned)
#print df_players_cleaned.tail()
In [58]:
#print df_players_cleaned
# for i, row in df_players_cleaned.iterrows():
# print row
print len(df_players_cleaned.columns)
features = scale(df_players_cleaned.drop('score', axis=1))
print len(features[0])
In [78]:
prediction = pd.DataFrame(index=df_players_cleaned.index)
prediction['p'] = tree.predict(features)
print prediction
In [79]:
prediction['p_w'] = prediction['p'] / prediction['p'].sum()
print prediction
In [75]:
Out[75]:
In [73]:
#print prediction.index.values
with open('players/_list.pkl', 'r') as f:
players = pickle.load(f)
players = {v: k for k, v in players.iteritems()}
#print players
prediction['player_name'] = [players[i] if i in players else None for i in prediction.index.values]
prediction_found = prediction.dropna()
print prediction_found
In [ ]: