In [2]:
%matplotlib inline
%config InlineBackend.figure_formats = {'png', 'retina'}
import pandas as pd
import numpy as np
import statsmodels.api as sm
import seaborn as sns
import MySQLdb
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import cross_val_score
In [3]:
db = MySQLdb.connect(
"db.fastcamp.us",
"root",
"dkstncks",
"football",
charset='utf8',
)
def make_query(position):
"""
parameter------------
position : M, D, F, G
return---------------
SQL_QUERY String
"""
SQL_QUERY = """
SELECT
age, tall, weight, apps_start, apps_sub, mins, goals, assists, yel, red
, spg, ps_x, motm, aw, tackles, inter, fouls, offsides, clear, drb, blocks
, owng, keyp_x, fouled, off, disp, unstch, avgp, ps_y, rating
FROM player
WHERE position like "%{position}%"
;
""".format(position=position)
return SQL_QUERY
# goalkeeper
SQL_QUERY = make_query("G")
goalkeeper_df = pd.read_sql(SQL_QUERY, db)
len(goalkeeper_df)
Out[3]:
In [4]:
X = goalkeeper_df.ix[:,:-1]
scaler = StandardScaler(with_mean=False)
X_scaled = scaler.fit_transform(X)
dfX0 = pd.DataFrame(X_scaled, columns=X.columns)
dfX = sm.add_constant(dfX0)
dfy = pd.DataFrame(goalkeeper_df.ix[:,-1], columns=["rating"])
g_df = pd.concat([dfX, dfy], axis=1)
g_df.head()
Out[4]:
In [5]:
model = sm.OLS(g_df.ix[:, -1], g_df.ix[:, :-1])
result = model.fit()
print(result.summary())
In [6]:
# remove features
remove_column_list = [
"weight", "apps_start", "apps_sub", "mins", "goals", "assists", "yel", "red", "spg", "motm"
, "tackles","inter", "fouls", "blocks", "owng", "keyp_x", "fouled", "off", "disp", "unstch"
]
removed_g_df = g_df.drop(remove_column_list, axis=1)
model = sm.OLS(removed_g_df.ix[:, -1], removed_g_df.ix[:, :-1])
result = model.fit()
print(result.summary())
In [7]:
formula_str = """
rating ~ age + tall + ps_x + aw + offsides + clear + drb + avgp + ps_y
"""
model = sm.OLS.from_formula(formula_str, data=removed_g_df)
result = model.fit()
table_anova = sm.stats.anova_lm(result)
table_anova
Out[7]:
In [8]:
# remove feature 2
remove_column_list = [
"age", "offsides", "ps_y"
]
removed2_g_df = removed_g_df.drop(remove_column_list, axis=1)
model = sm.OLS(removed2_g_df.ix[:, -1], removed2_g_df.ix[:, :-1])
result = model.fit()
print(result.summary())
In [17]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(removed2_g_df.ix[:, :-1], removed2_g_df.ix[:, -1])
Out[17]:
In [10]:
w = model.coef_[1:]
In [21]:
N = 5
print(removed2_g_df.loc[N])
v = np.array(removed2_g_df.loc[N][1:-1])
print(w)
print(v)
# predict
print( (w*v).sum()*10 )
In [ ]:
In [ ]: