In [2]:
%matplotlib inline
%config InlineBackend.figure_formats = {'png', 'retina'}
import pandas as pd
import numpy as np
import statsmodels.api as sm
import seaborn as sns
import MySQLdb
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import cross_val_score
In [3]:
db = MySQLdb.connect(
"db.fastcamp.us",
"root",
"dkstncks",
"football",
charset='utf8',
)
def make_query(position):
"""
parameter------------
position : M, D, F, G
return---------------
SQL_QUERY String
"""
SQL_QUERY = """
SELECT
age, tall, weight, apps_start, apps_sub, mins, goals, assists, yel, red
, spg, ps_x, motm, aw, tackles, inter, fouls, offsides, clear, drb, blocks
, owng, keyp_x, fouled, off, disp, unstch, avgp, ps_y, rating
FROM player
"""
if position == "F":
SQL_QUERY += """
WHERE position not like "%,%" and position like "%FW%" and mins > 270
"""
if position == "M":
SQL_QUERY += """
WHERE position not like "%,%" and position like "%M%" and mins > 270
"""
if position == "D":
SQL_QUERY += """
WHERE position not like "%,%" and position like "%D%" and position not like " DMC" and mins > 270
"""
if position == "G":
SQL_QUERY += """
WHERE position not like "%,%" and position like "%G%" and mins > 270
"""
return SQL_QUERY
# defencer
SQL_QUERY = make_query("D")
defenser_df = pd.read_sql(SQL_QUERY, db)
len(defenser_df)
Out[3]:
In [4]:
X = defenser_df.ix[:,:-1]
scaler = StandardScaler(with_mean=False)
X_scaled = scaler.fit_transform(X)
dfX0 = pd.DataFrame(X_scaled, columns=X.columns)
dfX = sm.add_constant(dfX0)
dfy = pd.DataFrame(defenser_df.ix[:,-1], columns=["rating"])
d_df = pd.concat([dfX, dfy], axis=1)
d_df.head()
Out[4]:
In [5]:
model = sm.OLS(d_df.ix[:, -1], d_df.ix[:, :-1])
result = model.fit()
print(result.summary())
In [6]:
# remove features
remove_column_list = [
"age", "tall", "weight", "apps_start", "apps_sub", "mins", "yel", "unstch"
]
removed_d_df = d_df.drop(remove_column_list, axis=1)
model = sm.OLS(removed_d_df.ix[:, -1], removed_d_df.ix[:, :-1])
result = model.fit()
print(result.summary())
In [7]:
formula_str = """
rating ~ goals + assists + red + spg + ps_x + motm + aw
+ tackles + inter + fouls + offsides + clear + drb + blocks
+ owng + keyp_x + fouled + off + disp + avgp + ps_y
"""
model = sm.OLS.from_formula(formula_str, data=removed_d_df)
result = model.fit()
table_anova = sm.stats.anova_lm(result)
table_anova
Out[7]:
In [8]:
# remove feature 2
remove_column_list = [
"red", "offsides", "drb", "blocks", "off", "disp", "ps_y"
]
removed2_d_df = removed_d_df.drop(remove_column_list, axis=1)
model = sm.OLS(removed2_d_df.ix[:, -1], removed2_d_df.ix[:, :-1])
result = model.fit()
print(result.summary())
In [9]:
# tackles
# aw : 공중볼 경합
# inter
# clear
In [ ]: