In [25]:
%matplotlib inline
%config InlineBackend.figure_formats = {'png', 'retina'}
import pandas as pd
import numpy as np
import statsmodels.api as sm
import seaborn as sns
import MySQLdb
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import cross_val_score
In [26]:
db = MySQLdb.connect(
"db.fastcamp.us",
"root",
"dkstncks",
"football",
charset='utf8',
)
def make_query(position):
"""
parameter------------
position : M, D, F, G
return---------------
SQL_QUERY String
"""
SQL_QUERY = """
SELECT
age, tall, weight, apps_start, apps_sub, mins, goals, assists, yel, red
, spg, ps_x, motm, aw, tackles, inter, fouls, offsides, clear, drb, blocks
, owng, keyp_x, fouled, off, disp, unstch, avgp, ps_y, rating
FROM player
WHERE position like "%{position}%" and position not like "%D%" and mins > 270
;
""".format(position=position)
return SQL_QUERY
# forword
SQL_QUERY = make_query("F")
forword_df = pd.read_sql(SQL_QUERY, db)
len(forword_df)
Out[26]:
In [27]:
X = forword_df.ix[:,:-1]
scaler = StandardScaler(with_mean=False)
X_scaled = scaler.fit_transform(X)
dfX0 = pd.DataFrame(X_scaled, columns=X.columns)
dfX = sm.add_constant(dfX0)
dfy = pd.DataFrame(forword_df.ix[:,-1], columns=["rating"])
f_df = pd.concat([dfX, dfy], axis=1)
f_df.head()
Out[27]:
In [28]:
model = sm.OLS(f_df.ix[:, -1], f_df.ix[:, :-1])
result = model.fit()
print(result.summary())
In [37]:
# remove features
remove_column_list = [
"age", "tall", "weight", "apps_start", "apps_sub", "red", "clear", "blocks", "owng", "unstch", "offsides", "disp", "off"
]
removed_f_df = f_df.drop(remove_column_list, axis=1)
model = sm.OLS(removed_f_df.ix[:, -1], removed_f_df.ix[:, :-1])
result = model.fit()
print(result.summary())
In [39]:
formula_str = """
rating ~ mins + goals + assists + yel + spg + ps_x
+ motm + aw + tackles + inter + fouls + drb + keyp_x
+ fouled + avgp + ps_y
"""
model = sm.OLS.from_formula(formula_str, data=removed_f_df)
result = model.fit()
table_anova = sm.stats.anova_lm(result)
table_anova
Out[39]:
In [40]:
# remove feature 2
remove_column_list = [
"fouls", "drb", "ps_y"
]
removed2_f_df = removed_f_df.drop(remove_column_list, axis=1)
model = sm.OLS(removed2_f_df.ix[:, -1], removed2_f_df.ix[:, :-1])
result = model.fit()
print(result.summary())
In [1]:
# forword
# goals
# keyp_x : 키 패스
# spg : 게임당 슈팅수
# assists
# aw : 공중볼 경합승리
# fouled : 파울당한 횟수