In [16]:
import pandas as pd
import os
import numpy as np
data_filename = os.getcwd()+"/data/NBA_2014_games_new.csv"
dataset = pd.read_csv(data_filename,parse_dates=["Date"],skiprows=[1,])
dataset.columns = ['Date','time','visitor','visitorPts','home','homePts','scoreType','OT','Notes']
# 格式化日期
dataset.drop(['time'], axis=1, inplace=True)
dataset.ix[:5]
Out[16]:
In [17]:
dataset["HomeWin"] = dataset["homePts"] > dataset["visitorPts"]
y_true = dataset["HomeWin"].values
dataset.ix[:5]
Out[17]:
In [18]:
# 主场胜率
print("Home win percentage: {0:.1f}%".format(100 * dataset["HomeWin"].sum()/dataset["HomeWin"].count()))
#客场胜率
print("Home win percentage: {0:.1f}%".format(100 * (dataset["HomeWin"].count()-dataset["HomeWin"].sum())/dataset["HomeWin"].count()))
In [28]:
from collections import defaultdict
won_last = defaultdict(int)
dataset["HomeLastWin"] = False
dataset["VisitorLastWin"] = False
for index,row in dataset.iterrows():
home_team = row["home"]
visitor_team = row["visitor"]
row["HomeLastWin"] = won_last[home_team]
row["VisitorLastWin"] = won_last[visitor_team]
dataset.ix[index] = row
won_last[home_team] = row['HomeWin']
won_last[visitor_team] = not row['HomeWin']
dataset.ix[20:25]
Out[28]:
In [34]:
# 分类决策树
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import cross_val_score
clf = DecisionTreeClassifier(random_state=14)
X_previouswins = dataset[['HomeLastWin','VisitorLastWin']].values
# 交叉检验的平均正确率
scores1 = cross_val_score(clf, X_previouswins, y_true, scoring='accuracy')
print("Accuracy: {0:.1f}%".format(np.mean(scores1) * 100))
In [38]:
# 前面已经建立了一个【主场队是否通常比对手水平高】的特征
# create new fetatuer
data_filename = os.getcwd()+"/data/NBA_2013_standings.xlsx"
standings = pd.read_excel(data_filename)
# 格式化日期
standings = standings.loc[:, ['Rk', 'Team', 'Overall']]
standings[:3]
Out[38]:
In [37]:
dataset['HomeTeamRanksHigher'] = 0
for index,row in dataset.iterrows():
home_team = row['home']
visitor_team = row['visitor']
if home_team == "New Orleans Pelicans":
home_team = "New Orleans Hornets"
elif visitor_team == "New Orleans Pelicans":
visitor_team = "New Orleans Hornets"
home_rank = standings[standings["Team"] == home_team]['Rk'].values[0]
visitor_rank = standings[standings["Team"] == visitor_team]['Rk'].values[0]
row['HomeTeamRanksHigher'] = int(home_rank>visitor_rank)
dataset.ix[index]= row
In [36]:
x_homehighter = dataset[['HomeLastWin','VisitorLastWin','HomeTeamRanksHigher']].values
clf = DecisionTreeClassifier(random_state=14)
score2 = cross_val_score(clf,x_homehighter,y_true,scoring="accuracy")
print("Accuracy: {0:.1f}%".format(np.mean(score2) * 100))
In [62]:
last_match_winner = defaultdict(int)
dataset['HomeTeamWonLast'] = 0
for index,row in dataset.iterrows():
home_team = row['home']
visitor_team = row['visitor']
teams = tuple(sorted([home_team,visitor_team]))
row['HomeTeamWonLast'] = 1 if last_match_winner[teams] == home_team else 0
dataset.ix[index] = row
last_match_winner[teams] = home_team if row["HomeWin"] else visitor_team
In [58]:
x_lastWinner = dataset[["HomeTeamWonLast","HomeTeamRanksHigher"]].values
clf = DecisionTreeClassifier(random_state=14)
scores3 = cross_val_score(clf,x_lastWinner,y_true,scoring='accuracy')
print("Accuracy: {0:.1f}%".format(np.mean(scores3) * 100))
特征数增加后,决策 树处理不当。鉴于此,我们尝试修改算法,看看会不会起作用。数据挖掘有时就是不断尝试新算 法、使用新特征这样一个过程
决策树在训练数据量很大的情况下,能否得到有效的分类模型。我们将 会为决策树添加球队,以检测它是否能整合新增的信息。 虽然决策树能够处理特征值为类别型的数据,但scikit-learn库所实现的决策树算法要求 先对这类特征进行处理。用LabelEncoder转换器就能把字符串类型的球队名转化为整型
In [69]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
encoding = LabelEncoder()
encoding.fit(dataset['home'].values)
home_teams = encoding.transform(dataset["home"].values)
visitor_teams = encoding.transform(dataset["visitor"].values)
X_teams = np.vstack([home_teams,visitor_teams]).T
决策树可以用这些特征值进行训练,但DecisionTreeClassifier仍把它们当作连续型特 征。例如,编号从0到16的17支球队,算法会认为球队1和2相似,而球队4和10不同。但其实这没 意义,对于两支球队而言,它们要么是同一支球队,要么不同,没有中间状态! 为了消除这种和实际情况不一致的现象,我们可以使用OneHotEncoder转换器把这些整数转 换为二进制数字。每个特征用一个二进制数字①来表示。例如,LabelEncoder为芝加哥公牛队分配 的数值是7,那么OneHotEncoder为它分配的二进制数字的第七位就是1,其余队伍的第七位就是0。 每个可能的特征值都这样处理,而数据集会变得很大
In [77]:
onehot = OneHotEncoder()
X_teams_expanded = onehot.fit_transform(X_teams).todense()
clf = DecisionTreeClassifier(random_state=14)
scores4 = cross_val_score(clf, X_teams_expanded, y_true,
scoring='accuracy')
print("Accuracy: {0:.1f}%".format(np.mean(scores4) * 100))
In [78]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=14)
scores5 = cross_val_score(clf,X_teams_expanded,y_true,scoring="accuracy")
print("Accuracy: {0:.1f}%".format(np.mean(socres5) * 100))
In [84]:
X_all = np.hstack([x_homehighter, X_teams])
clf = RandomForestClassifier(random_state=14)
scores6 = cross_val_score(clf, X_all, y_true, scoring='accuracy')
print("Accuracy: {0:.1f}%".format(np.mean(scores6) * 100))
In [ ]: