In [16]:
import pandas as pd
import os
import numpy as np


data_filename = os.getcwd()+"/data/NBA_2014_games_new.csv"
dataset = pd.read_csv(data_filename,parse_dates=["Date"],skiprows=[1,])
dataset.columns = ['Date','time','visitor','visitorPts','home','homePts','scoreType','OT','Notes']
# 格式化日期
dataset.drop(['time'], axis=1, inplace=True)   
dataset.ix[:5]


Out[16]:
Date visitor visitorPts home homePts scoreType OT Notes
0 2013-10-29 Los Angeles Clippers 103 Los Angeles Lakers 116 Box Score NaN NaN
1 2013-10-29 Chicago Bulls 95 Miami Heat 107 Box Score NaN NaN
2 2013-10-30 Brooklyn Nets 94 Cleveland Cavaliers 98 Box Score NaN NaN
3 2013-10-30 Atlanta Hawks 109 Dallas Mavericks 118 Box Score NaN NaN
4 2013-10-30 Washington Wizards 102 Detroit Pistons 113 Box Score NaN NaN
5 2013-10-30 Los Angeles Lakers 94 Golden State Warriors 125 Box Score NaN NaN

In [17]:
dataset["HomeWin"] = dataset["homePts"] > dataset["visitorPts"]
y_true = dataset["HomeWin"].values
dataset.ix[:5]


Out[17]:
Date visitor visitorPts home homePts scoreType OT Notes HomeWin
0 2013-10-29 Los Angeles Clippers 103 Los Angeles Lakers 116 Box Score NaN NaN True
1 2013-10-29 Chicago Bulls 95 Miami Heat 107 Box Score NaN NaN True
2 2013-10-30 Brooklyn Nets 94 Cleveland Cavaliers 98 Box Score NaN NaN True
3 2013-10-30 Atlanta Hawks 109 Dallas Mavericks 118 Box Score NaN NaN True
4 2013-10-30 Washington Wizards 102 Detroit Pistons 113 Box Score NaN NaN True
5 2013-10-30 Los Angeles Lakers 94 Golden State Warriors 125 Box Score NaN NaN True

In [18]:
# 主场胜率
print("Home win percentage: {0:.1f}%".format(100 * dataset["HomeWin"].sum()/dataset["HomeWin"].count()))
#客场胜率
print("Home win percentage: {0:.1f}%".format(100 * (dataset["HomeWin"].count()-dataset["HomeWin"].sum())/dataset["HomeWin"].count()))


Home win percentage: 57.9%
Home win percentage: 42.1%

In [28]:
from collections import defaultdict


won_last = defaultdict(int)
dataset["HomeLastWin"] = False
dataset["VisitorLastWin"] = False

for index,row in dataset.iterrows():
    home_team = row["home"]
    visitor_team = row["visitor"]
    row["HomeLastWin"] = won_last[home_team]
    row["VisitorLastWin"] = won_last[visitor_team]
    dataset.ix[index] = row
    won_last[home_team] = row['HomeWin']
    won_last[visitor_team] = not row['HomeWin']
dataset.ix[20:25]


Out[28]:
Date visitor visitorPts home homePts scoreType OT Notes HomeWin HomeLastWin VisitorLastWin
20 2013-11-01 Miami Heat 100 Brooklyn Nets 101 Box Score NaN NaN True False False
21 2013-11-01 Cleveland Cavaliers 84 Charlotte Bobcats 90 Box Score NaN NaN True False True
22 2013-11-01 Portland Trail Blazers 113 Denver Nuggets 98 Box Score NaN NaN False False False
23 2013-11-01 Dallas Mavericks 105 Houston Rockets 113 Box Score NaN NaN True True True
24 2013-11-01 San Antonio Spurs 91 Los Angeles Lakers 85 Box Score NaN NaN False False True
25 2013-11-01 Detroit Pistons 108 Memphis Grizzlies 111 Box Score OT NaN True False True

In [34]:
# 分类决策树
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import  cross_val_score
clf = DecisionTreeClassifier(random_state=14)
X_previouswins = dataset[['HomeLastWin','VisitorLastWin']].values
# 交叉检验的平均正确率
scores1 = cross_val_score(clf, X_previouswins, y_true, scoring='accuracy')
print("Accuracy: {0:.1f}%".format(np.mean(scores1) * 100))


Accuracy: 57.4%

In [38]:
# 前面已经建立了一个【主场队是否通常比对手水平高】的特征
# create new fetatuer
data_filename = os.getcwd()+"/data/NBA_2013_standings.xlsx"
standings = pd.read_excel(data_filename)
# 格式化日期
standings = standings.loc[:, ['Rk', 'Team', 'Overall']]
standings[:3]


Out[38]:
Rk Team Overall
0 1 Miami Heat 66-16
1 2 Oklahoma City Thunder 60-22
2 3 San Antonio Spurs 58-24

In [37]:
dataset['HomeTeamRanksHigher'] = 0
for index,row in dataset.iterrows():
    home_team = row['home']
    visitor_team = row['visitor']
    if home_team == "New Orleans Pelicans":
        home_team = "New Orleans Hornets"
    elif visitor_team == "New Orleans Pelicans":
        visitor_team = "New Orleans Hornets"
    home_rank = standings[standings["Team"] == home_team]['Rk'].values[0]
    visitor_rank = standings[standings["Team"] == visitor_team]['Rk'].values[0]
    row['HomeTeamRanksHigher'] = int(home_rank>visitor_rank)
    dataset.ix[index]= row

In [36]:
x_homehighter = dataset[['HomeLastWin','VisitorLastWin','HomeTeamRanksHigher']].values
clf = DecisionTreeClassifier(random_state=14)
score2 = cross_val_score(clf,x_homehighter,y_true,scoring="accuracy")
print("Accuracy: {0:.1f}%".format(np.mean(score2) * 100))


Accuracy: 59.5%

新建一个特征

  • 两支球队上场比赛的情况
  • 有时排名靠后的球队反而能战胜排名靠前的。原因有很多。例如,排名靠后的球队某些打法恰好能击中强者的软肋

In [62]:
last_match_winner = defaultdict(int)
dataset['HomeTeamWonLast'] = 0
for index,row in dataset.iterrows():
    home_team = row['home']
    visitor_team = row['visitor']
    teams = tuple(sorted([home_team,visitor_team]))
    row['HomeTeamWonLast'] = 1 if last_match_winner[teams] == home_team else 0
    dataset.ix[index] = row
    last_match_winner[teams] = home_team if row["HomeWin"] else visitor_team

In [58]:
x_lastWinner = dataset[["HomeTeamWonLast","HomeTeamRanksHigher"]].values
clf = DecisionTreeClassifier(random_state=14)
scores3 = cross_val_score(clf,x_lastWinner,y_true,scoring='accuracy')
print("Accuracy: {0:.1f}%".format(np.mean(scores3) * 100))


Accuracy: 57.8%

特征数增加后,决策 树处理不当。鉴于此,我们尝试修改算法,看看会不会起作用。数据挖掘有时就是不断尝试新算 法、使用新特征这样一个过程


决策树在训练数据量很大的情况下,能否得到有效的分类模型。我们将 会为决策树添加球队,以检测它是否能整合新增的信息。 虽然决策树能够处理特征值为类别型的数据,但scikit-learn库所实现的决策树算法要求 先对这类特征进行处理。用LabelEncoder转换器就能把字符串类型的球队名转化为整型


In [69]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
encoding = LabelEncoder()

encoding.fit(dataset['home'].values)
home_teams = encoding.transform(dataset["home"].values)
visitor_teams = encoding.transform(dataset["visitor"].values)
X_teams = np.vstack([home_teams,visitor_teams]).T

决策树可以用这些特征值进行训练,但DecisionTreeClassifier仍把它们当作连续型特 征。例如,编号从0到16的17支球队,算法会认为球队1和2相似,而球队4和10不同。但其实这没 意义,对于两支球队而言,它们要么是同一支球队,要么不同,没有中间状态! 为了消除这种和实际情况不一致的现象,我们可以使用OneHotEncoder转换器把这些整数转 换为二进制数字。每个特征用一个二进制数字①来表示。例如,LabelEncoder为芝加哥公牛队分配 的数值是7,那么OneHotEncoder为它分配的二进制数字的第七位就是1,其余队伍的第七位就是0。 每个可能的特征值都这样处理,而数据集会变得很大


In [77]:
onehot = OneHotEncoder()
X_teams_expanded = onehot.fit_transform(X_teams).todense()

clf = DecisionTreeClassifier(random_state=14)
scores4 = cross_val_score(clf, X_teams_expanded, y_true,
scoring='accuracy')
print("Accuracy: {0:.1f}%".format(np.mean(scores4) * 100))


Accuracy: 59.5%

使用随机森林


In [78]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state=14)
scores5 = cross_val_score(clf,X_teams_expanded,y_true,scoring="accuracy") 
print("Accuracy: {0:.1f}%".format(np.mean(socres5) * 100))


Accuracy: 59.3%

In [84]:
X_all = np.hstack([x_homehighter, X_teams])
clf = RandomForestClassifier(random_state=14)
scores6 = cross_val_score(clf, X_all, y_true, scoring='accuracy')
print("Accuracy: {0:.1f}%".format(np.mean(scores6) * 100))


Accuracy: 59.1%

In [ ]: