In [47]:
import pandas
import math
import collections
In [38]:
teams_data = pandas.read_csv("kaggle_data/Teams.csv")
reg_season_detail = pandas.read_csv("kaggle_data/RegularSeasonDetailedResults.csv")
seasons_data = pandas.read_csv("kaggle_data/Seasons.csv")
tourney_compact = pandas.read_csv("kaggle_data/TourneyCompactResults.csv")
tourney_detail = pandas.read_csv("kaggle_data/TourneyDetailedResults.csv")
tourney_seeds = pandas.read_csv("kaggle_data/TourneySeeds.csv")
tourney_slots = pandas.read_csv("kaggle_data/TourneySlots.csv")
conference_pd = pd.read_csv('Conference.csv')
tourney_results_pd = pd.read_csv('TourneyResults.csv')
NCAAChampionsList = tourney_results_pd['NCAA Champion'].tolist()
Credit to https://github.com/adeshpande3/March-Madness-2017 for the aggregation helper functions, and logic
In [39]:
listACCteams = ['North Carolina','Virginia','Florida St','Louisville','Notre Dame','Syracuse','Duke','Virginia Tech','Georgia Tech','Miami','Wake Forest','Clemson','NC State','Boston College','Pittsburgh']
listPac12teams = ['Arizona','Oregon','UCLA','California','USC','Utah','Washington St','Stanford','Arizona St','Colorado','Washington','Oregon St']
listSECteams = ['Kentucky','South Carolina','Florida','Arkansas','Alabama','Tennessee','Mississippi St','Georgia','Ole Miss','Vanderbilt','Auburn','Texas A&M','LSU','Missouri']
listBig10teams = ['Maryland','Wisconsin','Purdue','Northwestern','Michigan St','Indiana','Iowa','Michigan','Penn St','Nebraska','Minnesota','Illinois','Ohio St','Rutgers']
listBig12teams = ['Kansas','Baylor','West Virginia','Iowa St','TCU','Kansas St','Texas Tech','Oklahoma St','Texas','Oklahoma']
listBigEastteams = ['Butler','Creighton','DePaul','Georgetown','Marquette','Providence','Seton Hall','St John\'s','Villanova','Xavier']
def checkPower6Conference(team_id):
teamName = teams_pd.values[team_id-1101][1]
if (teamName in listACCteams or teamName in listBig10teams or teamName in listBig12teams
or teamName in listSECteams or teamName in listPac12teams or teamName in listBigEastteams):
return 1
else:
return 0
def handleDifferentCSV(df):
# The stats CSV is a lit different in terms of naming so below is just some data cleaning
df['School'] = df['School'].replace('(State)', 'St', regex=True)
df['School'] = df['School'].replace('Albany (NY)', 'Albany NY')
df['School'] = df['School'].replace('Boston University', 'Boston Univ')
df['School'] = df['School'].replace('Central Michigan', 'C Michigan')
df['School'] = df['School'].replace('(Eastern)', 'E', regex=True)
df['School'] = df['School'].replace('Louisiana St', 'LSU')
df['School'] = df['School'].replace('North Carolina St', 'NC State')
df['School'] = df['School'].replace('Southern California', 'USC')
df['School'] = df['School'].replace('University of California', 'California', regex=True)
df['School'] = df['School'].replace('American', 'American Univ')
df['School'] = df['School'].replace('Arkansas-Little Rock', 'Ark Little Rock')
df['School'] = df['School'].replace('Arkansas-Pine Bluff', 'Ark Pine Bluff')
df['School'] = df['School'].replace('Bowling Green St', 'Bowling Green')
df['School'] = df['School'].replace('Brigham Young', 'BYU')
df['School'] = df['School'].replace('Cal Poly', 'Cal Poly SLO')
df['School'] = df['School'].replace('Centenary (LA)', 'Centenary')
df['School'] = df['School'].replace('Central Connecticut St', 'Central Conn')
df['School'] = df['School'].replace('Charleston Southern', 'Charleston So')
df['School'] = df['School'].replace('Coastal Carolina', 'Coastal Car')
df['School'] = df['School'].replace('College of Charleston', 'Col Charleston')
df['School'] = df['School'].replace('Cal St Fullerton', 'CS Fullerton')
df['School'] = df['School'].replace('Cal St Sacramento', 'CS Sacramento')
df['School'] = df['School'].replace('Cal St Bakersfield', 'CS Bakersfield')
df['School'] = df['School'].replace('Cal St Northridge', 'CS Northridge')
df['School'] = df['School'].replace('East Tennessee St', 'ETSU')
df['School'] = df['School'].replace('Detroit Mercy', 'Detroit')
df['School'] = df['School'].replace('Fairleigh Dickinson', 'F Dickinson')
df['School'] = df['School'].replace('Florida Atlantic', 'FL Atlantic')
df['School'] = df['School'].replace('Florida Gulf Coast', 'FL Gulf Coast')
df['School'] = df['School'].replace('Florida International', 'Florida Intl')
df['School'] = df['School'].replace('George Washington', 'G Washington')
df['School'] = df['School'].replace('Georgia Southern', 'Ga Southern')
df['School'] = df['School'].replace('Gardner-Webb', 'Gardner Webb')
df['School'] = df['School'].replace('Illinois-Chicago', 'IL Chicago')
df['School'] = df['School'].replace('Kent St', 'Kent')
df['School'] = df['School'].replace('Long Island University', 'Long Island')
df['School'] = df['School'].replace('Loyola Marymount', 'Loy Marymount')
df['School'] = df['School'].replace('Loyola (MD)', 'Loyola MD')
df['School'] = df['School'].replace('Loyola (IL)', 'Loyola-Chicago')
df['School'] = df['School'].replace('Massachusetts', 'MA Lowell')
df['School'] = df['School'].replace('Maryland-Eastern Shore', 'MD E Shore')
df['School'] = df['School'].replace('Miami (FL)', 'Miami FL')
df['School'] = df['School'].replace('Miami (OH)', 'Miami OH')
df['School'] = df['School'].replace('Missouri-Kansas City', 'Missouri KC')
df['School'] = df['School'].replace('Monmouth', 'Monmouth NJ')
df['School'] = df['School'].replace('Mississippi Valley St', 'MS Valley St')
df['School'] = df['School'].replace('Montana St', 'MTSU')
df['School'] = df['School'].replace('Northern Colorado', 'N Colorado')
df['School'] = df['School'].replace('North Dakota St', 'N Dakota St')
df['School'] = df['School'].replace('Northern Illinois', 'N Illinois')
df['School'] = df['School'].replace('Northern Kentucky', 'N Kentucky')
df['School'] = df['School'].replace('North Carolina A&T', 'NC A&T')
df['School'] = df['School'].replace('North Carolina Central', 'NC Central')
df['School'] = df['School'].replace('Pennsylvania', 'Penn')
df['School'] = df['School'].replace('South Carolina St', 'S Carolina St')
df['School'] = df['School'].replace('Southern Illinois', 'S Illinois')
df['School'] = df['School'].replace('UC-Santa Barbara', 'Santa Barbara')
df['School'] = df['School'].replace('Southeastern Louisiana', 'SE Louisiana')
df['School'] = df['School'].replace('Southeast Missouri St', 'SE Missouri St')
df['School'] = df['School'].replace('Stephen F. Austin', 'SF Austin')
df['School'] = df['School'].replace('Southern Methodist', 'SMU')
df['School'] = df['School'].replace('Southern Mississippi', 'Southern Miss')
df['School'] = df['School'].replace('Southern', 'Southern Univ')
df['School'] = df['School'].replace('St. Bonaventure', 'St Bonaventure')
df['School'] = df['School'].replace('St. Francis (NY)', 'St Francis NY')
df['School'] = df['School'].replace('Saint Francis (PA)', 'St Francis PA')
df['School'] = df['School'].replace('St. John\'s (NY)', 'St John\'s')
df['School'] = df['School'].replace('Saint Joseph\'s', 'St Joseph\'s PA')
df['School'] = df['School'].replace('Saint Louis', 'St Louis')
df['School'] = df['School'].replace('Saint Mary\'s (CA)', 'St Mary\'s CA')
df['School'] = df['School'].replace('Mount Saint Mary\'s', 'Mt St Mary\'s')
df['School'] = df['School'].replace('Saint Peter\'s', 'St Peter\'s')
df['School'] = df['School'].replace('Texas A&M-Corpus Christian', 'TAM C. Christian')
df['School'] = df['School'].replace('Texas Christian', 'TCU')
df['School'] = df['School'].replace('Tennessee-Martin', 'TN Martin')
df['School'] = df['School'].replace('Texas-Rio Grande Valley', 'UTRGV')
df['School'] = df['School'].replace('Texas Southern', 'TX Southern')
df['School'] = df['School'].replace('Alabama-Birmingham', 'UAB')
df['School'] = df['School'].replace('UC-Davis', 'UC Davis')
df['School'] = df['School'].replace('UC-Irvine', 'UC Irvine')
df['School'] = df['School'].replace('UC-Riverside', 'UC Riverside')
df['School'] = df['School'].replace('Central Florida', 'UCF')
df['School'] = df['School'].replace('Louisiana-Lafayette', 'ULL')
df['School'] = df['School'].replace('Louisiana-Monroe', 'ULM')
df['School'] = df['School'].replace('Maryland-Baltimore County', 'UMBC')
df['School'] = df['School'].replace('North Carolina-Asheville', 'UNC Asheville')
df['School'] = df['School'].replace('North Carolina-Greensboro', 'UNC Greensboro')
df['School'] = df['School'].replace('North Carolina-Wilmington', 'UNC Wilmington')
df['School'] = df['School'].replace('Nevada-Las Vegas', 'UNLV')
df['School'] = df['School'].replace('Texas-Arlington', 'UT Arlington')
df['School'] = df['School'].replace('Texas-San Antonio', 'UT San Antonio')
df['School'] = df['School'].replace('Texas-El Paso', 'UTEP')
df['School'] = df['School'].replace('Virginia Commonwealth', 'VA Commonwealth')
df['School'] = df['School'].replace('Western Carolina', 'W Carolina')
df['School'] = df['School'].replace('Western Illinois', 'W Illinois')
df['School'] = df['School'].replace('Western Kentucky', 'WKU')
df['School'] = df['School'].replace('Western Michigan', 'W Michigan')
df['School'] = df['School'].replace('Abilene Christian', 'Abilene Chr')
df['School'] = df['School'].replace('Montana State', 'Montana St')
df['School'] = df['School'].replace('Central Arkansas', 'Cent Arkansas')
df['School'] = df['School'].replace('Houston Baptist', 'Houston Bap')
df['School'] = df['School'].replace('South Dakota St', 'S Dakota St')
df['School'] = df['School'].replace('Maryland-Eastern Shore', 'MD E Shore')
return df
def getTeamID(name):
return teams_pd[teams_pd['Team_Name'] == name].values[0][0]
def getTeamName(team_id):
return teams_pd[teams_pd['Team_Id'] == team_id].values[0][1]
def getNumChampionships(team_id):
name = getTeamName(team_id)
return NCAAChampionsList.count(name)
def getListForURL(team_list):
team_list = [x.lower() for x in team_list]
team_list = [t.replace(' ', '-') for t in team_list]
team_list = [t.replace('st', 'state') for t in team_list]
team_list = [t.replace('northern-dakota', 'north-dakota') for t in team_list]
team_list = [t.replace('nc-', 'north-carolina-') for t in team_list]
team_list = [t.replace('fl-', 'florida-') for t in team_list]
team_list = [t.replace('ga-', 'georgia-') for t in team_list]
team_list = [t.replace('lsu', 'louisiana-state') for t in team_list]
team_list = [t.replace('maristate', 'marist') for t in team_list]
team_list = [t.replace('stateate', 'state') for t in team_list]
team_list = [t.replace('northernorthern', 'northern') for t in team_list]
team_list = [t.replace('usc', 'southern-california') for t in team_list]
base = 'http://www.sports-reference.com/cbb/schools/'
for team in team_list:
url = base + team + '/'
getListForURL(teamList);
# Function for handling the annoying cases of Florida and FL, as well as State and St
def handleCases(arr):
indices = []
listLen = len(arr)
for i in range(listLen):
if (arr[i] == 'St' or arr[i] == 'FL'):
indices.append(i)
for p in indices:
arr[p-1] = arr[p-1] + ' ' + arr[p]
for i in range(len(indices)):
arr.remove(arr[indices[i] - i])
return arr
def checkConferenceChamp(team_id, year):
year_conf_pd = conference_pd[conference_pd['Year'] == year]
champs = year_conf_pd['Regular Season Champ'].tolist()
# For handling cases where there is more than one champion
champs_separated = [words for segments in champs for words in segments.split()]
name = getTeamName(team_id)
champs_separated = handleCases(champs_separated)
if (name in champs_separated):
return 1
else:
return 0
def checkConferenceTourneyChamp(team_id, year):
year_conf_pd = conference_pd[conference_pd['Year'] == year]
champs = year_conf_pd['Tournament Champ'].tolist()
name = getTeamName(team_id)
if (name in champs):
return 1
else:
return 0
def getTourneyAppearances(team_id):
return len(tourney_seeds_pd[tourney_seeds_pd['Team'] == team_id].index)
In [45]:
def getSeasonData(team_id, year):
# The data frame below holds stats for every single game in the given year
year_data_pd = reg_season_compact_pd[reg_season_compact_pd['Season'] == year]
# Finding number of points per game
gamesWon = year_data_pd[year_data_pd.Wteam == team_id]
totalPointsScored = gamesWon['Wscore'].sum()
gamesLost = year_data_pd[year_data_pd.Lteam == team_id]
totalGames = gamesWon.append(gamesLost)
numGames = len(totalGames.index)
totalPointsScored += gamesLost['Lscore'].sum()
# Finding number of points per game allowed
totalPointsAllowed = gamesWon['Lscore'].sum()
totalPointsAllowed += gamesLost['Wscore'].sum()
stats_SOS_pd = pd.read_csv('MMStats/MMStats_'+str(year)+'.csv')
stats_SOS_pd = handleDifferentCSV(stats_SOS_pd)
ratings_pd = pd.read_csv('RatingStats/RatingStats_'+str(year)+'.csv')
ratings_pd = handleDifferentCSV(ratings_pd)
name = getTeamName(team_id)
team = stats_SOS_pd[stats_SOS_pd['School'] == name]
team_rating = ratings_pd[ratings_pd['School'] == name]
if (len(team.index) == 0 or len(team_rating.index) == 0): #Can't find the team
total3sMade = 0
totalTurnovers = 0
totalAssists = 0
sos = 0
totalRebounds = 0
srs = 0
totalSteals = 0
else:
total3sMade = team['X3P'].values[0]
totalTurnovers = team['TOV'].values[0]
if (math.isnan(totalTurnovers)):
totalTurnovers = 0
totalAssists = team['AST'].values[0]
if (math.isnan(totalAssists)):
totalAssists = 0
sos = team['SOS'].values[0]
srs = team['SRS'].values[0]
totalRebounds = team['TRB'].values[0]
if (math.isnan(totalRebounds)):
totalRebounds = 0
totalSteals = team['STL'].values[0]
if (math.isnan(totalSteals)):
totalSteals = 0
#Finding tournament seed for that year
tourneyYear = tourney_seeds_pd[tourney_seeds_pd['Season'] == year]
seed = tourneyYear[tourneyYear['Team'] == team_id]
if (len(seed.index) != 0):
seed = seed.values[0][1]
tournamentSeed = int(seed[1:3])
else:
tournamentSeed = 25 #Not sure how to represent if a team didn't make the tourney
# Finding number of wins and losses
numWins = len(gamesWon.index)
# There are some teams who may have dropped to Division 2, so they won't have games
# a certain year. In this case, we don't want to divide by 0, so we'll just set the
# averages to 0 instead
if numGames == 0:
avgPointsScored = 0
avgPointsAllowed = 0
avg3sMade = 0
avgTurnovers = 0
avgAssists = 0
avgRebounds = 0
avgSteals = 0
else:
avgPointsScored = totalPointsScored/numGames
avgPointsAllowed = totalPointsAllowed/numGames
avg3sMade = total3sMade/numGames
avgTurnovers = totalTurnovers/numGames
avgAssists = totalAssists/numGames
avgRebounds = totalRebounds/numGames
avgSteals = totalSteals/numGames
#return [numWins, sos, srs]
#return [numWins, avgPointsScored, avgPointsAllowed, checkPower6Conference(team_id), avg3sMade, avg3sAllowed, avgTurnovers,
# tournamentSeed, getStrengthOfSchedule(team_id, year), getTourneyAppearances(team_id)]
return [numWins, avgPointsScored, avgPointsAllowed, checkPower6Conference(team_id), avg3sMade, avgAssists, avgTurnovers,
checkConferenceChamp(team_id, year), checkConferenceTourneyChamp(team_id, year), tournamentSeed,
sos, srs, avgRebounds, avgSteals, getTourneyAppearances(team_id), getNumChampionships(team_id)]
In [43]:
### This is adashpande3's as well, but heavily modified
def createSeasonDict(year):
seasonDictionary = collections.defaultdict(list)
for team in teamList:
team_id = teams_pd[teams_pd['Team_Name'] == team].values[0][0]
team_vector = getSeasonData(team_id, year)
vector_headers = ["num_wins", "avg_points_scored", "avg_points_allowed", "power6", "avg3s_made", "avg_assists",
"avg_turnovers", "conference_chap_this_year", "conference_tourney_champ_this_year", "tournament_seed",
"sos", "srs", "avg_rebounds", "avg_steals", "num_tourney_appearances", "num_championships"]
seasonDictionary[team_id] = dict(zip(vector_headers, team_vector))
#seasonDictionary[team_id] = team_vector
return seasonDictionary
In [48]:
s2016_results = createSeasonDict(2016)
df_2016_results = pd.DataFrame(s2016_results)
In [49]:
df_2016_results[1211] #Stats for Gonzaga, for example
Out[49]:
In [6]:
tourney_detail.columns
Out[6]:
In [34]:
# Headers of winning and losing team information
winHeaders = ['Wteam', 'Wscore', 'Wfgm', 'Wfga',
'Wfgm3', 'Wfga3', 'Wftm', 'Wfta', 'Wor',
'Wdr', 'Wast', 'Wto', 'Wstl', 'Wblk', 'Wpf']
lossHeaders = ['Lteam', 'Lscore', 'Lfgm', 'Lfga',
'Lfgm3', 'Lfga3','Lftm', 'Lfta', 'Lor',
'Ldr', 'Last', 'Lto', 'Lstl', 'Lblk', 'Lpf']
In [37]:
# Initialize the new dataframe
df = pandas.DataFrame(data=None, index=range(0, tourney_detail.shape[0]),
columns=['Ateam', 'Ascore', 'Afgm', 'Afga',
'Afgm3', 'Afga3', 'Aftm', 'Afta', 'Aor',
'Adr', 'Aast', 'Ato', 'Astl', 'Ablk', 'Apf', 'Bteam', 'Bscore', 'Bfgm', 'Bfga',
'Bfgm3', 'Bfga3', 'Bftm', 'Bfta', 'Bor',
'Bdr', 'Bast', 'Bto', 'Bstl', 'Bblk', 'Bpf', 'Awin'], dtype=None, copy=False)
#Initialize Headers
aHeaders = ['Ateam', 'Ascore', 'Afgm', 'Afga', 'Afgm3',
'Afga3', 'Aftm', 'Afta', 'Aor', 'Adr', 'Aast',
'Ato', 'Astl', 'Ablk', 'Apf']
bHeaders = ['Bteam', 'Bscore', 'Bfgm', 'Bfga',
'Bfgm3', 'Bfga3', 'Bftm', 'Bfta', 'Bor',
'Bdr', 'Bast', 'Bto', 'Bstl', 'Bblk', 'Bpf']
In [38]:
import numpy as np
for i in range(0, tourney_detail.shape[0]):
# Random Int of 1 or 0 decides whether the winning team is team A or team B
if (np.random.randint(2)):
aInsertHeaders = winHeaders
bInsertHeaders = lossHeaders
df['Awin'][i] = 1
else:
aInsertHeaders = lossHeaders
bInsertHeaders = winHeaders
df['Awin'][i] = 0
# Puts the team data in the appropriate location in the df
for j in range(0, len(aHeaders)):
df[aHeaders[j]][i] = tourney_detail[aInsertHeaders[j]][i]
df[bHeaders[j]][i] = tourney_detail[bInsertHeaders[j]][i]
Out[38]:
In [44]:
print(df.head())
In [47]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier
columns= ['Ateam', 'Ascore', 'Afgm', 'Afga',
'Afgm3', 'Afga3', 'Aftm', 'Afta', 'Aor',
'Adr', 'Aast', 'Ato', 'Astl', 'Ablk', 'Apf', 'Bteam', 'Bscore', 'Bfgm', 'Bfga',
'Bfgm3', 'Bfga3', 'Bftm', 'Bfta', 'Bor',
'Bdr', 'Bast', 'Bto', 'Bstl', 'Bblk', 'Bpf']
clf = AdaBoostClassifier(n_estimators=100)
Y = list(df["Awin"].values)
X = df[list(columns)].values
clf = clf.fit(X, Y)
scores = cross_val_score(clf, X, Y)
print(scores.mean())
In [51]:
for i in range(0, len(columns)):
print(columns[i], ":", clf.feature_importances_[i])
In [ ]: