In [1]:
# imports
import sqlite3 as sql
from sklearn import datasets
from sklearn import metrics
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
In [ ]:
# creates a connection to by local SQLite NCAA database
## * I pull from the database each time to guarantee that no raw/original data is modified
db = r'<path to database>'
conn = sql.connect(db)
In [4]:
# Creates dataframe for each year, puts it in a dictonary with the year as a key
d={}
for year in range(2009,2017,1):
year = str(year)
SQL_str = 'SELECT * FROM RegularSeasonDetailedResults WHERE Season =' + year
d["df{0}".format(year)]= pd.read_sql_query(SQL_str, conn)
In [5]:
# creates a copy of the dataframe dictionary
data_d = d.copy()
In [ ]:
# takes a dictionary of dataframes and modifies each df. Returns dictionary with modified dfs.
def modify_dictdf(d):
# new dictionary
new_d = {}
# adds some low level agg. stats like %freethrow
for year, df in d.items():
df['Wspread'] = df['Wscore'] - df['Lscore']
df['Lspread'] = df['Lscore'] - df['Wscore']
df['W%_freethrow'] = df['Wftm']/(df['Wftm']+df['Wfta'])
df['W%_field_goals'] = df['Wfgm']/(df['Wfgm']+df['Wfga'])
df['W%_3pt'] = df['Wfgm3']/(df['Wfgm3']+df['Wfga3'])
df['L%_freethrow'] = df['Lftm']/(df['Lftm']+df['Lfta'])
df['L%_field_goals'] = df['Lfgm']/(df['Lfgm']+df['Lfga'])
df['L%_3pt'] = df['Lfgm3']/(df['Lfgm3']+df['Lfga3'])
# difference in offensive rebounds
df['W_dor'] = df['Wor']-df['Lor']
df['L_dor'] = df['Lor']-df['Wor']
# difference in defensive rebounds
df['W_ddR'] = df['Wdr']-df['Ldr']
df['L_ddR'] = df['Ldr']-df['Wdr']
# creates two rows for every game so that each game's stats are represented for both teams
#splits game data to team data
df_a = df[['Wteam', 'Wscore', 'Wfgm','Wfga', 'Wfgm3', 'Wfga3', 'Wftm', 'Wfta', 'Wor',
'Wdr', 'Wast', 'Wto', 'Wstl', 'Wblk', 'Wpf', 'W%_freethrow', 'W%_field_goals',
'W%_3pt', 'Wspread', 'W_dor', 'W_ddR']]
df_b = df[['Lteam', 'Lscore','Lfgm', 'Lfga', 'Lfgm3', 'Lfga3','Lftm', 'Lfta', 'Lor', 'Ldr',
'Last', 'Lto', 'Lstl', 'Lblk', 'Lpf','L%_freethrow', 'L%_field_goals', 'L%_3pt',
'Lspread', 'L_dor', 'L_ddR']]
# renames columns of winner and loser dataframes
df_a = df_a.rename(columns = {'Wteam':'team', 'Wscore': 'score', 'Wfgm': 'fgm','Wfga': 'fga',
'Wfgm3': 'fgm3', 'Wfga3':'fga3', 'Wftm':'ftm', 'Wfta':'fta',
'Wor':'or', 'Wdr':'dr', 'Wast':'ast','Wto':'to', 'Wstl':'stl',
'Wblk':'blk', 'Wpf':'pf', 'W%_freethrow':'%_freethrow',
'W%_field_goals':'%field_goal', 'W%_3pt':'%_3pt',
'Wspread':'spread', 'W_dor':'dor', 'W_ddR':'ddR' })
df_b = df_b.rename(columns = {'Lteam':'team', 'Lscore': 'score','Lfgm': 'fgm', 'Lfga': 'fga',
'Lfgm3': 'fgm3', 'Lfga3':'fga3','Lftm':'ftm', 'Lfta':'fta',
'Lor':'or', 'Ldr':'dr', 'Last':'ast','Lto':'to','Lstl':'stl',
'Lblk':'blk','Lpf':'pf','L%_freethrow':'%_freethrow',
'L%_field_goals':'%field_goal', 'L%_3pt':'%_3pt',
'Lspread':'spread', 'L_dor':'dor', 'L_ddR':'ddR'})
# combines winner and loser dataframes so that each game has two data rows and some extra info
df_comp = df_a.append(df_b)
# applies outcome function
add_outcome = outcome(df_comp)
# makes a new dictionary of dataframes
new_d[year] = add_outcome
return new_d
In [7]:
# takes a dataframe, adds an 'outcome' column where win = 1 and loss = 0, returns dataframe
def outcome(df):
outcome = []
for num in df.spread:
if num > 0:
outcome.append(1)
if num < 0:
outcome.append(0)
df['outcome'] = outcome
return df
In [13]:
def team_data(df):
clean_data = modify_dictdf(data_d)
# creates dictionary for every season where team is key, and game df for team is value
team_data = {}
for key in clean_data.keys():
df = clean_data[key]
name = 'team_dict' + key[2:]
name = {}
# list of team unique team ID numbers
teams = list(set(list(df.team)))
for team in teams:
name[team] = df[df.team == team]
team_data[key] = name
return team_data
In [8]:
# addition of low level statistics and separating winner and loser fields so that each game has two rows
modified_data = modify_dictdf(data_d)
In [14]:
# adds low level statistics, splits each game into two rows, and separates
# **depending on your computer this could take a few minutes
team_data = team_data(data_d)
In [18]:
# closes the connection to the database
conn.close()
In [ ]: