In [1]:
import pandas as pd
import numpy as np
import matplotlib

from scipy import stats, integrate

import matplotlib.pyplot as plt
%matplotlib inline
import cufflinks as cf

import plotly
plotly.offline.init_notebook_mode()
import plotly.offline as py
import plotly.graph_objs as go
from plotly.graph_objs import *

import seaborn as sns
sns.set(color_codes=True)

print(pd.__version__)


0.19.2

In [6]:
df = pd.read_csv('/Users/DanMoeller/git/ncaa-bball-attendance/data/big_12/big_12_2014_2016.csv',sep=",",header='infer')

In [7]:
df.head(5)


Out[7]:
away_wins attendance capacity home_id home_opener home_losses away_win_pct away_losses home_win_pct home_score ... game_id line day away_id home_rank home_wins scoring_line time away_score tv_coverage
0 1 5706 10284.0 239 0 0 1.000 0 1.000 66 ... 400506825 0.0 tuesday 2579 0 1 NaN 20:00 64 espn
1 2 5290 10284.0 239 0 0 0.667 1 1.000 87 ... 400506826 0.0 sunday 309 0 2 NaN 22:00 68 NaN
2 2 5185 10284.0 239 0 0 0.500 2 1.000 69 ... 400506827 0.0 thursday 2127 0 3 NaN 0:00 64 NaN
3 4 5667 10284.0 239 0 1 0.444 5 0.889 91 ... 400506831 0.0 thursday 2466 0 8 NaN 2:30 84 NaN
4 3 5562 10284.0 239 0 1 0.273 8 0.900 81 ... 400506832 0.0 sunday 2582 0 9 NaN 22:00 56 NaN

5 rows × 22 columns


In [8]:
df.tail(5)


Out[8]:
away_wins attendance capacity home_id home_opener home_losses away_win_pct away_losses home_win_pct home_score ... game_id line day away_id home_rank home_wins scoring_line time away_score tv_coverage
534 2 8731 14000.0 277 0 0 0.222 7 1.000 87 ... 400813814 0.0 saturday 338 0 6 NaN 17:30 54 NaN
535 3 8323 14000.0 277 0 0 0.375 5 1.000 100 ... 400813815 -21.5 sunday 2433 0 8 133.0 22:00 58 NaN
536 8 8121 14000.0 277 0 1 0.615 5 0.900 84 ... 400813816 -26.0 tuesday 2198 0 9 162.0 0:00 59 espnu
537 10 9567 9900.0 259 0 4 0.909 1 0.667 63 ... 400813817 8.0 wednesday 277 0 8 148.0 17:00 88 espnu
538 17 11611 10151.0 57 0 7 0.850 3 0.650 88 ... 400809416 1.0 saturday 277 0 13 144.0 17:00 71 espn

5 rows × 22 columns


In [9]:
df.describe()


Out[9]:
away_wins attendance capacity home_id home_opener home_losses away_win_pct away_losses home_win_pct home_score away_rank game_id line away_id home_rank home_wins scoring_line away_score
count 539.000000 539.000000 504.000000 539.000000 539.000000 539.000000 539.000000 539.000000 539.000000 539.000000 539.000000 5.390000e+02 539.000000 539.000000 539.000000 539.000000 142.000000 539.000000
mean 9.231911 10169.486085 12986.073413 1073.256030 0.038961 4.035250 0.617479 4.755102 0.719631 75.764378 8.129870 4.006388e+08 -1.441558 1261.717996 2.230056 9.831169 144.507042 65.528757
std 6.559424 4033.831734 3183.263947 1115.437195 0.193682 4.028784 0.270841 3.825097 0.231882 12.679864 8.428456 1.383145e+05 5.198818 1129.874449 5.901527 5.963900 8.613651 12.107197
min 0.000000 1365.000000 2500.000000 2.000000 0.000000 0.000000 0.000000 0.000000 0.000000 42.000000 0.000000 4.004969e+08 -29.500000 2.000000 0.000000 0.000000 126.000000 27.000000
25% 3.000000 6177.000000 11562.000000 201.000000 0.000000 1.000000 0.500000 2.000000 0.625000 67.000000 0.000000 4.004984e+08 0.000000 201.000000 0.000000 5.000000 138.250000 57.000000
50% 9.000000 10597.000000 13611.000000 251.000000 0.000000 3.000000 0.667000 4.000000 0.759000 76.000000 6.000000 4.005858e+08 0.000000 299.000000 0.000000 10.000000 144.500000 65.000000
75% 14.000000 13239.500000 15098.000000 2306.000000 0.000000 6.000000 0.805000 7.000000 0.867000 85.000000 16.000000 4.008170e+08 0.000000 2435.000000 0.000000 14.000000 149.750000 73.000000
max 25.000000 24340.000000 21750.000000 2641.000000 1.000000 20.000000 1.000000 19.000000 1.000000 116.000000 25.000000 4.008582e+08 13.000000 3084.000000 25.000000 26.000000 170.000000 106.000000

In [10]:
df.dtypes


Out[10]:
away_wins         int64
attendance        int64
capacity        float64
home_id           int64
home_opener       int64
home_losses       int64
away_win_pct    float64
away_losses       int64
home_win_pct    float64
home_score        int64
away_rank         int64
date             object
game_id           int64
line            float64
day              object
away_id           int64
home_rank         int64
home_wins         int64
scoring_line    float64
time             object
away_score        int64
tv_coverage      object
dtype: object

In [11]:
df.columns


Out[11]:
Index([u'away_wins', u'attendance', u'capacity', u'home_id', u'home_opener',
       u'home_losses', u'away_win_pct', u'away_losses', u'home_win_pct',
       u'home_score', u'away_rank', u'date', u'game_id', u'line', u'day',
       u'away_id', u'home_rank', u'home_wins', u'scoring_line', u'time',
       u'away_score', u'tv_coverage'],
      dtype='object')

In [12]:
attend = df.attendance.values
cap = df.capacity.values

In [13]:
# Every Big East games 2014-16 season
plt.plot(cap,attend,"bo")
plt.xlabel("Stadium Capacity")
plt.ylabel("Game Attendance")
plt.show()

# The vertical lines are different teams home games.



In [15]:
# only include Big East Conference games
big_12_teams = [239, 66, 2305, 2306, 201, 197, 2628, 251, 2641, 277]
df_conf = df[df.home_id.isin(big_12_teams) & df.away_id.isin(big_12_teams)]

In [27]:
# Dataframe for each team
# df_baylor = df_conf[df_conf.home_id == 239]
# df_iowa_state = df_conf[df_conf.home_id == 66]
# df_kansas = df_conf[df_conf.home_id == 2305]
# df_kansas_state = df_conf[df_conf.home_id == 2306]
# df_oklahoma = df_conf[df_conf.home_id == 201]
# df_ok_state = df_conf[df_conf.home_id == 197]
# df_tcu = df_conf[df_conf.home_id == 2628]
# df_texas = df_conf[df_conf.home_id == 251]
# df_texas_tech = df_conf[df_conf.home_id == 2641]
# df_west_virginia = df_conf[df_conf.home_id == 277]

df_baylor = df[df.home_id == 239]
df_iowa_state = df[df.home_id == 66]
df_kansas = df[df.home_id == 2305]
df_kansas_state = df[df.home_id == 2306]
df_oklahoma = df[df.home_id == 201]
df_ok_state = df[df.home_id == 197]
df_tcu = df[df.home_id == 2628]
df_texas = df[df.home_id == 251]
df_texas_tech = df[df.home_id == 2641]
df_west_virginia = df[df.home_id == 277]

In [28]:
attend = df_conf.attendance.values
cap = df_conf.capacity.values
game = df_conf.game_id.values
pct_full = attend / cap
line = df_conf.line.values

In [29]:
plt.plot(df_baylor.line.values, (df_baylor.attendance.values / df_baylor.capacity.values),"o",color='#00558c',label='Baylor')
plt.plot(df_iowa_state.line.values, (df_iowa_state.attendance.values / df_iowa_state.capacity.values),"o",color='#13294b',label='Iowa State')
plt.plot(df_kansas.line.values, (df_kansas.attendance.values / df_kansas.capacity.values),"o",color='#001F5B',label='Kansas')
plt.plot(df_kansas_state.line.values, (df_kansas_state.attendance.values / df_kansas_state.capacity.values),"o",color='#011e41',label='Kansas St')
plt.plot(df_oklahoma.line.values, (df_oklahoma.attendance.values / df_oklahoma.capacity.values),"ko",label='Oklahoma')
plt.plot(df_ok_state.line.values, (df_ok_state.attendance.values / df_ok_state.capacity.values),"o",color='#FFCC00',label='Ok State')
plt.plot(df_tcu.line.values, (df_tcu.attendance.values / df_tcu.capacity.values),"o",color='#004488',label='TCU')
plt.plot(df_texas.line.values, (df_texas.attendance.values / df_texas.capacity.values),"o",color='#002857',label='Texas')
plt.plot(df_texas_tech.line.values, (df_texas_tech.attendance.values / df_texas_tech.capacity.values),"o",color='#CF102D',label='Texas Tech')
plt.plot(df_west_virginia.line.values, (df_west_virginia.attendance.values / df_west_virginia.capacity.values),"o",color='#3c4982',label='West Virginia')
plt.xlabel("Betting Line")
plt.ylabel("Percent Full")
plt.show()



In [30]:
plt.plot(df_baylor.away_win_pct.values, (df_baylor.attendance.values / df_baylor.capacity.values),"o",color='#00558c',label='Baylor')
plt.plot(df_iowa_state.away_win_pct.values, (df_iowa_state.attendance.values / df_iowa_state.capacity.values),"o",color='#13294b',label='Iowa State')
plt.plot(df_kansas.away_win_pct.values, (df_kansas.attendance.values / df_kansas.capacity.values),"o",color='#001F5B',label='Kansas')
plt.plot(df_kansas_state.away_win_pct.values, (df_kansas_state.attendance.values / df_kansas_state.capacity.values),"o",color='#011e41',label='Kansas St')
plt.plot(df_oklahoma.away_win_pct.values, (df_oklahoma.attendance.values / df_oklahoma.capacity.values),"ko",label='Oklahoma')
plt.plot(df_ok_state.away_win_pct.values, (df_ok_state.attendance.values / df_ok_state.capacity.values),"o",color='#FFCC00',label='Ok State')
plt.plot(df_tcu.away_win_pct.values, (df_tcu.attendance.values / df_tcu.capacity.values),"o",color='#004488',label='TCU')
plt.plot(df_texas.away_win_pct.values, (df_texas.attendance.values / df_texas.capacity.values),"o",color='#002857',label='Texas')
plt.plot(df_texas_tech.away_win_pct.values, (df_texas_tech.attendance.values / df_texas_tech.capacity.values),"o",color='#CF102D',label='Texas Tech')
plt.plot(df_west_virginia.away_win_pct.values, (df_west_virginia.attendance.values / df_west_virginia.capacity.values),"o",color='#3c4982',label='West Virginia')
plt.xlabel("Away Win Percentage")
plt.ylabel("Percent Full")
plt.show()



In [31]:
plt.plot(df_baylor.home_win_pct.values, (df_baylor.attendance.values / df_baylor.capacity.values),"o",color='#00558c',label='Baylor')
plt.plot(df_iowa_state.home_win_pct.values, (df_iowa_state.attendance.values / df_iowa_state.capacity.values),"o",color='#13294b',label='Iowa State')
plt.plot(df_kansas.home_win_pct.values, (df_kansas.attendance.values / df_kansas.capacity.values),"o",color='#001F5B',label='Kansas')
plt.plot(df_kansas_state.home_win_pct.values, (df_kansas_state.attendance.values / df_kansas_state.capacity.values),"o",color='#011e41',label='Kansas St')
plt.plot(df_oklahoma.home_win_pct.values, (df_oklahoma.attendance.values / df_oklahoma.capacity.values),"ko",label='Oklahoma')
plt.plot(df_ok_state.home_win_pct.values, (df_ok_state.attendance.values / df_ok_state.capacity.values),"o",color='#FFCC00',label='Ok State')
plt.plot(df_tcu.home_win_pct.values, (df_tcu.attendance.values / df_tcu.capacity.values),"o",color='#004488',label='TCU')
plt.plot(df_texas.home_win_pct.values, (df_texas.attendance.values / df_texas.capacity.values),"o",color='#002857',label='Texas')
plt.plot(df_texas_tech.home_win_pct.values, (df_texas_tech.attendance.values / df_texas_tech.capacity.values),"o",color='#CF102D',label='Texas Tech')
plt.plot(df_west_virginia.home_win_pct.values, (df_west_virginia.attendance.values / df_west_virginia.capacity.values),"o",color='#3c4982',label='West Virginia')
plt.xlabel("Home Win Percentage")
plt.ylabel("Percent Full")
plt.show()



In [32]:
plt.plot((df_baylor.home_win_pct.values + df_baylor.away_win_pct.values)/2, (df_baylor.attendance.values / df_baylor.capacity.values),"o",color='#00558c',label='Baylor')
plt.plot((df_iowa_state.home_win_pct.values + df_iowa_state.away_win_pct.values)/2, (df_iowa_state.attendance.values / df_iowa_state.capacity.values),"o",color='#13294b',label='Iowa State')
plt.plot((df_kansas.home_win_pct.values + df_kansas.away_win_pct.values)/2, (df_kansas.attendance.values / df_kansas.capacity.values),"o",color='#001F5B',label='Kansas')
plt.plot((df_kansas_state.home_win_pct.values + df_kansas_state.away_win_pct.values)/2, (df_kansas_state.attendance.values / df_kansas_state.capacity.values),"o",color='#011e41',label='Kansas St')
plt.plot((df_oklahoma.home_win_pct.values + df_oklahoma.away_win_pct.values)/2, (df_oklahoma.attendance.values / df_oklahoma.capacity.values),"ko",label='Oklahoma')
plt.plot((df_ok_state.home_win_pct.values + df_ok_state.away_win_pct.values)/2, (df_ok_state.attendance.values / df_ok_state.capacity.values),"o",color='#FFCC00',label='Ok State')
plt.plot((df_tcu.home_win_pct.values + df_tcu.away_win_pct.values)/2, (df_tcu.attendance.values / df_tcu.capacity.values),"o",color='#004488',label='TCU')
plt.plot((df_texas.home_win_pct.values + df_texas.away_win_pct.values)/2, (df_texas.attendance.values / df_texas.capacity.values),"o",color='#002857',label='Texas')
plt.plot((df_texas_tech.home_win_pct.values + df_texas_tech.away_win_pct.values)/2, (df_texas_tech.attendance.values / df_texas_tech.capacity.values),"o",color='#CF102D',label='Texas Tech')
plt.plot((df_west_virginia.home_win_pct.values + df_west_virginia.away_win_pct.values)/2, (df_west_virginia.attendance.values / df_west_virginia.capacity.values),"o",color='#3c4982',label='West Virginia')
plt.xlabel("Average Win Percentage")
plt.ylabel("Percent Full")
plt.show()



In [33]:
plt.plot(df_baylor.away_rank.values, (df_baylor.attendance.values / df_baylor.capacity.values),"o",color='#00558c',label='Baylor')
plt.plot(df_iowa_state.away_rank.values, (df_iowa_state.attendance.values / df_iowa_state.capacity.values),"o",color='#13294b',label='Iowa State')
plt.plot(df_kansas.away_rank.values, (df_kansas.attendance.values / df_kansas.capacity.values),"o",color='#001F5B',label='Kansas')
plt.plot(df_kansas_state.away_rank.values, (df_kansas_state.attendance.values / df_kansas_state.capacity.values),"o",color='#011e41',label='Kansas St')
plt.plot(df_oklahoma.away_rank.values, (df_oklahoma.attendance.values / df_oklahoma.capacity.values),"ko",label='Oklahoma')
plt.plot(df_ok_state.away_rank.values, (df_ok_state.attendance.values / df_ok_state.capacity.values),"o",color='#FFCC00',label='Ok State')
plt.plot(df_tcu.away_rank.values, (df_tcu.attendance.values / df_tcu.capacity.values),"o",color='#004488',label='TCU')
plt.plot(df_texas.away_rank.values, (df_texas.attendance.values / df_texas.capacity.values),"o",color='#002857',label='Texas')
plt.plot(df_texas_tech.away_rank.values, (df_texas_tech.attendance.values / df_texas_tech.capacity.values),"o",color='#CF102D',label='Texas Tech')
plt.plot(df_west_virginia.away_rank.values, (df_west_virginia.attendance.values / df_west_virginia.capacity.values),"o",color='#3c4982',label='West Virginia')
plt.xlabel("Away Rank")
plt.ylabel("Percent Full")
plt.show()



In [34]:
plt.plot(df_baylor.home_rank.values, (df_baylor.attendance.values / df_baylor.capacity.values),"o",color='#00558c',label='Baylor')
plt.plot(df_iowa_state.home_rank.values, (df_iowa_state.attendance.values / df_iowa_state.capacity.values),"o",color='#13294b',label='Iowa State')
plt.plot(df_kansas.home_rank.values, (df_kansas.attendance.values / df_kansas.capacity.values),"o",color='#001F5B',label='Kansas')
plt.plot(df_kansas_state.home_rank.values, (df_kansas_state.attendance.values / df_kansas_state.capacity.values),"o",color='#011e41',label='Kansas St')
plt.plot(df_oklahoma.home_rank.values, (df_oklahoma.attendance.values / df_oklahoma.capacity.values),"ko",label='Oklahoma')
plt.plot(df_ok_state.home_rank.values, (df_ok_state.attendance.values / df_ok_state.capacity.values),"o",color='#FFCC00',label='Ok State')
plt.plot(df_tcu.home_rank.values, (df_tcu.attendance.values / df_tcu.capacity.values),"o",color='#004488',label='TCU')
plt.plot(df_texas.home_rank.values, (df_texas.attendance.values / df_texas.capacity.values),"o",color='#002857',label='Texas')
plt.plot(df_texas_tech.home_rank.values, (df_texas_tech.attendance.values / df_texas_tech.capacity.values),"o",color='#CF102D',label='Texas Tech')
plt.plot(df_west_virginia.home_rank.values, (df_west_virginia.attendance.values / df_west_virginia.capacity.values),"o",color='#3c4982',label='West Virginia')
plt.xlabel("Home Rank")
plt.ylabel("Percent Full")
plt.show()



In [35]:
plt.plot(df_baylor.scoring_line.values, (df_baylor.attendance.values / df_baylor.capacity.values),"o",color='#00558c',label='Baylor')
plt.plot(df_iowa_state.scoring_line.values, (df_iowa_state.attendance.values / df_iowa_state.capacity.values),"o",color='#13294b',label='Iowa State')
plt.plot(df_kansas.scoring_line.values, (df_kansas.attendance.values / df_kansas.capacity.values),"o",color='#001F5B',label='Kansas')
plt.plot(df_kansas_state.scoring_line.values, (df_kansas_state.attendance.values / df_kansas_state.capacity.values),"o",color='#011e41',label='Kansas St')
plt.plot(df_oklahoma.scoring_line.values, (df_oklahoma.attendance.values / df_oklahoma.capacity.values),"ko",label='Oklahoma')
plt.plot(df_ok_state.scoring_line.values, (df_ok_state.attendance.values / df_ok_state.capacity.values),"o",color='#FFCC00',label='Ok State')
plt.plot(df_tcu.scoring_line.values, (df_tcu.attendance.values / df_tcu.capacity.values),"o",color='#004488',label='TCU')
plt.plot(df_texas.scoring_line.values, (df_texas.attendance.values / df_texas.capacity.values),"o",color='#002857',label='Texas')
plt.plot(df_texas_tech.scoring_line.values, (df_texas_tech.attendance.values / df_texas_tech.capacity.values),"o",color='#CF102D',label='Texas Tech')
plt.plot(df_west_virginia.scoring_line.values, (df_west_virginia.attendance.values / df_west_virginia.capacity.values),"o",color='#3c4982',label='West Virginia')
plt.xlabel("Scoring Line")
plt.ylabel("Percent Full")
plt.show()



In [ ]:


In [ ]: