In [2]:
import pandas as pd
import numpy as np
import matplotlib

from scipy import stats, integrate

import matplotlib.pyplot as plt
%matplotlib inline
import cufflinks as cf

import plotly
plotly.offline.init_notebook_mode()
import plotly.offline as py
import plotly.graph_objs as go
from plotly.graph_objs import *

import seaborn as sns
sns.set(color_codes=True)

print(pd.__version__)


0.19.2

In [1]:
df = pd.read_csv('/Users/DanMoeller/git/ncaa-bball-attendance/data/big_east/big_east_2014_2016.csv',sep=",",header='infer')


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-c4dfddaf5c6d> in <module>()
----> 1 df = pd.read_csv('/Users/DanMoeller/git/ncaa-bball-attendance/data/big_east/big_east_2014_2016.csv',sep=",",header='infer')
      2 df['pct_full'] = df['attendance'] / df['capacity']

NameError: name 'pd' is not defined

In [37]:
df.head(5)


Out[37]:
away_wins attendance capacity home_id home_opener home_losses away_win_pct away_losses home_win_pct home_score ... game_id line day away_id home_rank home_wins scoring_line time away_score tv_coverage
0 0 16859 15500.0 156 0 0 0.000 1 1.000 96 ... 400499763 0.0 tuesday 140 0 1 NaN 1:00 70 fs1
1 2 4200 4200.0 2603 0 0 1.000 0 1.000 79 ... 400504465 3.0 sunday 156 0 2 NaN 0:00 83 NaN
2 0 18078 15500.0 156 0 0 0.000 3 1.000 82 ... 400499278 0.0 saturday 202 0 3 NaN 20:30 72 fsn
3 5 3481 4200.0 299 0 7 0.714 2 0.125 61 ... 400502942 13.0 wednesday 156 0 1 NaN 3:00 78 NaN
4 6 17530 15500.0 156 0 2 0.750 2 0.750 82 ... 400504490 -12.5 sunday 158 0 6 NaN 23:07 67 fs1

5 rows × 22 columns


In [38]:
df.tail(5)


Out[38]:
away_wins attendance capacity home_id home_opener home_losses away_win_pct away_losses home_win_pct home_score ... game_id line day away_id home_rank home_wins scoring_line time away_score tv_coverage
528 4 6500 6500.0 222 0 1 0.571 3 0.875 76 ... 400825756 -24.5 sunday 2325 0 7 139.0 22:00 47 NaN
529 8 14593 14593.0 258 0 1 0.889 1 0.889 86 ... 400830367 -5.5 saturday 222 8 8 130.0 17:00 75 espn2
530 5 6500 6500.0 222 0 2 0.556 4 0.800 78 ... 400833111 -25.0 wednesday 48 0 8 137.0 0:00 48 fs1
531 5 6500 6500.0 222 0 2 0.455 6 0.818 77 ... 400840332 -26.5 tuesday 219 0 9 132.0 0:00 57 fs1
532 22 10472 10200.0 218 0 8 0.880 3 0.667 67 ... 400825573 10.0 thursday 222 0 16 133.0 0:00 83 espn2

5 rows × 22 columns


In [39]:
df.describe()


Out[39]:
away_wins attendance capacity home_id home_opener home_losses away_win_pct away_losses home_win_pct home_score away_rank game_id line away_id home_rank home_wins scoring_line away_score
count 533.000000 533.000000 503.000000 533.000000 533.000000 533.000000 533.000000 533.000000 533.000000 533.000000 533.000000 5.330000e+02 533.000000 533.000000 533.000000 533.000000 157.000000 533.000000
mean 8.909944 9743.247655 13997.536779 1241.365854 0.043152 4.615385 0.601780 5.052533 0.679454 74.395872 4.497186 4.006500e+08 -6.312383 1308.727955 0.476548 9.294559 144.987261 67.060038
std 6.366276 4555.379015 5097.732443 1153.232948 0.203390 4.440533 0.261039 4.253377 0.237606 11.897390 7.295746 1.411536e+05 9.284986 1157.007260 2.600230 6.031053 6.858447 11.839471
min 0.000000 754.000000 2500.000000 2.000000 0.000000 0.000000 0.000000 0.000000 0.000000 35.000000 0.000000 4.004969e+08 -35.000000 2.000000 0.000000 0.000000 129.000000 37.000000
25% 3.000000 6500.000000 9100.000000 222.000000 0.000000 1.000000 0.455000 2.000000 0.567000 67.000000 0.000000 4.005046e+08 -12.500000 222.000000 0.000000 5.000000 140.000000 58.000000
50% 8.000000 8600.000000 15500.000000 305.000000 0.000000 3.000000 0.643000 4.000000 0.688000 74.000000 0.000000 4.005884e+08 -5.500000 333.000000 0.000000 9.000000 145.000000 66.000000
75% 13.000000 12736.000000 18711.000000 2550.000000 0.000000 7.000000 0.778000 7.000000 0.857000 83.000000 7.000000 4.008278e+08 0.000000 2550.000000 0.000000 13.000000 150.000000 75.000000
max 27.000000 28135.000000 21678.000000 2752.000000 1.000000 22.000000 1.000000 22.000000 1.000000 114.000000 25.000000 4.008723e+08 20.000000 2916.000000 24.000000 29.000000 166.000000 99.000000

In [40]:
df.dtypes


Out[40]:
away_wins         int64
attendance        int64
capacity        float64
home_id           int64
home_opener       int64
home_losses       int64
away_win_pct    float64
away_losses       int64
home_win_pct    float64
home_score        int64
away_rank         int64
date             object
game_id           int64
line            float64
day              object
away_id           int64
home_rank         int64
home_wins         int64
scoring_line    float64
time             object
away_score        int64
tv_coverage      object
dtype: object

In [41]:
df.columns


Out[41]:
Index([u'away_wins', u'attendance', u'capacity', u'home_id', u'home_opener',
       u'home_losses', u'away_win_pct', u'away_losses', u'home_win_pct',
       u'home_score', u'away_rank', u'date', u'game_id', u'line', u'day',
       u'away_id', u'home_rank', u'home_wins', u'scoring_line', u'time',
       u'away_score', u'tv_coverage'],
      dtype='object')

In [42]:
attend = df.attendance.values
cap = df.capacity.values

In [43]:
# Every Big East games 2014-16 season
plt.plot(cap,attend,"bo")
plt.xlabel("Stadium Capacity")
plt.ylabel("Game Attendance")
plt.show()

# The vertical lines are different teams home games.



In [44]:
# only include Big East Conference games
big_east_teams = [305, 2086, 222, 46, 2507, 269, 2550, 2752, 2599, 156]
df_conf = df[df.home_id.isin(big_east_teams) & df.away_id.isin(big_east_teams)]

In [45]:
# Dataframe for each team
df_depaul = df_conf[df_conf.home_id == 305]
df_butler = df_conf[df_conf.home_id == 2086]
df_villanova = df_conf[df_conf.home_id == 222]
df_georgetown = df_conf[df_conf.home_id == 46]
df_providence = df_conf[df_conf.home_id == 2507]
df_marquette = df_conf[df_conf.home_id == 269]
df_seton_hall = df_conf[df_conf.home_id == 2550]
df_xavier = df_conf[df_conf.home_id == 2752]
df_st_johns = df_conf[df_conf.home_id == 2599]
df_creighton = df_conf[df_conf.home_id == 156]

In [46]:
attend = df_conf.attendance.values
cap = df_conf.capacity.values
game = df_conf.game_id.values
pct_full = attend / cap
line = df_conf.line.values

In [47]:
plt.plot(df_depaul.line.values, (df_depaul.attendance.values / df_depaul.capacity.values),"o",color='#00558c',label='DePaul')
plt.plot(df_butler.line.values, (df_butler.attendance.values / df_butler.capacity.values),"o",color='#13294b',label='Butler')
plt.plot(df_villanova.line.values, (df_villanova.attendance.values / df_villanova.capacity.values),"o",color='#001F5B',label='Villanova')
plt.plot(df_georgetown.line.values, (df_georgetown.attendance.values / df_georgetown.capacity.values),"o",color='#011e41',label='Georgetown')
plt.plot(df_providence.line.values, (df_providence.attendance.values / df_providence.capacity.values),"ko",label='Providence')
plt.plot(df_marquette.line.values, (df_marquette.attendance.values / df_marquette.capacity.values),"o",color='#FFCC00',label='Marquette')
plt.plot(df_seton_hall.line.values, (df_seton_hall.attendance.values / df_seton_hall.capacity.values),"o",color='#004488',label='Seton_Hall')
plt.plot(df_xavier.line.values, (df_xavier.attendance.values / df_xavier.capacity.values),"o",color='#002857',label='Xavier')
plt.plot(df_st_johns.line.values, (df_st_johns.attendance.values / df_st_johns.capacity.values),"o",color='#CF102D',label='St Johns')
plt.plot(df_creighton.line.values, (df_creighton.attendance.values / df_creighton.capacity.values),"o",color='#3c4982',label='Creighton')
plt.xlabel("Betting Line")
plt.ylabel("Percent Full")
plt.show()



In [48]:
plt.plot(df_depaul.away_win_pct.values, (df_depaul.attendance.values / df_depaul.capacity.values),"o",color='#00558c',label='DePaul')
plt.plot(df_butler.away_win_pct.values, (df_butler.attendance.values / df_butler.capacity.values),"o",color='#13294b',label='Butler')
plt.plot(df_villanova.away_win_pct.values, (df_villanova.attendance.values / df_villanova.capacity.values),"o",color='#001F5B',label='Villanova')
plt.plot(df_georgetown.away_win_pct.values, (df_georgetown.attendance.values / df_georgetown.capacity.values),"o",color='#011e41',label='Georgetown')
plt.plot(df_providence.away_win_pct.values, (df_providence.attendance.values / df_providence.capacity.values),"ko",label='Providence')
plt.plot(df_marquette.away_win_pct.values, (df_marquette.attendance.values / df_marquette.capacity.values),"o",color='#FFCC00',label='Marquette')
plt.plot(df_seton_hall.away_win_pct.values, (df_seton_hall.attendance.values / df_seton_hall.capacity.values),"o",color='#004488',label='Seton_Hall')
plt.plot(df_xavier.away_win_pct.values, (df_xavier.attendance.values / df_xavier.capacity.values),"o",color='#002857',label='Xavier')
plt.plot(df_st_johns.away_win_pct.values, (df_st_johns.attendance.values / df_st_johns.capacity.values),"o",color='#CF102D',label='St Johns')
plt.plot(df_creighton.away_win_pct.values, (df_creighton.attendance.values / df_creighton.capacity.values),"o",color='#3c4982',label='Creighton')
plt.xlabel("Away Win Percentage")
plt.ylabel("Percent Full")
plt.show()



In [49]:
plt.plot(df_depaul.home_win_pct.values, (df_depaul.attendance.values / df_depaul.capacity.values),"o",color='#00558c',label='DePaul')
plt.plot(df_butler.home_win_pct.values, (df_butler.attendance.values / df_butler.capacity.values),"o",color='#13294b',label='Butler')
plt.plot(df_villanova.home_win_pct.values, (df_villanova.attendance.values / df_villanova.capacity.values),"o",color='#001F5B',label='Villanova')
plt.plot(df_georgetown.home_win_pct.values, (df_georgetown.attendance.values / df_georgetown.capacity.values),"o",color='#011e41',label='Georgetown')
plt.plot(df_providence.home_win_pct.values, (df_providence.attendance.values / df_providence.capacity.values),"ko",label='Providence')
plt.plot(df_marquette.home_win_pct.values, (df_marquette.attendance.values / df_marquette.capacity.values),"o",color='#FFCC00',label='Marquette')
plt.plot(df_seton_hall.home_win_pct.values, (df_seton_hall.attendance.values / df_seton_hall.capacity.values),"o",color='#004488',label='Seton_Hall')
plt.plot(df_xavier.home_win_pct.values, (df_xavier.attendance.values / df_xavier.capacity.values),"o",color='#002857',label='Xavier')
plt.plot(df_st_johns.home_win_pct.values, (df_st_johns.attendance.values / df_st_johns.capacity.values),"o",color='#CF102D',label='St Johns')
plt.plot(df_creighton.home_win_pct.values, (df_creighton.attendance.values / df_creighton.capacity.values),"o",color='#3c4982',label='Creighton')
plt.xlabel("Home Win Percentage")
plt.ylabel("Percent Full")
plt.show()



In [50]:
plt.plot((df_depaul.home_win_pct.values + df_depaul.away_win_pct.values)/2, (df_depaul.attendance.values / df_depaul.capacity.values),"o",color='#00558c',label='DePaul')
plt.plot((df_butler.home_win_pct.values + df_butler.away_win_pct.values)/2, (df_butler.attendance.values / df_butler.capacity.values),"o",color='#13294b',label='Butler')
plt.plot((df_villanova.home_win_pct.values + df_villanova.away_win_pct.values)/2, (df_villanova.attendance.values / df_villanova.capacity.values),"o",color='#001F5B',label='Villanova')
plt.plot((df_georgetown.home_win_pct.values + df_georgetown.away_win_pct.values)/2, (df_georgetown.attendance.values / df_georgetown.capacity.values),"o",color='#011e41',label='Georgetown')
plt.plot((df_providence.home_win_pct.values + df_providence.away_win_pct.values)/2, (df_providence.attendance.values / df_providence.capacity.values),"ko",label='Providence')
plt.plot((df_marquette.home_win_pct.values + df_marquette.away_win_pct.values)/2, (df_marquette.attendance.values / df_marquette.capacity.values),"o",color='#FFCC00',label='Marquette')
plt.plot((df_seton_hall.home_win_pct.values + df_seton_hall.away_win_pct.values)/2, (df_seton_hall.attendance.values / df_seton_hall.capacity.values),"o",color='#004488',label='Seton_Hall')
plt.plot((df_xavier.home_win_pct.values + df_xavier.away_win_pct.values)/2, (df_xavier.attendance.values / df_xavier.capacity.values),"o",color='#002857',label='Xavier')
plt.plot((df_st_johns.home_win_pct.values + df_st_johns.away_win_pct.values)/2, (df_st_johns.attendance.values / df_st_johns.capacity.values),"o",color='#CF102D',label='St Johns')
plt.plot((df_creighton.home_win_pct.values + df_creighton.away_win_pct.values)/2, (df_creighton.attendance.values / df_creighton.capacity.values),"o",color='#3c4982',label='Creighton')
plt.xlabel("Average Win Percentage")
plt.ylabel("Percent Full")
plt.show()



In [51]:
plt.plot(df_depaul.away_rank.values, (df_depaul.attendance.values / df_depaul.capacity.values),"o",color='#00558c',label='DePaul')
plt.plot(df_butler.away_rank.values, (df_butler.attendance.values / df_butler.capacity.values),"o",color='#13294b',label='Butler')
plt.plot(df_villanova.away_rank.values, (df_villanova.attendance.values / df_villanova.capacity.values),"o",color='#001F5B',label='Villanova')
plt.plot(df_georgetown.away_rank.values, (df_georgetown.attendance.values / df_georgetown.capacity.values),"o",color='#011e41',label='Georgetown')
plt.plot(df_providence.away_rank.values, (df_providence.attendance.values / df_providence.capacity.values),"ko",label='Providence')
plt.plot(df_marquette.away_rank.values, (df_marquette.attendance.values / df_marquette.capacity.values),"o",color='#FFCC00',label='Marquette')
plt.plot(df_seton_hall.away_rank.values, (df_seton_hall.attendance.values / df_seton_hall.capacity.values),"o",color='#004488',label='Seton_Hall')
plt.plot(df_xavier.away_rank.values, (df_xavier.attendance.values / df_xavier.capacity.values),"o",color='#002857',label='Xavier')
plt.plot(df_st_johns.away_rank.values, (df_st_johns.attendance.values / df_st_johns.capacity.values),"o",color='#CF102D',label='St Johns')
plt.plot(df_creighton.away_rank.values, (df_creighton.attendance.values / df_creighton.capacity.values),"o",color='#3c4982',label='Creighton')
plt.xlabel("Away Rank")
plt.ylabel("Percent Full")
plt.show()



In [52]:
plt.plot(df_depaul.home_rank.values, (df_depaul.attendance.values / df_depaul.capacity.values),"o",color='#00558c',label='DePaul')
plt.plot(df_butler.home_rank.values, (df_butler.attendance.values / df_butler.capacity.values),"o",color='#13294b',label='Butler')
plt.plot(df_villanova.home_rank.values, (df_villanova.attendance.values / df_villanova.capacity.values),"o",color='#001F5B',label='Villanova')
plt.plot(df_georgetown.home_rank.values, (df_georgetown.attendance.values / df_georgetown.capacity.values),"o",color='#011e41',label='Georgetown')
plt.plot(df_providence.home_rank.values, (df_providence.attendance.values / df_providence.capacity.values),"ko",label='Providence')
plt.plot(df_marquette.home_rank.values, (df_marquette.attendance.values / df_marquette.capacity.values),"o",color='#FFCC00',label='Marquette')
plt.plot(df_seton_hall.home_rank.values, (df_seton_hall.attendance.values / df_seton_hall.capacity.values),"o",color='#004488',label='Seton_Hall')
plt.plot(df_xavier.home_rank.values, (df_xavier.attendance.values / df_xavier.capacity.values),"o",color='#002857',label='Xavier')
plt.plot(df_st_johns.home_rank.values, (df_st_johns.attendance.values / df_st_johns.capacity.values),"o",color='#CF102D',label='St Johns')
plt.plot(df_creighton.home_rank.values, (df_creighton.attendance.values / df_creighton.capacity.values),"o",color='#3c4982',label='Creighton')
plt.xlabel("Home Rank")
plt.ylabel("Percent Full")
plt.show()



In [53]:
plt.plot(df_depaul.scoring_line.values, (df_depaul.attendance.values / df_depaul.capacity.values),"o",color='#00558c',label='DePaul')
plt.plot(df_butler.scoring_line.values, (df_butler.attendance.values / df_butler.capacity.values),"o",color='#13294b',label='Butler')
plt.plot(df_villanova.scoring_line.values, (df_villanova.attendance.values / df_villanova.capacity.values),"o",color='#001F5B',label='Villanova')
plt.plot(df_georgetown.scoring_line.values, (df_georgetown.attendance.values / df_georgetown.capacity.values),"o",color='#011e41',label='Georgetown')
plt.plot(df_providence.scoring_line.values, (df_providence.attendance.values / df_providence.capacity.values),"ko",label='Providence')
plt.plot(df_marquette.scoring_line.values, (df_marquette.attendance.values / df_marquette.capacity.values),"o",color='#FFCC00',label='Marquette')
plt.plot(df_seton_hall.scoring_line.values, (df_seton_hall.attendance.values / df_seton_hall.capacity.values),"o",color='#004488',label='Seton_Hall')
plt.plot(df_xavier.scoring_line.values, (df_xavier.attendance.values / df_xavier.capacity.values),"o",color='#002857',label='Xavier')
plt.plot(df_st_johns.scoring_line.values, (df_st_johns.attendance.values / df_st_johns.capacity.values),"o",color='#CF102D',label='St Johns')
plt.plot(df_creighton.scoring_line.values, (df_creighton.attendance.values / df_creighton.capacity.values),"o",color='#3c4982',label='Creighton')
plt.xlabel("Scoring Line")
plt.ylabel("Percent Full")
plt.show()



In [ ]: