In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.request import urlopen
import os
import seaborn as sns
import charts_function_list
_,data,outputs = charts_function_list.folder_setup()
In [6]:
empty = []
def extract_player_data(table_rows):
"""
Extract and return the the desired information from the td elements within
the table rows.
"""
# create the empty list to store the player data
player_data = []
for row in table_rows: # for each row do the following
player_list = [th.get_text() for th in row.find_all("th")]+[td.get_text() for td in row.find_all("td")]
if not player_list:
continue
player_data.append(player_list)
return player_data
column_headers = [th.getText() for th in
soup.findAll('tr', limit=2)[0].findAll('th')]
for i,v in enumerate(['WAS','MIL','TOR','ORL','MIA',
'CLE','NJN','DET','CHA','IND',
'BOS','ATL','PHI','NYK','CHI',
'LAC','SAS','MEM','HOU',
'MIN','POR','UTA','GSW','OKC',
'NOH','LAL','DEN','PHO','SAC','DAL']):
url = "https://www.basketball-reference.com/teams/"+v
html = urlopen(url) #get_request
soup = BeautifulSoup(html,"lxml") #load into beuatiful soup
table_rows = soup.select('#'+v+' tr')[2:] #selecting table row elements from within Team css selector
data = extract_player_data(table_rows)
empty.append(pd.DataFrame(data, columns=column_headers,index=[i for x in range(len(data))]))
all_nba = pd.concat(empty)
all_nba['Team']=all_nba['Team'].str.strip('*')
all_nba['W/L%'] = all_nba['W/L%'].astype('float')
os.chdir('/Users/alexanderpudlin/Documents/GitHub/charts_and_more_charts/outputs')
all_nba.to_csv('ALL_NBA_team_.csv')
In [2]:
os.chdir(outputs)
all_nba = pd.read_csv('ALL_NBA_team.csv',index_col=0)
all_nba = all_nba[all_nba['Lg']=='NBA']
team_dictionary = dict(all_nba[~all_nba.index.duplicated(keep='first')]['Team'])
all_nba['Current Team'] = pd.Series(all_nba.index).map(team_dictionary).values
med_win_percent_group = all_nba.groupby('Current Team').median().sort_values(by='W/L%',ascending=False)
med_win_percent_group['Team Order']=[i+1 for i,v in enumerate(med_win_percent_group.index)]
all_nba['Median Win Percent Index']=all_nba['Current Team'].map(dict(med_win_percent_group['Team Order']))
#sort values by win index for plotting
all_nba = all_nba.sort_values(by=['Median Win Percent Index','Season'])
In [3]:
#New dataframe
winners = all_nba[all_nba['Playoffs']=='Won Finals']
plt.rcParams["font.family"]='Futura Bk BT'
chart_setup = charts_function_list.chart_maker(title='Distribution of Current NBA Teams\' Regular Season Winning Percentages\n', title_size=30)
#FIGURE
fig = chart_setup.initial_fig_axis(figsize=(18,18))
fig.subplots_adjust(top=.92)
fig.text(.5,.94,'From each team\'s founding thru Oct 2017',
fontsize=25,alpha=.9,ha='center')
#AXES
ax = chart_setup.axes_set_up(fig=fig)
#PLOTS
sns.violinplot(x='W/L%',y='Current Team',data=all_nba,
palette='RdBu',linewidth=0,
scale='count',saturation=.7)
plt.scatter(y=winners['Median Win Percent Index']-1,
x=winners['W/L%'],color='white',edgecolor='black',
marker='o',linewidth=.7,alpha=.8,label='NBA Championship')
ax.vlines(.5,ymin=0,ymax=30,color='black',alpha=.4)
#LABELS
ax.set_xlabel('Winning Percentage',fontsize=26,alpha=.8)
ax.set_ylabel('',fontsize=26,alpha=.8)
chart_setup.x_axis_setup(ax,0,1,interval=.1)
chart_setup.tick_params_(ax=ax,labelcolor='#8c2d04',pad=20,fontsize=18)
ax_text_y = ax.set_yticklabels(all_nba.sort_values(by='Median Win Percent Index')['Current Team'].unique(),rotation=0,fontsize=24)
ax_text_x = ax.set_xticklabels([str(int(x*100))+'%' for x in ax.get_xticks()],fontsize=22)
#PATCH
patch = ax.add_patch(patches.Rectangle(xy=(0,0), width=40,height=1,facecolor='black',alpha=.01,zorder=1))
#LEGEND
leg = ax.legend(bbox_to_anchor=(.95, -.15),loc=8,
frameon=True,facecolor='None',
edgecolor='black',fontsize=26,markerscale=2)
leg.get_frame().set_linewidth(0)
#CITATIONS
chart_setup.citations(ax=ax,source='Source: basketball-reference.com',x=-.2,source_y=-.14,chart_tag_y=-.16,alpha=.7)
#SAVE
os.chdir(outputs)
plt.savefig('NBA distribution.png',bbox_inches = 'tight', dpi = 160, pad_inches = .5,transparent=False)
In [216]:
#in process had noticed some didn't matched. found those and updated
empty_dict = {}
for i,v in enumerate(['WAS','MIL','TOR','ORL','MIA',
'CLE','BRK','DET','CHO','IND',
'BOS','ATL','PHI','NYK','CHI',
'LAC','SAS','MEM','HOU',
'MIN','POR','UTA','GSW','OKC',
'NOP','LAL','DEN','PHO','SAC','DAL']):
empty_dict[i] = v
def match_lookup():
non_match_keys = [x for x in set(empty_dict.keys()) if x not in all_nba.index.value_counts().sort_index().index]
return non_match_keys
#if len(non_match_keys)>0:
# return [empty_dict[x] for x in [x for x in non_match_keys]]
#else:
# return 'All keys match'
#more research shows URL forwarding
#NOP's ID is NOH
#BRK's ID is NJN
#CHO's ID is CHA
In [379]:
#non_match_keys = [x for x in set(empty_dict.keys()) if x not in all_nba.index.value_counts().sort_index().index]
#non_match_values = [empty_dict[x] for x in [x for x in non_match_keys]]
#non_match_values
#juxtaposition fun .. maybe resume something with this
for item in [['Minnesota Timberwolves','green',.9],
['San Antonio Spurs','black',.6],
['Los Angeles Clippers','red',.4]]:
plt.hist(all_nba[all_nba['Current Team']==item[0]]['W/L%'],normed=True,color=item[1],alpha=item[2])
plt.vlines(all_nba[all_nba['Current Team']==item[0]]['W/L%'].median(),ymin=0,ymax=6)
#plt.hist(all_nba[all_nba['Current Team']=='Los Angeles Clippers']['W/L%'],color='green',alpha=.6,normed=True)
#plt.hist(all_nba[all_nba['Current Team']=='San Antonio Spurs']['W/L%'],color='black',alpha=.6,normed=True)
#plt.vlines(all_nba[all_nba['Current Team']=='Dallas Mavericks']['W/L%'].median(),
# ymin=0,ymax=1)
In [ ]: