See Terry's blog LFC: The Elite Strikers for a discussion of of the data analysis.
This notebook analyses Liverpool FC's elite strikers data from 1962-1963 to 2014-2015. An elite striker is defined as a player who scored 20 or more league goals in a season. I've started the analysis at 1962 as this was when Bill Shankly returned Liverpool to the top flight of English football. The analysis uses IPython Notebook, python, pandas and matplotlib to explore the data.
In [1]:
%%html
<! left align the change log table in next cell >
<style>
table {float:left}
</style>
Date | Change Description |
---|---|
23rd July 2015 | Initial baseline |
Import the modules needed for the analysis.
In [2]:
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import sys
import collections
from datetime import datetime
from __future__ import division
# enable inline plotting
%matplotlib inline
Print version numbers.
In [3]:
print 'python version: {}'.format(sys.version)
print 'pandas version: {}'.format(pd.__version__)
print 'matplotlib version: {}'.format(mpl.__version__)
print 'numpy version: {}'.format(np.__version__)
In [4]:
LFC_SCORERS_CSV_FILE = 'data\lfc_scorers_1892-1893_2014-2015.csv'
dflfc_scorers = pd.read_csv(LFC_SCORERS_CSV_FILE)
# sort by season, then league goals
dflfc_scorers = dflfc_scorers.sort(['season', 'league'], ascending=([False, False]))
dflfc_scorers.shape
Out[4]:
In [5]:
dflfc_scorers.head()
Out[5]:
In [6]:
dflfc_scorers.tail()
Out[6]:
In [7]:
# note that scorers includes own goals
dflfc_scorers[dflfc_scorers.player == 'Own goals'].head()
Out[7]:
In [8]:
# note: war years already excluded
LANCS_YRS = ['1892-1893']
SECOND_DIV_YRS = ['1893-1894', '1895-1896', '1904-1905', '1961-1962',
'1954-1955', '1955-1956', '1956-1957', '1957-1958',
'1958-1959', '1959-1960', '1960-1961']
NOT_TOP_LEVEL_YRS = LANCS_YRS + SECOND_DIV_YRS
dflfc_scorers_tl = dflfc_scorers[~dflfc_scorers.season.isin(NOT_TOP_LEVEL_YRS)].copy()
dflfc_scorers_tl.shape
Out[8]:
In [9]:
## check number of seasons aligns with http://www.lfchistory.net/Stats/LeagueOverall.aspx
## expect 100 total
print 'num seasons is {}'.format(len(dflfc_scorers_tl.season.unique()))
In [10]:
# show most league goals in a season in top level
# cross-check with http://en.wikipedia.org/wiki/List_of_Liverpool_F.C._records_and_statistics#Goalscorers
# expect 101 in 2013-14
dflfc_scorers_tl[['season', 'league']].groupby(['season']).sum().sort('league', ascending=False).head(1)
Out[10]:
In [11]:
# remove OG
dflfc_scorers_tl = dflfc_scorers_tl[dflfc_scorers_tl.player != 'Own goals']
dflfc_scorers_tl.shape
Out[11]:
In [12]:
# check 2013-14
dflfc_scorers_tl[dflfc_scorers_tl.season == '2013-2014'].head(10)
Out[12]:
In [13]:
LFC_SQUADS_CSV_FILE = 'data/lfc_squads_1892-1893_2014-2015.csv'
dflfc_squads = pd.read_csv(LFC_SQUADS_CSV_FILE)
dflfc_squads.shape
Out[13]:
In [14]:
dflfc_squads.head()
Out[14]:
In [15]:
dflfc_squads.tail()
Out[15]:
In [16]:
LFC_LEAGUE_CSV_FILE = 'data/lfc_league_1892-1893_2014-2015.csv'
dflfc_league = pd.read_csv(LFC_LEAGUE_CSV_FILE)
dflfc_league.shape
Out[16]:
In [17]:
dflfc_league.head()
Out[17]:
In [18]:
dflfc_league.tail()
Out[18]:
In [19]:
dflfc_scorers_tl_pos = pd.DataFrame.merge(dflfc_scorers_tl, dflfc_squads)
dflfc_scorers_tl_pos.shape
Out[19]:
In [20]:
dflfc_scorers_tl_pos.head()
Out[20]:
In [21]:
dflfc_scorers_tl_pos.tail()
Out[21]:
In [22]:
LFC_PLAYERS_CSV_FILE = 'data\lfc_players_july2015_upd.csv'
dflfc_players = pd.read_csv(LFC_PLAYERS_CSV_FILE, parse_dates=['birthdate'])
dflfc_players.shape
Out[22]:
In [23]:
dflfc_players.head()
Out[23]:
In [24]:
dflfc_players.tail()
Out[24]:
In [25]:
def age_at_season(row):
"""Return player's age at mid-point of season, assumed to be 1st Jan.
row.player -> player's name
row.season -> season
uses dflfc_players to look-up birthdate, keyed on player
- return average age if player is missing from dflfc_players
"""
AVERAGE_AGE = 26.5
mid_point = '01 January {}'.format(row.season[-4:])
try:
dob = dflfc_players[dflfc_players.player == row.player].birthdate.values[0]
except:
# use average age if player's birthdate not available
print 'error: age not found for player {} in season {}, using average age {}'.format(row.player,
row.season,
AVERAGE_AGE)
return AVERAGE_AGE
return round((pd.Timestamp(mid_point) - dob).days/365.0, 1)
In [26]:
# add age column
dflfc_scorers_tl_pos['age'] = dflfc_scorers_tl_pos.apply(lambda row: age_at_season(row), axis=1)
In [27]:
dflfc_scorers_tl_pos_age = dflfc_scorers_tl_pos.copy()
In [28]:
dflfc_scorers_tl_pos_age.head()
Out[28]:
In [29]:
# check number of seasons in top flight between 1960 and 2014 (expect 53)
tf = dflfc_scorers_tl_pos_age[(dflfc_scorers_tl_pos_age.season.str[0:4] >= '1960') &
(dflfc_scorers_tl_pos_age.season.str[0:4] <= '2014')].season.unique()
tf[-1], tf[0], len(tf)
Out[29]:
In [30]:
# show modern scorers who scored >20, with age - aka elite strikers
dflfc_elite = dflfc_scorers_tl_pos_age[(dflfc_scorers_tl_pos_age.season.str[0:4] >= '1962') &
(dflfc_scorers_tl_pos_age.league >= 20)].sort('league', ascending=False)
dflfc_elite.reset_index(drop=True, inplace=True)
dflfc_elite
Out[30]:
In [31]:
# show near misses - modern scorers who scored >=18 and <=19, with age
dflfc_scorers_tl_pos_age[(dflfc_scorers_tl_pos_age.season.str[0:4] >= '1962') &
(dflfc_scorers_tl_pos_age.league >= 18) &
(dflfc_scorers_tl_pos_age.league <= 19)].sort('league', ascending=False)
Out[31]:
In [32]:
# show number of games played in seasons when strikers scored >20
dflfc_league[['Season', 'League', 'Pos', 'PLD']][dflfc_league.Season.isin(dflfc_elite.season)]
Out[32]:
In [33]:
# show strikers with total scored >20
df_striker_tot = dflfc_elite[['player', 'league']].groupby('player').sum()
df_striker_tot.rename(columns={'league': 'Tot'}, inplace=True)
df_striker_tot.index.name = 'Player'
df_striker_tot.sort('Tot', ascending=False).head()
Out[33]:
In [34]:
# show number of elite strikers who scored >20
len(dflfc_elite)
Out[34]:
In [35]:
# show number of unique strikers who scored >20
top20_lu = dflfc_elite.sort('league', ascending=False).player.unique()
print len(top20_lu)
print top20_lu
In [36]:
# show number of unique scorers who scored >20 - pretty print
player_d = collections.defaultdict(list)
top20_l = dflfc_elite.sort('league', ascending=False).values
for season, player, goals, position, age, in top20_l:
# season, player, goals, age
player_d[player].append('{} ({})'.format(season, goals))
for k,v in player_d.iteritems():
print '{}: {}'.format(k, ', '.join(v))
In [37]:
# construct table (dataframe) to summarise the data
# col 0 of dataframe is the list of season (goals) converted to a comma separated string
df = pd.DataFrame.from_dict({k: ', '.join(v) for k,v in player_d.iteritems()}, orient='index')
# col 1 is number of times player has scored 20+ (derived from number of brackets)
df[1] = df[0].apply(lambda x: x.count('('))
# change column order and rename
df = df[[1, 0]]
df.rename(columns={1: 'Times 20+ Achieved',
0: 'Seasons in which player scored 20+ goals (with number of league goals scored)'}, inplace=True)
df.index.name = 'Player'
# add total (sum of goals in brackets)
df = df.merge(df_striker_tot, left_index=True, right_index=True)
# set pandas option that avoids the default curtailing of long rows with ...
pd.set_option('display.max_colwidth', -1)
# show dataframe sorted by number of times achieved
df = df.sort(['Times 20+ Achieved', 'Tot'], ascending=(False, False))
df[['Times 20+ Achieved', 'Seasons in which player scored 20+ goals (with number of league goals scored)']]
Out[37]:
In [38]:
# show youngest elite scorers who scored >20
dflfc_elite.sort('league', ascending=False).sort('age').head(5)
Out[38]:
In [39]:
# check to see if anyone has matched Robbie's feat...
# show youngest elite scorers who scored >20 for ALL seasons
dflfc_scorers_tl_pos_age[(dflfc_scorers_tl_pos_age.league >= 20)].sort('league', ascending=False).sort('age').head(5)
Out[39]:
No, only God has done that!
In [40]:
# show oldest elite scorers who scored >20
dflfc_elite.sort('league', ascending=False).sort('age').tail(5)
Out[40]:
In [41]:
# show most appearances of elite strikers who scored >20
dflfc_elite.sort('league', ascending=False).player.value_counts()
Out[41]:
In [42]:
# show average (mean) age of elite strikers who scored >20
dflfc_elite.sort('league', ascending=False).mean()
Out[42]:
In [43]:
# show average (median) age of elite strikers who scored >20
dflfc_elite.sort('league', ascending=False).median()
Out[43]:
In [44]:
# show most appearances of elite strikers who scored >20 for ALL seasons
dflfc_scorers_tl_pos_age[(dflfc_scorers_tl_pos_age.league >= 20)].sort('league', ascending=False).player.value_counts()
Out[44]:
In [45]:
# show most appearances of elite strikers who scored >25 for ALL seasons
dflfc_scorers_tl_pos_age[(dflfc_scorers_tl_pos_age.league >= 25)].sort('league', ascending=False).player.value_counts()
Out[45]:
In [46]:
# show frequency of elite scorers who scored >20
top20_freq = dflfc_elite['player'].groupby(dflfc_elite.season).count()
top20_freq
Out[46]:
In [47]:
# produce list of all modern seasons (since 1962)
START_MOD_YR = 1962
END_MOD_YR = 2015
all_mod_yrs = list('{}-{}'.format(i, i+1) for i in range(START_MOD_YR, END_MOD_YR))
print len(all_mod_yrs) # expect 53
all_mod_yrs[0:5]
Out[47]:
In [48]:
# identify missing years and update frequency so it shows all years
missing_mod_years = [year for year in all_mod_yrs if year not in top20_freq.index.values]
for yr in missing_mod_years:
top20_freq[yr] = 0
top20_freq = top20_freq.sort_index()
top20_freq[0:5]
Out[48]:
In [49]:
# create list of seasons when LFC were champions
CHAMPS = ['1900-1901', '1905-1906', '1921-1922', '1922-1923', '1946-1947', '1963-1964',\
'1965-1966', '1972-1973', '1975-1976', '1976-1977', '1978-1979', '1979-1980',\
'1981-1982', '1982-1983', '1983-1984', '1985-1986', '1987-1988', '1989-1990']
CHAMPS[0:5]
Out[49]:
Plot the number of players scoring 20 or more goals by season
In [50]:
ax = top20_freq.plot(kind='bar', figsize=(14, 5), color='y', yticks=[0, 1, 2], ylim=(0,2.5), grid=False)
ax.set_xlabel('Season')
ax.set_ylabel('Number of players')
ax.set_title('Number of players scoring 20 or more goals by Season')
ax.text(-0.2, 2.36, 'prepared by: @terry8dolan', bbox=dict(facecolor='none', edgecolor='none', alpha=0.6))
# add vertical line for title winning seasons since 1962
champs_since_62 = [list(top20_freq.index.values).index(yr) for yr in CHAMPS if yr[0:4] >= '1962']
for x in champs_since_62:
plt.axvline(x, color='r', linestyle='--')
# create fake legend
l1 = plt.Line2D([], [], linewidth=1, color='r', linestyle='--')
l2 = plt.Line2D([], [], linewidth=5, color='y')
labels = ['LFC Title Winning Seasons', 'Players scoring 20+ goals in the league']
ax.legend([l1, l2], labels, fancybox=True, shadow=True, framealpha=0.8)
# plot and save current figure
fig = plt.gcf()
plt.show()
fig.savefig('20+PlayervsGoals.png', bbox_inches='tight')
Show percentage of title winning seasons that have had striker with >20 goals
In [51]:
# create a dataframe showing position in league in top flight season since 1962
dflfc_league_pos = dflfc_league[['Season', 'Pos']][dflfc_league.Season.str[0:4] >= '1962']
dflfc_league_pos.rename(columns={'Season': 'season', 'Pos': 'pos'}, inplace=True)
dflfc_league_pos.head()
Out[51]:
In [52]:
# create dataframe showing position in league when players have scored >20
top20_seasons = dflfc_elite.season.unique()
dflfc_elite_pos = dflfc_league_pos[dflfc_league_pos.season.isin(top20_seasons)]
dflfc_elite_pos.head()
Out[52]:
In [53]:
# show position in league when players have scored >20
val_cnts = dflfc_league_pos[['season', 'pos']][dflfc_league_pos.season.isin(top20_seasons)]['pos'].value_counts()
val_cnts
Out[53]:
In [54]:
# show % of title winning seasons that have had striker with >20 goals
tot_champs_since_62 = len(dflfc_league_pos[(dflfc_league_pos.pos == 1)])
tot_elite_champs = len(dflfc_elite_pos[(dflfc_elite_pos.pos == 1)])
print '{} of {} ({}%) of title winning seasons have had striker with >20 goals'\
.format(tot_elite_champs,
tot_champs_since_62,
round(100*tot_elite_champs/tot_champs_since_62, 1))
In [55]:
# show the 4 title winning seasons since 1962 that didn't have an elite scorer
dflfc_scorers_tl_pos_age[(dflfc_scorers_tl_pos_age.season.str[0:4] >= '1962') &
(dflfc_scorers_tl_pos_age.season.isin(CHAMPS))].sort(['season', 'league'], ascending=[True, False])\
.groupby(dflfc_scorers_tl_pos_age.season).head(1).sort('league').head(4)
Out[55]:
In [56]:
# show top goalscorers in modern seasons when Liverpool won the league without a striker scoring >20
s = ['1976-1977', '1972-1973', '1975-1976', '1981-1982']
dflfc_scorers_tl_pos_age[dflfc_scorers_tl_pos_age.season.isin(s)].groupby('season').head(3)
Out[56]:
Show distribution of strikers with more than 20 goals by age
In [57]:
# create 'cut' of value counts
pd.value_counts(pd.cut(dflfc_elite.age.values, range(19,32)))
Out[57]:
In [58]:
# create dataframe of value counts of age cut into 1 year bins from 19 to 31
MIN_AGE = int(np.floor(dflfc_elite.age.min()))
MAX_AGE = int(np.ceil(dflfc_elite.age.max()))
df = pd.DataFrame(pd.value_counts(pd.cut(dflfc_elite.age.values,
range(MIN_AGE, MAX_AGE+1),
labels=['{} to {}'.format(i, i+1) for i in range(MIN_AGE, MAX_AGE)])))
df.sort_index(inplace=True)
df
Out[58]:
In [59]:
# plot
ax = df.plot(kind='bar', figsize=(9, 6), grid=False, ylim=(0, 4.5), yticks=range(1,5), legend=False, color='r')
ax.set_xlabel('Age')
ax.set_ylabel('Number of players')
ax.set_title('Number of players scoring 20 or more goals by Age')
ax.text(-0.4, 4.3, 'prepared by: @terry8dolan', bbox=dict(facecolor='none', edgecolor='none', alpha=0.6))
# plot and save current figure
fig = plt.gcf()
plt.show()
fig.savefig('20+PlayervsAge.png', bbox_inches='tight')
In [ ]: