In [47]:
s=transfers[(transfers.age>23)&(transfers.season>1994)&(transfers.fromLeague.isin(leagueList))&(transfers.duration>0)]
s=s.merge(countryInfo[['marketTier']],right_index=True,left_on='fromCountry',how='left')
ss=s.groupby(['season','marketTier'])['duration']
(ss.describe()/366).unstack(level=0).swaplevel().ix['mean'].T.plot.line(figsize=(10,10))
Out[47]:
In [74]:
s=transfers[(transfers.age>23)&(transfers.season>1994)&(transfers.fromLeague.isin(leagueList))&(transfers.duration>0)]
s=s.merge(countryInfo[['marketTier']],right_index=True,left_on='fromCountry',how='left')
ss=s.groupby(['season','marketTier'])['durationWithLoans']
(ss.describe()/366).unstack(level=0).swaplevel().ix['mean'].T.plot.line(figsize=(10,10))
Out[74]:
In [65]:
t=nonloans
leagueList
s=t[(t.season>1994)&(t.fromCountry.isin(uefa))&(t.duration>0)&(t.fromLeague.isin(leagueList))&(~t.fromLeague.isin(['GB1','IT1','FR1','L1','ES1']))]
s.groupby('season').mean()['duration'].plot.area()
Out[65]:
In [116]:
s=transfers[(transfers.age>23)&(transfers.season>1994)&(transfers.fromCountry.isin(uefa))&(transfers.duration>0)]
s=s.merge(countryInfo[['marketTier']],right_index=True,left_on='fromCountry',how='left')
ss=s.groupby(['season','marketTier'])['id']
(ss.count()).unstack().plot.line(figsize=(10,10))
Out[116]:
In [75]:
s=transfers[(transfers.age>23)&(transfers.season>1994)&(transfers.fromCountry.isin(uefa))&(transfers.duration>0)]
s=s.merge(countryInfo[['federation']],right_index=True,left_on='Nationality',how='left')
ss=s[s.federation.isin(['UEFA','CAF','CONMEBOL'])].groupby(['season','federation'])['durationWithLoans']
(ss.describe()/366).unstack(level=0).swaplevel().ix['mean'].T.plot.line(figsize=(10,10))
Out[75]:
In [114]:
s=transfers[(transfers.age>23)&(transfers.season>1994)&(transfers.fromCountry.isin(uefa))&(transfers.duration>0)&(transfers.fromCountry!=transfers.Nationality)]
s=s.merge(countryInfo[['federation']],right_index=True,left_on='Nationality',how='left')
ss=s[s.federation.isin(['UEFA','CAF','CONMEBOL'])].groupby(['season','federation'])['id']
(ss.count()).unstack().plot.line(figsize=(10,10))
Out[114]:
In [77]:
transfers['fromTeamClass']=pd.Series(np.where(transfers.fromTeamId.isin(top100clubids),'top 100','the rest'))
s=transfers[(transfers.age>23)&(transfers.season>1994)&(transfers.fromLeague.isin(leagueList))&(transfers.duration>0)]
ss=s.groupby(['season','fromTeamClass'])['duration']
(ss.describe()/366).unstack(level=0).swaplevel().ix['mean'].T.plot.line(figsize=(10,10))
Out[77]:
In [111]:
transfers['fromTeamClass']=pd.Series(np.where(transfers.fromTeamId.isin(top100clubids),'top 100','the rest'))
s=transfers[(transfers.age>23)&(transfers.season>1994)&(transfers.fromLeague.isin(leagueList))&(transfers.duration>0)]
ss=s.groupby(['season','fromTeamClass'])['fromTeamId']
(ss.nunique()).unstack().plot.line(figsize=(10,10))
Out[111]:
In [79]:
transfers['fromLeagueLevel']=transfers.fromLeague.str.slice(start=-1)
s=transfers[(transfers.age>23)&(transfers.season>1994)&(transfers.fromCountry.isin(uefa))&(transfers.duration>0)&transfers['fromLeagueLevel'].isin(['1','2','3'])]
ss=s.groupby(['season','fromLeagueLevel'])['duration']
(ss.describe()/366).unstack(level=0).swaplevel().ix['mean'].T.plot.line(figsize=(10,10))
Out[79]:
In [4]:
transfers['fromLeagueLevel']=transfers.fromLeague.str.slice(start=-1)
s=transfers[(transfers.age>23)&(transfers.season>1994)&(transfers.fromCountry.isin(uefa))&(transfers.duration>0)&transfers['fromLeagueLevel'].isin(['1','2','3'])]
ss=s.groupby(['season','fromLeagueLevel'])['id']
(ss.count()).unstack().plot.line(figsize=(10,10))
Out[4]:
In [81]:
s=transfers[(transfers.age>23)&(transfers.season>1994)&(transfers.fromCountry.isin(uefa))&(transfers.duration>0)&transfers['fromLeagueLevel'].isin(['1','2','3','4'])]
ss=s.groupby(['season','positionGroup'])['duration']
(ss.describe()/366).unstack(level=0).swaplevel().ix['mean'].T.plot.line(figsize=(10,10))
Out[81]:
In [6]:
s=transfers[(transfers.age>23)&(transfers.season>1994)&(transfers.fromCountry.isin(uefa))&(transfers.duration>0)]
s['isForeign']=s.fromCountry!=s.Nationality
ss=s.groupby(['season','isForeign'])['duration']
(ss.describe()/366).unstack(level=0).swaplevel().ix['mean'].T.plot.line(figsize=(10,10))
Out[6]:
In [13]:
s=transfers[(transfers.age>23)&(transfers.season>1994)&(transfers.fromCountry.isin(uefa))&(transfers.duration>0)]
s['isDomesticTransfer']=s.fromCountry==s.toCountry
ss=s.groupby(['season','isDomesticTransfer'])['duration']
(ss.describe()/366).unstack(level=0).swaplevel().ix['mean'].T.plot.line(figsize=(10,10))
Out[13]:
In [16]:
s=transfers[(transfers.age>23)&(transfers.season>1994)&(transfers.fromCountry.isin(uefa))&(transfers.fromLeague.isin(leagueList))&(transfers.duration>0)]
s=s.merge(DataFrame({'nationalityTier':countryInfo.marketTier.fillna(countryInfo.federation)}),how='left',left_on='Nationality',right_index=True)
ss=s[s.nationalityTier.isin([1,2,3,'CAF','CONMEBOL'])].groupby(['season','nationalityTier'])['duration']
(ss.describe()/366).unstack(level=0).swaplevel().ix['mean'].T.plot.line(figsize=(10,10))
Out[16]:
In [19]:
s=transfers[(transfers.age>23)&(transfers.season>1994)&(transfers.fromCountry.isin(uefa))&(transfers.duration>0)]
s=s.merge(DataFrame({'fromMarketTier':countryInfo.marketTier.fillna(countryInfo.federation)}),how='left',left_on='fromCountry',right_index=True)
ss=s.groupby(['season','fromMarketTier'])['duration']
(ss.describe()/366).unstack(level=0).swaplevel().ix['mean'].T.plot.line(figsize=(10,10))
Out[19]:
In [10]:
s=transfers[(transfers.age>23)&(transfers.season>1994)&(transfers.fromCountry.isin(uefa))&(transfers.duration>0)]
s['isForeign']=s.fromCountry!=s.Nationality
s2=s[s.fromCountry.isin(['Serbia','Croatia','Bosnia-Herzegovina'])]
#s2=s[s.fromCountry.isin(['Croatia'])]
ss=s2.groupby(['season','isForeign'])['duration']
(ss.describe()/366).unstack(level=0).swaplevel().ix['mean'].T.plot.line(figsize=(10,10))
Out[10]:
In [108]:
s=transfers[(transfers.age>23)&(transfers.season>1994)&(transfers.fromCountry.isin(uefa))&(transfers.duration>0)&transfers['fromLeagueLevel'].isin(['1','2','3','4'])]
ss=s.groupby(['season','positionGroup'])['id']
(ss.count()).unstack().plot.line(figsize=(10,10))
Out[108]:
In [65]:
transfers['isIntl']=transfers['intlCaps']>0
s=transfers[(transfers.age>23)&(transfers.season>1994)&(transfers.fromCountry.isin(uefa))&(transfers.duration>0)&transfers['fromLeagueLevel'].isin(['1','2','3','4'])]
ss=s.groupby(['season','isIntl'])['duration']
(ss.describe()/366).unstack(level=0).swaplevel().ix['mean'].T.plot.line(figsize=(10,10))
Out[65]:
In [107]:
transfers['isIntl']=transfers['intlCaps']>0
s=transfers[(transfers.age>23)&(transfers.season>1994)&(transfers.fromCountry.isin(uefa))&(transfers.duration>0)&transfers['fromLeagueLevel'].isin(['1','2','3','4'])]
ss=s.groupby(['season','isIntl'])['id']
(ss.count()).unstack().plot.line(figsize=(10,10))
Out[107]:
In [69]:
transfers['hasAgent']=(transfers['playersAgentId']>0)&(transfers['playersAgentId']!=96)
s=transfers[(transfers.age>23)&(transfers.season>1994)&(transfers.fromCountry.isin(uefa))&(transfers.duration>0)&transfers['fromLeagueLevel'].isin(['1','2','3','4'])]
ss=s.groupby(['season','hasAgent'])['duration']
(ss.describe()/366).unstack(level=0).swaplevel().ix['mean'].T.plot.line(figsize=(10,10))
Out[69]:
In [106]:
transfers['hasAgent']=(transfers['playersAgentId']>0)&(transfers['playersAgentId']!=96)
s=transfers[(transfers.age>23)&(transfers.season>1994)&(transfers.fromCountry.isin(uefa))&(transfers.duration>0)&transfers['fromLeagueLevel'].isin(['1','2','3','4'])]
ss=s.groupby(['season','hasAgent'])['id']
(ss.count()).unstack().plot.line(figsize=(10,10))
Out[106]:
In [112]:
s=nonloans[(nonloans.season>1994)&(nonloans.duration>0)&(nonloans.fromCountry.isin(uefa))&(nonloans.fromCountry!=nonloans.Nationality)&(nonloans.age>23)] #&(nonloans.fromLeague.isin(leagueList))]
ss=s.groupby(['fromCountry','season'])['durationWithLoans'].mean().unstack().sort_values(2015)
#pd.concat([ss[:5],ss[-5:]]).T.plot.line(figsize=(10,10))
def gini(list_of_values):
sorted_list = sorted([x for x in list_of_values if x>0])[:200]
height, area = 0, 0
for value in sorted_list:
height += value
area += height - value / 2.
fair_area = height * len(list_of_values) / 2.
return (fair_area - area) / fair_area
def gini(list_of_values):
return min(list_of_values)
before=ss[range(1996,2001)].mean(axis=1)
after=ss[range(2011,2016)].mean(axis=1)
changepct=((before-after)/before).fillna(0).sort_values()
def describe(t):
return t.T.drop('season').dropna().describe().T
#ss .T.reset_index().groupby('season').apply(describe)[['25%','50%','mean','75%','min']].plot.line()
ss.ix[['Sweden','Bosnia-Herzegovina','Albania']].T.plot.line()
Out[112]:
In [150]:
big5=['GB1','ES1','L1','IT1','FR1']
tier2=['PO1','NL1','SC1','BE1']
north=['DK1','SE1','NO1','A1','C1','LUX1','FI1','IR1','IS1','WAL','NIR','FARO']
east=['LI1','ISR1','PL1','UNG1','MAL1','KAS1','TS1','SLO1','SL1','FI1','AZ1','EST1','LET1','ARM1','LI1','WER1','UKR1','RU1','TR1']
southeast=['GR1','RO1','ZYP1','KR1','SER1','BOS1','BU1','ALB1','MNE1','MAL1','MOL1','MAZ1']
def category(league):
if league in big5:
return '1 - big 5'
elif league in tier2:
return '2 - tier 2'
elif league in north:
return '3 - north'
elif league in east:
return '4 - east'
elif league in southeast:
return '5 - southeast'
else:
return 'WTF'
nonloans['category']=nonloans.fromLeague.apply(category)
nonloans[(nonloans.season>1994)&(nonloans['category']!='WTF')].groupby(['season','category'])['id'].count().unstack().plot.area()
Out[150]:
In [149]:
print (1044+758+989)/(952.+722+614)
(2292+1829 )/(1189.+673 )
Out[149]:
In [122]:
s=transfers[(transfers.age>21)&(transfers.season>1994)&(transfers.duration>0)&(transfers.fromCountry.isin(uefa))]
ss=s.groupby(['season','Nationality'])['duration']
i=(ss.describe()).unstack(level=0).swaplevel().ix['count'].sum(axis=1).sort_values()[-30:].index #.T #.plot.line(figsize=(10,10))
s2=ss.describe().unstack(level=0).ix[i].swaplevel().ix['mean'].sort_values(2010)
s2=(pd.concat([s2.iloc[:8],s2.iloc[-8:]])).T
(s2/366).rolling(window=3,center=False).mean().plot.line(figsize=(20,10),colormap='Dark2')
Out[122]:
In [17]:
nonloans.columns
Out[17]:
In [7]:
z=nonloans.merge(countryInfo[['Corruption']],right_index=True,left_on='fromCountry',how='left')
s=nonloans[(nonloans.facebook.notnull())]#|(nonloans.instagram.notnull())|(nonloans.twitter.notnull())|(nonloans.website.notnull())]
s=s.merge(countryInfo[['marketTier']],right_index=True,left_on='fromCountry',how='left')
s=s[(s.age>21)&(s.season>2000)&(s.duration>0)&(s.id.isin(z[z.Corruption<50]['id'].drop_duplicates().tolist()))] #&(s.feeValue.notnull())]
s=s.groupby('id').agg({'duration':'mean','fromTeamId': 'count'}).rename(columns={'fromTeamId':'transferCount'})
z=s[s.transferCount>3].sort_values('duration').reset_index().merge(nonloans[['id','Nationality','facebook','instagram','twitter','website']].drop_duplicates(), how='left')
#z[z.Nationality=='Germany']
z #[:300].groupby('Nationality')['id'].count().sort_values().plot.pie()
#.hist(bins=60,figsize=(20,10))
Out[7]:
In [6]:
nonloans[:1]
Out[6]:
In [8]:
nonloans.groupby('fromCountry')
Out[8]:
In [14]:
z=nonloans.merge(countryInfo[['Corruption']],right_index=True,left_on='fromCountry',how='left')
s=z[(z.facebook.notnull())]#|(z.instagram.notnull())|(z.twitter.notnull())|(z.website.notnull())]
s=s.merge(countryInfo[['marketTier']],right_index=True,left_on='fromCountry',how='left')
s=s[(s.age>21)&(s.season>2000)&(s.duration>0)&(s.duration<244)&(z.Corruption<50)&(z.fromCountry != 'Italy')] #&(s.feeValue.notnull())]
s=s.groupby('id').agg({'duration':'mean','fromTeamId': 'count'}).rename(columns={'fromTeamId':'transferCount'})
z=s[s.transferCount>0].sort_values('duration').reset_index().merge(nonloans[['id','playerName','Nationality','facebook','instagram','twitter','website']].drop_duplicates(), how='left')
#z[z.Nationality=='Germany']
z #[:300].groupby('Nationality')['id'].count().sort_values().plot.pie()
#.hist(bins=60,figsize=(20,10))
Out[14]:
In [6]:
def avgAge(t):
allIn=t[t.toLeague.isin(leagueList)].groupby('toCountry')['ageFloat'].mean().sort_values()
allOut=t[t.fromLeague.isin(leagueList)].groupby('fromCountry')['ageFloat'].mean().sort_values()
freeIn=t[t.toLeague.isin(leagueList)&(t.feeValue==0)].groupby('toCountry')['ageFloat'].mean().sort_values()
freeOut=t[t.fromLeague.isin(leagueList)&(t.feeValue==0)].groupby('fromCountry')['ageFloat'].mean().sort_values()
paidIn=t[t.toLeague.isin(leagueList)&(t.feeValue>0)].groupby('toCountry')['ageFloat'].mean().sort_values()
paidOut=t[t.fromLeague.isin(leagueList)&(t.feeValue>0)].groupby('fromCountry')['ageFloat'].mean().sort_values()
z=DataFrame({'all imports':allIn,'all exports':allOut,'free imports':freeIn,'free exports':freeOut,'paid imports':paidIn,'paid exports':paidOut}).sort_values('all imports')
z=z[['all imports', 'all exports', 'paid imports', 'paid exports', 'free imports', 'free exports']]
z.columns.name='average age'
return z.reset_index().rename(columns={'index':'country'})
output=nonloans[(nonloans.fromCountry!=nonloans.toCountry)&(nonloans.season>2000)].groupby('period').apply(avgAge).unstack(level=0).swaplevel(axis=1).sort_index(axis=1,level=0)
output.to_excel('../data/analysis/average_age.xlsx')
output
Out[6]:
In [10]:
nonloans[(nonloans.season>1994)&(nonloans.duration>0)].groupby('season')['duration'].describe().unstack()[['min','25%','50%','75%','mean']].plot.line()
Out[10]:
In [1]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')
from urllib2 import urlopen, Request
from bs4 import BeautifulSoup
from pandas import DataFrame
import pandas as pd
import numpy as np
import re
import os.path
import datetime
import math
import matplotlib.pyplot as plt
transfers = pd.read_csv('../data/merged/transfers_q.csv')
loans = pd.read_csv('../data/merged/loans.csv')
nonloans = pd.read_csv('../data/merged/nonloans.csv')
players=pd.read_csv('../data/merged/players.csv')
leagueCodeMapping = {'GB1':'Premier League (England)','FR1':'Ligue 1 (France)',
'ES1':'La Liga (Spain)','IT1':'Serie A (Italy)','L1':'Bundesliga (Germany)',
'PO1':'Primeira Liga (Portugal)','NL1':'Eredivisie (Netherlands)'}
teamIdMap = pd.concat([transfers[['toTeamId','toTeamName','season']].rename(columns={'toTeamId':'teamId','toTeamName':'teamName'}),
transfers[['fromTeamId','fromTeamName','season']].rename(columns={'fromTeamId':'teamId','fromTeamName':'teamName'})],ignore_index=True) \
.sort_values('season',ascending=False).drop_duplicates('teamId').set_index('teamId')['teamName'].to_dict()
agentIdMap = transfers[['playersAgentId','playersAgent']].sort_values('playersAgent',ascending=False).drop_duplicates('playersAgentId').set_index('playersAgentId')['playersAgent'].to_dict()
leagueList = ['GB1','ES1','L1','IT1','FR1','PO1','NL1','TR1','RU1','BE1','MLS1','UKR1','GR1','RO1','ZYP1','KR1','SER1','BOS1','BU1','DK1','SE1','NO1','LI1','SC1','ISR1','PL1','UNG1','A1','C1','MAL1','KAS1','TS1','SLO1','SL1','FI1','LUX1','AZ1','IR1']
countryInfo=pd.read_excel('../data/reference/countries.xls').set_index('country')
countryInfo.marketTier=np.where(countryInfo.marketTier==4,3,countryInfo.marketTier)
top20clubids=transfers.groupby('toTeamId')[['feeValue']].sum().merge(DataFrame({'teamName':teamIdMap}), left_index=True,right_index=True,how='left').sort_values('feeValue',ascending=False).iloc[:20].index
top50clubids=transfers.groupby('toTeamId')[['feeValue']].sum().merge(DataFrame({'teamName':teamIdMap}), left_index=True,right_index=True,how='left').sort_values('feeValue',ascending=False).iloc[:50].index
top100clubids=transfers.groupby('toTeamId')[['feeValue']].sum().merge(DataFrame({'teamName':teamIdMap}), left_index=True,right_index=True,how='left').sort_values('feeValue',ascending=False).iloc[:100].index
uefa=countryInfo[countryInfo.federation=='UEFA'].index
caf=countryInfo[countryInfo.federation=='CAF'].index
concacaf=countryInfo[countryInfo.federation=='CONCACAF'].index
concacaf_no_uscan=countryInfo[countryInfo.federation=='CONCACAF'].drop(['United States','Canada']).index
conmebol=countryInfo[countryInfo.federation=='CONMEBOL'].index
afc=countryInfo[countryInfo.federation=='AFC'].index
afc_no_anzac=countryInfo[countryInfo.federation=='AFC'].drop(['Australia','New Zealand']).index
country_coeff=pd.read_table('../data/reference/country_coefficient.csv').reset_index(drop=True)
country_coeff.columns=['Movement','Country','2012 coeff','2013 coeff','2014 coeff','2015 coeff','2016 coeff','Total coeff','Teams Total','CL','EL','Total']
country_coeff=country_coeff.drop(['Movement','2016 coeff','CL','EL','Total','Teams Total','Total coeff'],axis=1)
country_coeff['Country']=country_coeff['Country'].str.split('\(L',expand=True)[0].str.strip()
country_coeff=country_coeff.set_index('Country')
country_coeff = country_coeff.join(DataFrame({'total coeff':country_coeff.sum(axis=1)}))
natteam_coeff=pd.read_table('../data/reference/national_team_coefficient.csv').reset_index(drop=True).drop('Rank',axis=1)
natteam_coeff['Country']=natteam_coeff['Country'].str.strip()
natteam_coeff.Coefficient=natteam_coeff.Coefficient.apply(lambda s: int(s.replace(',','')))
natteam_coeff=natteam_coeff.rename(columns={'Coefficient':'National team coeff'}).set_index('Country')
club_coeff=pd.read_table('../data/reference/club_coefficient.csv').reset_index(drop=True).set_index('Club')
club_coeff=club_coeff.drop(['2017','2016','Mvmt','Total'],axis=1)
club_coeff.Association = club_coeff.Association.str.split(expand=True)[0]
# sanity check: club_coeff.merge(DataFrame({'Team Name':teamIdMap}),how='left',left_on='Team Id',right_index=True)
club_coeff=club_coeff.reset_index().set_index('Team Id').rename(columns=lambda c: 'Club coeff ' + c.split('–')[0] if '–' in c else c)
gdp=pd.read_table('../data/reference/country_gdp.csv').reset_index(drop=True).drop('2014 Rank',axis=1).rename(columns={'Country ':'Country'})
gdp.Country=gdp.Country.str.strip()
gdp=gdp.set_index('Country')
gdp.columns=['GDP '+c for c in gdp.columns.str.strip()]
for column in gdp.columns:
gdp[column]=gdp[column].apply(lambda s: float(str(s).replace(',','')))
unemployed=pd.read_table('../data/reference/country_unemployment.csv').drop(['Date','Employment rate'],axis=1)
unemployed.columns=['Country','Unemployment']
unemployed['Country']=unemployed['Country'].str.split(' ').apply(lambda arr: ' '.join(arr[:len(arr)/2]))
unemployed=unemployed.set_index('Country').rename(index={'United Kingdom':'England'})
corruption=pd.read_table('../data/reference/country_corruption.csv').drop(['Rank'],axis=1)
corruption.Country=corruption.Country.str.strip()
corruption=corruption.set_index('Country')
corruption=corruption[corruption.columns[:1]].rename(columns={'2015 score':'Corruption'}).rename(index={'United Kingdom':'England'})
population=pd.read_table('../data/reference/country_population.csv').drop(['Rank'],axis=1)
population.columns=population.columns.str.strip()
population.Country=population.Country.str.split('[',expand=True)[0].str.strip()
population['Population']=population['Population'].apply(lambda s: float(str(s).replace(',','')))
population=population[['Country','Population','Growth %']].set_index('Country').rename(index={'United Kingdom':'England'})
barometer=pd.read_table('../data/reference/country_corruption_barometer.csv').reset_index(drop=True)
barometer.columns=['Corruption Rank','Country','Bribe %']
barometer.Country=barometer.Country.str.strip()
barometer=barometer.set_index('Country').rename(index={'United Kingdom':'England'})
countryInfo=natteam_coeff.join(country_coeff).join(gdp).join(unemployed).join(corruption).join(population).join(countryInfo).join(barometer)