In [3]:
#number of transfers from individual clubs - first time a player changes countries
transfers[((transfers['toCountry'] != transfers['fromCountry']))&(transfers['season'].between(1995,2015))&(~transfers['fromTeamId'].isin([75,123,515,2113,2077]))].drop_duplicates(subset=['id'],keep='last').groupby(['fromTeamName','fromTeamId'])['id'].count().sort_values(ascending=False)[:40].plot.bar(figsize=(20,5))
Out[3]:
In [4]:
#number of transfers from individual clubs - any time a player changes countries
transfers[((transfers['toCountry'] != transfers['fromCountry']))&(transfers['season'].between(1995,2015))&(~transfers['fromTeamId'].isin([75,123,515,2113,2077]))].groupby(['fromTeamName','fromTeamId'])['id'].count().sort_values(ascending=False)[:40].plot.bar(figsize=(20,5))
Out[4]:
In [5]:
#total documented fees paid for transfers from individual clubs - any time a player changes countries
transfers[((transfers['toCountry'] != transfers['fromCountry']))&(transfers['season'].between(1995,2015))&(~transfers['fromTeamId'].isin([75,123,515,2113,2077]))].groupby(['fromTeamName','fromTeamId'])['feeValue'].sum().sort_values(ascending=False)[:40].plot.bar(figsize=(20,5))
Out[5]:
In [9]:
#total documented fees paid for transfers from individual clubs - first time a player changes countries
transfers[((transfers['toCountry'] != transfers['fromCountry']))&(transfers['season'].between(0,2015))].drop_duplicates(subset=['id'],keep='last').groupby(['fromTeamName','fromTeamId'])['feeValue'].sum().sort_values(ascending=False)[:40].plot.bar(figsize=(20,5))
Out[9]:
In [17]:
#number of transfers from each league to GB1 (English Premier) - first transfer to league only
transfers[((transfers['toLeague'] != transfers['fromLeague']))&(transfers['toLeague'] == 'GB1')&(transfers['season'] > 2000)].drop_duplicates(subset=['id'],keep='last').groupby('fromLeague')['id'].count().sort_values(ascending=False)[:20].plot.bar(figsize=(20,5))
Out[17]:
In [16]:
#total documented fees paid for transfers from each league to GB1 (English Premier) - first transfer to league only
transfers[((transfers['toLeague'] != transfers['fromLeague']))&(transfers['toLeague'] == 'GB1')&(transfers['season'] > 2000)].drop_duplicates(subset=['id'],keep='last').groupby('fromLeague')['feeValue'].sum().sort_values(ascending=False)[:20].plot.bar(figsize=(20,5))
Out[16]:
In [19]:
#total documented fees paid for transfers from each league to PO1 (Portuguese Top Tier) - first transfer to league only
transfers[((transfers['toLeague'] != transfers['fromLeague']))&(transfers['toLeague'] == 'PO1')&(transfers['season'] > 2000)].drop_duplicates(subset=['id'],keep='last').groupby('fromLeague')['feeValue'].sum().sort_values(ascending=False)[:20].plot.bar(figsize=(20,5))
Out[19]:
In [54]:
#number transfers to each league from PO1 (Portuguese Top Tier) - first transfer from league only
transfers[((transfers['toLeague'] != transfers['fromLeague']))&(transfers['fromLeague'] == 'PO1')&(transfers['season'].between(2000,2015))].drop_duplicates(subset=['id'],keep='last').groupby('toLeague')['id'].count().sort_values(ascending=False)[:20].plot.bar(figsize=(20,5))
Out[54]:
In [53]:
#total documented fees paid for transfers to each league from PO1 (Portuguese Top Tier) - first transfer from league only
transfers[((transfers['toLeague'] != transfers['fromLeague']))&(transfers['fromLeague'] == 'PO1')&(transfers['season'].between(2000,2015))].drop_duplicates(subset=['id'],keep='last').groupby('toLeague')['feeValue'].sum().sort_values(ascending=False)[:20].plot.bar(figsize=(20,5))
Out[53]:
In [21]:
#total documented fees paid for transfers from each league to NL1 (Dutch Top Tier) - first transfer to league only
transfers[((transfers['toLeague'] != transfers['fromLeague']))&(transfers['toLeague'] == 'NL1')&(transfers['season'] > 2000)].drop_duplicates(subset=['id'],keep='last').groupby('fromLeague')['feeValue'].sum().sort_values(ascending=False)[:20].plot.bar(figsize=(20,5))
Out[21]:
In [79]:
#total number of international transfers from each league to TR1 (Turkish top tier) - first transfer to league only
transfers[((transfers['toCountry'] != transfers['fromCountry']))&(transfers['toLeague'] == 'TR1')&(transfers['season'] > 2000)].drop_duplicates(subset=['id'],keep='last').groupby('fromLeague')['id'].count().sort_values(ascending=False)[:20].plot.bar(figsize=(20,5))
Out[79]:
In [77]:
#total documented fees paid for transfers from each league to TR1 (Turkish Top Tier) - first transfer to league only
transfers[((transfers['toLeague'] != transfers['fromLeague']))&(transfers['toLeague'] == 'TR1')&(transfers['season'] > 2000)].drop_duplicates(subset=['id'],keep='last').groupby('fromLeague')['feeValue'].sum().sort_values(ascending=False)[:20].plot.bar(figsize=(20,5))
Out[77]:
In [81]:
#total number of international transfers from each league to RU1 (Russian top tier) - first transfer to league only
transfers[((transfers['toCountry'] != transfers['fromCountry']))&(transfers['toLeague'] == 'RU1')&(transfers['season'] > 2000)].drop_duplicates(subset=['id'],keep='last').groupby('fromLeague')['id'].count().sort_values(ascending=False)[:20].plot.bar(figsize=(20,5))
Out[81]:
In [91]:
#total documented fees paid for transfers from each league to RU1 (Russian Top Tier) - first transfer to league only
transfers[((transfers['toLeague'] != transfers['fromLeague']))&(transfers['toLeague'] == 'RU1')&(transfers['season'] > 2000)].drop_duplicates(subset=['id'],keep='last').groupby('fromLeague')['feeValue'].sum().sort_values(ascending=False)[:20].plot.bar(figsize=(20,5))
Out[91]:
In [70]:
transfers[((transfers['fromCountry'] != transfers['toCountry']))&(transfers['fromCountry']=='Italy')].drop_duplicates(subset=['id'],keep='last').rename(columns={'toCountry':'destination'}).groupby('destination')['id'].count().sort_values()
Out[70]:
In [21]:
p=transfers[((transfers['fromCountry'] != transfers['toCountry']))].drop_duplicates(subset=['id'],keep='last')
topdestinations=p.groupby('toCountry')['id'].count().sort_values(ascending=False)
toporigins=p.groupby('fromCountry')['id'].count().sort_values(ascending=False)
print toporigins[toporigins>100].index.tolist()
print topdestinations[topdestinations>100].index.tolist()
In [22]:
def makeDestinationChart(country):
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15,5),sharex=False,sharey=False)
p=nonloans[((nonloans['fromCountry'] != nonloans['toCountry']))]#.drop_duplicates(subset=['id'],keep='last')
q=p[p['fromCountry']==country].rename(columns={'toCountry':'destination'})
r1=q[q['feeValue']==0].groupby('destination')['id'].count().sort_values().rename(columns={'toCountry':'destination'})[-10:]
r2=q[q['feeValue']>0].groupby('destination')['id'].count().sort_values(ascending=False).rename(columns={'toCountry':'destination'})[:10].sort_values()
r1.plot.barh(ax=axes[0],title='# of free int\'l transfers, from '+country+' to...')
r2.plot.barh(ax=axes[1],title='# of paid int\'l transfers, from '+country+' to...')
fig.savefig('../images/from_'+country+'.png')
for country in toporigins[toporigins>100].index.tolist():
makeDestinationChart(country)
In [23]:
def makeSourceChart(country):
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15,5),sharex=False,sharey=False)
p=nonloans[((nonloans['fromCountry'] != nonloans['toCountry']))]#.drop_duplicates(subset=['id'],keep='last')
q=p[p['toCountry']==country].rename(columns={'fromCountry':'origin'})
r1=q[q['feeValue']==0].groupby('origin')['id'].count().sort_values()[-10:]
r2=q[q['feeValue']>0].groupby('origin')['id'].count().sort_values(ascending=False)[:10].sort_values()
r1.plot.barh(ax=axes[0],title='# of free int\'l transfers, to '+country + ' from...')
r2.plot.barh(ax=axes[1],title='# of paid int\'l transfers, to '+country + ' from...')
fig.savefig('../images/to_'+country+'.png')
for country in topdestinations[topdestinations>100].index.tolist():
makeSourceChart(country)
In [11]:
nonloans[(nonloans['fromCountry']=='Portugal')&(nonloans['toCountry']=='Cyprus')].sort_values('mv',ascending=False)
Out[11]:
In [29]:
nonloans[(nonloans['fromCountry']=='Portugal')&(nonloans['toCountry']=='Romania')&(nonloans['Nationality']!='Romania')].sort_values('feeValue',ascending=False)
Out[29]:
In [ ]:
In [55]:
#total documented fees paid for transfers to each league from BE1 (Belgian Top Tier) - first transfer from league only
transfers[((transfers['toLeague'] != transfers['fromLeague']))&(transfers['fromLeague'] == 'BE1')&(transfers['season'].between(2000,2015))].drop_duplicates(subset=['id'],keep='last').groupby('toLeague')['feeValue'].sum().sort_values(ascending=False)[:20].plot.bar(figsize=(20,5))
Out[55]:
In [63]:
#total number of transfers to each league from Brazil - first transfer from country only
transfers[((transfers['toCountry'] != transfers['fromCountry']))&(transfers['fromCountry'] == 'Brazil')&(transfers['season'].between(2000,2010))].drop_duplicates(subset=['id'],keep='last').groupby('toLeague')['id'].count().sort_values(ascending=False)[:20].plot.bar(figsize=(20,5))
Out[63]:
In [60]:
#total documented fees paid for transfers to each league from Brazil - first transfer from country only
transfers[((transfers['toCountry'] != transfers['fromCountry']))&(transfers['fromCountry'] == 'Brazil')&(transfers['season'].between(2000,2010))].drop_duplicates(subset=['id'],keep='last').groupby('toLeague')['feeValue'].sum().sort_values(ascending=False)[:20].plot.bar(figsize=(20,5))
Out[60]:
In [62]:
#total number of transfers to each league from Argentina - first transfer from country only
transfers[((transfers['toCountry'] != transfers['fromCountry']))&(transfers['fromCountry'] == 'Argentina')&(transfers['season'].between(2000,2010))].drop_duplicates(subset=['id'],keep='last').groupby('toLeague')['id'].count().sort_values(ascending=False)[:20].plot.bar(figsize=(20,5))
Out[62]:
In [65]:
#total documented fees paid for transfers to each league from Nigeria - first transfer from country only
transfers[((transfers['toCountry'] != transfers['fromCountry']))&(transfers['fromCountry'] == 'Argentina')&(transfers['season'].between(2000,2010))].drop_duplicates(subset=['id'],keep='last').groupby('toLeague')['feeValue'].sum().sort_values(ascending=False)[:20].plot.bar(figsize=(20,5))
Out[65]:
In [19]:
#total number of transfers to each league from Mexico - first transfer from country only
transfers[((transfers['toCountry'] != transfers['fromCountry']))&(transfers['fromCountry'] == 'Mexico')&(transfers['season'].between(1995,2015))].drop_duplicates(subset=['id'],keep='last').groupby('toLeague')['id'].count().sort_values(ascending=False)[:20].plot.bar(figsize=(20,5))
Out[19]:
In [76]:
#total documented fees paid for transfers to each league from Mexico - first transfer from country only
transfers[((transfers['toCountry'] != transfers['fromCountry']))&(transfers['fromCountry'] == 'Mexico')&(transfers['season'].between(1995,2015))].drop_duplicates(subset=['id'],keep='last').groupby('toLeague')['feeValue'].sum().sort_values(ascending=False)[:20].plot.bar(figsize=(20,5))
Out[76]:
In [16]:
#total documented fees paid for transfers from each league to NL1 (Dutch Top Tier) - first transfer to league only
transfers[((transfers['toLeague'] != transfers['fromLeague']))&(transfers['toLeague'] == 'NL1')&(transfers['season'] > 2000)].drop_duplicates(subset=['id'],keep='last').groupby('fromLeague')['feeValue'].sum().sort_values(ascending=False)[:30].plot.pie(figsize=(6,6))
Out[16]:
In [17]:
#total number of transfers from each league to NL1 (Dutch Top Tier) - first transfer to league only
transfers[((transfers['toCountry'] != transfers['fromCountry']))&(transfers['toLeague'] == 'NL1')&(transfers['season'] > 2000)].drop_duplicates(subset=['id'],keep='last').groupby('fromLeague')['id'].count().sort_values(ascending=False)[:40].plot.bar(figsize=(20,5))
Out[17]:
In [18]:
#total number of transfers to each league from GB2 (Football League Championship) - first transfer from league only
transfers[((~transfers['toCountry'].isin(['England','Scotland'])))&(transfers['fromLeague'] == 'GB2')&(transfers['season'] > 2000)].drop_duplicates(subset=['id'],keep='last').groupby('toLeague')['id'].count().sort_values(ascending=False)[:40].plot.bar(figsize=(20,5))
Out[18]:
In [3]:
nonloans[(nonloans['toCountry']!=nonloans['fromCountry'])&(nonloans['fromLeague'].isin(['GB1','ES1','L1','IT1','FR1','NL1','PO1']))] \
.rename(columns={'Age':'age','fromLeague':'origin'}).groupby(['age','origin'])['id'].count().unstack() \
.rolling(window=3,center=False).mean().rename(columns={'GB1':'Premier League (England)','FR1':'Ligue 1 (France)','ES1':'La Liga (Spain)','IT1':'Serie A (Italy)','L1':'Bundesliga (Germany)','PO1':'Primeira Liga (Portugal)','NL1':'Eredivisie (Netherlands)'}).plot.line(figsize=(9,6),title='Number of International Transfers, by Age & Origin')
Out[3]:
In [2]:
nonloans[(nonloans['toCountry']!=nonloans['fromCountry'])&(nonloans['toLeague'].isin(['GB1','ES1','L1','IT1','FR1','NL1','PO1']))] \
.rename(columns={'Age':'age','toLeague':'destination'}).groupby(['age','destination'])['id'].count().unstack() \
.rolling(window=3,center=False).mean().rename(columns={'GB1':'Premier League (England)','FR1':'Ligue 1 (France)','ES1':'La Liga (Spain)','IT1':'Serie A (Italy)','L1':'Bundesliga (Germany)','PO1':'Primeira Liga (Portugal)','NL1':'Eredivisie (Netherlands)'}).plot.line(figsize=(9,6),title='Number of International Transfers, by Age & Destination')
Out[2]:
In [23]:
transfers[transfers['feeValue']>0].drop_duplicates('id')['Player\'s agent id'].fillna('-1').value_counts()
Out[23]:
In [15]:
z=transfers.groupby(['playersAgentId','period'])['feeValue'].sum().unstack()#.sort_values('feeValue',ascending=False)
z#['avg fee']=z['feeValue']/z['id']
#z['avg mv']=z['mv']/z['id']
#z[z['id']>100].sort_values('avg fee',ascending=False)
z.join(agentIdTable)
In [9]:
def toFromMismatch(df):
#return (df['fromTeamId'].tolist()[:-1]==df['toTeamId'].tolist()[1:])
#return (df['fromCountry'].tolist()[:-1]==df['toCountry'].tolist()[1:]) or df['fromCountry']
return (df['fromCountry'].tolist()[:-1]==df['toCountry'].tolist()[1:])
topids=[186423, 39356, 5689, 3335, 75856 ]
z=nonloans.groupby('id').apply(toFromMismatch).sort_values()
z
In [103]:
nonloans[-2:]
Out[103]:
In [27]:
from itertools import groupby
def getCountryChain(df):
return [x[0] for x in groupby(df[-1:][df[-1:].fromCountry.notnull()].fromCountry.tolist() + df[df.toCountry.notnull()].toCountry.tolist()[::-1])]
def getLeagueChain(df):
return [x[0] for x in groupby(df[-1:][df[-1:].fromLeague.notnull()].fromLeague.tolist() + df[df.toLeague.notnull()].toLeague.tolist()[::-1])]
z=nonloans.groupby('id').apply(getCountryChain).sort_values().apply(lambda x:'->'.join(x)).rename('countryChain')
z.value_counts()
Out[27]:
In [ ]:
w=nonloans.groupby('id').apply(getLeagueChain).sort_values().apply(lambda x:'->'.join(x)).rename('getLeagueChain')
w=w.rename('getLeagueChain')
w.value_counts()
In [32]:
nonloans=nonloans.drop('countryChain',axis=1,errors='ignore').merge(DataFrame(z),right_index=True,left_on='id',how='left')
nonloans=nonloans.drop('leagueChain',axis=1,errors='ignore').merge(DataFrame(w),right_index=True,left_on='id',how='left')
In [31]:
z[(z.str.contains('Serbia->Hungary'))].value_counts()[:20].plot.barh()
Out[31]:
In [33]:
nonloans[nonloans.countryChain.str.contains('Serbia->Hungary')]
Out[33]:
In [54]:
transfers[transfers.feeValue>0]['feeValue'].median()
Out[54]:
In [7]:
transfers[(transfers.feeValue>0)&(transfers.season>=1990)].groupby('season')['feeValue'].describe().unstack().drop(['count','std','max'],errors='ignore',axis=1).plot.line()
Out[7]:
In [158]:
w[(w.str.contains('ZYP1'))&(z.str.contains('->'))].value_counts()[:20].plot.barh()
Out[158]:
In [138]:
z[z.apply(lambda x:len(x.split('->')))>12]
Out[138]:
In [143]:
nonloans[nonloans.countryChain=='Netherlands->Bulgaria->Netherlands'].drop_duplicates('id')[['id','playerName','Nationality','currentClub','Age','countryChain']]
Out[143]:
In [89]:
nonloans[nonloans.id== 151197 ]
Out[89]:
In [48]:
z
Out[48]:
In [46]:
transfers[transfers.countryChain=='Cyprus->Portugal'][['id','Nationality']]
In [35]:
DataFrame({'loans':loans.groupby('season')['feeValue'].sum()}).join(DataFrame({'nonloans':nonloans.groupby('season')['feeValue'].sum()/30})).plot.line()
Out[35]:
In [51]:
t=loans.groupby(['fromCountry','toCountry']).agg({'feeValue':'sum','id':'count'}).sort_values('feeValue',ascending=False)
t['avgFee']=t.feeValue/t.id
t.sort_values('avgFee',ascending=False)
Out[51]:
In [57]:
loans[loans.toCountry=='United Arab Emirates'].groupby('fromCountry').agg({'feeValue':'sum','id':'count'}).sort_values('feeValue',ascending=False)
Out[57]:
In [102]:
t.date.str.split('-',expand=True)[1].fillna('0').apply(int)<7
Out[102]:
In [104]:
t=nonloans
a=t[(t.season==2011)&(t.fromCountry!=t.toCountry)&(t.date.str.split('-',expand=True)[1].fillna('0').apply(int)<7)].groupby('fromCountry')['id'].count()
b=t[(t.season==2011)&(t.fromCountry!=t.toCountry)&(t.date.str.split('-',expand=True)[1].fillna('0').apply(int)<7)].groupby('toCountry')['id'].count()
c=DataFrame({'from':a,'to':b})
c['total']=c['from']+c['to']
c.sort_values('total',ascending=False)
Out[104]:
In [147]:
t=transfers[transfers.season>=2010]
domestic=t[t.fromCountry==t.toCountry].groupby('fromLeague').sum()['feeValue'].sort_values(ascending=False)
international=t[t.fromCountry!=t.toCountry].groupby('toLeague').sum()['feeValue'].sort_values(ascending=False)
tt=DataFrame({'domestic':domestic,'international':international})
#tt['international']=tt.internationalOut+tt.internationalIn
tt['total']=tt.domestic+tt.international
tt.sort_values('total',ascending=False)[:10]#.sum() #['total'].plot.pie()
Out[147]:
In [202]:
def avgAge(t):
allIn=t[t.toLeague.isin(leagueList)].groupby('toCountry')['ageFloat'].mean().sort_values()
allOut=t[t.fromLeague.isin(leagueList)].groupby('fromCountry')['ageFloat'].mean().sort_values()
freeIn=t[t.toLeague.isin(leagueList)&(t.feeValue==0)].groupby('toCountry')['ageFloat'].mean().sort_values()
freeOut=t[t.fromLeague.isin(leagueList)&(t.feeValue==0)].groupby('fromCountry')['ageFloat'].mean().sort_values()
paidIn=t[t.toLeague.isin(leagueList)&(t.feeValue>0)].groupby('toCountry')['ageFloat'].mean().sort_values()
paidOut=t[t.fromLeague.isin(leagueList)&(t.feeValue>0)].groupby('fromCountry')['ageFloat'].mean().sort_values()
z=DataFrame({'all imports':allIn,'all exports':allOut,'free imports':freeIn,'free exports':freeOut,'paid imports':paidIn,'paid exports':paidOut}).sort_values('all imports')
z=z[['all imports', 'all exports', 'paid imports', 'paid exports', 'free imports', 'free exports']]
z.columns.name='average age'
return z.reset_index().rename(columns={'index':'country'})
output=nonloans[(nonloans.fromCountry!=nonloans.toCountry)&(nonloans.season>2000)].groupby('period').apply(avgAge).unstack(level=0).swaplevel(axis=1).sort_index(axis=1,level=0)
output.to_excel('../data/analysis/average_age.xlsx')
In [210]:
t=nonloans[(nonloans.fromCountry!=nonloans.toCountry)&(nonloans.season>2000)&(nonloans.toCountry=='Cyprus')&(nonloans.Nationality!='Cyprus')]
t.groupby('Nationality')['id'].count().sort_values(ascending=False)
Out[210]:
In [178]:
{'all imports':allIn,'all exports':allOut,'free imports':freeIn,'free exports':freeOut,'paid imports':paidIn,'paid exports':paidOut}.keys()
Out[178]:
In [131]:
transfers.groupby('season').agg({'id':'count','feeValue':'sum'}).ix[[1994,1999,2005,2011]]
Out[131]:
In [143]:
t=transfers[transfers.fromCountry=='England']
t=t[t.season==2010]
t.groupby('fromTeamId').sum()[['feeValue']].join(DataFrame({'teamName':teamIdMap})).sort_values('feeValue',ascending=False)
Out[143]:
In [92]:
nonloans[nonloans.fromCountry=='Switzerland']
Out[92]:
In [6]:
transfers[(transfers.season==2011)&(transfers.toCountry!=transfers.fromCountry)].count()
Out[6]:
In [4]:
def top(df):
return df[df.toLeague.notnull()].sort_values('mv',ascending=False).drop_duplicates('id')[:5][['id','displayName','mv','toTeamName','toLeague']]
nonloans.groupby('Nationality').apply(top)
Out[4]:
In [76]:
def topClubs(df,direction='to',field='id',agg=('count','count'),ascending=False):
teamIdTable=DataFrame({'teamName':teamIdMap})
teamIdTable.index.name= direction+'TeamId'
s = df.groupby([direction+'TeamId'])[[field]] \
.agg(agg[1]).join(teamIdTable).sort_values(field,ascending=ascending) \
.reset_index(level=-1)[['teamName',field]]
s['cumulative %']=(s[field].cumsum())/(s[field].sum())
s=s[s['cumulative %'].shift(1).fillna(0)<2/3.]
s=s[s[field]>0]
return s.rename(columns={field:(agg[0] + ' of '+ field),('teamName'):(direction +' club')})
t=nonloans[(nonloans['fromCountry']!=nonloans['toCountry'])&(nonloans.season>2000)]
def countNonZero(l):
return l[l>0].count()
def countIsZero(l):
return l[l==0].count()
def createTable(tt,direction,field,agg,ascending=False):
o=tt.groupby([direction+'Country','period']).apply(topClubs,direction=direction,field=field,agg=agg,ascending=ascending).unstack(level=1).reorder_levels(axis=1,order=[1,0]).sort_index(axis=1,level=0)
#ew = pd.ExcelWriter('../data/output/'+direction+'_' +field + '_' + agg[0] + '.xls', encoding='utf-8')
#o.to_excel(ew)
#ew.save()
return o
#createTable(t[t.fromLeague.isin(leagueList)],'from','feeValue', ('sum','sum'), False)
#createTable(t[t.toLeague.isin(leagueList)],'to','feeValue', ('sum','sum'), False)
#createTable(t[t.toLeague.isin(leagueList)],'to','id', ('count','count'), False)
#createTable(t[t.fromLeague.isin(leagueList)],'from','id', ('count','count'), False)
createTable(t[(~t.fromCountry.isin(uefa))&t.toLeague.isin(leagueList)&t.toCountry.isin(uefa)],'to','id', ('count','count'), False)
Out[76]:
In [83]:
t=transfers
t[(t.fromCountry=='Portugal')&(t.toCountry=='Greece')&(t.facebook.notnull()|t.instagram.notnull()|t.twitter.notnull()|t.website.notnull())]
Out[83]:
In [29]:
agentIdTable=DataFrame({'playersAgent':agentIdMap})
agentIdTable.index.name='playersAgentId'
In [221]:
ew = pd.ExcelWriter('../data/reference/countries.xls', encoding='utf-8')
DataFrame({'country':pd.concat([transfers.fromCountry,transfers.toCountry,transfers.Nationality,transfers.nationality2]).drop_duplicates().sort_values()}).to_excel(ew)
ew.save()
In [40]:
uefa=countryInfo[countryInfo.federation=='UEFA'].index
caf=countryInfo[countryInfo.federation=='CAF'].index
concacaf=countryInfo[countryInfo.federation=='CONCACAF'].index
concacaf_no_uscan=countryInfo[countryInfo.federation=='CONCACAF'].drop(['United States','Canada']).index
conmebol=countryInfo[countryInfo.federation=='CONMEBOL'].index
afc=countryInfo[countryInfo.federation=='AFC'].index
afc_no_anzac=countryInfo[countryInfo.federation=='AFC'].drop(['Australia','New Zealand']).index
t=transfers[(transfers.fromCountry.str.contains('Jamaica'))&(transfers.toCountry.isin(uefa))&(transfers.fromCountry.notnull())&(transfers.toCountry.notnull())]
t=transfers[(transfers.fromCountry.isin(conmebol))&(transfers.toCountry.isin(uefa))&(transfers.fromCountry.notnull())&(transfers.toCountry.notnull())]
#
t=t[t.feeValue==0]
#t.groupby('fromCountry').agg({'id':'count','feeValue':'sum','season':'median'}).sort_values('id',ascending=False).id#.plot.pie()
t.groupby('toCountry').agg({'id':'count','feeValue':'sum','season':'median'}).sort_values('id',ascending=False).id#.plot.pie()
#t[t.toCountry=='Norway']
Out[40]:
In [175]:
t=nonloans
t=t[t.season>=2000]
t=t[t.toLeague!='GB1']
t.groupby('season').sum()['feeValue'].plot.line()
Out[175]:
In [57]:
transfers.columns
Out[57]:
In [44]:
t=transfers
z=t.groupby('id').apply(lambda x:max(x.feeValue.sort_values()))
In [58]:
tt=t.merge(DataFrame({'maxFee':z}),left_on='id',right_index=True,how='left')
tt=tt.drop_duplicates('id')[['id','playerName','intlCaps','intlGoals','countryOfBirth','Nationality','maxFee','Age','currentClub']]
tt[(tt.intlCaps==0)&(tt.maxFee>0)].sort_values('maxFee',ascending=False)
Out[58]:
In [66]:
def getMvIncreases(t):
return t[(t.mv>t.mv.shift(1))&(t.mv.notnull())&(t.mv.shift(1).notnull())]
def getFeeIncreases(t):
return t[(t.mv>t.feeValue.shift(1))&(t.feeValue.notnull())&(t.feeValue.shift(1).notnull())]
def getMvDecreases(t):
return t[(t.mv<t.mv.shift(1))&(t.mv.notnull())&(t.mv.shift(1).notnull())]
def getFeeDecreases(t):
return t[(t.mv<t.feeValue.shift(1))&(t.feeValue.notnull())&(t.feeValue.shift(1).notnull())]
tt=nonloans.groupby('id').apply(getMvIncreases)
In [10]:
import dateutil
t=nonloans[nonloans['id']==110858]
t.date.shift(1)
Out[10]:
In [27]:
nonloans[nonloans['id']==8806].feeValue.fillna(method='bfill').iloc[::-1].diff().iloc[::-1]
Out[27]:
In [43]:
transfers[transfers.intlCaps==0]
Out[43]:
In [165]:
t=transfers[~transfers.fromLeague.str.startswith('Other ').fillna(True)]
t=t[t.toCountry!=t.fromCountry]
#t=t[t.toLeague=='GB1']
t.groupby(['season','toLeague','toTeamId']).sum()[['feeValue']].unstack(level=0).reset_index().groupby('toLeague').sum()['feeValue'].sort_values(2015,ascending=False)[:10].T.plot.line()
Out[165]:
In [135]:
t[t.fromCountry.str.contains('France')&(t.season==2014)]
Out[135]:
In [119]:
countries=['Brazil','Serbia','Croatia','Sweden','Norway','Argentina','Nigeria','England','France','Germany']
countries=['Romania','Cyprus','Greece']
countries=['England','France','Germany','Spain','Italy']
countries=['Poland','Hungary','Slovakia','Czech Republic','d']
t=transfers[(transfers.fromCountry.isin(uefa))&(transfers.fromCountry.notnull())&(transfers.fromCountry!=transfers.Nationality)]
t=transfers[(transfers.fromCountry.isin(uefa))&(transfers.fromCountry.notnull())]
s=t[t.season>2000].groupby(['season','fromCountry'])['duration'].mean()
(s).unstack() #.plot.line(figsize=(10,10),colormap='Set1')
Out[119]:
In [8]:
#transfers.merge(countryInfo,right_index=True,left_on='fromCountry',how='left')
s=transfers[(transfers.age>20)&(transfers.season>1994)&(transfers.fromLeague.isin(leagueList))&(transfers.duration>0)&(transfers.fromCountry.notnull())]
s=s.merge(countryInfo[['marketTier']],right_index=True,left_on='fromCountry',how='left')
#s=s[s.marketTier==2]
ss=s.groupby(['season','marketTier'])['duration']
(ss.describe()/366).unstack(level=0).swaplevel().ix['mean'].T.plot.line(figsize=(10,10))
Out[8]:
In [182]:
tier1=['England','France','Spain','Germany','France']
t=transfers[(transfers.season>2000)&(transfers.feeValue>0)]
allTransfers=t[(t.fromCountry!=t.toCountry)&(t.toCountry.isin(tier1))].groupby('season')['id'].count()
internalTransfers=t[(t.fromCountry!=t.toCountry)&(t.toCountry.isin(tier1))&(t.fromCountry.isin(tier1))].groupby('season')['id'].count()
s=DataFrame({'all':allTransfers,'internal':internalTransfers})
s['internalPct']=s.internal/s['all']
s.internalPct.plot.line()
Out[182]:
In [182]:
tier1=['England','France','Spain','Germany','France']
t=transfers[(transfers.season>2000)&(transfers.feeValue>0)]
allTransfers=t[(t.fromCountry!=t.toCountry)&(t.toCountry.isin(tier1))].groupby('season')['id'].count()
internalTransfers=t[(t.fromCountry!=t.toCountry)&(t.toCountry.isin(tier1))&(t.fromCountry.isin(tier1))].groupby('season')['id'].count()
s=DataFrame({'all':allTransfers,'internal':internalTransfers})
s['internalPct']=s.internal/s['all']
s.internalPct.plot.line()
Out[182]:
In [113]:
s
Out[113]:
In [6]:
#transfers.merge(countryInfo,right_index=True,left_on='fromCountry',how='left')
transfers['fromTeamClass']=pd.Series(np.where(transfers.fromTeamId.isin(top20clubids),'top 20','the rest'))
s=transfers[(transfers.age>23)&(transfers.season>1994)&(transfers.Nationality.isin(['France','Spain','Germany','Italy','England','Brazil','Norway']))&(transfers.duration>0)&(transfers.fromCountry.notnull())]
s=s.merge(countryInfo[['marketTier']],right_index=True,left_on='fromCountry',how='left')
#s=s[s.marketTier==2]
ss=s.groupby(['season','Nationality'])['duration']
(ss.describe()/366).unstack(level=0).swaplevel().ix['mean'].T.plot.line(figsize=(10,10))
Out[6]:
In [190]:
transfers.groupby('positionGroup').count()['id']
Out[190]:
In [193]:
#transfers.merge(countryInfo,right_index=True,left_on='fromCountry',how='left')
transfers['intlStatus']=pd.Series(np.where(transfers.intlCaps>0,'international','not international'))
s=transfers[(transfers.age>21)&(transfers.season>1994)&(transfers.duration>0)&(transfers.fromCountry.notnull())&(transfers.positionGroup=='Midfield')]
s=s.merge(countryInfo[['marketTier']],right_index=True,left_on='fromCountry',how='left')
#s=s[s.marketTier==2]
ss=s.groupby(['season','mainPosition'])['duration']
(ss.describe()/366).unstack(level=0).swaplevel().ix['mean'].T.plot.line(figsize=(10,10))
Out[193]:
In [3]:
transfers[transfers.age>20].groupby('age')['duration'].mean().plot.bar()
Out[3]:
In [131]:
t=transfers[(transfers.feeValue>0)&(transfers.season>2000)].groupby('season')['duration'].mean()
(t/366).plot.line()
Out[131]:
In [7]:
top20clubids=transfers.groupby('toTeamId')[['feeValue']].sum().merge(DataFrame({'teamName':teamIdMap}), left_index=True,right_index=True,how='left').sort_values('feeValue',ascending=False).iloc[:20].index
In [41]:
def getTierOnePct(t):
return t[t.marketTier==1]['feeValue'].sum() / t.feeValue.sum()
s=transfers[(transfers.season>1994)&(transfers.fromLeague.isin(leagueList))&(transfers.duration>0)]
s=s.merge(countryInfo[['marketTier']].fillna(4),right_index=True,left_on='fromCountry',how='left')
ss=s.groupby(['season','marketTier'])
(ss.sum()).reset_index().groupby('season').apply(getTierOnePct).plot.line()
Out[41]:
In [7]:
transfers[transfers.id==7767][['fromTeamName','toTeamName','date','duration','isLoan']]
Out[7]:
In [2]:
#transfers.merge(countryInfo,right_index=True,left_on='fromCountry',how='left')
s=transfers[(transfers.season>1994)&(transfers.fromCountry.isin(uefa))&(transfers.duration>0)]
s=s.merge(countryInfo[['marketTier']].fillna(4),right_index=True,left_on='fromCountry',how='left')
s=s.groupby(['season','marketTier'])['duration']
(s.describe()/366).unstack(level=0).swaplevel().ix['mean'].T.plot.line()
In [209]:
def topTransferClubs(t):
t=t[(t.feeValue>0)&(t.fromCountry!=t.toCountry)].groupby('toTeamId').agg({'feeValue':'sum'}).sort_values('feeValue',ascending=False)
t['cumPct']=t.feeValue.cumsum()/t.feeValue.sum()
return t[t.feeValue>t.feeValue.mean()].join(DataFrame({'toTeamName':teamIdMap}),how='left').reset_index().count()
transfers[transfers.season>1994].groupby('season').apply(topTransferClubs).feeValue.plot.line()
Out[209]:
In [225]:
def topTransferCountries(t):
t=t[(t.feeValue>0)&(t.Nationality!=t.toCountry)].groupby('Nationality').agg({'feeValue':'sum'}).sort_values('feeValue',ascending=False)
t['cumPct']=t.feeValue.cumsum()/t.feeValue.sum()
return t[t.cumPct.shift(1)<0.66].reset_index().count()
transfers[transfers.season>1994].groupby('season').apply(topTransferCountries)['cumPct'].plot.line()
Out[225]:
In [171]:
def topTransferClubs(t):
t=t[(t.feeValue>0)&(t.fromCountry!=t.toCountry)].groupby('fromTeamId').agg({'feeValue':'sum'}).sort_values('feeValue',ascending=False)
t['cumPct']=t.feeValue.cumsum()/t.feeValue.sum()
return t[t.cumPct.shift(1).fillna(0)<0.666].join(DataFrame({'fromTeamName':teamIdMap}),how='left').reset_index().drop('fromTeamId',axis=1).count()
transfers[transfers.season>2000].groupby('season').apply(topTransferClubs).feeValue.plot.line()
Out[171]:
In [48]:
t=nonloans[nonloans.feeValue>0]
s=t[t.fromCountry!=t.toCountry].groupby(['fromCountry','toCountry']).sum()[['feeValue']].sort_values('feeValue',ascending=False).reset_index()
s['cumpct']=s['feeValue'].cumsum()/s['feeValue'].sum()
s=s[s.cumpct<0.66]
s.drop('cumpct',axis=1).rename(columns={'fromCountry':'source','toCountry':'target','feeValue':'value'}).to_csv('../html/transfers.csv',index=False)
s
Out[48]:
In [3]:
big5leagues=['GB1','FR1','L1','IT1','ES1']
#big5leagues=['ZYP1']
big5ids=nonloans[(nonloans.fromLeague.isin(big5leagues))|(nonloans.toLeague.isin(big5leagues))].id
t=nonloans[(nonloans.feeValue>0)&(nonloans.id.isin(big5ids))]
t=nonloans[nonloans.season.between(1995,2015)]
#t=t[t.feeValue>0]
s=t[t.fromCountry!=t.toCountry].groupby(['fromCountry','toCountry']).count()[['id']].sort_values('id',ascending=False).reset_index()
#s.groupby('toCountry').apply(lambda x:x[:5]).reset_index(drop=True).sort_values('id',ascending=False).reset_index(drop=True)
#s['cumpct']=s['id'].cumsum()/s['id'].sum()
s=s[:100]
#s['value']=pd.qcut(s.id, 10, labels=False)
s['value']=s.id/s.id.max()
s['decile']=pd.qcut(s.id, 10,labels=False)
#s.id=10+s.id.max()-s.id
s=s.rename(columns={'fromCountry':'source','toCountry':'target'})
s.to_csv('../html/transfers.csv',index=False)
s
#DataFrame(pd.concat([s.groupby('source').sum()['id'].fillna(0),s.groupby('target').sum()['id'].fillna(0)],axis=1).sum(axis=1)).
Out[3]:
In [54]:
big5leagues=['GB1','FR1','L1','IT1','ES1']
#big5leagues=['ZYP1']
big5ids=nonloans[(nonloans.toTeamId==148)].id
t=nonloans[(nonloans.feeValue>0)&(nonloans.id.isin(big5ids))]
t=t[t.season.between(1995,2015)]
t=t[t.fromCountry.notnull()&t.toCountry.notnull()&(t.fromLeague.isin(leagueList))]
#t=t[t.feeValue>0]
s=t.groupby(['fromTeamId','toTeamId']).count()[['id']].sort_values('id',ascending=False).reset_index()
#s.groupby('toCountry').apply(lambda x:x[:5]).reset_index(drop=True).sort_values('id',ascending=False).reset_index(drop=True)
#s['cumpct']=s['id'].cumsum()/s['id'].sum()
#s['value']=pd.qcut(s.id, 10, labels=False)
s['value']=s.id/s.id.max()
s['decile']=pd.qcut(s.id, 1,labels=False)*5
#s.id=10+s.id.max()-s.id
s=s.merge(DataFrame({"source":teamIdMap}).reset_index().rename(columns={'index':'fromTeamId'}),how='left')
s=s.merge(DataFrame({"target":teamIdMap}).reset_index().rename(columns={'index':'toTeamId'}),how='left')
s
s.to_csv('../html/transfers.csv',index=False)
In [22]:
def getImportExportRank(sortby='total',cutoff=50):
def f(t):
s=t[t.fromCountry!=t.toCountry].groupby(['fromCountry','toCountry']).count()[['id']].sort_values('id',ascending=False)
s1=s.swaplevel().reset_index().rename(columns={'id':'import','fromCountry':'toCountry','toCountry':'fromCountry'})
s2=s.reset_index().rename(columns={'id':'export'})
s3=s2.merge(s1)
s3['net']=s3['export']-s3['import']
s3['total']=s3['export']+s3['import']
s3['netted']=(s3['total']-s3['net'])/2
s3=s3[s3.net>0]
return s3.sort_values(sortby,ascending=False)[:cutoff].reset_index(drop=True)
return f
In [17]:
o=nonloans.groupby('period').apply(getImportExportRank()).unstack(level=0).swaplevel(axis=1).sort_index(level=0,axis=1)
ew = pd.ExcelWriter('../data/output/total_nonloans.xls', encoding='utf-8')
o.to_excel(ew)
ew.save()
o
Out[17]:
In [19]:
o=nonloans.groupby('period').apply(getImportExportRank('netted')).unstack(level=0).swaplevel(axis=1).sort_index(level=0,axis=1)
ew = pd.ExcelWriter('../data/output/netted_nonloans.xls', encoding='utf-8')
o.to_excel(ew)
ew.save()
o
Out[19]:
In [ ]:
o=nonloans[nonloans.feeValue>0].groupby('period').apply(getImportExportRank('netted')).unstack(level=0).swaplevel(axis=1).sort_index(level=0,axis=1)
ew = pd.ExcelWriter('../data/output/paid_netted_nonloans.xls', encoding='utf-8')
o.to_excel(ew)
ew.save()
o
In [21]:
o=nonloans[nonloans.feeValue>0].groupby('period').apply(getImportExportRank('total')).unstack(level=0).swaplevel(axis=1).sort_index(level=0,axis=1)
ew = pd.ExcelWriter('../data/output/paid_total_nonloans.xls', encoding='utf-8')
o.to_excel(ew)
ew.save()
o
Out[21]:
In [4]:
countryInfo.groupby('marketTier').count()
Out[4]:
In [9]:
s=transfers[transfers.season>2000].groupby(['season','toCountry']).sum()['feeValue'].unstack()
s[s.sum().sort_values(ascending=False)[:10].index]
Out[9]:
In [1]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')
from urllib2 import urlopen, Request
from bs4 import BeautifulSoup
from pandas import DataFrame
import pandas as pd
import numpy as np
import re
import os.path
import datetime
import math
import matplotlib.pyplot as plt
transfers=pd.read_csv('../data/merged/transfers_q.csv',index_col=False, warn_bad_lines =True,error_bad_lines=False)
loans = pd.read_csv('../data/merged/loans.csv')
nonloans = pd.read_csv('../data/merged/nonloans.csv')
players=pd.read_csv('../data/merged/players.csv')
leagueCodeMapping = {'GB1':'Premier League (England)','FR1':'Ligue 1 (France)',
'ES1':'La Liga (Spain)','IT1':'Serie A (Italy)','L1':'Bundesliga (Germany)',
'PO1':'Primeira Liga (Portugal)','NL1':'Eredivisie (Netherlands)'}
teamIdMap = pd.concat([transfers[['toTeamId','toTeamName','season']].rename(columns={'toTeamId':'teamId','toTeamName':'teamName'}),
transfers[['fromTeamId','fromTeamName','season']].rename(columns={'fromTeamId':'teamId','fromTeamName':'teamName'})],ignore_index=True) \
.sort_values('season',ascending=False).drop_duplicates('teamId').set_index('teamId')['teamName'].to_dict()
agentIdMap = transfers[['playersAgentId','playersAgent']].sort_values('playersAgent',ascending=False).drop_duplicates('playersAgentId').set_index('playersAgentId')['playersAgent'].to_dict()
leagueList = ['GB1','ES1','L1','IT1','FR1','PO1','NL1','TR1','RU1','BE1','MLS1','UKR1','GR1','RO1','ZYP1','KR1','SER1','BOS1','BU1','DK1','SE1','NO1','LI1','SC1','ISR1','PL1','UNG1','A1','C1','MAL1','KAS1','TS1','SLO1','SL1','FI1','LUX1','AZ1']
countryInfo=pd.read_excel('../data/reference/countries.xls').set_index('country')
In [305]:
countryInfo=pd.read_excel('../data/reference/countries.xls').set_index('country')
In [37]:
pd.Series(np.where(transfers.id==transfers.id.shift(1),transfers.mv-transfers.mv.shift(1).fillna(0),np.nan))
Out[37]:
In [54]:
transfers=pd.read_csv('../data/merged/transfers_q.csv',index_col=False, warn_bad_lines =True,error_bad_lines=False)
In [55]:
transfers.drop_duplicates('id')[['mv','feeValue']].plot.scatter('mv','feeValue',figsize=(10,10))
Out[55]:
In [36]:
s=transfers.drop_duplicates('id')
t=(s['mv']/s['feeValue'])
pd.Series(np.where((t!=float("Infinity")),t,np.nan)).mean()
Out[36]: