In [128]:
nonloans['isInternational']=(nonloans.fromCountry!=nonloans.toCountry)&(nonloans.fromCountry.notnull())&(nonloans.toCountry.notnull())
nonloans['isSameCountry']=(nonloans.countryOfBirth==nonloans.fromCountry)
nonloans['isPaid']=(nonloans.feeValue>0)
nonloans['isIntlPlayer']=(nonloans.intlCaps>0)


nonloans['isYouth']=nonloans['age']<21
s=nonloans[(nonloans.age>20)&(nonloans.season>1995)].groupby('season').id.count()#.plot.bar(stacked=True)
z=nonloans[(nonloans.season>1995)].groupby(['season','isYouth']).id.count().unstack()

x=nonloans.merge(countryInfo.marketTier.reset_index().rename(columns={'Country':'fromCountry'}),how='left')

t=x[(x.age>=21)&(x.season>1995)&(x.fromCountry.notnull())].groupby(['season','positionGroup'])['duration'].mean().unstack()

t#.plot.line()
#z.plot.bar(stacked=True)


Out[128]:
positionGroup Defence Goalkeeper Midfield Striker
season
1996 969.700252 1021.613497 859.392461 762.651515
1997 871.526946 1011.884422 834.697068 665.838337
1998 931.054545 993.980861 829.320402 681.956262
1999 828.331626 1026.421429 822.941725 661.373041
2000 851.044053 941.490506 815.181132 690.808511
2001 874.293694 886.581152 814.304212 689.688742
2002 900.790334 921.187097 829.486120 664.403863
2003 857.711846 889.418790 816.212180 688.717845
2004 870.731040 823.693617 788.577659 678.457503
2005 783.946123 818.007344 746.997707 618.343075
2006 763.466454 848.739583 728.938061 607.807224
2007 759.603662 769.477153 700.728419 596.934573
2008 741.712346 767.695370 669.525360 579.267692
2009 711.112984 748.902047 661.405310 571.202699
2010 715.314327 762.142285 659.878157 580.087623
2011 706.872969 727.019320 667.304088 568.060058
2012 704.402269 753.716511 646.472507 558.383692
2013 690.310890 710.303104 638.855779 540.512879
2014 682.995624 780.525935 633.069721 543.164284
2015 685.731753 721.617733 639.046519 544.145477

In [129]:
nonloans.id.nunique()


Out[129]:
56407

In [ ]:


In [ ]:
nonloans[(nonloans.age>20)&(nonloans.season>1995)].groupby(['fromCountry','season'])

In [92]:
countryInfo.marketTier.reset_index().rename(columns={'Country':'fromCountry'}) #.groupby('marketTier').count()


Out[92]:
fromCountry marketTier
0 Germany 1.0
1 Spain 1.0
2 England 1.0
3 Portugal 2.0
4 Belgium 2.0
5 Italy 1.0
6 Netherlands 2.0
7 France 1.0
8 Russia 2.0
9 Switzerland 2.0
10 Austria 2.0
11 Croatia 3.0
12 Bosnia-Herzegovina 3.0
13 Ukraine 2.0
14 Czech Republic 3.0
15 Sweden 3.0
16 Poland 3.0
17 Romania 3.0
18 Slovakia 3.0
19 Hungary 3.0
20 Denmark 2.0
21 Turkey 2.0
22 Ireland 3.0
23 Greece 2.0
24 Norway 3.0
25 Slovenia 3.0
26 Iceland 3.0
27 Wales 3.0
28 Israel 3.0
29 Scotland 2.0
30 Albania 3.0
31 Montenegro 3.0
32 Northern Ireland 3.0
33 Serbia 3.0
34 Finland 3.0
35 Bulgaria 3.0
36 Armenia 3.0
37 Estonia 3.0
38 Lithuania 3.0
39 Belarus 3.0
40 Georgia 3.0
41 Azerbaijan 3.0
42 Latvia 3.0
43 Cyprus 3.0
44 Moldova 3.0
45 Macedonia 3.0
46 Kazakhstan 3.0
47 Luxembourg 3.0
48 Liechtenstein 3.0
49 Faroe Islands NaN
50 Malta 3.0
51 Andorra 3.0
52 San Marino 3.0
53 Gibraltar 3.0

In [ ]:


In [8]:
fig, axes = plt.subplots(nrows=3, ncols=4, figsize=(16, 16),sharex=False,sharey='row')

t=nonloans[(nonloans.fromCountry!=nonloans.toCountry)].rename(columns={'toCountry':'dest. country'}).groupby(['period','dest. country'])['feeValue'].count()
t['(1995, 2000]'].sort_values(ascending=False)[:20].plot.bar(ax=axes[0,0],title='# of intl transfers, 1996-2000')
t['(2000, 2005]'].sort_values(ascending=False)[:20].plot.bar(ax=axes[0,1],title='# of intl transfers,2001-2005')
t['(2005, 2010]'].sort_values(ascending=False)[:20].plot.bar(ax=axes[0,2],title='# of intl transfers,2006-2010') 
t['(2010, 2015]'].sort_values(ascending=False)[:20].plot.bar(ax=axes[0,3],title='# of intl transfers,2011-2015') 

s=nonloans[(nonloans.fromCountry!=nonloans.toCountry)].rename(columns={'toCountry':'dest. country'}).groupby(['period','dest. country'])['feeValue'].sum()
s['(1995, 2000]'].sort_values(ascending=False)[:20].plot.bar(ax=axes[1,0],title='total intl fees, 1996-2000')
s['(2000, 2005]'].sort_values(ascending=False)[:20].plot.bar(ax=axes[1,1],title='total intl fees, 2001-2005')
s['(2005, 2010]'].sort_values(ascending=False)[:20].plot.bar(ax=axes[1,2],title='total intl fees, 2006-2010') 
s['(2010, 2015]'].sort_values(ascending=False)[:20].plot.bar(ax=axes[1,3],title='total intl fees, 2011-2015') 

r=nonloans[(nonloans.fromCountry!=nonloans.toCountry)].rename(columns={'toCountry':'dest. country'}).groupby(['period','dest. country'])['feeValue'].mean()
r['(1995, 2000]'].sort_values(ascending=False)[:20].plot.bar(ax=axes[2,0],title='avg intl fees, 1996-2000')
r['(2000, 2005]'].sort_values(ascending=False)[:20].plot.bar(ax=axes[2,1],title='avg intl fees, 2001-2005')
r['(2005, 2010]'].sort_values(ascending=False)[:20].plot.bar(ax=axes[2,2],title='avg intl fees, 2006-2010') 
r['(2010, 2015]'].sort_values(ascending=False)[:20].plot.bar(ax=axes[2,3],title='avg intl fees, 2011-2015') 

plt.tight_layout()



In [7]:
fig, axes = plt.subplots(nrows=3, ncols=4, figsize=(16, 16),sharex=False,sharey='row')

t=nonloans[(nonloans.fromCountry!=nonloans.toCountry)&(nonloans.toCountry.notnull())].rename(columns={'toTeamId':'dest. club'}).groupby(['period','dest. club'])['feeValue'].count().rename(index=teamIdMap)
t['(1995, 2000]'].sort_values(ascending=False)[:20].plot.bar(ax=axes[0,0],title='# of intl transfers, 1996-2000')
t['(2000, 2005]'].sort_values(ascending=False)[:20].plot.bar(ax=axes[0,1],title='# of intl transfers,2001-2005')
t['(2005, 2010]'].sort_values(ascending=False)[:20].plot.bar(ax=axes[0,2],title='# of intl transfers,2006-2010') 
t['(2010, 2015]'].sort_values(ascending=False)[:20].plot.bar(ax=axes[0,3],title='# of intl transfers,2011-2015') 

s=nonloans[(nonloans.fromCountry!=nonloans.toCountry)&(nonloans.toCountry.notnull())].rename(columns={'toTeamId':'dest. club'}).groupby(['period','dest. club'])['feeValue'].sum().rename(index=teamIdMap)
s['(1995, 2000]'].sort_values(ascending=False)[:20].plot.bar(ax=axes[1,0],title='total intl fees, 1996-2000')
s['(2000, 2005]'].sort_values(ascending=False)[:20].plot.bar(ax=axes[1,1],title='total intl fees, 2001-2005')
s['(2005, 2010]'].sort_values(ascending=False)[:20].plot.bar(ax=axes[1,2],title='total intl fees, 2006-2010') 
s['(2010, 2015]'].sort_values(ascending=False)[:20].plot.bar(ax=axes[1,3],title='total intl fees, 2011-2015') 

r=nonloans[(nonloans.fromCountry!=nonloans.toCountry)&(nonloans.toCountry.notnull())].rename(columns={'toTeamId':'dest. club'}).groupby(['period','dest. club'])['feeValue'].mean().rename(index=teamIdMap)
r['(1995, 2000]'].sort_values(ascending=False)[:20].plot.bar(ax=axes[2,0],title='avg intl fees, 1996-2000')
r['(2000, 2005]'].sort_values(ascending=False)[:20].plot.bar(ax=axes[2,1],title='avg intl fees, 2001-2005')
r['(2005, 2010]'].sort_values(ascending=False)[:20].plot.bar(ax=axes[2,2],title='avg intl fees, 2006-2010') 
r['(2010, 2015]'].sort_values(ascending=False)[:20].plot.bar(ax=axes[2,3],title='avg intl fees, 2011-2015') 

plt.tight_layout()



In [162]:
fig, axes = plt.subplots(nrows=1, ncols=4, figsize=(18, 4),sharex=True,sharey='row')

country_coeff.join(natteam_coeff).plot.scatter('2012 coeff','National team coeff',ax=axes[0],title='2012 coefficient')
country_coeff.join(natteam_coeff).plot.scatter('2013 coeff','National team coeff',ax=axes[1],title='2013 coefficient')
country_coeff.join(natteam_coeff).plot.scatter('2014 coeff','National team coeff',ax=axes[2],title='2014 coefficient')
country_coeff.join(natteam_coeff).plot.scatter('2015 coeff','National team coeff',ax=axes[3],title='2015 coefficient')


Out[162]:
<matplotlib.axes._subplots.AxesSubplot at 0x118b05390>

In [1]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')
from urllib2 import urlopen, Request
from bs4 import BeautifulSoup
from pandas import DataFrame
import pandas as pd
import numpy as np
import re
import os.path 
import datetime 
import math
import matplotlib.pyplot as plt

transfers = pd.read_csv('../data/merged/transfers_q.csv')
loans = pd.read_csv('../data/merged/loans.csv')
nonloans = pd.read_csv('../data/merged/nonloans.csv')
players=pd.read_csv('../data/merged/players.csv')

leagueCodeMapping = {'GB1':'Premier League (England)','FR1':'Ligue 1 (France)',
                                                         'ES1':'La Liga (Spain)','IT1':'Serie A (Italy)','L1':'Bundesliga (Germany)',
                                                         'PO1':'Primeira Liga (Portugal)','NL1':'Eredivisie (Netherlands)'}

teamIdMap = pd.concat([transfers[['toTeamId','toTeamName','season']].rename(columns={'toTeamId':'teamId','toTeamName':'teamName'}),
    transfers[['fromTeamId','fromTeamName','season']].rename(columns={'fromTeamId':'teamId','fromTeamName':'teamName'})],ignore_index=True) \
    .sort_values('season',ascending=False).drop_duplicates('teamId').set_index('teamId')['teamName'].to_dict()
    
agentIdMap = transfers[['playersAgentId','playersAgent']].sort_values('playersAgent',ascending=False).drop_duplicates('playersAgentId').set_index('playersAgentId')['playersAgent'].to_dict()

leagueList = ['GB1','ES1','L1','IT1','FR1','PO1','NL1','TR1','RU1','BE1','MLS1','UKR1','GR1','RO1','ZYP1','KR1','SER1','BOS1','BU1','DK1','SE1','NO1','LI1','SC1','ISR1','PL1','UNG1','A1','C1','MAL1','KAS1','TS1','SLO1','SL1','FI1','LUX1','AZ1','IR1']

countryInfo=pd.read_excel('../data/reference/countries.xls').set_index('country')
countryInfo.marketTier=np.where(countryInfo.marketTier==4,3,countryInfo.marketTier)

top20clubids=transfers.groupby('toTeamId')[['feeValue']].sum().merge(DataFrame({'teamName':teamIdMap}), left_index=True,right_index=True,how='left').sort_values('feeValue',ascending=False).iloc[:20].index
top50clubids=transfers.groupby('toTeamId')[['feeValue']].sum().merge(DataFrame({'teamName':teamIdMap}), left_index=True,right_index=True,how='left').sort_values('feeValue',ascending=False).iloc[:50].index
top100clubids=transfers.groupby('toTeamId')[['feeValue']].sum().merge(DataFrame({'teamName':teamIdMap}), left_index=True,right_index=True,how='left').sort_values('feeValue',ascending=False).iloc[:100].index

uefa=countryInfo[countryInfo.federation=='UEFA'].index
caf=countryInfo[countryInfo.federation=='CAF'].index
concacaf=countryInfo[countryInfo.federation=='CONCACAF'].index
concacaf_no_uscan=countryInfo[countryInfo.federation=='CONCACAF'].drop(['United States','Canada']).index
conmebol=countryInfo[countryInfo.federation=='CONMEBOL'].index
afc=countryInfo[countryInfo.federation=='AFC'].index
afc_no_anzac=countryInfo[countryInfo.federation=='AFC'].drop(['Australia','New Zealand']).index

country_coeff=pd.read_table('../data/reference/country_coefficient.csv').reset_index(drop=True)
country_coeff.columns=['Movement','Country','2012 coeff','2013 coeff','2014 coeff','2015 coeff','2016 coeff','Total coeff','Teams Total','CL','EL','Total']
country_coeff=country_coeff.drop(['Movement','2016 coeff','CL','EL','Total','Teams Total','Total coeff'],axis=1)
country_coeff['Country']=country_coeff['Country'].str.split('\(L',expand=True)[0].str.strip()
country_coeff=country_coeff.set_index('Country')
country_coeff = country_coeff.join(DataFrame({'total coeff':country_coeff.sum(axis=1)}))

natteam_coeff=pd.read_table('../data/reference/national_team_coefficient.csv').reset_index(drop=True).drop('Rank',axis=1)
natteam_coeff['Country']=natteam_coeff['Country'].str.strip()
natteam_coeff.Coefficient=natteam_coeff.Coefficient.apply(lambda s: int(s.replace(',','')))
natteam_coeff=natteam_coeff.rename(columns={'Coefficient':'National team coeff'}).set_index('Country')

club_coeff=pd.read_table('../data/reference/club_coefficient.csv').reset_index(drop=True).set_index('Club')
club_coeff=club_coeff.drop(['2017','2016','Mvmt','Total'],axis=1)
club_coeff.Association = club_coeff.Association.str.split(expand=True)[0]
# sanity check: club_coeff.merge(DataFrame({'Team Name':teamIdMap}),how='left',left_on='Team Id',right_index=True)
club_coeff=club_coeff.reset_index().set_index('Team Id').rename(columns=lambda c: 'Club coeff ' + c.split('–')[0] if '–' in c else c)

gdp=pd.read_table('../data/reference/country_gdp.csv').reset_index(drop=True).drop('2014 Rank',axis=1).rename(columns={'Country ':'Country'})
gdp.Country=gdp.Country.str.strip()
gdp=gdp.set_index('Country')
gdp.columns=['GDP '+c for c in gdp.columns.str.strip()]
for column in gdp.columns:
    gdp[column]=gdp[column].apply(lambda s: float(str(s).replace(',','')))
    
unemployed=pd.read_table('../data/reference/country_unemployment.csv').drop(['Date','Employment rate'],axis=1)
unemployed.columns=['Country','Unemployment']
unemployed['Country']=unemployed['Country'].str.split(' ').apply(lambda arr: ' '.join(arr[:len(arr)/2]))
unemployed=unemployed.set_index('Country').rename(index={'United Kingdom':'England'})

corruption=pd.read_table('../data/reference/country_corruption.csv').drop(['Rank'],axis=1)
corruption.Country=corruption.Country.str.strip()
corruption=corruption.set_index('Country')
corruption=corruption[corruption.columns[:1]].rename(columns={'2015 score':'Corruption'}).rename(index={'United Kingdom':'England'})

population=pd.read_table('../data/reference/country_population.csv').drop(['Rank'],axis=1)
population.columns=population.columns.str.strip()
population.Country=population.Country.str.split('[',expand=True)[0].str.strip()
population['Population']=population['Population'].apply(lambda s: float(str(s).replace(',','')))
population=population[['Country','Population','Growth %']].set_index('Country').rename(index={'United Kingdom':'England'})

barometer=pd.read_table('../data/reference/country_corruption_barometer.csv').reset_index(drop=True)
barometer.columns=['Corruption Rank','Country','Bribe %']
barometer.Country=barometer.Country.str.strip()
barometer=barometer.set_index('Country').rename(index={'United Kingdom':'England'})

countryInfo=natteam_coeff.join(country_coeff).join(gdp).join(unemployed).join(corruption).join(population).join(countryInfo).join(barometer)


/Users/sunkevin/anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.py:2871: DtypeWarning: Columns (0,38,40,41,45,47,49,55) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
/Users/sunkevin/anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.py:2871: DtypeWarning: Columns (45,47) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
/Users/sunkevin/anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.py:2871: DtypeWarning: Columns (40,45,47,51) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
/Users/sunkevin/anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.py:2871: DtypeWarning: Columns (32,34) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)

In [111]:
t=nonloans[(nonloans.season>2000)&(nonloans.fromCountry!=nonloans.toCountry)]
s=nonloans[(nonloans.season>2000)&(nonloans.age>21)]

meanFeeImportPeriod = t.groupby(['toCountry','period']).feeValue.mean().unstack().rename(columns=lambda col:'avg import fee '+col)
totalFeeImportPeriod = t.groupby(['toCountry','period']).feeValue.sum().unstack().rename(columns=lambda col:'total import fees '+col)
meanAgeImportPeriod = t.groupby(['toCountry','period']).age.mean().unstack().rename(columns=lambda col:'avg import age '+col)
paidImportPeriod = t[t.feeValue>0].groupby(['toCountry','period']).id.count().unstack().rename(columns=lambda col:'# of paid imports '+col)
freeImportPeriod = t[t.feeValue==0].groupby(['toCountry','period']).id.count().unstack().rename(columns=lambda col:'# of free imports '+col)
totalImportPeriod = t.groupby(['toCountry','period']).id.count().unstack().rename(columns=lambda col:'# of imports '+col)

meanFeeIExportPeriod = t.groupby(['fromCountry','period']).feeValue.mean().unstack().rename(columns=lambda col:'avg export fee '+col)
totalFeeExportPeriod = t.groupby(['fromCountry','period']).feeValue.sum().unstack().rename(columns=lambda col:'total export fees '+col)
meanAgeExportPeriod = t.groupby(['fromCountry','period']).age.mean().unstack().rename(columns=lambda col:'avg export age '+col)
paidExportPeriod = t[t.feeValue>0].groupby(['fromCountry','period']).id.count().unstack().rename(columns=lambda col:'# of paid exports '+col)
freeExportPeriod = t[t.feeValue==0].groupby(['fromCountry','period']).id.count().unstack().rename(columns=lambda col:'# of free exports '+col)
totalExportPeriod = t.groupby(['fromCountry','period']).id.count().unstack().rename(columns=lambda col:'# of exports '+col)

meanDurationPeriod = s.groupby(['fromCountry','period']).duration.mean().unstack().rename(columns=lambda col:'avg duration '+col)
volumePeriod = s.groupby(['fromCountry','period']).id.count().unstack().rename(columns=lambda col:'21-plus transfer volume '+col)

meanFeeImportAll = t.groupby(['toCountry'])[['feeValue']].mean().rename(columns=lambda col:'avg import fee overall')
totalFeeImportAll = t.groupby(['toCountry'])[['feeValue']].sum().rename(columns=lambda col:'total import fees overall')
meanAgeImportAll = t.groupby(['toCountry'])[['age']].mean().rename(columns=lambda col:'avg import age overall')
paidImportAll = t[t.feeValue>0].groupby(['toCountry'])[['id']].count().rename(columns=lambda col:'# of paid imports overall')
freeImportAll = t[t.feeValue==0].groupby(['toCountry'])[['id']].count().rename(columns=lambda col:'# of free imports overall')
totalImportAll = t.groupby(['toCountry'])[['id']].count().rename(columns=lambda col:'# of imports overall')

meanFeeExportAll = t.groupby(['fromCountry'])[['feeValue']].mean().rename(columns=lambda col:'avg export fee overall')
totalFeeExportAll = t.groupby(['fromCountry'])[['feeValue']].sum().rename(columns=lambda col:'total export fees overall')
meanAgeExportAll = t.groupby(['fromCountry'])[['age']].mean().rename(columns=lambda col:'avg export age overall')
paidExportAll = t[t.feeValue>0].groupby(['fromCountry'])[['id']].count().rename(columns=lambda col:'# of paid exports overall')
freeExportAll= t[t.feeValue==0].groupby(['fromCountry'])[['id']].count().rename(columns=lambda col:'# of free exports overall')
totalExportAll = t.groupby(['fromCountry'])[['id']].count().rename(columns=lambda col:'# of exports overall')

meanDurationAll = s.groupby(['fromCountry'])[['duration']].mean().rename(columns=lambda col:'avg duration overall')
volumeAll = s.groupby(['fromCountry'])[['id']].count().rename(columns=lambda col:'21-plus transfer volume overall')

fulljoin=countryInfo.join(meanFeeImportPeriod).join(totalFeeImportPeriod).join(meanAgeImportPeriod).join(paidImportPeriod).join(freeImportPeriod)\
    .join(totalImportPeriod).join(meanFeeIExportPeriod).join(totalFeeExportPeriod).join(meanAgeExportPeriod).join(paidExportPeriod)\
    .join(freeExportPeriod).join(totalExportPeriod).join(meanDurationPeriod).join(volumePeriod)\
    .join(meanFeeImportAll).join(totalFeeImportAll).join(meanAgeImportAll).join(paidImportAll).join(freeImportAll)\
    .join(totalImportAll).join(meanFeeExportAll).join(totalFeeExportAll).join(meanAgeExportAll).join(paidExportAll)\
    .join(freeExportAll).join(totalExportAll).join(meanDurationAll).join(volumeAll)
    
fulljoin.to_excel('../data/output/country_summary.xlsx')
fulljoin


Out[111]:
National team coeff 2012 coeff 2013 coeff 2014 coeff 2015 coeff total coeff GDP 2010 GDP 2011 GDP 2012 GDP 2013 ... # of free imports overall # of imports overall avg export fee overall total export fees overall avg export age overall # of paid exports overall # of free exports overall # of exports overall avg duration overall 21-plus transfer volume overall
Country
Germany 40236 17.928 14.714 15.857 16.428 64.927 3418.371 3755.549 3535.199 3831.427 ... 1729.0 3507.0 4.966945e+05 1.280975e+09 27.401203 439.0 2140.0 4156.0 785.471215 7962.0
Spain 37962 17.714 23.000 20.214 23.928 84.856 1434.257 1495.968 1356.483 1393.476 ... 972.0 2155.0 1.652338e+06 2.932900e+09 28.050064 459.0 1316.0 3136.0 791.597415 7640.0
England 35963 16.428 16.785 13.571 14.250 61.034 2409.409 2594.114 2624.291 2680.123 ... 1345.0 3617.0 8.657902e+05 2.113394e+09 27.175012 419.0 2022.0 4276.0 876.007576 7969.0
Portugal 35138 11.750 9.916 9.083 10.500 41.249 238.748 245.120 216.488 224.983 ... 915.0 2168.0 1.226864e+06 1.828028e+09 26.929966 414.0 1076.0 2613.0 705.026633 5375.0
Belgium 34442 6.500 6.400 9.600 7.400 29.900 485.307 528.721 499.129 524.970 ... 1260.0 2512.0 3.758827e+05 5.863770e+08 26.734412 262.0 1298.0 2967.0 764.369454 5416.0
Italy 34345 14.416 14.166 19.000 11.500 59.082 2130.586 2280.315 2076.370 2137.615 ... 661.0 2594.0 2.090930e+06 2.553026e+09 27.883022 365.0 856.0 3163.0 746.764949 8595.0
Netherlands 33679 4.214 5.916 6.083 5.750 21.963 837.949 894.576 823.595 853.806 ... 852.0 1735.0 8.212738e+05 1.205630e+09 26.779124 363.0 1105.0 2558.0 903.022510 4789.0
France 33599 11.750 8.500 10.916 11.083 42.249 2651.772 2865.304 2688.210 2807.306 ... 1129.0 2511.0 9.595873e+05 2.285737e+09 26.222384 547.0 1835.0 4121.0 892.793771 6643.0
Russia 31345 9.750 10.416 9.666 11.500 41.332 1524.915 1904.790 2015.154 2079.134 ... 640.0 2091.0 5.642458e+05 5.693240e+08 26.746038 176.0 833.0 2398.0 687.315219 5231.0
Switzerland 31254 8.375 7.200 6.900 5.300 27.775 580.696 696.528 665.898 685.871 ... 687.0 1528.0 4.254467e+05 3.829020e+08 26.558775 203.0 697.0 1830.0 712.965558 3481.0
Austria 30932 2.250 7.800 4.125 3.800 17.975 390.383 429.493 407.801 428.456 ... 1145.0 2513.0 1.322733e+05 1.476170e+08 28.183950 130.0 986.0 2218.0 647.855680 5315.0
Croatia 30642 4.375 4.375 6.875 4.500 20.125 59.611 62.172 56.484 57.849 ... 848.0 1626.0 2.961482e+05 3.076980e+08 25.137339 243.0 796.0 2101.0 565.517544 3914.0
Bosnia-Herzegovina 30367 1.250 1.500 1.750 1.500 6.000 16.847 18.318 16.906 17.852 ... 820.0 1758.0 1.391046e+04 1.662300e+07 25.168550 104.0 1091.0 2181.0 476.268970 3255.0
Ukraine 30313 9.500 7.833 10.000 9.800 37.133 136.011 163.307 175.707 179.572 ... 507.0 1693.0 5.778412e+05 4.876980e+08 26.684802 120.0 724.0 2225.0 693.160855 4177.0
Czech Republic 29403 8.500 8.000 3.875 7.300 27.675 207.016 227.307 206.751 208.796 ... 563.0 1300.0 2.104867e+05 2.054350e+08 26.943251 165.0 811.0 1815.0 822.183267 3350.0
Sweden 29028 5.125 3.200 3.900 4.750 16.975 488.378 563.110 NaN 579.526 ... 694.0 1376.0 2.684537e+05 2.201320e+08 26.445906 186.0 634.0 1675.0 983.049724 2700.0
Poland 28306 2.500 3.125 4.750 5.500 15.875 476.528 524.104 496.687 526.031 ... 1036.0 1725.0 1.240000e+05 1.393760e+08 27.639298 168.0 956.0 2107.0 678.449509 5818.0
Romania 28038 6.800 6.875 5.125 2.250 21.050 167.986 186.113 172.041 191.598 ... 868.0 2105.0 1.980330e+05 2.037760e+08 27.493651 194.0 835.0 2522.0 604.264774 6012.0
Slovakia 27171 1.500 1.625 2.750 3.750 9.625 89.173 97.621 92.799 97.743 ... 580.0 1214.0 5.583749e+04 4.707100e+07 26.040050 100.0 743.0 1606.0 685.817049 2308.0
Hungary 27142 3.000 0.875 2.125 1.625 7.625 129.585 139.447 126.825 133.424 ... 906.0 1505.0 5.455117e+04 5.384200e+07 26.976919 94.0 893.0 1863.0 660.914337 4161.0
Denmark 27140 3.300 3.800 2.900 5.500 15.500 319.812 341.498 322.277 335.878 ... 465.0 969.0 3.400883e+05 2.618680e+08 26.665916 215.0 555.0 1336.0 887.447196 2241.0
Turkey 27033 10.200 6.700 6.000 6.600 29.500 731.539 774.729 788.605 821.918 ... 829.0 2159.0 4.105941e+05 3.600910e+08 28.331665 128.0 749.0 2397.0 561.250997 8256.0
Ireland 26902 1.000 0.250 2.000 0.700 3.950 218.843 237.990 222.089 232.150 ... 445.0 698.0 1.586853e+04 7.966000e+06 24.420725 45.0 457.0 969.0 647.597637 1753.0
Greece 26654 4.400 6.100 6.200 5.400 22.100 300.156 289.068 249.663 242.306 ... 1954.0 2622.0 1.136432e+05 2.194450e+08 28.367286 172.0 1759.0 3014.0 598.411266 7806.0
Norway 26439 4.900 2.600 2.200 7.250 16.950 428.527 498.157 509.705 522.349 ... 520.0 1251.0 3.361715e+05 2.195200e+08 27.757416 169.0 484.0 1518.0 964.326931 2783.0
Slovenia 25441 3.250 2.625 4.000 1.000 10.875 48.060 51.299 46.288 48.005 ... 577.0 1174.0 5.986398e+04 5.501500e+07 26.297330 96.0 823.0 1649.0 635.630586 2732.0
Iceland 25388 1.250 2.500 2.500 1.125 7.375 13.261 14.666 14.183 15.330 ... 161.0 315.0 1.438776e+04 2.115000e+06 25.511041 11.0 136.0 317.0 617.102410 362.0
Wales 24521 0.500 0.750 0.125 1.500 2.875 NaN NaN NaN NaN ... 38.0 66.0 3.337500e+04 1.068000e+06 23.823529 5.0 27.0 68.0 485.400000 46.0
Israel 24162 3.250 5.750 1.375 2.250 12.625 NaN NaN NaN NaN ... 378.0 927.0 1.114857e+05 5.462800e+07 26.882406 74.0 416.0 1120.0 608.986837 2576.0
Scotland 23259 4.300 3.250 4.000 3.000 14.550 NaN NaN NaN NaN ... 776.0 1561.0 3.656420e+05 3.502850e+08 26.553463 193.0 765.0 1805.0 799.362390 2867.0
Albania 23216 0.750 2.000 0.875 2.125 5.750 11.927 12.891 12.345 12.916 ... 161.0 341.0 4.706587e+04 7.860000e+06 26.678571 43.0 124.0 336.0 416.561798 455.0
Montenegro 22971 1.375 1.250 0.750 1.000 4.375 4.118 4.500 4.048 4.419 ... 157.0 450.0 6.722527e+04 1.223500e+07 25.211350 41.0 141.0 511.0 467.930328 559.0
Northern Ireland 22961 1.000 0.875 1.375 0.750 4.000 NaN NaN NaN NaN ... 130.0 175.0 5.381356e+03 6.350000e+05 24.000000 5.0 113.0 178.0 582.060403 158.0
Serbia 22127 3.000 2.500 2.750 4.250 12.500 39.035 46.488 40.749 45.520 ... 791.0 1941.0 2.691131e+05 3.307400e+08 25.402431 287.0 942.0 2886.0 535.282310 4735.0
Finland 21181 2.000 0.500 2.400 1.000 5.900 248.262 273.925 256.849 268.281 ... 468.0 886.0 3.881625e+04 2.197000e+07 26.457077 65.0 501.0 1301.0 778.222795 2387.0
Bulgaria 20766 0.750 5.625 4.250 1.000 11.625 48.782 55.799 52.613 54.517 ... 513.0 1203.0 1.475806e+05 1.006500e+08 26.947917 150.0 532.0 1556.0 544.390459 3623.0
Armenia 19476 0.875 1.125 0.375 1.625 4.000 NaN NaN NaN NaN ... 77.0 157.0 3.379747e+04 2.670000e+06 25.108434 8.0 71.0 166.0 483.376471 175.0
Estonia 19429 0.375 1.000 1.500 1.000 3.875 19.529 22.824 22.673 24.888 ... 95.0 203.0 3.158333e+04 3.790000e+06 25.342342 26.0 94.0 222.0 649.088000 254.0
Lithuania 19278 1.125 1.250 0.500 0.750 3.625 37.155 43.478 42.828 46.426 ... 350.0 687.0 1.368778e+04 6.050000e+06 25.130719 29.0 413.0 919.0 544.822798 1170.0
Belarus 18666 4.500 1.750 5.500 5.125 16.875 55.221 59.735 63.615 71.710 ... 247.0 454.0 1.259593e+05 3.400900e+07 26.151452 52.0 218.0 482.0 550.596154 681.0
Georgia 17786 1.500 1.875 1.250 0.625 5.250 NaN NaN NaN NaN ... 137.0 280.0 1.055229e+05 1.614500e+07 25.567797 43.0 110.0 354.0 446.819302 517.0
Azerbaijan 16941 3.000 2.500 3.625 4.375 13.500 NaN NaN NaN NaN ... 318.0 710.0 2.862319e+03 7.900000e+05 28.364553 5.0 271.0 695.0 559.718864 1187.0
Latvia 16911 1.250 1.625 0.250 1.625 4.750 24.112 28.488 28.343 30.838 ... 195.0 362.0 4.162011e+04 7.450000e+06 26.030534 25.0 154.0 393.0 581.174393 468.0
Cyprus 16898 4.000 2.750 3.300 3.000 13.050 25.294 27.114 24.954 24.065 ... 1270.0 1968.0 2.398184e+04 2.376600e+07 28.462775 58.0 933.0 1874.0 511.215501 2777.0
Moldova 16621 2.250 3.375 1.750 1.250 8.625 5.813 7.018 7.283 7.985 ... 164.0 412.0 7.267857e+04 1.221000e+07 26.217290 28.0 140.0 428.0 493.334029 511.0
Macedonia 15521 1.250 0.500 1.125 1.500 4.375 9.433 10.659 9.751 10.774 ... 201.0 377.0 3.119149e+04 7.330000e+06 25.286682 33.0 202.0 443.0 432.199584 530.0
Kazakhstan 14101 1.375 3.125 3.375 4.625 12.500 NaN NaN NaN NaN ... 421.0 799.0 1.566901e+03 4.450000e+05 27.974860 4.0 280.0 716.0 564.544172 1715.0
Luxembourg 13821 1.375 1.500 0.500 0.750 4.125 52.241 59.010 56.323 60.150 ... 703.0 919.0 2.668213e+02 1.150000e+05 28.164972 2.0 429.0 894.0 955.807912 2310.0
Liechtenstein 12725 NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
Faroe Islands 12681 0.500 0.875 1.375 0.375 3.125 NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
Malta 11310 0.875 0.875 0.125 0.875 2.750 8.757 9.604 9.289 10.056 ... 230.0 655.0 3.183857e+03 7.100000e+05 27.237774 6.0 217.0 601.0 536.053232 1301.0
Andorra 8520 0.000 0.333 0.500 0.166 0.999 NaN NaN NaN NaN ... 3.0 11.0 0.000000e+00 0.000000e+00 26.714286 NaN 4.0 7.0 377.000000 8.0
San Marino 8110 0.000 0.333 0.000 0.000 0.333 2.143 2.056 1.802 1.802 ... NaN 29.0 0.000000e+00 0.000000e+00 25.692308 NaN 2.0 26.0 478.058824 19.0
Gibraltar 7300 NaN NaN 0.250 0.750 1.000 NaN NaN NaN NaN ... 14.0 19.0 0.000000e+00 0.000000e+00 28.000000 NaN 3.0 5.0 176.600000 5.0

54 rows × 75 columns


In [62]:
t=nonloans[(nonloans.season>2000)&(nonloans.fromCountry!=nonloans.toCountry)]
s=nonloans[(nonloans.season>2000)&(nonloans.age>21)]

meanFeeImportPeriod = t.groupby(['toTeamId','period']).feeValue.mean().unstack().rename(columns=lambda col:'avg import fee '+col)
totalFeeImportPeriod = t.groupby(['toTeamId','period']).feeValue.sum().unstack().rename(columns=lambda col:'total import fees '+col)
meanAgeImportPeriod = t.groupby(['toTeamId','period']).age.mean().unstack().rename(columns=lambda col:'avg import age '+col)
paidImportPeriod = t[t.feeValue>0].groupby(['toTeamId','period']).id.count().unstack().rename(columns=lambda col:'# of paid imports '+col)
freeImportPeriod = t[t.feeValue==0].groupby(['toTeamId','period']).id.count().unstack().rename(columns=lambda col:'# of free imports '+col)
totalImportPeriod = t.groupby(['toTeamId','period']).id.count().unstack().rename(columns=lambda col:'# of imports '+col)

meanFeeIExportPeriod = t.groupby(['fromTeamId','period']).feeValue.mean().unstack().rename(columns=lambda col:'avg export fee '+col)
totalFeeExportPeriod = t.groupby(['fromTeamId','period']).feeValue.sum().unstack().rename(columns=lambda col:'total export fees '+col)
meanAgeExportPeriod = t.groupby(['fromTeamId','period']).age.mean().unstack().rename(columns=lambda col:'avg export age '+col)
paidExportPeriod = t[t.feeValue>0].groupby(['fromTeamId','period']).id.count().unstack().rename(columns=lambda col:'# of paid exports '+col)
freeExportPeriod = t[t.feeValue==0].groupby(['fromTeamId','period']).id.count().unstack().rename(columns=lambda col:'# of free exports '+col)
totalExportPeriod = t.groupby(['fromTeamId','period']).id.count().unstack().rename(columns=lambda col:'# of exports '+col)

meanDurationPeriod = s.groupby(['fromTeamId','period']).duration.mean().unstack().rename(columns=lambda col:'avg duration '+col)

meanFeeImportAll = t.groupby(['toTeamId'])[['feeValue']].mean().rename(columns=lambda col:'avg import fee overall')
totalFeeImportAll = t.groupby(['toTeamId'])[['feeValue']].sum().rename(columns=lambda col:'total import fees overall')
meanAgeImportAll = t.groupby(['toTeamId'])[['age']].mean().rename(columns=lambda col:'avg import age overall')
paidImportAll = t[t.feeValue>0].groupby(['toTeamId'])[['id']].count().rename(columns=lambda col:'# of paid imports overall')
freeImportAll = t[t.feeValue==0].groupby(['toTeamId'])[['id']].count().rename(columns=lambda col:'# of free imports overall')
totalImportAll = t.groupby(['toTeamId'])[['id']].count().rename(columns=lambda col:'# of imports overall')

meanFeeExportAll = t.groupby(['fromTeamId'])[['feeValue']].mean().rename(columns=lambda col:'avg export fee overall')
totalFeeExportAll = t.groupby(['fromTeamId'])[['feeValue']].sum().rename(columns=lambda col:'total export fees overall')
meanAgeExportAll = t.groupby(['fromTeamId'])[['age']].mean().rename(columns=lambda col:'avg export age overall')
paidExportAll = t[t.feeValue>0].groupby(['fromTeamId'])[['id']].count().rename(columns=lambda col:'# of paid exports overall')
freeExportAll= t[t.feeValue==0].groupby(['fromTeamId'])[['id']].count().rename(columns=lambda col:'# of free exports overall')
totalExportAll = t.groupby(['fromTeamId'])[['id']].count().rename(columns=lambda col:'# of exports overall')

meanDurationAll = s.groupby(['fromTeamId'])[['duration']].mean().rename(columns=lambda col:'avg duration overall')

clubjoin=club_coeff.join(meanFeeImportPeriod).join(totalFeeImportPeriod).join(meanAgeImportPeriod).join(paidImportPeriod).join(freeImportPeriod)\
    .join(totalImportPeriod).join(meanFeeIExportPeriod).join(totalFeeExportPeriod).join(meanAgeExportPeriod).join(paidExportPeriod)\
    .join(freeExportPeriod).join(totalExportPeriod).join(meanDurationPeriod)\
    .join(meanFeeImportAll).join(totalFeeImportAll).join(meanAgeImportAll).join(paidImportAll).join(freeImportAll)\
    .join(totalImportAll).join(meanFeeIExportAll).join(totalFeeExportAll).join(meanAgeExportAll).join(paidExportAll)\
    .join(freeExportAll).join(totalExportAll).join(meanDurationAll)

ew = pd.ExcelWriter('../data/output/top_club_summary.xls', encoding='utf-8')
j=clubjoin.merge(fulljoin,how='left',left_on='Association',right_index=True,suffixes=('',', country'))
j.to_excel(ew)
ew.save()

In [2]:
%ls ../data/reference/GCB2013_DataPack


GCB2013_Data.xls*                   GCB_CountryCoverage_2003to2013.xls*
GCB2013_Methodology_EN.pdf          GCB_Questions_2003to2013.xls*

In [18]:
bribes=pd.read_excel('../data/reference/GCB2013_DataPack/GCB2013_Data.xls',sheetname=7,skiprows=3)

bribes=bribes[['Country']+[bribes.columns[-1]]][1:].set_index('Country')

bribes.ix['Germany']


Out[18]:
Have you paid a bribe to any of the 8 services listed?
Country
Germany NaN
Germany NaN

In [134]:
t=countryInfo.join(nonloans[(nonloans.toCountry!=nonloans.fromCountry)|(nonloans.age>21)].groupby(['toCountry','season'])['id'].count().unstack()).groupby('marketTier').sum().T[-20:] #.plot.line()
t[3]=t[3]+t[4]
t=t.drop(4,axis=1)
s=t.sum(axis=1)
t[1]=t[1]/s
t[2]=t[2]/s
t[3]=t[3]/s
t.plot.area(stacked=True,figsize=(10,10))


Out[134]:
<matplotlib.axes._subplots.AxesSubplot at 0x129b9f9d0>

In [109]:
countryInfo['Corruption Level']=countryInfo['Corruption'].apply(lambda x: math.floor(x/20))
countryInfo['Corruption Level']=countryInfo['Corruption Rank'].fillna(1).apply(lambda x: math.floor(x/10))
t=countryInfo.join(nonloans.groupby(['toCountry','season'])['id'].count().unstack())
t=t[t['marketTier']==2].groupby('Corruption Level').sum().T[-20:] #.plot.line()
s=t.sum(axis=1)
for c in t.columns:
    t[c]=t[c]/s
t.plot.line(stacked=False,figsize=(10,10))


Out[109]:
<matplotlib.axes._subplots.AxesSubplot at 0x114a7d1d0>

In [121]:
countryInfo[countryInfo.marketTier>=3].count()


Out[121]:
National team coeff    37
2012 coeff             35
2013 coeff             35
2014 coeff             36
2015 coeff             36
total coeff            36
GDP 2010               27
GDP 2011               27
GDP 2012               26
GDP 2013               27
GDP 2014               27
Unemployment           33
Corruption             31
Population             33
Growth %               33
federation             37
marketTier             37
Corruption Rank        21
Bribe %                21
Corruption Level       37
dtype: int64

In [132]:
nonloans[(nonloans.feeValue>0)&(nonloans.fromCountry!=nonloans.toCountry)].groupby('season').feeValue.sum().plot.line()
#nonloans[nonloans.fromCountry!=nonloans.toCountry].groupby('season').feeValue.mean().plot.line()


Out[132]:
<matplotlib.axes._subplots.AxesSubplot at 0x10c124510>

In [135]:
countryInfo['marketTier'].sort_values()


Out[135]:
Country
Germany               1.0
Spain                 1.0
England               1.0
Italy                 1.0
France                1.0
Scotland              2.0
Greece                2.0
Denmark               2.0
Ukraine               2.0
Austria               2.0
Turkey                2.0
Russia                2.0
Netherlands           2.0
Belgium               2.0
Portugal              2.0
Switzerland           2.0
Croatia               3.0
Cyprus                3.0
Belarus               3.0
Lithuania             3.0
Estonia               3.0
Bulgaria              3.0
Finland               3.0
Serbia                3.0
Kazakhstan            3.0
Luxembourg            3.0
Israel                3.0
Malta                 3.0
Slovenia              3.0
Ireland               3.0
Azerbaijan            3.0
Hungary               3.0
Slovakia              3.0
Romania               3.0
Poland                3.0
Sweden                3.0
Czech Republic        3.0
Bosnia-Herzegovina    3.0
Latvia                3.0
Andorra               4.0
Macedonia             4.0
Moldova               4.0
Liechtenstein         4.0
Iceland               4.0
Armenia               4.0
Northern Ireland      4.0
Montenegro            4.0
Albania               4.0
Wales                 4.0
San Marino            4.0
Norway                4.0
Georgia               4.0
Gibraltar             4.0
Faroe Islands         NaN
Name: marketTier, dtype: float64