"And since 2004, when ten Eastern European countries joined the European Union, much of that growth has been concentrated in the east – while the number of transfers per year in Western Europe increased by 20% from 2004 to 2015, the number of transfers per year in Eastern Europe increased by a whopping 120%."



In [77]:

    
big5=['GB1','ES1','L1','IT1','FR1']
tier2=['PO1','NL1','SC1','BE1']
north=['DK1','SE1','NO1','A1','C1','LUX1','FI1','IR1','IS1','WAL','NIR','FARO','FI1']
east=['PL1','UNG1','TS1','SLO1','SL1','AZ1','EST1','LET1','ARM1','LI1','WER1','UKR1','RU1','TR1','KAS1','ISR1']
southeast=['GR1','RO1','ZYP1','KR1','SER1','BOS1','BU1','ALB1','MNE1','MAL1','MOL1','MAZ1','MAL1']

def category(league):
    if league in big5:
        return '1 - big 5'
    elif league in tier2:
        return '2 - tier 2'
    elif league in north:
        return '3 - north'
    elif league in east:
        return '4 - east'
    elif league in southeast:
        return '5 - southeast'
    else:
        return 'WTF'
    
def category(league):
    if league in big5:
        return '1 - WEST'
    elif league in tier2:
        return 'WTF'
    elif league in north:
        return 'WTF'
    elif league in east:
        return '2 - EAST'
    elif league in southeast:
        return '2 - EAST'
    else:
        return 'WTF'
    
t=nonloans[['toCountry','toLeague']].drop_duplicates().rename(columns={'toCountry':'country','toLeague':'league'})
    
nonloans['category']=nonloans.fromLeague.apply(category)
t['category']=t.league.apply(category)

table=nonloans.merge(t,how='left',left_on='toCountry',right_on='country',suffixes=('','Country')) #[(nonloans.season>1994)&(nonloans.age>0)].groupby(['season','categoryCountry'])['feeValue'].sum().unstack()

table=table[(table.season>1994)&(table.age>23)&(table.duration.notnull())].groupby(['season','categoryCountry'])['duration'].describe().unstack()

#table=table.drop('WTF',axis=1)



In [83]:

    
def getFreesAndNotFrees(t):
    t1=t[t.toCountry.notnull()]
    free=len(t1[t1.feeValue==0])
    paid=len(t1[(t1.feeValue>0)&(t1.feeValue.notnull())])
    return free/(float(free)+paid),free,paid,free+paid
z=nonloans[(nonloans.season>1994)&(nonloans.toLeague.isin(big5+tier2+north+east+southeast))].groupby('season').apply(getFreesAndNotFrees)
z









    Out[83]:





season
1995       (0.575163398693, 176, 130, 306)
1996       (0.595698924731, 277, 188, 465)
1997       (0.587328767123, 343, 241, 584)
1998       (0.578581363004, 416, 303, 719)
1999       (0.589524969549, 484, 337, 821)
2000       (0.60797034291, 656, 423, 1079)
2001      (0.629921259843, 800, 470, 1270)
2002      (0.76217765043, 1064, 332, 1396)
2003     (0.783412572636, 1483, 410, 1893)
2004     (0.760352760736, 1983, 625, 2608)
2005     (0.721873968987, 2188, 843, 3031)
2006     (0.689202178789, 2151, 970, 3121)
2007     (0.61351511559, 2070, 1304, 3374)
2008    (0.632174392936, 2291, 1333, 3624)
2009     (0.736756756757, 2726, 974, 3700)
2010    (0.736408142231, 2858, 1023, 3881)
2011    (0.738681102362, 3002, 1062, 4064)
2012     (0.749746450304, 2957, 987, 3944)
2013     (0.771331883007, 3191, 946, 4137)
2014     (0.800292255236, 3286, 820, 4106)
2015     (0.781719924812, 3327, 929, 4256)
dtype: object



In [52]:

    
table.plot.area(stacked=True)









    Out[52]:





<matplotlib.axes._subplots.AxesSubplot at 0x108dcf990>



In [55]:

    
(table[table.columns[:4]].sum(axis=1)/table.sum(axis=1)).plot.line()









    Out[55]:





<matplotlib.axes._subplots.AxesSubplot at 0x10bc599d0>



In [16]:

    
t[t.categoryCountry!='WTF']









    Out[16]:






  
    
      
      country
      league
      categoryCountry
    
  
  
    
      2
      England
      GB1
      1 - big 5
    
    
      7
      Ireland
      IR1
      3 - north
    
    
      121
      Scotland
      SC1
      2 - tier 2
    
    
      136
      Denmark
      DK1
      3 - north
    
    
      2344
      Cyprus
      ZYP1
      5 - southeast
    
    
      2391
      Iceland
      IS1
      3 - north
    
    
      3615
      Hungary
      UNG1
      4 - east
    
    
      3803
      Bosnia-Herzegovina
      BOS1
      5 - southeast
    
    
      3900
      Sweden
      SE1
      3 - north
    
    
      3946
      France
      FR1
      1 - big 5
    
    
      3948
      Switzerland
      C1
      3 - north
    
    
      4091
      Azerbaijan
      AZ1
      4 - east
    
    
      4127
      Moldova
      MOL1
      5 - southeast
    
    
      4183
      Belgium
      BE1
      2 - tier 2
    
    
      4219
      Russia
      RU1
      4 - east
    
    
      4266
      Macedonia
      MAZ1
      5 - southeast
    
    
      4322
      Albania
      ALB1
      5 - southeast
    
    
      4326
      Latvia
      LET1
      4 - east
    
    
      4509
      Portugal
      PO1
      2 - tier 2
    
    
      4528
      Turkey
      TR1
      4 - east
    
    
      4728
      Belarus
      WER1
      4 - east
    
    
      4844
      Montenegro
      MNE1
      5 - southeast
    
    
      5007
      Netherlands
      NL1
      2 - tier 2
    
    
      5204
      Bulgaria
      BU1
      5 - southeast
    
    
      5341
      Armenia
      ARM1
      4 - east
    
    
      5384
      Estonia
      EST1
      4 - east
    
    
      5588
      Greece
      GR1
      5 - southeast
    
    
      5591
      Ukraine
      UKR1
      4 - east
    
    
      5606
      Serbia
      SER1
      5 - southeast
    
    
      5879
      Luxembourg
      LUX1
      3 - north
    
    
      10747
      Germany
      L1
      1 - big 5
    
    
      11574
      Austria
      A1
      3 - north
    
    
      11725
      Finland
      FI1
      3 - north
    
    
      12064
      Croatia
      KR1
      5 - southeast
    
    
      12212
      Norway
      NO1
      3 - north
    
    
      14579
      Poland
      PL1
      4 - east
    
    
      14657
      Italy
      IT1
      1 - big 5
    
    
      14781
      Malta
      MAL1
      4 - east
    
    
      14816
      Lithuania
      LI1
      4 - east
    
    
      15072
      Israel
      ISR1
      4 - east
    
    
      16244
      Slovenia
      SL1
      4 - east
    
    
      17369
      Slovakia
      SLO1
      4 - east
    
    
      17740
      Faroe Island
      FARO
      3 - north
    
    
      19562
      Spain
      ES1
      1 - big 5
    
    
      20914
      Czech Republic
      TS1
      4 - east
    
    
      21887
      Kazakhstan
      KAS1
      4 - east
    
    
      22683
      Romania
      RO1
      5 - southeast



In [88]:

    
(table[table.columns[4:]].sum(axis=1))/table.sum(axis=1)









    Out[88]:





season
1995    0.003007
1996    0.006897
1997    0.009604
1998    0.005837
1999    0.023176
2000    0.012394
2001    0.019861
2002    0.010974
2003    0.013466
2004    0.014965
2005    0.019698
2006    0.023021
2007    0.032792
2008    0.045546
2009    0.022018
2010    0.018827
2011    0.013259
2012    0.010392
2013    0.008632
2014    0.013272
2015    0.011675
dtype: float64



In [61]:

    
f=(lambda x: x.ix[2015]/float(x.ix[2004]))
print f(table[table.columns[:3]].sum(axis=1))
print f(table[table.columns[3:]].sum(axis=1))









    



2.95487421325
1.35105045158



In [27]:

    
(4134+2104+2977 )/(3271+1569+1524.  )
(4134+2104+2977 )/(3512+1699+1799. )









    Out[27]:





1.3145506419400856



In [26]:

    
(5590+4288 )/(2727+1428. )
(5590+4288 )/(3209+1836. )









    Out[26]:





1.957978196233895



In [2]:

    
nonloans.merge(t,how='left',left_on='fromCountry',right_on='country',suffixes=('','Country'))[(nonloans.season>1994)&(nonloans['category']!='WTF')].groupby(['season','categoryCountry'])['id'].count().unstack().plot.area()









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-2-51d600d76b12> in <module>()
----> 1 nonloans.merge(t,how='left',left_on='fromCountry',right_on='country',suffixes=('','Country'))[(nonloans.season>1994)&(nonloans['category']!='WTF')].groupby(['season','categoryCountry'])['id'].count().unstack().plot.area()

NameError: name 't' is not defined



In [91]:

    
transfers[transfers.season>1994].groupby('season')[['duration','durationWithLoans']].mean() #.plot.line()









    Out[91]:






  
    
      
      duration
      durationWithLoans
    
    
      season
      
      
    
  
  
    
      1995.0
      753.199219
      711.217668
    
    
      1996.0
      739.529543
      692.462487
    
    
      1997.0
      716.974171
      685.197794
    
    
      1998.0
      723.825450
      683.028843
    
    
      1999.0
      696.709514
      644.470947
    
    
      2000.0
      704.939108
      651.704998
    
    
      2001.0
      726.451448
      672.095318
    
    
      2002.0
      751.084050
      690.670323
    
    
      2003.0
      744.191820
      674.670266
    
    
      2004.0
      732.657691
      657.059183
    
    
      2005.0
      693.240520
      615.167929
    
    
      2006.0
      681.911796
      599.275364
    
    
      2007.0
      660.797215
      576.776976
    
    
      2008.0
      636.359997
      554.113707
    
    
      2009.0
      617.524194
      535.018812
    
    
      2010.0
      616.688243
      531.324944
    
    
      2011.0
      609.097860
      525.996771
    
    
      2012.0
      599.201683
      518.497293
    
    
      2013.0
      586.571652
      507.756664
    
    
      2014.0
      589.456897
      505.400147
    
    
      2015.0
      588.278417
      503.978023



In [94]:

    
503/366.









    Out[94]:





1.3743169398907105



In [1]:

    
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')
from urllib2 import urlopen, Request
from bs4 import BeautifulSoup
from pandas import DataFrame
import pandas as pd
import numpy as np
import re
import os.path 
import datetime 
import math
import matplotlib.pyplot as plt

transfers = pd.read_csv('../data/merged/transfers_q.csv')
loans = pd.read_csv('../data/merged/loans.csv')
nonloans = pd.read_csv('../data/merged/nonloans.csv')
players=pd.read_csv('../data/merged/players.csv')
leagueTables=pd.read_csv('../data/merged/leagueTables.csv')

leagueCodeMapping = {'GB1':'Premier League (England)','FR1':'Ligue 1 (France)',
                                                         'ES1':'La Liga (Spain)','IT1':'Serie A (Italy)','L1':'Bundesliga (Germany)',
                                                         'PO1':'Primeira Liga (Portugal)','NL1':'Eredivisie (Netherlands)'}

teamIdMap = pd.concat([transfers[['toTeamId','toTeamName','season']].rename(columns={'toTeamId':'teamId','toTeamName':'teamName'}),
    transfers[['fromTeamId','fromTeamName','season']].rename(columns={'fromTeamId':'teamId','fromTeamName':'teamName'})],ignore_index=True) \
    .sort_values('season',ascending=False).drop_duplicates('teamId').set_index('teamId')['teamName'].to_dict()
    
agentIdMap = transfers[['playersAgentId','playersAgent']].sort_values('playersAgent',ascending=False).drop_duplicates('playersAgentId').set_index('playersAgentId')['playersAgent'].to_dict()

leagueList = ['GB1','ES1','L1','IT1','FR1','PO1','NL1','TR1','RU1','BE1','MLS1','UKR1','GR1','RO1','ZYP1','KR1','SER1','BOS1','BU1','DK1','SE1','NO1','LI1','SC1','ISR1','PL1','UNG1','A1','C1','MAL1','KAS1','TS1','SLO1','SL1','FI1','LUX1','AZ1','IR1']

countryInfo=pd.read_excel('../data/reference/countries.xls').set_index('country')
countryInfo.marketTier=np.where(countryInfo.marketTier==4,3,countryInfo.marketTier)

top20clubids=transfers.groupby('toTeamId')[['feeValue']].sum().merge(DataFrame({'teamName':teamIdMap}), left_index=True,right_index=True,how='left').sort_values('feeValue',ascending=False).iloc[:20].index
top50clubids=transfers.groupby('toTeamId')[['feeValue']].sum().merge(DataFrame({'teamName':teamIdMap}), left_index=True,right_index=True,how='left').sort_values('feeValue',ascending=False).iloc[:50].index
top100clubids=transfers.groupby('toTeamId')[['feeValue']].sum().merge(DataFrame({'teamName':teamIdMap}), left_index=True,right_index=True,how='left').sort_values('feeValue',ascending=False).iloc[:100].index

uefa=countryInfo[countryInfo.federation=='UEFA'].index
caf=countryInfo[countryInfo.federation=='CAF'].index
concacaf=countryInfo[countryInfo.federation=='CONCACAF'].index
concacaf_no_uscan=countryInfo[countryInfo.federation=='CONCACAF'].drop(['United States','Canada']).index
conmebol=countryInfo[countryInfo.federation=='CONMEBOL'].index
afc=countryInfo[countryInfo.federation=='AFC'].index
afc_no_anzac=countryInfo[countryInfo.federation=='AFC'].drop(['Australia','New Zealand']).index

country_coeff=pd.read_csv('../data/reference/country_coefficient2.csv').reset_index(drop=True).set_index('Country').rename(index={
        'Bosnia and Herzegovina': 'Bosnia-Herzegovina',
        'Republic of Ireland': 'Ireland',
        'FYR Macedonia': 'Macedonia',
        'Faroe Islands':'Faroe Islands'
    })

natteam_coeff=pd.read_table('../data/reference/national_team_coefficient.csv').reset_index(drop=True).drop('Rank',axis=1)
natteam_coeff['Country']=natteam_coeff['Country'].str.strip()
natteam_coeff.Coefficient=natteam_coeff.Coefficient.apply(lambda s: int(s.replace(',','')))
natteam_coeff=natteam_coeff.rename(columns={'Coefficient':'National team coeff'}).set_index('Country')

club_coeff=pd.read_csv('../data/reference/club_coefficients.csv')[['rank','clubId','clubName','coeff12','coeff13','coeff14','coeff15','points']]

gdp=pd.read_table('../data/reference/country_gdp.csv').reset_index(drop=True).drop('2014 Rank',axis=1).rename(columns={'Country ':'Country'})
gdp.Country=gdp.Country.str.strip()
gdp=gdp.set_index('Country')
gdp.columns=['GDP '+c for c in gdp.columns.str.strip()]
for column in gdp.columns:
    gdp[column]=gdp[column].apply(lambda s: float(str(s).replace(',','')))
    
unemployed=pd.read_table('../data/reference/country_unemployment.csv').drop(['Date','Employment rate'],axis=1)
unemployed.columns=['Country','Unemployment']
unemployed['Country']=unemployed['Country'].str.split(' ').apply(lambda arr: ' '.join(arr[:len(arr)/2]))
unemployed=unemployed.set_index('Country').rename(index={'United Kingdom':'England'})

corruption=pd.read_table('../data/reference/country_corruption.csv').drop(['Rank'],axis=1)
corruption.Country=corruption.Country.str.strip()
corruption=corruption.set_index('Country')
corruption=corruption[corruption.columns[:1]].rename(columns={'2015 score':'Corruption'}).rename(index={'United Kingdom':'England'})

population=pd.read_table('../data/reference/country_population.csv').drop(['Rank'],axis=1)
population.columns=population.columns.str.strip()
population.Country=population.Country.str.split('[',expand=True)[0].str.strip()
population['Population']=population['Population'].apply(lambda s: float(str(s).replace(',','')))
population=population[['Country','Population','Growth %']].set_index('Country').rename(index={'United Kingdom':'England'})

barometer=pd.read_table('../data/reference/country_corruption_barometer.csv').reset_index(drop=True)
barometer.columns=['Corruption Rank','Country','Bribe %']
barometer.Country=barometer.Country.str.strip()
barometer=barometer.set_index('Country').rename(index={'United Kingdom':'England'})

countryInfo=natteam_coeff.join(country_coeff).join(gdp).join(unemployed).join(corruption).join(population).join(countryInfo).join(barometer)









    



/Users/sunkevin/anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.py:2871: DtypeWarning: Columns (0,38,40,41,45,47,49,55) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
/Users/sunkevin/anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.py:2871: DtypeWarning: Columns (45,47) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
/Users/sunkevin/anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.py:2871: DtypeWarning: Columns (40,45,47,51) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
/Users/sunkevin/anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.py:2871: DtypeWarning: Columns (32,34) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)

	country	league	categoryCountry
2	England	GB1	1 - big 5
7	Ireland	IR1	3 - north
121	Scotland	SC1	2 - tier 2
136	Denmark	DK1	3 - north
2344	Cyprus	ZYP1	5 - southeast
2391	Iceland	IS1	3 - north
3615	Hungary	UNG1	4 - east
3803	Bosnia-Herzegovina	BOS1	5 - southeast
3900	Sweden	SE1	3 - north
3946	France	FR1	1 - big 5
3948	Switzerland	C1	3 - north
4091	Azerbaijan	AZ1	4 - east
4127	Moldova	MOL1	5 - southeast
4183	Belgium	BE1	2 - tier 2
4219	Russia	RU1	4 - east
4266	Macedonia	MAZ1	5 - southeast
4322	Albania	ALB1	5 - southeast
4326	Latvia	LET1	4 - east
4509	Portugal	PO1	2 - tier 2
4528	Turkey	TR1	4 - east
4728	Belarus	WER1	4 - east
4844	Montenegro	MNE1	5 - southeast
5007	Netherlands	NL1	2 - tier 2
5204	Bulgaria	BU1	5 - southeast
5341	Armenia	ARM1	4 - east
5384	Estonia	EST1	4 - east
5588	Greece	GR1	5 - southeast
5591	Ukraine	UKR1	4 - east
5606	Serbia	SER1	5 - southeast
5879	Luxembourg	LUX1	3 - north
10747	Germany	L1	1 - big 5
11574	Austria	A1	3 - north
11725	Finland	FI1	3 - north
12064	Croatia	KR1	5 - southeast
12212	Norway	NO1	3 - north
14579	Poland	PL1	4 - east
14657	Italy	IT1	1 - big 5
14781	Malta	MAL1	4 - east
14816	Lithuania	LI1	4 - east
15072	Israel	ISR1	4 - east
16244	Slovenia	SL1	4 - east
17369	Slovakia	SLO1	4 - east
17740	Faroe Island	FARO	3 - north
19562	Spain	ES1	1 - big 5
20914	Czech Republic	TS1	4 - east
21887	Kazakhstan	KAS1	4 - east
22683	Romania	RO1	5 - southeast

	duration	durationWithLoans
season
1995.0	753.199219	711.217668
1996.0	739.529543	692.462487
1997.0	716.974171	685.197794
1998.0	723.825450	683.028843
1999.0	696.709514	644.470947
2000.0	704.939108	651.704998
2001.0	726.451448	672.095318
2002.0	751.084050	690.670323
2003.0	744.191820	674.670266
2004.0	732.657691	657.059183
2005.0	693.240520	615.167929
2006.0	681.911796	599.275364
2007.0	660.797215	576.776976
2008.0	636.359997	554.113707
2009.0	617.524194	535.018812
2010.0	616.688243	531.324944
2011.0	609.097860	525.996771
2012.0	599.201683	518.497293
2013.0	586.571652	507.756664
2014.0	589.456897	505.400147
2015.0	588.278417	503.978023