"And since 2004, when ten Eastern European countries joined the European Union, much of that growth has been concentrated in the east – while the number of transfers per year in Western Europe increased by 20% from 2004 to 2015, the number of transfers per year in Eastern Europe increased by a whopping 120%."


In [77]:
big5=['GB1','ES1','L1','IT1','FR1']
tier2=['PO1','NL1','SC1','BE1']
north=['DK1','SE1','NO1','A1','C1','LUX1','FI1','IR1','IS1','WAL','NIR','FARO','FI1']
east=['PL1','UNG1','TS1','SLO1','SL1','AZ1','EST1','LET1','ARM1','LI1','WER1','UKR1','RU1','TR1','KAS1','ISR1']
southeast=['GR1','RO1','ZYP1','KR1','SER1','BOS1','BU1','ALB1','MNE1','MAL1','MOL1','MAZ1','MAL1']

def category(league):
    if league in big5:
        return '1 - big 5'
    elif league in tier2:
        return '2 - tier 2'
    elif league in north:
        return '3 - north'
    elif league in east:
        return '4 - east'
    elif league in southeast:
        return '5 - southeast'
    else:
        return 'WTF'
    
def category(league):
    if league in big5:
        return '1 - WEST'
    elif league in tier2:
        return 'WTF'
    elif league in north:
        return 'WTF'
    elif league in east:
        return '2 - EAST'
    elif league in southeast:
        return '2 - EAST'
    else:
        return 'WTF'
    
t=nonloans[['toCountry','toLeague']].drop_duplicates().rename(columns={'toCountry':'country','toLeague':'league'})
    
nonloans['category']=nonloans.fromLeague.apply(category)
t['category']=t.league.apply(category)

table=nonloans.merge(t,how='left',left_on='toCountry',right_on='country',suffixes=('','Country')) #[(nonloans.season>1994)&(nonloans.age>0)].groupby(['season','categoryCountry'])['feeValue'].sum().unstack()

table=table[(table.season>1994)&(table.age>23)&(table.duration.notnull())].groupby(['season','categoryCountry'])['duration'].describe().unstack()

#table=table.drop('WTF',axis=1)

In [83]:
def getFreesAndNotFrees(t):
    t1=t[t.toCountry.notnull()]
    free=len(t1[t1.feeValue==0])
    paid=len(t1[(t1.feeValue>0)&(t1.feeValue.notnull())])
    return free/(float(free)+paid),free,paid,free+paid
z=nonloans[(nonloans.season>1994)&(nonloans.toLeague.isin(big5+tier2+north+east+southeast))].groupby('season').apply(getFreesAndNotFrees)
z


Out[83]:
season
1995       (0.575163398693, 176, 130, 306)
1996       (0.595698924731, 277, 188, 465)
1997       (0.587328767123, 343, 241, 584)
1998       (0.578581363004, 416, 303, 719)
1999       (0.589524969549, 484, 337, 821)
2000       (0.60797034291, 656, 423, 1079)
2001      (0.629921259843, 800, 470, 1270)
2002      (0.76217765043, 1064, 332, 1396)
2003     (0.783412572636, 1483, 410, 1893)
2004     (0.760352760736, 1983, 625, 2608)
2005     (0.721873968987, 2188, 843, 3031)
2006     (0.689202178789, 2151, 970, 3121)
2007     (0.61351511559, 2070, 1304, 3374)
2008    (0.632174392936, 2291, 1333, 3624)
2009     (0.736756756757, 2726, 974, 3700)
2010    (0.736408142231, 2858, 1023, 3881)
2011    (0.738681102362, 3002, 1062, 4064)
2012     (0.749746450304, 2957, 987, 3944)
2013     (0.771331883007, 3191, 946, 4137)
2014     (0.800292255236, 3286, 820, 4106)
2015     (0.781719924812, 3327, 929, 4256)
dtype: object

In [52]:
table.plot.area(stacked=True)


Out[52]:
<matplotlib.axes._subplots.AxesSubplot at 0x108dcf990>

In [55]:
(table[table.columns[:4]].sum(axis=1)/table.sum(axis=1)).plot.line()


Out[55]:
<matplotlib.axes._subplots.AxesSubplot at 0x10bc599d0>

In [16]:
t[t.categoryCountry!='WTF']


Out[16]:
country league categoryCountry
2 England GB1 1 - big 5
7 Ireland IR1 3 - north
121 Scotland SC1 2 - tier 2
136 Denmark DK1 3 - north
2344 Cyprus ZYP1 5 - southeast
2391 Iceland IS1 3 - north
3615 Hungary UNG1 4 - east
3803 Bosnia-Herzegovina BOS1 5 - southeast
3900 Sweden SE1 3 - north
3946 France FR1 1 - big 5
3948 Switzerland C1 3 - north
4091 Azerbaijan AZ1 4 - east
4127 Moldova MOL1 5 - southeast
4183 Belgium BE1 2 - tier 2
4219 Russia RU1 4 - east
4266 Macedonia MAZ1 5 - southeast
4322 Albania ALB1 5 - southeast
4326 Latvia LET1 4 - east
4509 Portugal PO1 2 - tier 2
4528 Turkey TR1 4 - east
4728 Belarus WER1 4 - east
4844 Montenegro MNE1 5 - southeast
5007 Netherlands NL1 2 - tier 2
5204 Bulgaria BU1 5 - southeast
5341 Armenia ARM1 4 - east
5384 Estonia EST1 4 - east
5588 Greece GR1 5 - southeast
5591 Ukraine UKR1 4 - east
5606 Serbia SER1 5 - southeast
5879 Luxembourg LUX1 3 - north
10747 Germany L1 1 - big 5
11574 Austria A1 3 - north
11725 Finland FI1 3 - north
12064 Croatia KR1 5 - southeast
12212 Norway NO1 3 - north
14579 Poland PL1 4 - east
14657 Italy IT1 1 - big 5
14781 Malta MAL1 4 - east
14816 Lithuania LI1 4 - east
15072 Israel ISR1 4 - east
16244 Slovenia SL1 4 - east
17369 Slovakia SLO1 4 - east
17740 Faroe Island FARO 3 - north
19562 Spain ES1 1 - big 5
20914 Czech Republic TS1 4 - east
21887 Kazakhstan KAS1 4 - east
22683 Romania RO1 5 - southeast

In [88]:
(table[table.columns[4:]].sum(axis=1))/table.sum(axis=1)


Out[88]:
season
1995    0.003007
1996    0.006897
1997    0.009604
1998    0.005837
1999    0.023176
2000    0.012394
2001    0.019861
2002    0.010974
2003    0.013466
2004    0.014965
2005    0.019698
2006    0.023021
2007    0.032792
2008    0.045546
2009    0.022018
2010    0.018827
2011    0.013259
2012    0.010392
2013    0.008632
2014    0.013272
2015    0.011675
dtype: float64

In [61]:
f=(lambda x: x.ix[2015]/float(x.ix[2004]))
print f(table[table.columns[:3]].sum(axis=1))
print f(table[table.columns[3:]].sum(axis=1))


2.95487421325
1.35105045158

In [27]:
(4134+2104+2977 )/(3271+1569+1524.  )
(4134+2104+2977 )/(3512+1699+1799. )


Out[27]:
1.3145506419400856

In [26]:
(5590+4288 )/(2727+1428. )
(5590+4288 )/(3209+1836. )


Out[26]:
1.957978196233895

In [2]:
nonloans.merge(t,how='left',left_on='fromCountry',right_on='country',suffixes=('','Country'))[(nonloans.season>1994)&(nonloans['category']!='WTF')].groupby(['season','categoryCountry'])['id'].count().unstack().plot.area()


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-2-51d600d76b12> in <module>()
----> 1 nonloans.merge(t,how='left',left_on='fromCountry',right_on='country',suffixes=('','Country'))[(nonloans.season>1994)&(nonloans['category']!='WTF')].groupby(['season','categoryCountry'])['id'].count().unstack().plot.area()

NameError: name 't' is not defined

In [91]:
transfers[transfers.season>1994].groupby('season')[['duration','durationWithLoans']].mean() #.plot.line()


Out[91]:
duration durationWithLoans
season
1995.0 753.199219 711.217668
1996.0 739.529543 692.462487
1997.0 716.974171 685.197794
1998.0 723.825450 683.028843
1999.0 696.709514 644.470947
2000.0 704.939108 651.704998
2001.0 726.451448 672.095318
2002.0 751.084050 690.670323
2003.0 744.191820 674.670266
2004.0 732.657691 657.059183
2005.0 693.240520 615.167929
2006.0 681.911796 599.275364
2007.0 660.797215 576.776976
2008.0 636.359997 554.113707
2009.0 617.524194 535.018812
2010.0 616.688243 531.324944
2011.0 609.097860 525.996771
2012.0 599.201683 518.497293
2013.0 586.571652 507.756664
2014.0 589.456897 505.400147
2015.0 588.278417 503.978023

In [94]:
503/366.


Out[94]:
1.3743169398907105

In [1]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')
from urllib2 import urlopen, Request
from bs4 import BeautifulSoup
from pandas import DataFrame
import pandas as pd
import numpy as np
import re
import os.path 
import datetime 
import math
import matplotlib.pyplot as plt

transfers = pd.read_csv('../data/merged/transfers_q.csv')
loans = pd.read_csv('../data/merged/loans.csv')
nonloans = pd.read_csv('../data/merged/nonloans.csv')
players=pd.read_csv('../data/merged/players.csv')
leagueTables=pd.read_csv('../data/merged/leagueTables.csv')

leagueCodeMapping = {'GB1':'Premier League (England)','FR1':'Ligue 1 (France)',
                                                         'ES1':'La Liga (Spain)','IT1':'Serie A (Italy)','L1':'Bundesliga (Germany)',
                                                         'PO1':'Primeira Liga (Portugal)','NL1':'Eredivisie (Netherlands)'}

teamIdMap = pd.concat([transfers[['toTeamId','toTeamName','season']].rename(columns={'toTeamId':'teamId','toTeamName':'teamName'}),
    transfers[['fromTeamId','fromTeamName','season']].rename(columns={'fromTeamId':'teamId','fromTeamName':'teamName'})],ignore_index=True) \
    .sort_values('season',ascending=False).drop_duplicates('teamId').set_index('teamId')['teamName'].to_dict()
    
agentIdMap = transfers[['playersAgentId','playersAgent']].sort_values('playersAgent',ascending=False).drop_duplicates('playersAgentId').set_index('playersAgentId')['playersAgent'].to_dict()

leagueList = ['GB1','ES1','L1','IT1','FR1','PO1','NL1','TR1','RU1','BE1','MLS1','UKR1','GR1','RO1','ZYP1','KR1','SER1','BOS1','BU1','DK1','SE1','NO1','LI1','SC1','ISR1','PL1','UNG1','A1','C1','MAL1','KAS1','TS1','SLO1','SL1','FI1','LUX1','AZ1','IR1']

countryInfo=pd.read_excel('../data/reference/countries.xls').set_index('country')
countryInfo.marketTier=np.where(countryInfo.marketTier==4,3,countryInfo.marketTier)

top20clubids=transfers.groupby('toTeamId')[['feeValue']].sum().merge(DataFrame({'teamName':teamIdMap}), left_index=True,right_index=True,how='left').sort_values('feeValue',ascending=False).iloc[:20].index
top50clubids=transfers.groupby('toTeamId')[['feeValue']].sum().merge(DataFrame({'teamName':teamIdMap}), left_index=True,right_index=True,how='left').sort_values('feeValue',ascending=False).iloc[:50].index
top100clubids=transfers.groupby('toTeamId')[['feeValue']].sum().merge(DataFrame({'teamName':teamIdMap}), left_index=True,right_index=True,how='left').sort_values('feeValue',ascending=False).iloc[:100].index

uefa=countryInfo[countryInfo.federation=='UEFA'].index
caf=countryInfo[countryInfo.federation=='CAF'].index
concacaf=countryInfo[countryInfo.federation=='CONCACAF'].index
concacaf_no_uscan=countryInfo[countryInfo.federation=='CONCACAF'].drop(['United States','Canada']).index
conmebol=countryInfo[countryInfo.federation=='CONMEBOL'].index
afc=countryInfo[countryInfo.federation=='AFC'].index
afc_no_anzac=countryInfo[countryInfo.federation=='AFC'].drop(['Australia','New Zealand']).index

country_coeff=pd.read_csv('../data/reference/country_coefficient2.csv').reset_index(drop=True).set_index('Country').rename(index={
        'Bosnia and Herzegovina': 'Bosnia-Herzegovina',
        'Republic of Ireland': 'Ireland',
        'FYR Macedonia': 'Macedonia',
        'Faroe Islands':'Faroe Islands'
    })

natteam_coeff=pd.read_table('../data/reference/national_team_coefficient.csv').reset_index(drop=True).drop('Rank',axis=1)
natteam_coeff['Country']=natteam_coeff['Country'].str.strip()
natteam_coeff.Coefficient=natteam_coeff.Coefficient.apply(lambda s: int(s.replace(',','')))
natteam_coeff=natteam_coeff.rename(columns={'Coefficient':'National team coeff'}).set_index('Country')

club_coeff=pd.read_csv('../data/reference/club_coefficients.csv')[['rank','clubId','clubName','coeff12','coeff13','coeff14','coeff15','points']]

gdp=pd.read_table('../data/reference/country_gdp.csv').reset_index(drop=True).drop('2014 Rank',axis=1).rename(columns={'Country ':'Country'})
gdp.Country=gdp.Country.str.strip()
gdp=gdp.set_index('Country')
gdp.columns=['GDP '+c for c in gdp.columns.str.strip()]
for column in gdp.columns:
    gdp[column]=gdp[column].apply(lambda s: float(str(s).replace(',','')))
    
unemployed=pd.read_table('../data/reference/country_unemployment.csv').drop(['Date','Employment rate'],axis=1)
unemployed.columns=['Country','Unemployment']
unemployed['Country']=unemployed['Country'].str.split(' ').apply(lambda arr: ' '.join(arr[:len(arr)/2]))
unemployed=unemployed.set_index('Country').rename(index={'United Kingdom':'England'})

corruption=pd.read_table('../data/reference/country_corruption.csv').drop(['Rank'],axis=1)
corruption.Country=corruption.Country.str.strip()
corruption=corruption.set_index('Country')
corruption=corruption[corruption.columns[:1]].rename(columns={'2015 score':'Corruption'}).rename(index={'United Kingdom':'England'})

population=pd.read_table('../data/reference/country_population.csv').drop(['Rank'],axis=1)
population.columns=population.columns.str.strip()
population.Country=population.Country.str.split('[',expand=True)[0].str.strip()
population['Population']=population['Population'].apply(lambda s: float(str(s).replace(',','')))
population=population[['Country','Population','Growth %']].set_index('Country').rename(index={'United Kingdom':'England'})

barometer=pd.read_table('../data/reference/country_corruption_barometer.csv').reset_index(drop=True)
barometer.columns=['Corruption Rank','Country','Bribe %']
barometer.Country=barometer.Country.str.strip()
barometer=barometer.set_index('Country').rename(index={'United Kingdom':'England'})

countryInfo=natteam_coeff.join(country_coeff).join(gdp).join(unemployed).join(corruption).join(population).join(countryInfo).join(barometer)


/Users/sunkevin/anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.py:2871: DtypeWarning: Columns (0,38,40,41,45,47,49,55) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
/Users/sunkevin/anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.py:2871: DtypeWarning: Columns (45,47) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
/Users/sunkevin/anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.py:2871: DtypeWarning: Columns (40,45,47,51) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
/Users/sunkevin/anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.py:2871: DtypeWarning: Columns (32,34) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)