In [2]:
from pandas import DataFrame
import pandas as pd
import numpy as np
import re
import dateutil.parser
import datetime
import math
import locale
import os
locale.setlocale(locale.LC_ALL,'en_US')
def getTransfersFile(league,season):
filename = league + "_" + str(season)
result = pd.read_csv("../data/transfers/" + filename + ".csv")
result['filename'] = filename
return result
def getPersonalFile(league,season):
filename = league + "_" + str(season)
result = pd.read_csv("../data/personal/" + filename + ".csv")
result['filename'] = filename
return result
def getClubFile(group,season):
filename = group + "_" + str(season)
result = pd.read_csv("../data/clubs/" + filename + ".csv")
result['filename'] = filename
result['season'] = season
return result
def getLeagueTableFile(league):
return pd.read_csv("../data/tables/" + league + ".csv")
def mvFix(mv):
if 'Mill.' in mv:
return float(mv.split('Mill.')[0].replace(',','.'))*1000000
elif "Th." in mv:
return float(mv.split('Th.')[0].replace(',','.'))*1000
elif "-" in mv or "?"==mv:
return np.nan
else:
return int('0' + ''.join([d for d in mv.replace(',','.') if d in '1234567890.']))
def feeValue(mv):
try:
mv = str(mv)
r= mvFix(mv.replace('Loan fee:',''))
return r
except Exception as e:
print mv, e
return -666
def heightFix(h):
r = float('0' + ''.join([d for d in str(h).replace(',','.') if d in '1234567890.']))
return r if r > 0 else np.nan
def getAge(born, today):
return today.year - born.year - ((today.month, today.day) < (born.month, born.day))
def getAgeWithSeasonAdjustment(born,today,season):
if type(today) == int:
return (today.year if today.year in [season,season+1] else season) - born.year - ((today.month, today.day) < (born.month, born.day))
else:
return season - born.year
def getAgeFloatWithSeasonAdjustment(born,today,season):
if type(today) == int:
return (today if today.year in [season,season+1] else datetime.datetime(season,today.month,today.day)) - born
else:
return datetime.datetime(season,1,1) - born
def parsedate(d):
try:
if d=="-":
return np.nan
return dateutil.parser.parse(str(d)) if pd.notnull(d) else np.nan
except Exception as e:
t=str(d).split('.')
d='.'.join([str(int(t[0])-1)] + t[1:])
try:
return dateutil.parser.parse(str(d))
except Exception as e2:
print d
#print e2
return np.nan
def parsebirthdate(d):
return parsedate(str(d).replace(' Happy Birthday','')) if pd.notnull(d) else np.nan
def basicparse(d):
try:
return dateutil.parser.parse(d)
except Exception as e:
#print e
return np.nan
def seasonFix(season):
year = int(season[-5:].split('/')[0])
return year+2000 if year < 20 else year+1900
def camel(s):
words = [''.join([c for c in w if c.isalnum()]) for w in s.split(' ')]
result = [words[0].lower()]
result.extend([w.title() for w in words[1:]])
return ''.join(result)
def agentIdToInt(i):
if 'nan' in str(i):
return -1
try:
return int(str(i).split('.')[0])
except Exception:
return -18
#leagueList = ['GB1','ES1','L1','IT1','FR1','PO1','NL1','TR1','RU1','BE1','MLS1','UKR1','GR1','RO1','ZYP1','KR1','SER1','BOS1','BU1','DK1','SE1','NO1','LI1','SC1','ISR1','PL1','UNG1','A1','C1','MAL1','KAS1','TS1','SLO1','SL1','FI1','LUX1','AZ1','IR1']
leagueList =tier1=["GB1", "ES1", "IT1", "L1", "FR1", "TR1", "RU1", "PO1", "NL1", "BE1", "MLS1",
"UKR1", "GR1", "C1", "RO1", "KR1", "A1", "TS1", "DK1", "PL1", "SE1", "SER1", "NO1", "ISR1", "SC1", "BU1", "ZYP1"]\
+ ["KAS1", "BOS1", "UNG1", "WER1", "SL1", "SLO1", "GEO1", "AZ1", "ALB1", "MOL1",
"FI1", "MAZ1", "MNE1", "LI1", "ARM1", "EST1", "LUX1", "LET1", "MAL1", "IS1", "IR1", "WAL1","NIR1", "FARO"]
In [3]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')
In [ ]:
#TODO
get national team id -> nation team name mapping
defunct competitions
check actual season ranges of competitions
In [4]:
leagueTables=pd.concat([getLeagueTableFile(league) for league in leagueList])[['league','season','rank','clubId','clubName','matches','wins','draws','losses','goalsFor','goalsAgainst','goalDiff','points','pointsRaw','error']]
leagueTables=leagueTables[(leagueTables.error.isnull())&(leagueTables.season>=1995)].drop('error',axis=1).reset_index(drop=True)
leagueTables['pointsRaw']=leagueTables['pointsRaw'].apply(int)
for column in leagueTables.columns:
if leagueTables[column].dtype=='float64':
leagueTables[column]=leagueTables[column].apply(int)
In [87]:
#t=leagueTables.groupby(['league','season']).describe()['rank'].unstack()[['count','min','max']]
leagueTables[leagueTables['rank'].isnull()]
Out[87]:
In [75]:
t=leagueTables[leagueTables['pointsRaw']!=leagueTables['points']].groupby(['league','season']).sum()[['points','pointsRaw']]
t['pointDiff']=t['pointsRaw']-t['points']
t.sort_values('pointDiff',ascending=False)
#leagueTables[(leagueTables['season']==2015)&(leagueTables['league']=='SER1')]
Out[75]:
In [4]:
players0 = pd.concat([getPersonalFile(league,season) for league in leagueList for season in range(2001,2016) if os.path.isfile("../data/personal/" + league + "_" + str(season) + ".csv")],ignore_index=True)
players = players0[pd.notnull(players0['Display name'])].copy()
players = players.drop_duplicates('id') #27081
players = players.drop(['error','birthDate','filename','name'],axis=1,errors='ignore')
players['Date of birth']=players['Date of birth'].apply(parsebirthdate)
players['Height']=players['Height'].apply(heightFix)
players['Nationality 2']=players['Nationality'].str.split(' \xc2\xa0\xc2\xa0',expand=True)[1]
players['Nationality']=players['Nationality'].str.split(' \xc2\xa0\xc2\xa0',expand=True)[0]
players['Position group']=players["Position"].str.split(" - ",expand=True)[0]
players['Position']=players["Position"].str.split(" - ",expand=True)[1]
players['Intl caps']=players['Intl caps/goals'].str.split('/',expand=True)[0]
players['Intl goals']=players['Intl caps/goals'].str.split('/',expand=True)[1]
del players['Intl caps/goals']
players['In the team since']=players['In the team since'].apply(parsedate)
players['Contract until']=players['Contract until'].apply(parsedate)
players['Contract there until']=players['Contract there until'].apply(parsedate)
players['2nd club']=players['2nd club'].str.replace('\(#\d+\)','').str.strip()
players['Date of last contract extension']=players['Date of last contract extension'].apply(parsedate)
players['Main position']=players['Detailed position'].str.split('Side position:',expand=True)[0].str.slice(start=14)
players['Side position']=players['Detailed position'].str.split('Side position:',expand=True)[1]
del players['Detailed position']
players['Glove']=players['Glove'].str.split(' since ',expand=True)[0]
players['Shoe model']=players['Shoe model'].str.split(' since ',expand=True)[0]
players["Shoe size"]=players["Shoe size"].apply(lambda s:float(str(s).replace(",",".")))
players['day of death']=players['day of death'].str.slice(stop=10).apply(parsedate)
players['id'].apply(int)
c = ['id','Display name', 'Name in home country','Date of birth','Place of birth','Country of birth',
'Age','Height','Nationality','Nationality 2','Position group','Position','Main position','Side position',
'Foot','National team id', 'Intl caps','Intl goals','Player\'s agent','Player\'s agent id',
'Current club', 'Current club id','In the team since', 'Contract until','on loan from', 'on loan from club id', 'Contract there until']
players = players[(c + [x for x in players.columns if not x in c])]
players.index = players['id']
In [8]:
clubs0 = pd.concat([getClubFile(group,season) for group in ['top','bottom','other','americas','world','deeper'] for season in range(1992,2016)],ignore_index=True)
clubs=clubs0.copy()
clubs = clubs[~((clubs['league']=='GB2')&(clubs['season']<2004))].copy()
clubs = clubs[~((clubs['league']=='GB3')&(clubs['season']<2004))]
clubs = clubs[~((clubs['league']=='ENSD')&(clubs['season'].isin([2002,2003])))]
clubs = clubs[~((clubs['league']=='EFD2')&(clubs['season'].isin([2001,2002,2003])))]
clubs = clubs[~((clubs['league']=='IT2')&(clubs['season']<1993))]
clubs = clubs[~((clubs['league']=='IT3A')&(clubs['season']<2006))]
clubs = clubs[~((clubs['league']=='IT3B')&(clubs['season']<2007))]
clubs = clubs[~((clubs['league']=='IT3C')&(clubs['season']<2014))]
clubs = clubs[~((clubs['league']=='ITJ1')&(clubs['season']<2009))]
clubs = clubs[~((clubs['league']=='ITJ2')&(clubs['season']<2009))]
clubs = clubs[~((clubs['league']=='ITJ3')&(clubs['season']<2008))]
clubs = clubs[~((clubs['league']=='L3')&(clubs['season']<2008))]
clubs = clubs[~((clubs['league'].isin(["RLW3","RLSW","RLN3","RLN4","RLB3"]))&(clubs['season']<2012))]
clubs = clubs[~((clubs['league']=='BE1')&(clubs['season']<2000))]
clubs = clubs[~((clubs['league']=='BE2')&(clubs['season']<2005))]
clubs = clubs[~((clubs['league']=='FR2')&(clubs['season']<1995))]
clubs = clubs[~((clubs['league']=='FR2')&(clubs['season']<2001))]
clubs = clubs[~((clubs['league']=='PO1')&(clubs['season']<2003))]
clubs = clubs[~((clubs['league']=='PO2')&(clubs['season']<2004))]
clubs = clubs[~((clubs['league']=='RU1')&(clubs['season']<2004))]
clubs = clubs[~((clubs['league']=='RU2')&(clubs['season']<2006))]
clubs = clubs[~((clubs['league']=='MLS1')&(clubs['season']<1995))]
clubs = clubs[~((clubs['league']=='NAFC')&(clubs['season']<2012))]
clubs = clubs[~((clubs['league']=='BRA1')&(clubs['season']<2004))]
clubs = clubs[~((clubs['league']=='BRA2')&(clubs['season']<2007))]
clubs = clubs[~((clubs['league']=='TR2')&(clubs['season']<2001))]
clubs = clubs[~((clubs['league']=='ES3A')&(clubs['season']<1998))]
clubs = clubs[~((clubs['league']=='ES3B')&(clubs['season']<2000))]
clubs = clubs[~((clubs['league']=='ES3C')&(clubs['season']<2006))]
clubs = clubs[~((clubs['league']=='ES3D')&(clubs['season']<2011))]
clubs = clubs[~((clubs['league']=='510')&(clubs['season']<2010))]
clubs = clubs[~((clubs['league']=='511')&(clubs['season']<2010))]
clubs = clubs[~((clubs['league']=='TR3A')&(clubs['season']<2010))]
clubs = clubs[~((clubs['league']=='TR3B')&(clubs['season']<2010))]
clubs = clubs[~((clubs['league']=='TR4A')&(clubs['season']<2012))]
clubs = clubs[~((clubs['league']=='TR4B')&(clubs['season']<2012))]
clubs = clubs[~((clubs['league']=='TR4C')&(clubs['season']<2013))]
clubs = clubs[~((clubs['league']=='GB4')&(clubs['season']<2004))]
clubs = clubs[~((clubs['league']=='CNAT')&(clubs['season']<2010))]
clubs = clubs[~((clubs['league']=='EFD2')&(clubs['season']<2001))]
clubs = clubs[~((clubs['league']=='EFD2')&(clubs['season']>2003))]
clubs = clubs[~((clubs['league']=='ENSD')&~(clubs['season'].isin([2002,2003])))]
clubs = clubs[~((clubs['clubId']==2457)&(clubs['season']==2006)&(clubs['league']=='PO2'))]
clubs = clubs[~((clubs['clubId'].isin([11126,11127]))&(clubs['season']==2011)&(clubs['league']=='RU1'))]
#clubs
In [6]:
clubs[(clubs.league=='GB1')&(clubs.season==2014)]
Out[6]:
In [9]:
transfers0 = pd.concat([getTransfersFile(league,season) for league in leagueList for season in range(2001,2016) if os.path.isfile("../data/transfers/" + league + "_" + str(season) + ".csv")],ignore_index=True)
transfers = transfers0[pd.notnull(transfers0['playerName'])].copy()
transfers['date'] = transfers['date'].apply(basicparse)
transfers['season']=transfers['season'].apply(seasonFix)
transfers=transfers.merge(clubs.rename(columns={'league': 'fromLeague','clubId':'fromTeamId'})[['fromTeamId','season','fromLeague']],how='left',on=['season','fromTeamId']) \
.merge(clubs.rename(columns={'league': 'toLeague','clubId':'toTeamId'})[['toTeamId','season','toLeague']],how='left',on=['season','toTeamId'])
transfers=transfers[['id','playerName','date','season','fromCountry','fromLeague','fromTeamId','fromTeamName','toCountry','toLeague','toTeamId','toTeamName','mv','fee']]
transfers=transfers.merge(players,how='left',on='id')
transfers['mv'] = transfers['mv'].apply(mvFix)
transfers['feeValue'] = transfers['fee'].apply(feeValue)
transfers=transfers.drop_duplicates()
#transfers=transfers[(transfers['fee']!='End of loan')]
transfers['isLoan']=(transfers['fee'].str.startswith('Loan')).fillna(False)
#outofrangeIds = transfers[(transfers['season']<1992)]['id'].drop_duplicates().tolist()
#transfers=transfers[~transfers['id'].isin(outofrangeIds)]
transfers=transfers[transfers['season']<2016]
transfers['toCountry']=np.where(transfers['fromTeamId'].isin([75,123,515,2113,2077]),transfers['fromCountry'],transfers['toCountry'])
transfers['toCountry']=np.where(transfers['toTeamName'].str.startswith('Monaco'),'France',transfers['toCountry'])
transfers['fromCountry']=np.where(transfers['fromTeamId'].isin([75,123,515,2113,2077]),np.nan,transfers['fromCountry'])
transfers['fromCountry']=np.where(transfers['fromTeamName'].str.startswith('Monaco'),'France',transfers['fromCountry'])
transfers['toCountry']=np.where(transfers['fromTeamId'].isin([464,12604]),transfers['fromCountry'],transfers['toCountry'])
transfers['fromLeague']=np.where(pd.isnull(transfers['fromLeague']),'Other '+transfers['fromCountry'],transfers['fromLeague'])
transfers['toLeague']=np.where(pd.isnull(transfers['toLeague']),'Other '+transfers['toCountry'],transfers['toLeague'])
transfers['fromCountry']=np.where(transfers['fromLeague']=='MLS1','United States',transfers['fromCountry'])
transfers['toCountry']=np.where(transfers['toLeague']=='MLS1','United States',transfers['toCountry'])
transfers['fromCountry']=np.where(transfers['fromLeague']=='IR1','Ireland',transfers['fromCountry'])
transfers['toCountry']=np.where(transfers['toLeague']=='IR1','Ireland',transfers['toCountry'])
transfers['fromCountry']=np.where(transfers['fromCountry'].str.startswith('Liechten').fillna(False),'Switzerland',transfers['fromCountry'])
transfers['toCountry']=np.where(transfers['toCountry'].str.startswith('Liechten').fillna(False),'Switzerland',transfers['toCountry'])
transfers['age']=transfers[['Date of birth','date','season']].apply(lambda x:getAgeWithSeasonAdjustment(*x),axis=1)
transfers['ageFloat']=transfers[['Date of birth','date','season']].apply(lambda x:getAgeFloatWithSeasonAdjustment(*x),axis=1)/ np.timedelta64(1, 'Y')
transfers['period']=pd.cut(transfers['season'],[1995,2000,2005,2010,2015])
transfers=transfers.rename(columns={x:camel(x) for x in transfers.columns if ' ' in x})
transfers.toTeamId=transfers.toTeamId.apply(int)
transfers.fromTeamId=transfers.fromTeamId.apply(int)
transfers.toTeamName=transfers.toTeamName.str.strip()
transfers.fromTeamName=transfers.fromTeamName.str.strip()
transfers.intlCaps=transfers.intlCaps.fillna(0).apply(int)
transfers.intlGoals=transfers.intlGoals.fillna(0).apply(int)
transfers.currentClubId=transfers.currentClubId.apply(int)
transfers.nationalTeamId=transfers.nationalTeamId.fillna(-1).apply(int)
transfers.playersAgentId=transfers.playersAgentId.apply(agentIdToInt)
transfers=transfers.iloc[::-1]
loans=transfers[transfers['isLoan']]
nonloans=transfers[~transfers['isLoan']]
In [51]:
def addMvDelta(s):
s['mvDelta']=s.mv-s.mv.shift(1).fillna(method='pad')
return s
transfers=transfers.groupby('id').apply(addMvDelta)
In [47]:
dateutil.parser.parse('Jan 1, 2009 ')
Out[47]:
In [7]:
transfers.to_csv('../data/merged/transfers.csv',encoding='utf-8',index=False)
loans.to_csv('../data/merged/loans.csv',encoding='utf-8',index=False)
nonloans.to_csv('../data/merged/nonloans.csv',encoding='utf-8',index=False)
players.to_csv('../data/merged/players.csv',encoding='utf-8',index=False)
leagueTables.to_csv('../data/merged/leagueTables.csv',encoding='utf-8',index=False)
In [5]:
leagueTables.to_csv('../data/merged/leagueTables.csv',encoding='utf-8',index=False)
In [313]:
{c:str(transfers[c].dtype) for c in transfers.columns if c.endswith('Id')}
Out[313]:
In [6]:
players['Contract until']
Out[6]:
In [309]:
def agentIdToInt(i):
if 'nan' in str(i):
return -1
try:
return int(str(i).split('.')[0])
except Exception:
return -18
transfers[['id','playersAgentId','playersAgent']]
x=transfers[['id','playersAgentId','playersAgent']].join(transfers['playersAgentId'].apply(agentIdToInt),rsuffix='_new')
x[x.playersAgentId_new<0].drop_duplicates('playersAgentId')
Out[309]:
In [268]:
players[players['id']==17340 ]['Player\'s agent id'].apply(str).apply(int)
In [255]:
a='asdasd'
'12a1212'.isdigit()
Out[255]:
In [162]:
fromIds=transfers.sort_values('season',ascending=False).drop_duplicates('fromTeamId')[['fromTeamId','fromTeamName']]
toIds=transfers.sort_values('season',ascending=False).drop_duplicates('fromTeamId')[['toTeamId','toTeamName']]
Out[162]:
In [176]:
teamIdMap = pd.concat([transfers[['toTeamId','toTeamName','season']].rename(columns={'toTeamId':'teamId','toTeamName':'teamName'}),
transfers[['fromTeamId','fromTeamName','season']].rename(columns={'fromTeamId':'teamId','fromTeamName':'teamName'})],ignore_index=True) \
.sort_values('season',ascending=False).drop_duplicates('teamId').set_index('teamId')['teamName'].to_dict()
In [332]:
x=transfers.groupby('playersAgentId')['playersAgent'].nunique().sort_values()
transfers[transfers['playersAgentId'].isin([y for y in x[x>1].index])][['playersAgentId','playersAgent']].sort_values('playersAgent',ascending=False).drop_duplicates('playersAgentId')
Out[332]:
In [338]:
agentIdMap = transfers[['playersAgentId','playersAgent']].sort_values('playersAgent',ascending=False).drop_duplicates('playersAgentId').set_index('playersAgentId')['playersAgent'].to_dict()
In [87]:
transfers[transfers['Date of birth'].isnull()].count() #.apply(lambda x: x.day).value_counts().sort_index().plot.bar()
Out[87]:
In [93]:
transfers['Place of birth'].str.strip().drop_duplicates().sort_values().count()
Out[93]:
In [102]:
In [111]:
{x:camel(x) for x in transfers.columns if ' ' in x}
Out[111]:
In [125]:
transfers #['fromTeamId']
Out[125]:
In [50]:
players[players['id']== 146494 ]
Out[50]:
In [41]:
players
Out[41]:
In [18]:
transfers['Age']=transfers[['Date of birth','date','season']].apply(lambda x:getAgeWithSeasonAdjustment(*x),axis=1)/ np.timedelta64(1, 'Y')
transfers[transfers['date'].apply(lambda x:x.year)<transfers['season']][['Date of birth','date','season','Age']]
Out[18]:
In [17]:
transfers.columns
Out[17]:
In [11]:
transfers[transfers['Age']==4]
Out[11]:
In [24]:
pd.concat(,axis=1)
In [10]:
print transfers['id'].nunique()
print transfers['id'].count()
print loans['id'].count()
print nonloans['id'].count()
# with GB2:
#25443
#206201
#57085
#149116
#without GB2:
#23454
#188907
#49859
#139048
In [99]:
transfers[transfers['feeValue']>=50000000]#['id'].nunique()
Out[99]:
In [116]:
players
In [88]:
t=(transfers.groupby('id')['feeValue'].max()+1).apply(math.log10).apply(math.floor).value_counts().sort_index(ascending=False).drop(0)
t.index = t.index.map(lambda x: "over "+locale.format("%d",int(math.pow(10,x)),grouping=True))
t.plot.bar(figsize=(6,6))
Out[88]:
In [111]:
t=(nonloans['mv']+1).apply(math.log10).apply(math.floor).value_counts().sort_index(ascending=False)#.drop(3)
t.index = t.index.map(lambda x: locale.format("%d",int(math.pow(10,x) if x>1 else 0),grouping=True)+'+')
t.plot.pie(figsize=(6,6))
Out[111]:
In [115]:
t=(nonloans['feeValue']+1).apply(math.log10).apply(math.floor).value_counts().sort_index(ascending=False).drop([3,8])
t.index = t.index.map(lambda x: locale.format("%d",int(math.pow(10,x) if x>1 else 0),grouping=True)+'+')
t.plot.pie(figsize=(6,6))
Out[115]:
In [36]:
transfers = pd.read_csv('../data/merged/transfers.csv')
loans=transfers[transfers['isLoan']]
nonloans=transfers[~transfers['isLoan']]
In [94]:
transfers[transfers['fromTeamId'].isin([464,12604])]
#transfers[pd.notnull(transfers['newToCountry'])]
Out[94]:
In [38]:
#transfers['toCountry'] =
transfers['newToCountry']=np.where(transfers['fromTeamId'].isin([75,123,515,2113,2077]),transfers['fromCountry'],transfers['toCountry'])
In [5]:
transfers[transfers['toTeamId']==515]
Out[5]:
In [95]:
nonloans['PO1'==nonloans['toLeague']].groupby('fromLeague').count().sort_values('id',ascending=False)['id'][:10].plot.bar()
Out[95]:
In [96]:
nonloans['PO1'==nonloans['toLeague']].groupby('fromLeague').sum().sort_values('feeValue',ascending=False)['feeValue'][:10].plot.bar()
Out[96]:
In [489]:
pre2001ids = transfers[transfers['season']<2001]['id'].drop_duplicates().tolist()
transfers[transfers['id'].apply(lambda i: not i in pre2001ids)]
Out[489]:
In [483]:
transfers['id'].drop_duplicates().count()-8513
Out[483]:
In [501]:
players.to_csv('../data/merged/players.csv',encoding='utf-8',index=False)
In [97]:
# irregular transfers (where transfer date is outside of season years) are negligible
transfers.join(transfers['date'].apply(lambda d:d.year),rsuffix='year').groupby(['season','dateyear']).count()['id'].plot.bar(figsize=(40,10))
Out[97]:
In [469]:
transfers.merge(clubs.rename(columns={'league': 'fromLeague','clubId':'fromTeamId'})[['fromTeamId','season','fromLeague']],how='left',on=['season','fromTeamId']) \
.merge(clubs.rename(columns={'league': 'toLeague','clubId':'toTeamId'})[['toTeamId','season','toLeague']],how='left',on=['season','toTeamId'])
Out[469]:
In [459]:
transfers
Out[459]:
In [123]:
players.columns
Out[123]:
In [74]:
players[(pd.isnull(players['Display name']))].groupby('error').count()
Out[74]:
In [51]:
transfers
Out[51]:
In [196]:
t=transfers[['fromTeamId','fromTeamName','id']].drop_duplicates(['fromTeamId','fromTeamName'])
s=t.groupby('fromTeamId').count().sort_values('id')
t[t['fromTeamId'].apply(lambda x:x in s[s['id']>1].index)].sort_values('fromTeamId')
Out[196]:
In [197]:
t=transfers[['toTeamId','toTeamName','id']].drop_duplicates(['toTeamId','toTeamName'])
s=t.groupby('toTeamId').count().sort_values('id')
t[t['toTeamId'].apply(lambda x:x in s[s['id']>1].index)].sort_values('toTeamId')
Out[197]:
In [1]:
teams = pd.concat([DataFrame(nonloans[['fromTeamId','fromTeamName','fromCountry']].values,columns=['teamId','teamName','country']),
DataFrame(nonloans[['toTeamId','toTeamName','toCountry']].values,columns=['teamId','teamName','country'])],ignore_index=True).drop_duplicates('teamId')
teams.groupby('country').count()
In [251]:
import math
math.log10(10)
Out[251]:
In [ ]:
transfers[pd.isnull(transfers['playerName'])]
In [162]:
grouped = transfers.groupby(['id','date'])['playerName'].count()
grouped[grouped==2].count()
Out[162]:
In [234]:
int('0' + ''.join([d for d in 'asds121' if d in '1234567890.']))
Out[234]:
In [235]:
transfers[['id','fee','playerName']][~((transfers['fee'].str.contains('Th.'))|(transfers['fee'].str.contains('Mill.')))].drop_duplicates(subset='fee')
transfers[['id','fee','playerName']][~((transfers['fee'].str.contains('Th.'))|(transfers['fee'].str.contains('Mill.')))].groupby('fee').count().sort_values('id',ascending=False)
#transfers[['mvfixed','mv']].describe()
Out[235]:
In [280]:
(transfers.groupby('fromCountry')['id'].nunique() + transfers.groupby('toCountry')['id'].nunique()).sort_values(ascending=False)[:40].index
Out[280]:
In [115]:
transfers[((transfers['toLeague'] != transfers['fromLeague']))&(transfers['toLeague'] == 'L1')&(transfers['season'] > 2000)].drop_duplicates(subset=['id'],keep='last').groupby('fromLeague')['id'].count().sort_values(ascending=False).plot.bar(figsize=(20,5))
Out[115]:
In [114]:
transfers[((transfers['toLeague'] != transfers['fromLeague']))&(transfers['toLeague'] == 'L1')&(transfers['season'] > 2000)].drop_duplicates(subset=['id'],keep='last').groupby('fromLeague')['id'].count().sort_values(ascending=False).plot.pie(figsize=(6,6))
Out[114]:
In [122]:
transfers[((transfers['toLeague'] != transfers['fromLeague']))&(transfers['toLeague'] == 'L1')&(transfers['season'] > 2000)].drop_duplicates(subset=['id'],keep='last').groupby('fromLeague')['feeValue'].median().sort_values(ascending=False).plot.bar(figsize=(20,5))
Out[122]:
In [116]:
transfers[((transfers['toLeague'] != transfers['fromLeague']))&(transfers['toLeague'] == 'NL1')&(transfers['season'] > 2000)].drop_duplicates(subset=['id'],keep='last').groupby('fromLeague')['feeValue'].sum().sort_values(ascending=False).plot.pie(figsize=(6,6))
Out[116]:
In [124]:
transfers[((transfers['fromLeague'] == 'Other Paraguay'))&(transfers['toLeague'] == 'L1')&(transfers['season'] > 2000)].drop_duplicates(subset=['id'],keep='last').sort_values('feeValue',ascending=False)
Out[124]:
In [126]:
transfers[transfers['id']==28944]
Out[126]:
In [240]:
transfers[(transfers['toCountry'] == 'France') & (transfers['date'] >= datetime.date(1980,1,1))].drop_duplicates(subset=['id'],keep='last').groupby('fromCountry')['id'].count().drop('France').sort_values(ascending=False).plot.bar(figsize=(20,5))
Out[240]:
In [209]:
transfers[(transfers['toCountry'] == 'France') & (transfers['date'] < datetime.date(2010,1,1))].drop_duplicates(subset=['id'],keep='last').groupby('fromCountry')['id'].count().drop(['France','Monaco']).sort_values(ascending=False).plot.pie(figsize=(6,6))
Out[209]:
In [185]:
transfers[(transfers['toCountry'] == 'Portugal') & (transfers['date'] >= datetime.date(1900,1,1))].drop_duplicates(subset=['id'],keep='last').groupby('fromCountry')['id'].count().drop('Portugal').sort_values(ascending=False).plot.bar(figsize=(20,5))
Out[185]:
In [205]:
transfers[(transfers['toCountry'] == 'Portugal') & (transfers['date'] >= datetime.date(2010,1,1))].drop_duplicates(subset=['id'],keep='last').groupby('fromCountry')['id'].count().drop('Portugal').sort_values(ascending=False).plot.pie(figsize=(6,6))
Out[205]:
In [193]:
transfers[(transfers['toCountry'] == 'Germany') & (transfers['date'] < datetime.date(2020,1,1))].drop_duplicates(subset=['id'],keep='last').groupby('fromCountry')['id'].count().drop('Germany').sort_values(ascending=False).plot.bar(figsize=(20,5))
Out[193]:
In [202]:
transfers[(transfers['toCountry'] == 'Germany') & (transfers['date'] < datetime.date(2010,1,1))].drop_duplicates(subset=['id'],keep='last').groupby('fromCountry')['id'].count().drop('Germany').sort_values(ascending=False).plot.pie(figsize=(6,6))
Out[202]:
In [197]:
transfers[(transfers['toCountry'] == 'Netherlands') & (transfers['date'] >= datetime.date(2010,1,1))].drop_duplicates(subset=['id'],keep='last').groupby('fromCountry')['id'].count().drop('Netherlands').sort_values(ascending=False).plot.bar(figsize=(20,5))
Out[197]:
In [199]:
transfers[(transfers['toCountry'] == 'Netherlands') & (transfers['date'] < datetime.date(2010,1,1))].drop_duplicates(subset=['id'],keep='last').groupby('fromCountry')['id'].count().drop('Netherlands').sort_values(ascending=False).plot.pie(figsize=(6,6))
Out[199]:
In [268]:
transfers[(transfers['toCountry'] == 'Italy') & (transfers['date'] < datetime.date(2020,1,1))].drop_duplicates(subset=['id'],keep='last').groupby('fromCountry')['id'].count().drop('Italy').sort_values(ascending=False).plot.bar(figsize=(20,5))
Out[268]:
In [271]:
transfers[(transfers['toCountry'] == 'Italy') & (transfers['date'] < datetime.date(2020,1,1))].drop_duplicates(subset=['id'],keep='last').groupby('fromCountry')['id'].count().drop('Italy').sort_values(ascending=False).plot.pie(figsize=(6,6))
Out[271]:
In [40]:
transfers[(transfers['fromCountry'] == 'Brazil') & (transfers['date'] < datetime.date(2020,1,1))].drop_duplicates(subset=['id'],keep='last').groupby('toCountry')['id'].count().drop('Brazil').sort_values(ascending=False).plot.bar(figsize=(20,5))
Out[40]:
In [41]:
transfers[(transfers['fromCountry'] == 'Japan') & (transfers['date'] < datetime.date(2020,1,1))].drop_duplicates(subset=['id'],keep='last').groupby('toCountry')['id'].count().drop('Japan').sort_values(ascending=False).plot.bar(figsize=(20,5))
Out[41]:
In [43]:
transfers[(transfers['fromCountry'] == 'Belgium') & (transfers['date'] < datetime.date(2020,1,1))].drop_duplicates(subset=['id'],keep='last').groupby('toCountry')['id'].count().drop('Belgium').sort_values(ascending=False).plot.bar(figsize=(20,5))
Out[43]:
In [36]:
transfers.groupby('id')['playerName'].count().hist()
transfers.groupby('id')['playerName'].count().describe()
Out[36]:
In [37]:
loans.groupby('id')['playerName'].count().hist()
loans.groupby('id')['playerName'].count().describe()
Out[37]:
In [35]:
transfers['date'][transfers['date']>datetime.date(2002,8,1)].apply(lambda d: d.month).hist()
#.groupby('date').
Out[35]:
In [214]:
transfers[(transfers['toCountry'] == 'France') &(transfers['fromCountry'] == 'England')].sort_values('date')
Out[214]:
In [122]:
DataFrame(transfers[transfers['toCountry']!=transfers['fromCountry']]).groupby('season')['id'].count().plot.line()
Out[122]:
In [123]:
transfers.groupby('Age')['id'].count().plot.line()
Out[123]:
In [117]:
nonloans[(nonloans['toCountry']!=nonloans['fromCountry'])&(nonloans['fromLeague'].isin(['GB1','ES1','L1','IT1','FR1','NL1','PO1']))] \
.rename(columns={'Age':'age','fromLeague':'origin'}).groupby(['age','origin'])['id'].count().unstack() \
.rolling(window=3,center=False).mean().rename(columns={'GB1':'Premier League (England)','FR1':'Ligue 1 (France)','ES1':'La Liga (Spain)','IT1':'Serie A (Italy)','L1':'Bundesliga (Germany)','PO1':'Primeira Liga (Portugal)','NL1':'Eredivisie (Netherlands)'}).plot.line(figsize=(9,6),title='Number of International Transfers, by Age & Origin')
Out[117]:
In [103]:
nonloans[(nonloans['toLeague']!=nonloans['fromLeague'])&(nonloans['fromLeague'].isin(['GB1','ES1','L1','IT1','FR1','NL1','PO1']))] \
.rename(columns={'Age':'age','fromLeague':'origin'}).groupby(['age','origin'])['feeValue'].sum().unstack() \
.rolling(window=3,center=False).mean().rename(columns={'GB1':'Premier League (England)','FR1':'Ligue 1 (France)','ES1':'La Liga (Spain)','IT1':'Serie A (Italy)','L1':'Bundesliga (Germany)','PO1':'Primeira Liga (Portugal)','NL1':'Eredivisie (Netherlands)'}).plot.line(figsize=(9,6),title='Fees Paid for Inter-League Transfers, by Age & Origin')
Out[103]:
In [115]:
nonloans[(nonloans['toCountry']!=nonloans['fromCountry'])&(nonloans['toLeague'].isin(['GB1','ES1','L1','IT1','FR1','NL1','PO1']))] \
.rename(columns={'Age':'age','toLeague':'destination'}).groupby(['age','destination'])['id'].count().unstack() \
.rolling(window=3,center=False).mean().rename(columns={'GB1':'Premier League (England)','FR1':'Ligue 1 (France)','ES1':'La Liga (Spain)','IT1':'Serie A (Italy)','L1':'Bundesliga (Germany)','PO1':'Primeira Liga (Portugal)','NL1':'Eredivisie (Netherlands)'}).plot.line(figsize=(9,6),title='Number of International Transfers, by Age & Destination')
Out[115]:
In [71]:
nonloans.groupby(['Age','Position group'])['id'].count().unstack().plot.line(figsize=(10,5))
Out[71]:
In [39]:
nonloans.groupby('Age')['feeValue'].sum().plot.bar(figsize=(10,5))
Out[39]:
In [54]:
nonloans[nonloans['Age']<15][transfers.columns[:12].tolist()+['Age']]
Out[54]:
In [210]:
leagues=['GB1','ES1','L1','IT1','FR1','PO1','NL1','MLS1']
#DataFrame(nonloans[(nonloans['toLeague'].isin(leagues))]).groupby(['season','toLeague'])['feeValue'].sum().unstack().plot.line(figsize=(10,10))
DataFrame(nonloans[(nonloans['toLeague']!=nonloans['fromLeague'])&(nonloans['toLeague'].isin(leagues))]).groupby(['season','toLeague'])['feeValue'].sum().unstack().plot.line(figsize=(20,10))
#DataFrame(nonloans[(nonloans['toCountry']!=nonloans['fromCountry'])&(nonloans['toLeague'].isin(leagues))]).groupby(['season','toLeague'])['id'].count().unstack().plot.line(figsize=(10,10))
Out[210]:
In [218]:
#DataFrame(nonloans[(nonloans['toLeague'].isin(leagues))]).groupby(['season','toLeague'])['feeValue'].sum().unstack().plot.line(figsize=(10,10))
DataFrame(nonloans[(nonloans['toLeague']!=nonloans['fromLeague'])&(nonloans['toLeague']=='ES1')]).groupby(['season','toTeamName'])['feeValue'].sum().unstack().plot.line(figsize=(20,10))
#DataFrame(nonloans[(nonloans['toCountry']!=nonloans['fromCountry'])&(nonloans['toLeague'].isin(leagues))]).groupby(['season','toLeague'])['id'].count().unstack().plot.line(figsize=(10,10))
Out[218]:
In [139]:
t=DataFrame(nonloans[nonloans['toCountry']==nonloans['fromCountry']]['season'])
t["id"]=t.index
t.groupby('season').count().plot.line()
#t.groupby('date').count()[-10:-2].plot.line()
Out[139]:
In [138]:
t=DataFrame(loans[loans['toCountry']==loans['fromCountry']]['season'])
t["id"]=t.index
t.groupby('season').count().plot.line()
#t.groupby('date').count()[-10:-2].plot.line()
Out[138]: