This Notebook contains all the code required to scrape the football players informations from Transfermarkt.com.
It automatically scrape infos about European, American and Asian leagues that have an estimated total market value > 200M euros. It scrapes all the clubs from thoses leagues, then download all the html pages of the players.
The players page are then parsed, and a DataFrame summing up all infos about the players (Name, current club, market value, birthdate, list of transfers, etc...) is created and saved as a JSON file.
In [1]:
from urllib.request import *
from bs4 import BeautifulSoup
import re
import pandas as pd
import numpy as np
import scipy.stats as stats
import pylab as pl
import requests
import json
import os
import time
%matplotlib inline
In [2]:
base_url = "https://www.transfermarkt.com"
In [3]:
def parseContinent(ref):
HEADERS = {'User-Agent': 'Mozilla/5.0'}
url = base_url + "/" + ref
r = requests.get(url, headers=HEADERS)
response = BeautifulSoup(r.text, 'html.parser')
leagues = []
rows = response.find("table", {"class":"items"}).find_all("tr",{"class","odd"})
rows += response.find("table", {"class":"items"}).find_all("tr",{"class","even"})
for row in rows:
val = row.find("td",{"class":"rechts hauptlink"}).text
val = ".".join(val.split(","))
rest = val.split(" ")[1]
val = val.split(" ")[0]
if "Bill" in rest:
val = float(val) * 10**9
else:
if "Mill" in rest:
val = float(val) * 10**6
else:
val = 0
if val > 200*10**6:
league = {}
league["href"] = row.findAll('a')[1]['href']
league["name"] = row.find("img")["title"]
league["country"] = row.find("td",{"class","zentriert"}).find("img")["title"]
league["tot_value"] = val
leagues.append(league)
return leagues
In [4]:
def getPlayers(club_page):
players = []
players_infos = club_page.find("div", {"id":"yw1"}).find("table", {"class":"items"}).find("tbody").find_all("tr", recursive=False)
for player_info in players_infos:
player = {}
player_info = player_info.find("a", {"class":"spielprofil_tooltip"})
player["name"] = player_info["title"]
player["id"] = player_info["id"]
player["href"] = player_info["href"]
players.append(player)
return players
In [5]:
def get_leagues_clubs(league_ref): #get clubs in league
HEADERS = {'User-Agent': 'Mozilla/5.0'}
url = base_url + league_ref
clubs = []
r = requests.get(url, headers=HEADERS)
response = BeautifulSoup(r.text, 'html.parser')
rows = response.find("table", {"class":"items"}).find_all("tr",{"class","odd"})
rows += response.find("table", {"class":"items"}).find_all("tr",{"class","even"})
clubs = []
for row in rows:
try:
clubs.append(row.findAll("td",{"class":"zentriert"})[1].find('a')['href'])
except:
None
return clubs
In [6]:
def parse_league(league_ref): #get clubs in league
HEADERS = {'User-Agent': 'Mozilla/5.0'}
url = base_url + league_ref
clubs = []
r = requests.get(url, headers=HEADERS)
response = BeautifulSoup(r.text, 'html.parser')
rows = response.find("table", {"class":"items"}).find_all("tr",{"class","odd"})
rows += response.find("table", {"class":"items"}).find_all("tr",{"class","even"})
for row in rows:
url_club = base_url + row.findAll("td",{"class":"zentriert"})[1].find('a')['href']
r_club = requests.get(url_club, headers=HEADERS)
response_club = BeautifulSoup(r_club.text, 'html.parser')
stadium_info =response_club.find("div",{"id":"main"}).findAll("span",{"class":"dataValue"})[4].text
stadium_info=stadium_info.replace(u'\xa0',u'')
stadium_info=stadium_info.replace(u'\n',u'')
split_stadium= re.split(r'(\d+)',stadium_info)
stadium = split_stadium[0]
# num_seats = float(split_stadium[1]+'.'+split_stadium[3])
club = {}
club["name"] = row.findAll("td",{"class":"zentriert"})[1].find('a')['title']
club["href"] = row.findAll("td",{"class":"zentriert"})[1].find('a')['href']
club["squad"] = row.findAll("td",{"class":"zentriert"})[1].text
club["market_value"] = row.find("td",{"class":"rechts show-for-small show-for-pad nowrap"}).text
club["stadium"] = stadium
players = getPlayers(BeautifulSoup(r_club.text, 'html.parser'))
club["players"] = players
# club["stadium_seats"] = num_seats
clubs.append(club)
return clubs
In [7]:
def getPlayersPage(player_ref):
HEADERS = {'User-Agent': 'Mozilla/5.0'}
url = base_url + player_ref
r = requests.get(url, headers=HEADERS)
return r.text
In [8]:
'''
leagues = parseContinent("wettbewerbe/europa")
leagues += parseContinent("wettbewerbe/amerika")
leagues += parseContinent("wettbewerbe/asien")
with open("data/leagues.json", "w") as out:
json.dump(leagues, out)
'''
Out[8]:
In [9]:
with open("data/leagues.json", "r") as in_file:
leagues = json.load(in_file)
print("Number of leagues: " + str(len(leagues)))
for league in leagues:
print(league["name"])
In [10]:
'''
leaguesData = []
for league in leagues:
leaguesData.append(get_leagues_clubs(league['href']))
club_leagues = {}
for i in range(len(leagues)):
for club in leaguesData[i]:
club_leagues[club.split("/")[4]] = leagues[i]["name"]
with open("data/clubs_leauges.json", "w") as out:
json.dump(club_leagues, out)
'''
Out[10]:
In [11]:
'''
leaguesData = []
clubs = []
for league in leagues:
clubs += parse_league(league["href"])
with open("data/clubs.json", "w") as out:
json.dump(clubs, out)
'''
Out[11]:
In [12]:
with open("data/clubs.json", "r") as in_file:
clubs = json.load(in_file)
In [13]:
clubDict = {}
for club in clubs:
club_id = club["href"].split("/")[4]
if club_id not in clubDict:
clubDict[club_id] = club
with open("../scraper/data/dictClubs.json", "w") as out:
json.dump(clubDict, out)
In [14]:
with open("../scraper/data/dictClubs.json", "r") as in_file:
clubDict = json.load(in_file)
In [15]:
'''
player_list = []
for club in clubs:
players = club["players"]
for player in players:
player_list.append(player["href"])
with open("data/players_ref.json", "w") as out:
json.dump(player_list, out)
'''
Out[15]:
In [16]:
with open("data/players_ref.json", "r") as in_file:
players_list = json.load(in_file)
In [ ]:
# Download the pages of all the players and store it in a file
for player_ref in players_list:
player_id = player_ref.split("/")[-1]
directory = 'data/players/' + player_id + "/"
fname = directory + "page.html"
if os.path.isfile(fname) == False:
if os.path.exists(directory) == False:
os.makedirs(directory)
page = getPlayersPage(player_ref)
with open(fname, "w")as out:
json.dump(page, out)
time.sleep(0.5)
In [18]:
with open("data/players_ref.json", "r") as in_file:
players_ref_list = json.load(in_file)
In [19]:
# Define method to parse player given player url
def getPlayerData(player_ref):
playerID = player_ref.split("/")[-1]
with open("data/players/" + playerID + "/page.html") as in_file:
player_page = json.load(in_file)
response = BeautifulSoup(player_page, 'html.parser')
playerInfos = str(response.find("table", {"class":"auflistung"}))
player = {}
player["href"] = player_ref
try:
player["number"] = response.find("span", {"class":"dataRN"}).text
except:
player["number"] = None
player["name"] = response.find("h1", {"itemprop":"name"}).text
player["player_id"] = player_ref.split("/")[-1]
position = BeautifulSoup(playerInfos.split("Position")[1], 'html.parser').find("td").text
reg = re.compile( "[a-zA-Z -]")
player["position"] = "".join(reg.findall(position))
try:
player["birthdate"] = BeautifulSoup(playerInfos.split("Date of birth")[1], 'html.parser').find("td").text
except:
player["birthdate"] = None
player["nationality"] = BeautifulSoup(playerInfos.split("Nationality")[1], 'html.parser').find("td").find("img")["title"]
player["current_club"] = BeautifulSoup(playerInfos.split("Current club")[1], 'html.parser').find("td").find_all("a")[-1].text
try:
transfers = []
trans = response.find("div",{"class" : "box transferhistorie"}).find("table").find("tbody").find_all("tr", {"class":"zeile-transfer"})
for t in trans:
transfer = {}
transfer["player"] = player_ref.split("/")[-1]
transfer["date"] = t.find_all("td", {"class":"zentriert hide-for-small"})[1].text
transfer["from"] = t.find_all("td", {"class":"no-border-rechts vereinswappen"})[0].find("a")["id"]
transfer["to"] = t.find_all("td", {"class":"no-border-rechts vereinswappen"})[1].find("a")["id"]
if (t.find("td", {"class":"zelle-abloese"}).text) == "End of loan" or t.find("td", {"class":"zelle-abloese"}).text =="Loan":
transfer["fee"] = t.find("td", {"class":"zelle-mw"}).text
else:
transfer["fee"] = t.find("td",{"class":"zelle-abloese"}).text
transfers.append(transfer)
except:
transfers = None
return player, transfers
In [20]:
# Get the amount of the transfer as an int from a string
def getTransferAmount(fee):
try:
if fee == "-" or fee == "?" or fee == "draft":
return 0
if "free" in fee or "Free" in fee:
return 0
if fee is not None:
val = ".".join(fee.split(","))
rest = val.split(" ")[1]
val = val.split(" ")[0]
if "Mill" in rest:
return float(val) * 10**6
else:
if "Th" in rest:
return float(val) * 10**3
else:
return 0
else:
return 0
except:
return 0
In [ ]:
# Loads each player, parse the page to retrieve infos, then compute the amont of the transfer from the strings
# and save the whole DataFrame as a JSON file
players_data = []
players_transfers = []
for player_id in players_ref_list:
player = getPlayerData(player_id)
print(player)
players_data.append(player[0])
players_transfers.append(player[1])
playersDF = pd.DataFrame(players_data)
playersDF["transfers"] = players_transfers
# Convert string fees to integers
for idx, player in playersDF.iterrows():
if player["transfers"] is not None:
for transfer in player["transfers"]:
transfer["amount"] = getTransferAmount(transfer["fee"])
playersDF.to_json("data/players.json")
In [23]:
playersDF = pd.read_json("data/players.json")
In [24]:
print("Total number of players: " + str(len(playersDF)))
print("Total number of transfers: " + str(playersDF["transfers"].map(lambda x: len(x) if x is not None else 0).sum()))