Transfermarkt Scraper

This Notebook contains all the code required to scrape the football players informations from Transfermarkt.com.

It automatically scrape infos about European, American and Asian leagues that have an estimated total market value > 200M euros. It scrapes all the clubs from thoses leagues, then download all the html pages of the players.

The players page are then parsed, and a DataFrame summing up all infos about the players (Name, current club, market value, birthdate, list of transfers, etc...) is created and saved as a JSON file.


In [1]:
from urllib.request import *
from bs4 import BeautifulSoup
import re
import pandas as pd
import numpy as np
import scipy.stats as stats
import pylab as pl
import requests
import json
import os
import time
%matplotlib inline

In [2]:
base_url = "https://www.transfermarkt.com"

In [3]:
def parseContinent(ref):
    
    HEADERS = {'User-Agent': 'Mozilla/5.0'}
    url = base_url + "/" + ref

    r = requests.get(url, headers=HEADERS)
    response = BeautifulSoup(r.text, 'html.parser')
    
    leagues = []
    rows = response.find("table", {"class":"items"}).find_all("tr",{"class","odd"})
    rows += response.find("table", {"class":"items"}).find_all("tr",{"class","even"})

    for row in rows:
        val = row.find("td",{"class":"rechts hauptlink"}).text
        val = ".".join(val.split(","))
        rest = val.split(" ")[1]
        val = val.split(" ")[0]
        if "Bill" in rest:
            val = float(val) * 10**9
        else:
            if "Mill" in rest:
                val = float(val) * 10**6
            else: 
                val = 0
        if val > 200*10**6:
            league = {}
            league["href"] = row.findAll('a')[1]['href']
            league["name"] = row.find("img")["title"]
            league["country"] = row.find("td",{"class","zentriert"}).find("img")["title"]
            league["tot_value"] = val
            leagues.append(league)
            
    return leagues

In [4]:
def getPlayers(club_page):
    players = []
    players_infos = club_page.find("div", {"id":"yw1"}).find("table", {"class":"items"}).find("tbody").find_all("tr", recursive=False)
    for player_info in players_infos:
        player = {}
        player_info = player_info.find("a", {"class":"spielprofil_tooltip"})
        player["name"] = player_info["title"]
        player["id"] = player_info["id"]
        player["href"] = player_info["href"]
        players.append(player)
    return players

In [5]:
def get_leagues_clubs(league_ref): #get clubs in league

    HEADERS = {'User-Agent': 'Mozilla/5.0'}
    url = base_url + league_ref
    clubs = []
    
    r = requests.get(url, headers=HEADERS)
    response = BeautifulSoup(r.text, 'html.parser')
    rows = response.find("table", {"class":"items"}).find_all("tr",{"class","odd"})
    rows += response.find("table", {"class":"items"}).find_all("tr",{"class","even"})
    
    clubs = []
    for row in rows:
        try:
            clubs.append(row.findAll("td",{"class":"zentriert"})[1].find('a')['href'])
        except:
            None
    
    return clubs

In [6]:
def parse_league(league_ref): #get clubs in league

    HEADERS = {'User-Agent': 'Mozilla/5.0'}
    url = base_url + league_ref
    clubs = []
    
    r = requests.get(url, headers=HEADERS)
    response = BeautifulSoup(r.text, 'html.parser')
    rows = response.find("table", {"class":"items"}).find_all("tr",{"class","odd"})
    rows += response.find("table", {"class":"items"}).find_all("tr",{"class","even"})
    
    for row in rows:
            
            url_club = base_url + row.findAll("td",{"class":"zentriert"})[1].find('a')['href']
            r_club = requests.get(url_club, headers=HEADERS)
            response_club = BeautifulSoup(r_club.text, 'html.parser')
            stadium_info =response_club.find("div",{"id":"main"}).findAll("span",{"class":"dataValue"})[4].text
            
            stadium_info=stadium_info.replace(u'\xa0',u'')
            stadium_info=stadium_info.replace(u'\n',u'')


            split_stadium= re.split(r'(\d+)',stadium_info)
            stadium = split_stadium[0]
#             num_seats = float(split_stadium[1]+'.'+split_stadium[3])
           
            
            club = {}
            club["name"] = row.findAll("td",{"class":"zentriert"})[1].find('a')['title']
            club["href"] = row.findAll("td",{"class":"zentriert"})[1].find('a')['href']
            club["squad"] = row.findAll("td",{"class":"zentriert"})[1].text
            club["market_value"] = row.find("td",{"class":"rechts show-for-small show-for-pad nowrap"}).text
            club["stadium"] = stadium
            
            players = getPlayers(BeautifulSoup(r_club.text, 'html.parser'))
            club["players"] = players
#             club["stadium_seats"] = num_seats
            clubs.append(club)  
    
    return clubs

In [7]:
def getPlayersPage(player_ref):
        
    HEADERS = {'User-Agent': 'Mozilla/5.0'}
    url = base_url + player_ref

    r = requests.get(url, headers=HEADERS)
    return r.text

In [8]:
'''
leagues = parseContinent("wettbewerbe/europa")
leagues += parseContinent("wettbewerbe/amerika")
leagues += parseContinent("wettbewerbe/asien")
with open("data/leagues.json", "w") as out:
    json.dump(leagues, out)
'''


Out[8]:
'\nleagues = parseContinent("wettbewerbe/europa")\nleagues += parseContinent("wettbewerbe/amerika")\nleagues += parseContinent("wettbewerbe/asien")\nwith open("data/leagues.json", "w") as out:\n    json.dump(leagues, out)\n'

In [9]:
with open("data/leagues.json", "r") as in_file:
    leagues = json.load(in_file)
    
print("Number of leagues: " + str(len(leagues)))
for league in leagues:
    print(league["name"])


Number of leagues: 25
Premier League
Serie A
Ligue 1
Liga NOS
Eredivisie
Super League
Raiffeisen Super League
LaLiga
1.Bundesliga
Süper Lig
Premier Liga
Jupiler Pro League
Premier Liga
HET Liga
Campeonato Brasileiro Série A
Liga MX Clausura
Major League Soccer
Copa MX Clausura
Primera División
Liga MX Apertura
Liga Águila I
Copa MX Apertura
Liguilla Apertura
Chinese Super League
J1 League

In [10]:
'''
leaguesData = []
for league in leagues:
    leaguesData.append(get_leagues_clubs(league['href']))
    
club_leagues = {}
for i in range(len(leagues)):
    for club in leaguesData[i]:
        club_leagues[club.split("/")[4]] = leagues[i]["name"]
        
with open("data/clubs_leauges.json", "w") as out:
    json.dump(club_leagues, out)
'''


Out[10]:
'\nleaguesData = []\nfor league in leagues:\n    leaguesData.append(get_leagues_clubs(league[\'href\']))\n    \nclub_leagues = {}\nfor i in range(len(leagues)):\n    for club in leaguesData[i]:\n        club_leagues[club.split("/")[4]] = leagues[i]["name"]\n        \nwith open("data/clubs_leauges.json", "w") as out:\n    json.dump(club_leagues, out)\n'

In [11]:
'''
leaguesData = []
clubs = []
for league in leagues:
    clubs += parse_league(league["href"])

with open("data/clubs.json", "w") as out:
    json.dump(clubs, out)
'''


Out[11]:
'\nleaguesData = []\nclubs = []\nfor league in leagues:\n    clubs += parse_league(league["href"])\n\nwith open("data/clubs.json", "w") as out:\n    json.dump(clubs, out)\n'

In [12]:
with open("data/clubs.json", "r") as in_file:
    clubs = json.load(in_file)

Clean redundant clubs

There are 2 Mexican leagues with redundant clubs, we keep them each once only


In [13]:
clubDict = {}
for club in clubs:
    club_id = club["href"].split("/")[4]
    if club_id not in clubDict:
        clubDict[club_id] = club
        
with open("../scraper/data/dictClubs.json", "w") as out:
    json.dump(clubDict, out)

In [14]:
with open("../scraper/data/dictClubs.json", "r") as in_file:
    clubDict = json.load(in_file)

Create players list


In [15]:
'''
player_list = []

for club in clubs:
    players = club["players"]
    for player in players:
        player_list.append(player["href"])

with open("data/players_ref.json", "w") as out:
    json.dump(player_list, out)
'''


Out[15]:
'\nplayer_list = []\n\nfor club in clubs:\n    players = club["players"]\n    for player in players:\n        player_list.append(player["href"])\n\nwith open("data/players_ref.json", "w") as out:\n    json.dump(player_list, out)\n'

In [16]:
with open("data/players_ref.json", "r") as in_file:
    players_list = json.load(in_file)

In [ ]:
# Download the pages of all the players and store it in a file
for player_ref in players_list:
    player_id = player_ref.split("/")[-1]
     
    directory = 'data/players/' + player_id + "/"
    fname = directory + "page.html"

    if os.path.isfile(fname) == False:
        if os.path.exists(directory) == False:
            os.makedirs(directory)
        page = getPlayersPage(player_ref)
        with open(fname, "w")as out:
            json.dump(page, out)  
        time.sleep(0.5)

Creating players dataset

Now that all players pages have been scraped, we can load them, parse them and extract all necessary informations. We then save the DataFrame produced as a JSON file


In [18]:
with open("data/players_ref.json", "r") as in_file:
    players_ref_list = json.load(in_file)

In [19]:
# Define method to parse player given player url
def getPlayerData(player_ref):    
    playerID = player_ref.split("/")[-1]
    
    with open("data/players/" + playerID + "/page.html") as in_file:
        player_page = json.load(in_file)
    
    response = BeautifulSoup(player_page, 'html.parser')
    
    playerInfos = str(response.find("table", {"class":"auflistung"}))
    player = {}
    player["href"] = player_ref
    try:
        player["number"] = response.find("span", {"class":"dataRN"}).text
    except:
        player["number"] = None
    player["name"] = response.find("h1", {"itemprop":"name"}).text
    player["player_id"] = player_ref.split("/")[-1]
    position = BeautifulSoup(playerInfos.split("Position")[1], 'html.parser').find("td").text
    reg = re.compile( "[a-zA-Z -]")
    player["position"] = "".join(reg.findall(position))
    try:
        player["birthdate"] = BeautifulSoup(playerInfos.split("Date of birth")[1], 'html.parser').find("td").text
    except:
        player["birthdate"] = None
    player["nationality"] = BeautifulSoup(playerInfos.split("Nationality")[1], 'html.parser').find("td").find("img")["title"]
    player["current_club"] = BeautifulSoup(playerInfos.split("Current club")[1], 'html.parser').find("td").find_all("a")[-1].text

    try:
        transfers = []
        trans = response.find("div",{"class" : "box transferhistorie"}).find("table").find("tbody").find_all("tr", {"class":"zeile-transfer"})

        for t in trans:
            transfer = {}
            transfer["player"] = player_ref.split("/")[-1]
            transfer["date"] = t.find_all("td", {"class":"zentriert hide-for-small"})[1].text
            transfer["from"] = t.find_all("td", {"class":"no-border-rechts vereinswappen"})[0].find("a")["id"]
            transfer["to"] = t.find_all("td", {"class":"no-border-rechts vereinswappen"})[1].find("a")["id"]
            if (t.find("td", {"class":"zelle-abloese"}).text) == "End of loan" or t.find("td", {"class":"zelle-abloese"}).text =="Loan":
                transfer["fee"] = t.find("td", {"class":"zelle-mw"}).text
            else:
                transfer["fee"] = t.find("td",{"class":"zelle-abloese"}).text
                        
            transfers.append(transfer)
    except:
        transfers = None
        
    return player, transfers

In [20]:
# Get the amount of the transfer as an int from a string
def getTransferAmount(fee):
    try:
        if fee == "-" or fee == "?" or fee == "draft":
            return 0
        if "free" in fee or "Free" in fee:
            return 0
        if fee is not None:
            val = ".".join(fee.split(","))
            rest = val.split(" ")[1]
            val = val.split(" ")[0]

            if "Mill" in rest:
                return float(val) * 10**6
            else:
                if "Th" in rest: 
                    return float(val) * 10**3
                else:
                    return 0
        else:
            return 0
    except:
        return 0

In [ ]:
# Loads each player, parse the page to retrieve infos, then compute the amont of the transfer from the strings
# and save the whole DataFrame as a JSON file
players_data = []
players_transfers = []
for player_id in players_ref_list:
    player = getPlayerData(player_id)
    print(player)
    players_data.append(player[0])
    players_transfers.append(player[1])
    
playersDF = pd.DataFrame(players_data)
playersDF["transfers"] = players_transfers

# Convert string fees to integers
for idx, player in playersDF.iterrows():
    if player["transfers"] is not None:
        for transfer in player["transfers"]:
            transfer["amount"] = getTransferAmount(transfer["fee"])

playersDF.to_json("data/players.json")

In [23]:
playersDF = pd.read_json("data/players.json")

In [24]:
print("Total number of players: " + str(len(playersDF)))
print("Total number of transfers: " + str(playersDF["transfers"].map(lambda x: len(x) if x is not None else 0).sum()))


Total number of players: 12075
Total number of transfers: 73558