Crawling Whoscored Web Site

  1. https://www.whoscored.com/
  2. Crawling
    • League Team
    • Player Summary
    • Player Defensive
    • Player Offensive
    • Player Passing

Import Package


In [1]:
import pandas as pd
import time
import pickle
from selenium import webdriver

0. Useful Common Function


In [2]:
def replace_pd(df):
    """
    replace pandas dataframe datas "-" to 0
    
    """
    
    # mapping pattern
    mapping = {'-': 0}
    
    replace_dict = {}
    
    for colum in df.columns:
        replace_dict[colum] = mapping
        
    return df.replace(replace_dict)

1. Crawling League Team Data Function


In [3]:
def crawling_league_teams(team_id, api_delay_term=5):
    """
    cawling league team_id and team name datas
    
    parameter ---------------------------------------------------------------
    team_id : one of you want league team_id & parameter data type int or str
    
    return ------------------------------------------------------------------
    crawling league team_id, team_name datas belong team_id parameter
    return pandas dataframe columns=team_id, team_name
    
    """
    
    # connect webdriver
    url = "https://www.whoscored.com/Teams/" + str(team_id)
    driver =  webdriver.PhantomJS()
    driver.get(url)

    
    # wait get league team datas
    time.sleep(api_delay_term) 
    
    # make pandas dataframe
    team_df = pd.DataFrame(columns=["team_id","team_name"])
    
    # get team datas
    teams = driver.find_elements_by_css_selector("#teams option")
    for team in teams:
        team_name = team.text
        team_id = team.get_attribute("value").split("/")[2]
        team_df.loc[len(team_df)] = {"team_id":team_id, "team_name":team_name }
        
    # close webdriver
    driver.close()
    
    return replace_pd(team_df)

2. Crawling Players Summary Data Function


In [4]:
def crawling_player_summary(team_id, api_delay_term=5):
    """
    cawling player summary data 
        
    parameter -------------------------------------------------------------------
    team_id : one of you want team_id of players & parameter data type int or str
    api_dealy_term : if your network speed is slow, you should set bigger number
    
    return ----------------------------------------------------------------------
    pandas dataframe belong player's ability
    player_nuber, flag, name, age, position, tall, weight, full_time, half_time
    , mins, goals, asists, yel, red, spg, ps, motm, aw, rating
    
    """    
    
    # connect webdriver
    url = "https://www.whoscored.com/Teams/" + str(team_id)
    driver =  webdriver.PhantomJS()
    driver.get(url)

    # wait for getting data
    time.sleep(api_delay_term)
    
    # make pandas dataframe
    player_summary_df = pd.DataFrame(columns=[
            "player_number", "flag", "name", "age", "position"
            , "tall", "weight", "full_time", "half_time", "mins"
            , "goals", "asists", "yel", "red", "spg", "ps", "motm"
            , "aw", "rating",
        ])
    
    # get player summay datas
    elements = driver.find_elements_by_css_selector("#player-table-statistics-body tr")
    for element in elements:
        
        # split full time games and half time games
        games = element.find_elements_by_css_selector("td")[5].text
        games = games.split("(")
        full_time, half_time = games[0], 0
        if len(games) > 1 :
            half_time = games[1].replace(")","")
        else :
            half_time = 0
        
        # player dictionary data
        player_dict = { 
            "player_number": element.find_elements_by_css_selector("td")[2].find_elements_by_css_selector("a")[0].get_attribute("href").split("/")[4], 
            "flag": element.find_elements_by_css_selector("td")[1].find_elements_by_css_selector("span")[0].get_attribute("class").split("-")[2],
            "name": element.find_elements_by_css_selector("td")[2].find_elements_by_css_selector("a")[0].text, 
            "age": element.find_elements_by_css_selector("td")[2].find_elements_by_css_selector("span")[0].text, 
            "position": element.find_elements_by_css_selector("td")[2].find_elements_by_css_selector("span")[1].text[1:], 
            "tall": element.find_elements_by_css_selector("td")[3].text,
            "weight": element.find_elements_by_css_selector("td")[4].text, 
            "full_time": full_time,
            "half_time": half_time,
            "mins": element.find_elements_by_css_selector("td")[6].text,
            "goals": element.find_elements_by_css_selector("td")[7].text,
            "asists": element.find_elements_by_css_selector("td")[8].text,
            "yel": element.find_elements_by_css_selector("td")[9].text,
            "red": element.find_elements_by_css_selector("td")[10].text,
            "spg": element.find_elements_by_css_selector("td")[11].text,
            "ps": element.find_elements_by_css_selector("td")[12].text,
            "aw": element.find_elements_by_css_selector("td")[13].text,
            "motm": element.find_elements_by_css_selector("td")[14].text,
            "rating": element.find_elements_by_css_selector("td")[15].text,
        }
        
        player_summary_df.loc[len(player_summary_df)] = player_dict
    
    # close webdriver
    driver.close()
    
    return replace_pd(player_summary_df)

3. Crawling Players Defensive Data Function


In [5]:
def crawling_player_defensive(team_id, api_delay_term=5):
    """
    cawling player defensive data 
        
    parameter -------------------------------------------------------------------
    team_id : one of you want team_id of players & parameter data type int or str
    api_dealy_term : if your network speed is slow, you should set bigger number
    
    return ----------------------------------------------------------------------
    pandas dataframe belong player's ability
    player_number, tackles, inter, fouls, offsides, clear, drb, blocks, owng
    
    """  

    # connect webdriver
    url = "https://www.whoscored.com/Teams/" + str(team_id)
    driver =  webdriver.PhantomJS()
    driver.get(url)
    
    # wait for getting data
    time.sleep(api_delay_term)
    
    # click event for getting defensive data
    driver.find_elements_by_css_selector("#team-squad-stats-options .in-squad-detailed-view")[0].find_element_by_css_selector("a").click()
    
    # wait for getting data
    time.sleep(api_delay_term)
    
    # make pandas dataframe
    player_defensive_df = pd.DataFrame(columns=[
            "player_number", "tackles", "inter", "fouls", "offsides", "clear", "drb", "blocks", "owng"
        ])
    
    # get player defensive datas
    elements = driver.find_elements_by_css_selector("#team-squad-stats-defensive #player-table-statistics-body tr")
    for element in elements:
       
        player_dict = {
            "player_number": element.find_elements_by_css_selector("td")[2].find_elements_by_css_selector("a")[0].get_attribute("href").split("/")[4], 
            "tackles": element.find_elements_by_css_selector("td")[7].text, 
            "inter": element.find_elements_by_css_selector("td")[8].text, 
            "fouls": element.find_elements_by_css_selector("td")[9].text,
            "offsides": element.find_elements_by_css_selector("td")[10].text,
            "clear": element.find_elements_by_css_selector("td")[11].text,
            "drb": element.find_elements_by_css_selector("td")[12].text,
            "blocks": element.find_elements_by_css_selector("td")[13].text,
            "owng": element.find_elements_by_css_selector("td")[14].text,
        }
        
        player_defensive_df.loc[len(player_defensive_df)] = player_dict
    
    # close webdriver
    driver.close()
    
    return replace_pd(player_defensive_df)

4. Crawling Players Offensive Data Function


In [6]:
def crawling_player_offensive(team_id, api_delay_term=5):
    
    """
    cawling player defensive data 
        
    parameter -------------------------------------------------------------------
    team_id : one of you want team_id of players & parameter data type int or str
    api_dealy_term : if your network speed is slow, you should set bigger number
    
    return ----------------------------------------------------------------------
    pandas dataframe belong player's ability
    player_number, keyp, fouled, off, disp, unstch
    
    """  

    # connect webdriver
    url = "https://www.whoscored.com/Teams/" + str(team_id)
    driver =  webdriver.PhantomJS()
    driver.get(url)
    
    # wait for getting data
    time.sleep(api_delay_term)
    
    # click event for getting data
    driver.find_elements_by_css_selector("#team-squad-stats-options .in-squad-detailed-view")[1].find_element_by_css_selector("a").click()
    
    # wait for getting data
    time.sleep(api_delay_term)
    
    # make pandas dataframe
    player_offensive_df = pd.DataFrame(columns=["player_number", "keyp", "fouled", "off", "disp", "unstch"])
    
    # get player offensive datas
    elements = driver.find_elements_by_css_selector("#statistics-table-offensive #player-table-statistics-body tr")
    for element in elements:
    
        player_dict = {
            "player_number": element.find_elements_by_css_selector("td")[2].find_elements_by_css_selector("a")[0].get_attribute("href").split("/")[4], 
            "keyp": element.find_elements_by_css_selector("td")[10].text, 
            "fouled": element.find_elements_by_css_selector("td")[12].text, 
            "off": element.find_elements_by_css_selector("td")[13].text,
            "disp": element.find_elements_by_css_selector("td")[14].text,
            "unstch": element.find_elements_by_css_selector("td")[15].text,
        }
        
        player_offensive_df.loc[len(player_offensive_df)] = player_dict
        
    # close webdriver
    driver.close()
    
    return replace_pd(player_offensive_df)

5. Crawling Players Passing Data Function


In [7]:
def crawling_player_passing(team_id, api_delay_term=5):
    
    """
    cawling player defensive data 
        
    parameter -------------------------------------------------------------------
    team_id : one of you want team_id of players & parameter data type int or str
    api_dealy_term : if your network speed is slow, you should set bigger number
    
    return ----------------------------------------------------------------------
    pandas dataframe belong player's ability
    player_number, avgp, ps, crosses, longb, thrb
    
    """ 
    
    # connect webdriver
    url = "https://www.whoscored.com/Teams/" + str(team_id)
    driver =  webdriver.PhantomJS()
    driver.get(url)
    
    # wait for gettig data
    time.sleep(api_delay_term)
    
    # click event for gettig data
    driver.find_elements_by_css_selector("#team-squad-stats-options .in-squad-detailed-view")[2].find_element_by_css_selector("a").click()

    # wait for gettig data
    time.sleep(api_delay_term)
    
    # make pnadas dateframe
    player_passing_df = pd.DataFrame(columns=[
            "player_number", "avgp", "ps", "crosses", "longb", "thrb"
        ])

    # get data
    elements = driver.find_elements_by_css_selector("#statistics-table-passing #player-table-statistics-body tr")
    for element in elements:
       
        player_dict = {
            "player_number": element.find_elements_by_css_selector("td")[2].find_elements_by_css_selector("a")[0].get_attribute("href").split("/")[4], 
            "avgp": element.find_elements_by_css_selector("td")[8].text, 
            "ps": element.find_elements_by_css_selector("td")[9].text, 
            "crosses": element.find_elements_by_css_selector("td")[10].text,
            "longb": element.find_elements_by_css_selector("td")[11].text,
            "thrb": element.find_elements_by_css_selector("td")[12].text,
        }
        
        player_passing_df.loc[len(player_passing_df)] = player_dict
    
    # close webdriver
    driver.close()
    
    return replace_pd(player_passing_df)

6. Crawling and Save Team Data


In [8]:
def save_league_teams(league_name, team_id):
    """
    save league team datas team id and team name
    you need to make league_name directory
    
    parameter --------------------------------------------------
    league_name : league name have to same league_name directory
    team_id : team_id of belong the league
    
    return -----------------------------------------------------
    pandas dataframe : columns are team_id, team_name
    
    """
    
    league_teams = crawling_league_teams(team_id)
    league_teams.to_csv("./league/" + league_name + ".csv", index=False)
    return league_teams

In [12]:
# excute save_league_teams function
save_league_teams("TurkeyLeague", 133)


Out[12]:
team_id team_name
0 5729 Akhisar Belediyespor
1 227 Antalyaspor
2 133 Besiktas
3 221 Bursaspor
4 2864 Eskisehirspor
5 223 Fenerbahce
6 294 Galatasaray
7 220 Gaziantepspor
8 156 Genclerbirligi
9 2201 Istanbul Basaksehir
10 3475 Kasimpasa
11 2200 Kayserispor
12 2006 Konyaspor
13 2192 Mersin
14 2005 Osmanlispor FK
15 750 Rizespor
16 2003 Sivasspor
17 226 Trabzonspor

7. Crawling and Save Player Data


In [10]:
def make_player_info(team_id, team_name):
    """
    cralwing player data and merge player datas(summary, defensive, offensive, passing)
    
    parameter ----------------------------------------------
    team_id : int or str, you want to get team_id of players
    team_name : str, team name of team id
    
    return -------------------------------------------------
    merged player datas
    
    """
    
    # excute player datas crawling functions 
    player_summary_df = crawling_player_summary(team_id)
    player_defensive_df = crawling_player_defensive(team_id)
    player_offensive_df = crawling_player_offensive(team_id)
    player_passing_df = crawling_player_passing(team_id)
    
    # merge player datas
    sd = player_summary_df.merge(player_defensive_df, on="player_number")
    sdo = sd.merge(player_offensive_df, on="player_number")
    merged_data = sdo.merge(player_passing_df, on="player_number")
    
    # add team name
    merged_data["team_name"] = team_name
    
    return merged_data
    

def save_player(league):
    """
    make player data and save
    
    prameter  -----------------------------------------------------
    league : you want to save league name of players
    
    """
    
    # get league team_id team_name dataframe
    league_team_df = pd.read_csv("./league/" + league + ".csv")
    
    # get palyer dataframe function
    def get_player_df(league, team_id, team_name):
        players_df = make_player_info(team_id, team_name)
        players_df.to_csv("./player/" + league + "/" + team_name + ".csv")
        return players_df
    
    # for one of league teams
    for idx, row in league_team_df.iterrows():
        try_again_num = 0
        print("Make Player {0} Start.".format(row.team_name))

        players_df = []
        
        # there is no player data, try crawling more 3 times
        while len(players_df) == 0 and try_again_num < 3:
            if try_again_num > 0:
                print("Make Player Try Again!")
            try_again_num += 1
            players_df = get_player_df(league, row.team_id, row.team_name)
            
        print("The number of saved players : {0}".format(len(players_df)))
        print("Make Player {0} Done".format(row.team_name))
        print("-" * 35)    
        
    print(league + " Save Players Done!")

In [13]:
# excute save_player function
league = "TurkeyLeague"
save_player(league)


Make Player Akhisar Belediyespor Start.
The number of saved players : 29
Make Player Akhisar Belediyespor Done
-----------------------------------
Make Player Antalyaspor Start.
The number of saved players : 39
Make Player Antalyaspor Done
-----------------------------------
Make Player Besiktas Start.
The number of saved players : 26
Make Player Besiktas Done
-----------------------------------
Make Player Bursaspor Start.
The number of saved players : 34
Make Player Bursaspor Done
-----------------------------------
Make Player Eskisehirspor Start.
The number of saved players : 36
Make Player Eskisehirspor Done
-----------------------------------
Make Player Fenerbahce Start.
Make Player Try Again!
The number of saved players : 26
Make Player Fenerbahce Done
-----------------------------------
Make Player Galatasaray Start.
The number of saved players : 27
Make Player Galatasaray Done
-----------------------------------
Make Player Gaziantepspor Start.
The number of saved players : 32
Make Player Gaziantepspor Done
-----------------------------------
Make Player Genclerbirligi Start.
The number of saved players : 31
Make Player Genclerbirligi Done
-----------------------------------
Make Player Istanbul Basaksehir Start.
The number of saved players : 24
Make Player Istanbul Basaksehir Done
-----------------------------------
Make Player Kasimpasa Start.
The number of saved players : 24
Make Player Kasimpasa Done
-----------------------------------
Make Player Kayserispor Start.
The number of saved players : 27
Make Player Kayserispor Done
-----------------------------------
Make Player Konyaspor Start.
The number of saved players : 28
Make Player Konyaspor Done
-----------------------------------
Make Player Mersin Start.
The number of saved players : 33
Make Player Mersin Done
-----------------------------------
Make Player Osmanlispor FK Start.
Make Player Try Again!
The number of saved players : 29
Make Player Osmanlispor FK Done
-----------------------------------
Make Player Rizespor Start.
The number of saved players : 23
Make Player Rizespor Done
-----------------------------------
Make Player Sivasspor Start.
The number of saved players : 31
Make Player Sivasspor Done
-----------------------------------
Make Player Trabzonspor Start.
The number of saved players : 38
Make Player Trabzonspor Done
-----------------------------------
TurkeyLeague Save Players Done!

In [ ]:


In [ ]:


In [ ]: