In [1]:
import pandas as pd
import time
import pickle
from selenium import webdriver
In [2]:
def replace_pd(df):
"""
replace pandas dataframe datas "-" to 0
"""
# mapping pattern
mapping = {'-': 0}
replace_dict = {}
for colum in df.columns:
replace_dict[colum] = mapping
return df.replace(replace_dict)
In [3]:
def crawling_league_teams(team_id, api_delay_term=5):
"""
cawling league team_id and team name datas
parameter ---------------------------------------------------------------
team_id : one of you want league team_id & parameter data type int or str
return ------------------------------------------------------------------
crawling league team_id, team_name datas belong team_id parameter
return pandas dataframe columns=team_id, team_name
"""
# connect webdriver
url = "https://www.whoscored.com/Teams/" + str(team_id)
driver = webdriver.PhantomJS()
driver.get(url)
# wait get league team datas
time.sleep(api_delay_term)
# make pandas dataframe
team_df = pd.DataFrame(columns=["team_id","team_name"])
# get team datas
teams = driver.find_elements_by_css_selector("#teams option")
for team in teams:
team_name = team.text
team_id = team.get_attribute("value").split("/")[2]
team_df.loc[len(team_df)] = {"team_id":team_id, "team_name":team_name }
# close webdriver
driver.close()
return replace_pd(team_df)
In [4]:
def crawling_player_summary(team_id, api_delay_term=5):
"""
cawling player summary data
parameter -------------------------------------------------------------------
team_id : one of you want team_id of players & parameter data type int or str
api_dealy_term : if your network speed is slow, you should set bigger number
return ----------------------------------------------------------------------
pandas dataframe belong player's ability
player_nuber, flag, name, age, position, tall, weight, full_time, half_time
, mins, goals, asists, yel, red, spg, ps, motm, aw, rating
"""
# connect webdriver
url = "https://www.whoscored.com/Teams/" + str(team_id)
driver = webdriver.PhantomJS()
driver.get(url)
# wait for getting data
time.sleep(api_delay_term)
# make pandas dataframe
player_summary_df = pd.DataFrame(columns=[
"player_number", "flag", "name", "age", "position"
, "tall", "weight", "full_time", "half_time", "mins"
, "goals", "asists", "yel", "red", "spg", "ps", "motm"
, "aw", "rating",
])
# get player summay datas
elements = driver.find_elements_by_css_selector("#player-table-statistics-body tr")
for element in elements:
# split full time games and half time games
games = element.find_elements_by_css_selector("td")[5].text
games = games.split("(")
full_time, half_time = games[0], 0
if len(games) > 1 :
half_time = games[1].replace(")","")
else :
half_time = 0
# player dictionary data
player_dict = {
"player_number": element.find_elements_by_css_selector("td")[2].find_elements_by_css_selector("a")[0].get_attribute("href").split("/")[4],
"flag": element.find_elements_by_css_selector("td")[1].find_elements_by_css_selector("span")[0].get_attribute("class").split("-")[2],
"name": element.find_elements_by_css_selector("td")[2].find_elements_by_css_selector("a")[0].text,
"age": element.find_elements_by_css_selector("td")[2].find_elements_by_css_selector("span")[0].text,
"position": element.find_elements_by_css_selector("td")[2].find_elements_by_css_selector("span")[1].text[1:],
"tall": element.find_elements_by_css_selector("td")[3].text,
"weight": element.find_elements_by_css_selector("td")[4].text,
"full_time": full_time,
"half_time": half_time,
"mins": element.find_elements_by_css_selector("td")[6].text,
"goals": element.find_elements_by_css_selector("td")[7].text,
"asists": element.find_elements_by_css_selector("td")[8].text,
"yel": element.find_elements_by_css_selector("td")[9].text,
"red": element.find_elements_by_css_selector("td")[10].text,
"spg": element.find_elements_by_css_selector("td")[11].text,
"ps": element.find_elements_by_css_selector("td")[12].text,
"aw": element.find_elements_by_css_selector("td")[13].text,
"motm": element.find_elements_by_css_selector("td")[14].text,
"rating": element.find_elements_by_css_selector("td")[15].text,
}
player_summary_df.loc[len(player_summary_df)] = player_dict
# close webdriver
driver.close()
return replace_pd(player_summary_df)
In [5]:
def crawling_player_defensive(team_id, api_delay_term=5):
"""
cawling player defensive data
parameter -------------------------------------------------------------------
team_id : one of you want team_id of players & parameter data type int or str
api_dealy_term : if your network speed is slow, you should set bigger number
return ----------------------------------------------------------------------
pandas dataframe belong player's ability
player_number, tackles, inter, fouls, offsides, clear, drb, blocks, owng
"""
# connect webdriver
url = "https://www.whoscored.com/Teams/" + str(team_id)
driver = webdriver.PhantomJS()
driver.get(url)
# wait for getting data
time.sleep(api_delay_term)
# click event for getting defensive data
driver.find_elements_by_css_selector("#team-squad-stats-options .in-squad-detailed-view")[0].find_element_by_css_selector("a").click()
# wait for getting data
time.sleep(api_delay_term)
# make pandas dataframe
player_defensive_df = pd.DataFrame(columns=[
"player_number", "tackles", "inter", "fouls", "offsides", "clear", "drb", "blocks", "owng"
])
# get player defensive datas
elements = driver.find_elements_by_css_selector("#team-squad-stats-defensive #player-table-statistics-body tr")
for element in elements:
player_dict = {
"player_number": element.find_elements_by_css_selector("td")[2].find_elements_by_css_selector("a")[0].get_attribute("href").split("/")[4],
"tackles": element.find_elements_by_css_selector("td")[7].text,
"inter": element.find_elements_by_css_selector("td")[8].text,
"fouls": element.find_elements_by_css_selector("td")[9].text,
"offsides": element.find_elements_by_css_selector("td")[10].text,
"clear": element.find_elements_by_css_selector("td")[11].text,
"drb": element.find_elements_by_css_selector("td")[12].text,
"blocks": element.find_elements_by_css_selector("td")[13].text,
"owng": element.find_elements_by_css_selector("td")[14].text,
}
player_defensive_df.loc[len(player_defensive_df)] = player_dict
# close webdriver
driver.close()
return replace_pd(player_defensive_df)
In [6]:
def crawling_player_offensive(team_id, api_delay_term=5):
"""
cawling player defensive data
parameter -------------------------------------------------------------------
team_id : one of you want team_id of players & parameter data type int or str
api_dealy_term : if your network speed is slow, you should set bigger number
return ----------------------------------------------------------------------
pandas dataframe belong player's ability
player_number, keyp, fouled, off, disp, unstch
"""
# connect webdriver
url = "https://www.whoscored.com/Teams/" + str(team_id)
driver = webdriver.PhantomJS()
driver.get(url)
# wait for getting data
time.sleep(api_delay_term)
# click event for getting data
driver.find_elements_by_css_selector("#team-squad-stats-options .in-squad-detailed-view")[1].find_element_by_css_selector("a").click()
# wait for getting data
time.sleep(api_delay_term)
# make pandas dataframe
player_offensive_df = pd.DataFrame(columns=["player_number", "keyp", "fouled", "off", "disp", "unstch"])
# get player offensive datas
elements = driver.find_elements_by_css_selector("#statistics-table-offensive #player-table-statistics-body tr")
for element in elements:
player_dict = {
"player_number": element.find_elements_by_css_selector("td")[2].find_elements_by_css_selector("a")[0].get_attribute("href").split("/")[4],
"keyp": element.find_elements_by_css_selector("td")[10].text,
"fouled": element.find_elements_by_css_selector("td")[12].text,
"off": element.find_elements_by_css_selector("td")[13].text,
"disp": element.find_elements_by_css_selector("td")[14].text,
"unstch": element.find_elements_by_css_selector("td")[15].text,
}
player_offensive_df.loc[len(player_offensive_df)] = player_dict
# close webdriver
driver.close()
return replace_pd(player_offensive_df)
In [7]:
def crawling_player_passing(team_id, api_delay_term=5):
"""
cawling player defensive data
parameter -------------------------------------------------------------------
team_id : one of you want team_id of players & parameter data type int or str
api_dealy_term : if your network speed is slow, you should set bigger number
return ----------------------------------------------------------------------
pandas dataframe belong player's ability
player_number, avgp, ps, crosses, longb, thrb
"""
# connect webdriver
url = "https://www.whoscored.com/Teams/" + str(team_id)
driver = webdriver.PhantomJS()
driver.get(url)
# wait for gettig data
time.sleep(api_delay_term)
# click event for gettig data
driver.find_elements_by_css_selector("#team-squad-stats-options .in-squad-detailed-view")[2].find_element_by_css_selector("a").click()
# wait for gettig data
time.sleep(api_delay_term)
# make pnadas dateframe
player_passing_df = pd.DataFrame(columns=[
"player_number", "avgp", "ps", "crosses", "longb", "thrb"
])
# get data
elements = driver.find_elements_by_css_selector("#statistics-table-passing #player-table-statistics-body tr")
for element in elements:
player_dict = {
"player_number": element.find_elements_by_css_selector("td")[2].find_elements_by_css_selector("a")[0].get_attribute("href").split("/")[4],
"avgp": element.find_elements_by_css_selector("td")[8].text,
"ps": element.find_elements_by_css_selector("td")[9].text,
"crosses": element.find_elements_by_css_selector("td")[10].text,
"longb": element.find_elements_by_css_selector("td")[11].text,
"thrb": element.find_elements_by_css_selector("td")[12].text,
}
player_passing_df.loc[len(player_passing_df)] = player_dict
# close webdriver
driver.close()
return replace_pd(player_passing_df)
In [8]:
def save_league_teams(league_name, team_id):
"""
save league team datas team id and team name
you need to make league_name directory
parameter --------------------------------------------------
league_name : league name have to same league_name directory
team_id : team_id of belong the league
return -----------------------------------------------------
pandas dataframe : columns are team_id, team_name
"""
league_teams = crawling_league_teams(team_id)
league_teams.to_csv("./league/" + league_name + ".csv", index=False)
return league_teams
In [12]:
# excute save_league_teams function
save_league_teams("TurkeyLeague", 133)
Out[12]:
In [10]:
def make_player_info(team_id, team_name):
"""
cralwing player data and merge player datas(summary, defensive, offensive, passing)
parameter ----------------------------------------------
team_id : int or str, you want to get team_id of players
team_name : str, team name of team id
return -------------------------------------------------
merged player datas
"""
# excute player datas crawling functions
player_summary_df = crawling_player_summary(team_id)
player_defensive_df = crawling_player_defensive(team_id)
player_offensive_df = crawling_player_offensive(team_id)
player_passing_df = crawling_player_passing(team_id)
# merge player datas
sd = player_summary_df.merge(player_defensive_df, on="player_number")
sdo = sd.merge(player_offensive_df, on="player_number")
merged_data = sdo.merge(player_passing_df, on="player_number")
# add team name
merged_data["team_name"] = team_name
return merged_data
def save_player(league):
"""
make player data and save
prameter -----------------------------------------------------
league : you want to save league name of players
"""
# get league team_id team_name dataframe
league_team_df = pd.read_csv("./league/" + league + ".csv")
# get palyer dataframe function
def get_player_df(league, team_id, team_name):
players_df = make_player_info(team_id, team_name)
players_df.to_csv("./player/" + league + "/" + team_name + ".csv")
return players_df
# for one of league teams
for idx, row in league_team_df.iterrows():
try_again_num = 0
print("Make Player {0} Start.".format(row.team_name))
players_df = []
# there is no player data, try crawling more 3 times
while len(players_df) == 0 and try_again_num < 3:
if try_again_num > 0:
print("Make Player Try Again!")
try_again_num += 1
players_df = get_player_df(league, row.team_id, row.team_name)
print("The number of saved players : {0}".format(len(players_df)))
print("Make Player {0} Done".format(row.team_name))
print("-" * 35)
print(league + " Save Players Done!")
In [13]:
# excute save_player function
league = "TurkeyLeague"
save_player(league)
In [ ]:
In [ ]:
In [ ]: