In [1]:
# import package
import pandas as pd
import time
import pickle
from selenium import webdriver
import glob
import os
import csv
In [2]:
api_delay_term = 5
In [6]:
# save player url
def get_players_url(season_id, team_id):
# connect url
url = "http://www.koreabaseball.com/Record/Player/HitterBasic/Basic1.aspx"
driver = webdriver.Firefox()
driver.get(url)
# click season
driver.find_element_by_css_selector('#cphContainer_cphContents_ddlSeason_ddlSeason').\
find_elements_by_css_selector('option')[season_id].click()
time.sleep(api_delay_term)
# click team
driver.find_element_by_css_selector('#cphContainer_cphContents_ddlTeam_ddlTeam').\
find_elements_by_css_selector('option')[team_id].click()
time.sleep(api_delay_term)
# get page number
page_elements = driver.find_elements_by_css_selector(".paging02 a")
page_number = len(page_elements)
if page_number == 1:
page_number = page_number
if page_number > 1:
page_number = page_number -2
# make empty dataframe
url_df = pd.DataFrame(columns=['url'])
# if having one page
if page_number == 1:
elements = driver.find_elements_by_css_selector(".record_result tr")
elements = elements[1:len(elements)+1]
for element in elements:
tmp_dict = {
"url" : element.find_element_by_css_selector("a").get_attribute("href")
}
url_df.loc[len(url_df)] = tmp_dict
# if having other more pages
if page_number > 1:
for page in range(1, page_number+1):
driver.find_element_by_css_selector('#cphContainer_cphContents_ucPager_btnNo' + str(page)).click()
time.sleep(api_delay_term)
elements = driver.find_elements_by_css_selector(".record_result tr")
elements = elements[1:len(elements)+1]
for element in elements:
tmp_dict = {
"url" : element.find_element_by_css_selector("a").get_attribute("href")
}
url_df.loc[len(url_df)] = tmp_dict
return url_df
In [7]:
def crawling_player_info(season_id, team_id):
# get each player's url
url_df = get_players_url(season_id, team_id)
# drop the retired players
is_retired = url_df['url'].str.contains("Retire") == True
url_df = url_df.drop(url_df.index[is_retired]).reset_index()
print("get url_df successfully")
# make empty dataframe
player_info_df = pd.DataFrame(columns=[
"p_number", "name", "salary"
])
driver1 = webdriver.Firefox()
# connect each url & scrap player info
for number in range(0, len(url_df)):
player_url = url_df['url'][number]
driver1.get(player_url)
time.sleep(api_delay_term)
player_info = driver1.find_elements_by_css_selector(".player_basic ul li")
tmp_dict = {
"p_number" : player_url.split("=")[1],
"name" : player_info[0].text.split(": ")[1],
"salary" : player_info[7].text.split(": ")[1],
}
player_info_df.loc[len(player_info_df)] = tmp_dict
return player_info_df
In [8]:
nexen_hsalary_df = crawling_player_info(33,1)
In [9]:
nexen_hsalary_df.to_csv("nexen_hsalary.csv",encoding='utf-8')
In [10]:
start_time = time.time()
doosan_hsalary_df = crawling_player_info(33,2)
lotte_hsalary_df = crawling_player_info(33,3)
samsung_hsalary_df = crawling_player_info(33,4)
hanhwa_hsalary_df = crawling_player_info(33,5)
KIA_hsalary_df = crawling_player_info(33,6)
KT_hsalary_df = crawling_player_info(33,7)
LG_hsalary_df = crawling_player_info(33,8)
NC_hsalary_df = crawling_player_info(33,9)
SK_hsalary_df = crawling_player_info(33,10)
end_time = time.time()
end_time-start_time
Out[10]:
In [11]:
doosan_hsalary_df.to_csv("doosan_hsalary.csv",encoding='utf-8')
lotte_hsalary_df.to_csv("lotte_hsalary.csv",encoding='utf-8')
samsung_hsalary_df.to_csv("samsung_hsalary.csv",encoding='utf-8')
hanhwa_hsalary_df.to_csv("hanhwa_hsalary.csv",encoding='utf-8')
KIA_hsalary_df.to_csv("KIA_hsalary.csv",encoding='utf-8')
KT_hsalary_df.to_csv("KT_hsalary.csv",encoding='utf-8')
LG_hsalary_df.to_csv("LG_hsalary.csv",encoding='utf-8')
NC_hsalary_df.to_csv("NC_hsalary.csv",encoding='utf-8')
SK_hsalary_df.to_csv("SK_hsalary.csv",encoding='utf-8')
In [12]:
# save player url
def get_players_url(season_id, team_id):
# connect url
url = "http://www.koreabaseball.com/Record/Player/PitcherBasic/Basic1.aspx"
driver = webdriver.Firefox()
driver.get(url)
# click season
driver.find_element_by_css_selector('#cphContainer_cphContents_ddlSeason_ddlSeason').\
find_elements_by_css_selector('option')[season_id].click()
time.sleep(api_delay_term)
# click team
driver.find_element_by_css_selector('#cphContainer_cphContents_ddlTeam_ddlTeam').\
find_elements_by_css_selector('option')[team_id].click()
time.sleep(api_delay_term)
# get page number
page_elements = driver.find_elements_by_css_selector(".paging02 a")
page_number = len(page_elements)
if page_number == 1:
page_number = page_number
if page_number > 1:
page_number = page_number -2
# make empty dataframe
url_df = pd.DataFrame(columns=['url'])
# if having one page
if page_number == 1:
elements = driver.find_elements_by_css_selector(".record_result tr")
elements = elements[1:len(elements)+1]
for element in elements:
tmp_dict = {
"url" : element.find_element_by_css_selector("a").get_attribute("href")
}
url_df.loc[len(url_df)] = tmp_dict
# if having other more pages
if page_number > 1:
for page in range(1, page_number+1):
driver.find_element_by_css_selector('#cphContainer_cphContents_ucPager_btnNo' + str(page)).click()
time.sleep(api_delay_term)
elements = driver.find_elements_by_css_selector(".record_result tr")
elements = elements[1:len(elements)+1]
for element in elements:
tmp_dict = {
"url" : element.find_element_by_css_selector("a").get_attribute("href")
}
url_df.loc[len(url_df)] = tmp_dict
return url_df
In [13]:
nexen_psalary_df = crawling_player_info(33,1)
In [14]:
nexen_psalary_df.to_csv("nexen_psalary.csv",encoding='utf-8')
In [15]:
start_time = time.time()
doosan_psalary_df = crawling_player_info(33,2)
lotte_psalary_df = crawling_player_info(33,3)
samsung_psalary_df = crawling_player_info(33,4)
hanhwa_psalary_df = crawling_player_info(33,5)
KIA_psalary_df = crawling_player_info(33,6)
KT_psalary_df = crawling_player_info(33,7)
LG_psalary_df = crawling_player_info(33,8)
NC_psalary_df = crawling_player_info(33,9)
SK_psalary_df = crawling_player_info(33,10)
end_time = time.time()
end_time-start_time
Out[15]:
In [16]:
doosan_psalary_df.to_csv("doosan_psalary.csv",encoding='utf-8')
lotte_psalary_df.to_csv("lotte_psalary.csv",encoding='utf-8')
samsung_psalary_df.to_csv("samsung_psalary.csv",encoding='utf-8')
hanhwa_psalary_df.to_csv("hanhwa_psalary.csv",encoding='utf-8')
KIA_psalary_df.to_csv("KIA_psalary.csv",encoding='utf-8')
KT_psalary_df.to_csv("KT_psalary.csv",encoding='utf-8')
LG_psalary_df.to_csv("LG_psalary.csv",encoding='utf-8')
NC_psalary_df.to_csv("NC_psalary.csv",encoding='utf-8')
SK_psalary_df.to_csv("SK_psalary.csv",encoding='utf-8')
In [ ]: