In [1]:
# import package
import pandas as pd
import time
import pickle
from selenium import webdriver
import glob
import os
In [2]:
api_delay_term = 3
In [6]:
# crawling_runner
def crawling_runner(season_id, team_id):
"""
season_id = 0 ~ 34
team_id = 1 ~ 10
------------------------------------------------------------------------------------
<season_id>
0 : 2002 ~ 14 : 2016
<team_id> ==> It can be different from several season.
1 : Nexen heroes
2 : Doosan
3 : Lotte
4 : Samsung
5 : Hanhwa
6 : KIA
7 : KT
8 : LG twins
9 : NC dinos
10 : SK wyberns
"""
driver = webdriver.Firefox()
url = "http://www.koreabaseball.com/Record/Player/Runner/Basic.aspx"
driver.get(url)
# click season
driver.find_element_by_css_selector('#cphContainer_cphContents_ddlSeason_ddlSeason').\
find_elements_by_css_selector('option')[season_id].click()
time.sleep(api_delay_term)
# click team
driver.find_element_by_css_selector('#cphContainer_cphContents_ddlTeam_ddlTeam').\
find_elements_by_css_selector('option')[team_id].click()
time.sleep(api_delay_term)
# get page number
page_elements = driver.find_elements_by_css_selector(".paging02 a")
page_number = len(page_elements)
if page_number == 1:
page_number = page_number
if page_number > 1:
page_number = page_number -2
# make empty dataframe
runner_df = pd.DataFrame(columns=[
'rank', 'name', 'team', 'G', 'SBA', 'SB', 'CS', 'SB%', 'OOB',
'PKO'
])
# if having one page
if page_number == 1:
elements = driver.find_elements_by_css_selector(".record_result tr")
elements = elements[1:len(elements)+1]
for element in elements:
tmp_dict = {
'rank' : element.find_elements_by_css_selector('td')[0].text,
'name' : element.find_elements_by_css_selector('td')[1].text,
'team' : element.find_elements_by_css_selector('td')[2].text,
'G' : element.find_elements_by_css_selector('td')[3].text,
'SBA' : element.find_elements_by_css_selector('td')[4].text,
'SB' : element.find_elements_by_css_selector('td')[5].text,
'CS' : element.find_elements_by_css_selector('td')[6].text,
'SB%' : element.find_elements_by_css_selector('td')[7].text,
'OOB' : element.find_elements_by_css_selector('td')[8].text,
'PKO' : element.find_elements_by_css_selector('td')[9].text,
}
runner_df.loc[len(runner_df)] = tmp_dict
# if having other more pages
if page_number > 1:
for page in range(1, page_number+1):
driver.find_element_by_css_selector('#cphContainer_cphContents_ucPager_btnNo' + str(page)).click()
time.sleep(api_delay_term)
elements = driver.find_elements_by_css_selector(".record_result tr")
elements = elements[1:len(elements)+1]
for element in elements:
tmp_dict = {
'rank' : element.find_elements_by_css_selector('td')[0].text,
'name' : element.find_elements_by_css_selector('td')[1].text,
'team' : element.find_elements_by_css_selector('td')[2].text,
'G' : element.find_elements_by_css_selector('td')[3].text,
'SBA' : element.find_elements_by_css_selector('td')[4].text,
'SB' : element.find_elements_by_css_selector('td')[5].text,
'CS' : element.find_elements_by_css_selector('td')[6].text,
'SB%' : element.find_elements_by_css_selector('td')[7].text,
'OOB' : element.find_elements_by_css_selector('td')[8].text,
'PKO' : element.find_elements_by_css_selector('td')[9].text,
}
runner_df.loc[len(runner_df)] = tmp_dict
return runner_df
In [5]:
nexen_running_df = crawling_runner(13, 1)
In [7]:
nexen_running_df.to_csv("nexen_running.csv",encoding='utf-8')
In [8]:
start_time = time.time()
doosan_running_df = crawling_runner(13,2)
lotte_running_df = crawling_runner(13,3)
samsung_running_df = crawling_runner(13,4)
hanhwa_running_df = crawling_runner(13,5)
KIA_running_df = crawling_runner(13,6)
KT_running_df = crawling_runner(13,7)
LG_running_df = crawling_runner(13,8)
NC_running_df = crawling_runner(13,9)
SK_running_df = crawling_runner(13,10)
end_time = time.time()
end_time-start_time
Out[8]:
In [9]:
doosan_running_df.to_csv("doosan_running.csv",encoding='utf-8')
lotte_running_df.to_csv("lotte_running.csv",encoding='utf-8')
samsung_running_df.to_csv("samsung_running.csv",encoding='utf-8')
hanhwa_running_df.to_csv("hanhwa_running.csv",encoding='utf-8')
KIA_running_df.to_csv("KIA_running.csv",encoding='utf-8')
KT_running_df.to_csv("KT_running.csv",encoding='utf-8')
LG_running_df.to_csv("LG_running.csv",encoding='utf-8')
NC_running_df.to_csv("NC_running.csv",encoding='utf-8')
SK_running_df.to_csv("SK_running.csv",encoding='utf-8')
In [ ]: