In [1]:
# import package
import pandas as pd
import time
import pickle
from selenium import webdriver
import glob
import os
In [2]:
api_delay_term = 3
In [3]:
# crawling_defense
def crawling_defense(season_id, team_id):
"""
season_id = 0 ~ 14
team_id = 1 ~ 10
------------------------------------------------------------------------------------
<season_id>
0 : 2002 ~ 14 : 2016
<team_id> ==> It can be different from several season.
1 : Nexen heroes
2 : Doosan
3 : Lotte
4 : Samsung
5 : Hanhwa
6 : KIA
7 : KT
8 : LG twins
9 : NC dinos
10 : SK wyberns
"""
driver = webdriver.Firefox()
url = "http://www.koreabaseball.com/Record/Player/Defense/Basic.aspx"
driver.get(url)
# click season
driver.find_element_by_css_selector('#cphContainer_cphContents_ddlSeason_ddlSeason').\
find_elements_by_css_selector('option')[season_id].click()
time.sleep(api_delay_term)
# click team
driver.find_element_by_css_selector('#cphContainer_cphContents_ddlTeam_ddlTeam').\
find_elements_by_css_selector('option')[team_id].click()
time.sleep(api_delay_term)
# get page number
page_elements = driver.find_elements_by_css_selector(".paging02 a")
page_number = len(page_elements)
if page_number == 1:
page_number = page_number
if page_number > 1:
page_number = page_number -2
# make empty dataframe
defense_df = pd.DataFrame(columns=[
'rank', 'name', 'team', 'POS', 'G', 'GS', 'IP', 'E', 'PKO', 'PO',
'A', 'DP', 'FPCT', 'PB', 'SB', 'CS', 'CS%'
])
# if having one page
if page_number == 1:
elements = driver.find_elements_by_css_selector(".record_result tr")
elements = elements[1:len(elements)+1]
for element in elements:
tmp_dict = {
'rank' : element.find_elements_by_css_selector('td')[0].text,
'name' : element.find_elements_by_css_selector('td')[1].text,
'team' : element.find_elements_by_css_selector('td')[2].text,
'POS' : element.find_elements_by_css_selector('td')[3].text,
'G' : element.find_elements_by_css_selector('td')[4].text,
'GS' : element.find_elements_by_css_selector('td')[5].text,
'IP' : element.find_elements_by_css_selector('td')[6].text,
'E' : element.find_elements_by_css_selector('td')[7].text,
'PKO' : element.find_elements_by_css_selector('td')[8].text,
'PO' : element.find_elements_by_css_selector('td')[9].text,
'A' : element.find_elements_by_css_selector('td')[10].text,
'DP' : element.find_elements_by_css_selector('td')[11].text,
'FPCT' : element.find_elements_by_css_selector('td')[12].text,
'PB' : element.find_elements_by_css_selector('td')[13].text,
'SB' : element.find_elements_by_css_selector('td')[14].text,
'CS' : element.find_elements_by_css_selector('td')[15].text,
'CS%' : element.find_elements_by_css_selector('td')[16].text,
}
defense_df.loc[len(defense_df)] = tmp_dict
# if having other more pages
if page_number > 1:
for page in range(1, page_number+1):
driver.find_element_by_css_selector('#cphContainer_cphContents_ucPager_btnNo' + str(page)).click()
time.sleep(api_delay_term)
elements = driver.find_elements_by_css_selector(".record_result tr")
elements = elements[1:len(elements)+1]
for element in elements:
tmp_dict = {
'rank' : element.find_elements_by_css_selector('td')[0].text,
'name' : element.find_elements_by_css_selector('td')[1].text,
'team' : element.find_elements_by_css_selector('td')[2].text,
'POS' : element.find_elements_by_css_selector('td')[3].text,
'G' : element.find_elements_by_css_selector('td')[4].text,
'GS' : element.find_elements_by_css_selector('td')[5].text,
'IP' : element.find_elements_by_css_selector('td')[6].text,
'E' : element.find_elements_by_css_selector('td')[7].text,
'PKO' : element.find_elements_by_css_selector('td')[8].text,
'PO' : element.find_elements_by_css_selector('td')[9].text,
'A' : element.find_elements_by_css_selector('td')[10].text,
'DP' : element.find_elements_by_css_selector('td')[11].text,
'FPCT' : element.find_elements_by_css_selector('td')[12].text,
'PB' : element.find_elements_by_css_selector('td')[13].text,
'SB' : element.find_elements_by_css_selector('td')[14].text,
'CS' : element.find_elements_by_css_selector('td')[15].text,
'CS%' : element.find_elements_by_css_selector('td')[16].text,
}
defense_df.loc[len(defense_df)] = tmp_dict
return defense_df
In [4]:
nexen_defense_df = crawling_defense(13, 1)
In [6]:
nexen_defense_df.to_csv("nexen_defense.csv",encoding='utf-8')
In [7]:
start_time = time.time()
doosan_defense_df = crawling_defense(13,2)
lotte_defense_df = crawling_defense(13,3)
samsung_defense_df = crawling_defense(13,4)
hanhwa_defense_df = crawling_defense(13,5)
KIA_defense_df = crawling_defense(13,6)
KT_defense_df = crawling_defense(13,7)
LG_defense_df = crawling_defense(13,8)
NC_defense_df = crawling_defense(13,9)
SK_defense_df = crawling_defense(13,10)
end_time = time.time()
end_time-start_time
Out[7]:
In [8]:
doosan_defense_df.to_csv("doosan_defense.csv",encoding='utf-8')
lotte_defense_df.to_csv("lotte_defense.csv",encoding='utf-8')
samsung_defense_df.to_csv("samsung_defense.csv",encoding='utf-8')
hanhwa_defense_df.to_csv("hanhwa_defense.csv",encoding='utf-8')
KIA_defense_df.to_csv("KIA_defense.csv",encoding='utf-8')
KT_defense_df.to_csv("KT_defense.csv",encoding='utf-8')
LG_defense_df.to_csv("LG_defense.csv",encoding='utf-8')
NC_defense_df.to_csv("NC_defense.csv",encoding='utf-8')
SK_defense_df.to_csv("SK_defense.csv",encoding='utf-8')
In [ ]: