In [1]:
# import package
import pandas as pd
import time
import pickle
from selenium import webdriver
import glob
import os
In [2]:
api_delay_term = 3
In [3]:
# crawling_hitter_basic
def crawling_hitter_basic(season_id, team_id):
"""
season_id = 0 ~ 34
team_id = 1 ~ 10
------------------------------------------------------------------------------------
<season_id>
0 : 1982 ~ 34 : 2016
<team_id>
1 : Nexen heroes
2 : Doosan
3 : Lotte
4 : Samsung
5 : Hanhwa
6 : KIA
7 : KT
8 : LG twins
9 : NC dinos
10 : SK wyberns
"""
url = "http://www.koreabaseball.com/Record/Player/HitterBasic/Basic1.aspx"
driver = webdriver.PhantomJS()
driver.get(url)
# click season
driver.find_element_by_css_selector('#cphContainer_cphContents_ddlSeason_ddlSeason').\
find_elements_by_css_selector('option')[season_id].click()
time.sleep(api_delay_term)
# click team
driver.find_element_by_css_selector('#cphContainer_cphContents_ddlTeam_ddlTeam').\
find_elements_by_css_selector('option')[team_id].click()
time.sleep(api_delay_term)
elements = driver.find_elements_by_css_selector(".record_result tr")
elements = elements[1:len(elements)+1]
hitter_basic_df = pd.DataFrame(columns=[
'rank', 'name', 'team', 'AVG', 'G', 'PA', 'AB', 'R', 'H', '2B',
'3B', 'HR', 'TB', 'RBI', 'SAC', 'SF'
])
for element in elements:
tmp_dict = {
'rank' : element.find_elements_by_css_selector('td')[0].text,
'name' : element.find_elements_by_css_selector('td')[1].text,
'team' : element.find_elements_by_css_selector('td')[2].text,
'AVG' : element.find_elements_by_css_selector('td')[3].text,
'G' : element.find_elements_by_css_selector('td')[4].text,
'PA' : element.find_elements_by_css_selector('td')[5].text,
'AB' : element.find_elements_by_css_selector('td')[6].text,
'R' : element.find_elements_by_css_selector('td')[7].text,
'H' : element.find_elements_by_css_selector('td')[8].text,
'2B' : element.find_elements_by_css_selector('td')[9].text,
'3B' : element.find_elements_by_css_selector('td')[10].text,
'HR' : element.find_elements_by_css_selector('td')[11].text,
'TB' : element.find_elements_by_css_selector('td')[12].text,
'RBI' : element.find_elements_by_css_selector('td')[13].text,
'SAC' : element.find_elements_by_css_selector('td')[14].text,
'SF' : element.find_elements_by_css_selector('td')[15].text,
}
hitter_basic_df.loc[len(hitter_basic_df)] = tmp_dict
# close webdriver
driver.close()
return hitter_basic_df
In [4]:
nexen_hb_15_df = crawling_hitter_basic(33,1)
In [5]:
nexen_hb_15_df.to_csv("nexen_hb_15.csv",encoding='utf-8')
In [6]:
start_time = time.time()
doosan_hb_15_df = crawling_hitter_basic(33,2)
lotte_hb_15_df = crawling_hitter_basic(33,3)
samsung_hb_15_df = crawling_hitter_basic(33,4)
hanhwa_hb_15_df = crawling_hitter_basic(33,5)
KIA_hb_15_df = crawling_hitter_basic(33,6)
KT_hb_15_df = crawling_hitter_basic(33,7)
LG_hb_15_df = crawling_hitter_basic(33,8)
NC_hb_15_df = crawling_hitter_basic(33,9)
SK_hb_15_df = crawling_hitter_basic(33,10)
end_time = time.time()
end_time-start_time
Out[6]:
In [7]:
doosan_hb_15_df.to_csv("doosan_hb_15.csv",encoding='utf-8')
lotte_hb_15_df.to_csv("lotte_hb_15.csv",encoding='utf-8')
samsung_hb_15_df.to_csv("samsung_hb_15.csv",encoding='utf-8')
hanhwa_hb_15_df.to_csv("hanhwa_hb_15.csv",encoding='utf-8')
KIA_hb_15_df.to_csv("KIA_hb_15.csv",encoding='utf-8')
KT_hb_15_df.to_csv("KT_hb_15.csv",encoding='utf-8')
LG_hb_15_df.to_csv("LG_hb_15.csv",encoding='utf-8')
NC_hb_15_df.to_csv("NC_hb_15.csv",encoding='utf-8')
SK_hb_15_df.to_csv("SK_hb_15.csv",encoding='utf-8')
In [8]:
# crawling_hitter_detail
def crawling_hitter_detail(season_id, team_id):
"""
season_id = 0 ~ 34
team_id = 1 ~ 10
------------------------------------------------------------------------------------
<season_id>
0 : 1982 ~ 34 : 2016
<team_id>
1 : Nexen heroes
2 : Doosan
3 : Lotte
4 : Samsung
5 : Hanhwa
6 : KIA
7 : KT
8 : LG twins
9 : NC dinos
10 : SK wyberns
"""
url = "http://www.koreabaseball.com/Record/Player/HitterBasic/Detail1.aspx"
driver = webdriver.PhantomJS()
driver.get(url)
# click season
driver.find_element_by_css_selector('#cphContainer_cphContents_ddlSeason_ddlSeason').\
find_elements_by_css_selector('option')[season_id].click()
time.sleep(api_delay_term)
# click team
driver.find_element_by_css_selector('#cphContainer_cphContents_ddlTeam_ddlTeam').\
find_elements_by_css_selector('option')[team_id].click()
time.sleep(api_delay_term)
elements = driver.find_elements_by_css_selector(".record_result tr")
elements = elements[1:len(elements)+1]
hitter_detail_df = pd.DataFrame(columns=[
'rank', 'name', 'team', 'AVG', 'XBH', 'GO', 'AO', 'GO/AO', 'GW_RBI',
'BB/K', 'P/PA', 'ISOP', 'XR', 'GPA'
])
for element in elements:
tmp_dict = {
'rank' : element.find_elements_by_css_selector('td')[0].text,
'name' : element.find_elements_by_css_selector('td')[1].text,
'team' : element.find_elements_by_css_selector('td')[2].text,
'AVG' : element.find_elements_by_css_selector('td')[3].text,
'XBH' : element.find_elements_by_css_selector('td')[4].text,
'GO' : element.find_elements_by_css_selector('td')[5].text,
'AO' : element.find_elements_by_css_selector('td')[6].text,
'GO/AO' : element.find_elements_by_css_selector('td')[7].text,
'GW_RBI' : element.find_elements_by_css_selector('td')[8].text,
'BB/K' : element.find_elements_by_css_selector('td')[9].text,
'P/PA' : element.find_elements_by_css_selector('td')[10].text,
'ISOP' : element.find_elements_by_css_selector('td')[11].text,
'XR' : element.find_elements_by_css_selector('td')[12].text,
'GPA' : element.find_elements_by_css_selector('td')[13].text,
}
hitter_detail_df.loc[len(hitter_detail_df)] = tmp_dict
# close webdriver
driver.close()
return hitter_detail_df
In [9]:
nexen_hd_15_df = crawling_hitter_detail(33,1)
In [10]:
nexen_hd_15_df.to_csv("nexen_hd_15.csv",encoding='utf-8')
In [11]:
start_time = time.time()
doosan_hd_15_df = crawling_hitter_detail(33,2)
lotte_hd_15_df = crawling_hitter_detail(33,3)
samsung_hd_15_df = crawling_hitter_detail(33,4)
hanhwa_hd_15_df = crawling_hitter_detail(33,5)
KIA_hd_15_df = crawling_hitter_detail(33,6)
KT_hd_15_df = crawling_hitter_detail(33,7)
LG_hd_15_df = crawling_hitter_detail(33,8)
NC_hd_15_df = crawling_hitter_detail(33,9)
SK_hd_15_df = crawling_hitter_detail(33,10)
end_time = time.time()
end_time-start_time
Out[11]:
In [12]:
doosan_hd_15_df.to_csv("doosan_hd_15.csv",encoding='utf-8')
lotte_hd_15_df.to_csv("lotte_hd_15.csv",encoding='utf-8')
samsung_hd_15_df.to_csv("samsung_hd_15.csv",encoding='utf-8')
hanhwa_hd_15_df.to_csv("hanhwa_hd_15.csv",encoding='utf-8')
KIA_hd_15_df.to_csv("KIA_hd_15.csv",encoding='utf-8')
KT_hd_15_df.to_csv("KT_hd_15.csv",encoding='utf-8')
LG_hd_15_df.to_csv("LG_hd_15.csv",encoding='utf-8')
NC_hd_15_df.to_csv("NC_hd_15.csv",encoding='utf-8')
SK_hd_15_df.to_csv("SK_hd_15.csv",encoding='utf-8')
In [ ]: