In [1]:
# import package
import pandas as pd
import time
import pickle
from selenium import webdriver
import glob
import os
In [2]:
api_delay_term = 3
In [3]:
# crawling_pitcher_basic
def crawling_pitcher_basic(season_id, team_id):
"""
season_id = 0 ~ 34
team_id = 1 ~ 10
------------------------------------------------------------------------------------
<season_id>
0 : 1982
:
:
34 : 2016
<team_id>
1 : Nexen heroes
2 : Doosan
3 : Lotte
4 : Samsung
5 : Hanhwa
6 : KIA
7 : KT
8 : LG twins
9 : NC dinos
10 : SK wyberns
"""
url = "http://www.koreabaseball.com/Record/Player/PitcherBasic/Basic1.aspx"
driver = webdriver.PhantomJS()
driver.get(url)
# click season
driver.find_element_by_css_selector('#cphContainer_cphContents_ddlSeason_ddlSeason').\
find_elements_by_css_selector('option')[season_id].click()
time.sleep(api_delay_term)
# click team
driver.find_element_by_css_selector('#cphContainer_cphContents_ddlTeam_ddlTeam').\
find_elements_by_css_selector('option')[team_id].click()
time.sleep(api_delay_term)
elements = driver.find_elements_by_css_selector(".record_result tr")
elements = elements[1:len(elements)+1]
pitcher_basic_df = pd.DataFrame(columns=[
'rank', 'name', 'team', 'ERA', 'G', 'W', 'L', 'SV', 'HLD', 'WPCT',
'IP', 'H', 'HR', 'BB', 'HBP', 'SO', 'R', 'ER', 'WHIP'
])
for element in elements:
tmp_dict = {
'rank' : element.find_elements_by_css_selector('td')[0].text,
'name' : element.find_elements_by_css_selector('td')[1].text,
'team' : element.find_elements_by_css_selector('td')[2].text,
'ERA' : element.find_elements_by_css_selector('td')[3].text,
'G' : element.find_elements_by_css_selector('td')[4].text,
'W' : element.find_elements_by_css_selector('td')[5].text,
'L' : element.find_elements_by_css_selector('td')[6].text,
'SV' : element.find_elements_by_css_selector('td')[7].text,
'HLD' : element.find_elements_by_css_selector('td')[8].text,
'WPCT' : element.find_elements_by_css_selector('td')[9].text,
'IP' : element.find_elements_by_css_selector('td')[10].text,
'H' : element.find_elements_by_css_selector('td')[11].text,
'HR' : element.find_elements_by_css_selector('td')[12].text,
'BB' : element.find_elements_by_css_selector('td')[13].text,
'HBP' : element.find_elements_by_css_selector('td')[14].text,
'SO' : element.find_elements_by_css_selector('td')[15].text,
'R' : element.find_elements_by_css_selector('td')[16].text,
'ER' : element.find_elements_by_css_selector('td')[17].text,
'WHIP' : element.find_elements_by_css_selector('td')[18].text,
}
pitcher_basic_df.loc[len(pitcher_basic_df)] = tmp_dict
# close webdriver
driver.close()
return pitcher_basic_df
In [9]:
nexen_pb_15_df = crawling_pitcher_basic(33,1)
In [10]:
nexen_pb_15_df.to_csv("nexen_pb_15.csv",encoding='utf-8')
In [15]:
start_time = time.time()
doosan_pb_15_df = crawling_pitcher_basic(33,2)
lotte_pb_15_df = crawling_pitcher_basic(33,3)
samsung_pb_15_df = crawling_pitcher_basic(33,4)
hanhwa_pb_15_df = crawling_pitcher_basic(33,5)
KIA_pb_15_df = crawling_pitcher_basic(33,6)
KT_pb_15_df = crawling_pitcher_basic(33,7)
LG_pb_15_df = crawling_pitcher_basic(33,8)
NC_pb_15_df = crawling_pitcher_basic(33,9)
SK_pb_15_df = crawling_pitcher_basic(33,10)
end_time = time.time()
end_time-start_time
Out[15]:
In [19]:
doosan_pb_15_df.to_csv("doosan_pb_15.csv",encoding='utf-8')
lotte_pb_15_df.to_csv("lotte_pb_15.csv",encoding='utf-8')
samsung_pb_15_df.to_csv("samsung_pb_15.csv",encoding='utf-8')
hanhwa_pb_15_df.to_csv("hanhwa_pb_15.csv",encoding='utf-8')
KIA_pb_15_df.to_csv("KIA_pb_15.csv",encoding='utf-8')
KT_pb_15_df.to_csv("KT_pb_15.csv",encoding='utf-8')
LG_pb_15_df.to_csv("LG_pb_15.csv",encoding='utf-8')
NC_pb_15_df.to_csv("NC_pb_15.csv",encoding='utf-8')
SK_pb_15_df.to_csv("SK_pb_15.csv",encoding='utf-8')
In [5]:
# crawling_pitcher_detail
def crawling_pitcher_detail(season_id, team_id):
"""
season_id = 0 ~ 34
team_id = 1 ~ 10
------------------------------------------------------------------------------------
<season_id>
0 : 1982
:
:
34 : 2016
<team_id>
1 : Nexen heroes
2 : Doosan
3 : Lotte
4 : Samsung
5 : Hanhwa
6 : KIA
7 : KT
8 : LG twins
9 : NC dinos
10 : SK wyberns
"""
url = "http://www.koreabaseball.com/Record/Player/PitcherBasic/Detail1.aspx"
driver = webdriver.PhantomJS()
driver.get(url)
# click season
driver.find_element_by_css_selector('#cphContainer_cphContents_ddlSeason_ddlSeason').\
find_elements_by_css_selector('option')[season_id].click()
time.sleep(api_delay_term)
# click team
driver.find_element_by_css_selector('#cphContainer_cphContents_ddlTeam_ddlTeam').\
find_elements_by_css_selector('option')[team_id].click()
time.sleep(api_delay_term)
elements = driver.find_elements_by_css_selector(".record_result tr")
elements = elements[1:len(elements)+1]
pitcher_detail_df = pd.DataFrame(columns=[
'rank', 'name', 'team', 'ERA', 'GS', 'Wgs', 'Wgr', 'GF', 'SVO', 'TS',
'GDP', 'GO', 'AO', 'GO/AO'
])
for element in elements:
tmp_dict = {
'rank' : element.find_elements_by_css_selector('td')[0].text,
'name' : element.find_elements_by_css_selector('td')[1].text,
'team' : element.find_elements_by_css_selector('td')[2].text,
'ERA' : element.find_elements_by_css_selector('td')[3].text,
'GS' : element.find_elements_by_css_selector('td')[4].text,
'Wgs' : element.find_elements_by_css_selector('td')[5].text,
'Wgr' : element.find_elements_by_css_selector('td')[6].text,
'GF' : element.find_elements_by_css_selector('td')[7].text,
'SVO' : element.find_elements_by_css_selector('td')[8].text,
'TS' : element.find_elements_by_css_selector('td')[9].text,
'GDP' : element.find_elements_by_css_selector('td')[10].text,
'GO' : element.find_elements_by_css_selector('td')[11].text,
'AO' : element.find_elements_by_css_selector('td')[12].text,
'GO/AO' : element.find_elements_by_css_selector('td')[13].text,
}
pitcher_detail_df.loc[len(pitcher_detail_df)] = tmp_dict
# close webdriver
driver.close()
return pitcher_detail_df
In [24]:
nexen_pd_15_df = crawling_pitcher_detail(33,1)
In [25]:
nexen_pd_15_df.to_csv("nexen_pd_15.csv",encoding='utf-8')
In [27]:
start_time = time.time()
doosan_pd_15_df = crawling_pitcher_detail(33,2)
lotte_pd_15_df = crawling_pitcher_detail(33,3)
samsung_pd_15_df = crawling_pitcher_detail(33,4)
hanhwa_pd_15_df = crawling_pitcher_detail(33,5)
KIA_pd_15_df = crawling_pitcher_detail(33,6)
KT_pd_15_df = crawling_pitcher_detail(33,7)
LG_pd_15_df = crawling_pitcher_detail(33,8)
NC_pd_15_df = crawling_pitcher_detail(33,9)
SK_pd_15_df = crawling_pitcher_detail(33,10)
end_time = time.time()
end_time-start_time
Out[27]:
In [29]:
doosan_pd_15_df.to_csv("doosan_pd_15.csv",encoding='utf-8')
lotte_pd_15_df.to_csv("lotte_pd_15.csv",encoding='utf-8')
samsung_pd_15_df.to_csv("samsung_pd_15.csv",encoding='utf-8')
hanhwa_pd_15_df.to_csv("hanhwa_pd_15.csv",encoding='utf-8')
KIA_pd_15_df.to_csv("KIA_pd_15.csv",encoding='utf-8')
KT_pd_15_df.to_csv("KT_pd_15.csv",encoding='utf-8')
LG_pd_15_df.to_csv("LG_pd_15.csv",encoding='utf-8')
NC_pd_15_df.to_csv("NC_pd_15.csv",encoding='utf-8')
SK_pd_15_df.to_csv("SK_pd_15.csv",encoding='utf-8')
In [ ]: