In [1]:
# import package
import pandas as pd
import time
import pickle
from selenium import webdriver
import glob
import os

In [2]:
api_delay_term = 3

In [3]:
# crawling_pitcher_basic
def crawling_pitcher_basic(season_id, team_id):
    """
    season_id = 0 ~ 34
    team_id = 1 ~ 10
    ------------------------------------------------------------------------------------
    <season_id>
    0 : 1982 
      :
      :
    34 : 2016
    
    <team_id>
    1 : Nexen heroes
    2 : Doosan
    3 : Lotte
    4 : Samsung
    5 : Hanhwa
    6 : KIA
    7 : KT
    8 : LG twins
    9 : NC dinos
    10 : SK wyberns
    """
    
    url = "http://www.koreabaseball.com/Record/Player/PitcherBasic/Basic1.aspx"
    driver = webdriver.PhantomJS()
    driver.get(url)
    
    # click season
    driver.find_element_by_css_selector('#cphContainer_cphContents_ddlSeason_ddlSeason').\
            find_elements_by_css_selector('option')[season_id].click()
    time.sleep(api_delay_term)
    # click team
    driver.find_element_by_css_selector('#cphContainer_cphContents_ddlTeam_ddlTeam').\
            find_elements_by_css_selector('option')[team_id].click()
    time.sleep(api_delay_term)
    elements = driver.find_elements_by_css_selector(".record_result tr")
    elements = elements[1:len(elements)+1]
    
    pitcher_basic_df = pd.DataFrame(columns=[
        'rank', 'name', 'team', 'ERA', 'G', 'W', 'L', 'SV', 'HLD', 'WPCT',
        'IP', 'H', 'HR', 'BB', 'HBP', 'SO', 'R', 'ER', 'WHIP'
    ])
    
    for element in elements:
        tmp_dict  = {
            'rank' : element.find_elements_by_css_selector('td')[0].text,
            'name' : element.find_elements_by_css_selector('td')[1].text,
            'team' : element.find_elements_by_css_selector('td')[2].text,
            'ERA' : element.find_elements_by_css_selector('td')[3].text,
            'G' : element.find_elements_by_css_selector('td')[4].text,
            'W' : element.find_elements_by_css_selector('td')[5].text,
            'L' : element.find_elements_by_css_selector('td')[6].text,
            'SV' : element.find_elements_by_css_selector('td')[7].text,
            'HLD' : element.find_elements_by_css_selector('td')[8].text,
            'WPCT' : element.find_elements_by_css_selector('td')[9].text,
            'IP' : element.find_elements_by_css_selector('td')[10].text,
            'H' : element.find_elements_by_css_selector('td')[11].text,
            'HR' : element.find_elements_by_css_selector('td')[12].text,
            'BB' : element.find_elements_by_css_selector('td')[13].text,
            'HBP' : element.find_elements_by_css_selector('td')[14].text,
            'SO' : element.find_elements_by_css_selector('td')[15].text,
            'R' : element.find_elements_by_css_selector('td')[16].text,
            'ER' : element.find_elements_by_css_selector('td')[17].text,
            'WHIP' : element.find_elements_by_css_selector('td')[18].text,
        }
        pitcher_basic_df.loc[len(pitcher_basic_df)] = tmp_dict
    # close webdriver
    driver.close()
    return pitcher_basic_df

In [9]:
nexen_pb_15_df = crawling_pitcher_basic(33,1)

In [10]:
nexen_pb_15_df.to_csv("nexen_pb_15.csv",encoding='utf-8')

In [15]:
start_time = time.time()
doosan_pb_15_df = crawling_pitcher_basic(33,2)
lotte_pb_15_df = crawling_pitcher_basic(33,3)
samsung_pb_15_df = crawling_pitcher_basic(33,4)
hanhwa_pb_15_df = crawling_pitcher_basic(33,5)
KIA_pb_15_df = crawling_pitcher_basic(33,6)
KT_pb_15_df = crawling_pitcher_basic(33,7)
LG_pb_15_df = crawling_pitcher_basic(33,8)
NC_pb_15_df = crawling_pitcher_basic(33,9)
SK_pb_15_df = crawling_pitcher_basic(33,10)
end_time = time.time()

end_time-start_time


Out[15]:
221.34296488761902

In [19]:
doosan_pb_15_df.to_csv("doosan_pb_15.csv",encoding='utf-8')
lotte_pb_15_df.to_csv("lotte_pb_15.csv",encoding='utf-8')
samsung_pb_15_df.to_csv("samsung_pb_15.csv",encoding='utf-8')
hanhwa_pb_15_df.to_csv("hanhwa_pb_15.csv",encoding='utf-8')
KIA_pb_15_df.to_csv("KIA_pb_15.csv",encoding='utf-8')
KT_pb_15_df.to_csv("KT_pb_15.csv",encoding='utf-8')
LG_pb_15_df.to_csv("LG_pb_15.csv",encoding='utf-8')
NC_pb_15_df.to_csv("NC_pb_15.csv",encoding='utf-8')
SK_pb_15_df.to_csv("SK_pb_15.csv",encoding='utf-8')

In [5]:
# crawling_pitcher_detail
def crawling_pitcher_detail(season_id, team_id):
    """
    season_id = 0 ~ 34
    team_id = 1 ~ 10
    ------------------------------------------------------------------------------------
    <season_id>
    0 : 1982 
      :
      :
    34 : 2016
    
    <team_id>
    1 : Nexen heroes
    2 : Doosan
    3 : Lotte
    4 : Samsung
    5 : Hanhwa
    6 : KIA
    7 : KT
    8 : LG twins
    9 : NC dinos
    10 : SK wyberns
    """
    
    url = "http://www.koreabaseball.com/Record/Player/PitcherBasic/Detail1.aspx"
    driver = webdriver.PhantomJS()
    driver.get(url)
    
    # click season
    driver.find_element_by_css_selector('#cphContainer_cphContents_ddlSeason_ddlSeason').\
            find_elements_by_css_selector('option')[season_id].click()
    time.sleep(api_delay_term)
    # click team
    driver.find_element_by_css_selector('#cphContainer_cphContents_ddlTeam_ddlTeam').\
            find_elements_by_css_selector('option')[team_id].click()
    time.sleep(api_delay_term)
    elements = driver.find_elements_by_css_selector(".record_result tr")
    elements = elements[1:len(elements)+1]
    
    pitcher_detail_df = pd.DataFrame(columns=[
        'rank', 'name', 'team', 'ERA', 'GS', 'Wgs', 'Wgr', 'GF', 'SVO', 'TS',
        'GDP', 'GO', 'AO', 'GO/AO'
    ])
    
    for element in elements:
        tmp_dict  = {
            'rank' : element.find_elements_by_css_selector('td')[0].text,
            'name' : element.find_elements_by_css_selector('td')[1].text,
            'team' : element.find_elements_by_css_selector('td')[2].text,
            'ERA' : element.find_elements_by_css_selector('td')[3].text,
            'GS' : element.find_elements_by_css_selector('td')[4].text,
            'Wgs' : element.find_elements_by_css_selector('td')[5].text,
            'Wgr' : element.find_elements_by_css_selector('td')[6].text,
            'GF' : element.find_elements_by_css_selector('td')[7].text,
            'SVO' : element.find_elements_by_css_selector('td')[8].text,
            'TS' : element.find_elements_by_css_selector('td')[9].text,
            'GDP' : element.find_elements_by_css_selector('td')[10].text,
            'GO' : element.find_elements_by_css_selector('td')[11].text,
            'AO' : element.find_elements_by_css_selector('td')[12].text,
            'GO/AO' : element.find_elements_by_css_selector('td')[13].text,
        }
        pitcher_detail_df.loc[len(pitcher_detail_df)] = tmp_dict
    # close webdriver
    driver.close()
    return pitcher_detail_df

In [24]:
nexen_pd_15_df = crawling_pitcher_detail(33,1)

In [25]:
nexen_pd_15_df.to_csv("nexen_pd_15.csv",encoding='utf-8')

In [27]:
start_time = time.time()
doosan_pd_15_df = crawling_pitcher_detail(33,2)
lotte_pd_15_df = crawling_pitcher_detail(33,3)
samsung_pd_15_df = crawling_pitcher_detail(33,4)
hanhwa_pd_15_df = crawling_pitcher_detail(33,5)
KIA_pd_15_df = crawling_pitcher_detail(33,6)
KT_pd_15_df = crawling_pitcher_detail(33,7)
LG_pd_15_df = crawling_pitcher_detail(33,8)
NC_pd_15_df = crawling_pitcher_detail(33,9)
SK_pd_15_df = crawling_pitcher_detail(33,10)
end_time = time.time()

end_time-start_time


Out[27]:
208.31250405311584

In [29]:
doosan_pd_15_df.to_csv("doosan_pd_15.csv",encoding='utf-8')
lotte_pd_15_df.to_csv("lotte_pd_15.csv",encoding='utf-8')
samsung_pd_15_df.to_csv("samsung_pd_15.csv",encoding='utf-8')
hanhwa_pd_15_df.to_csv("hanhwa_pd_15.csv",encoding='utf-8')
KIA_pd_15_df.to_csv("KIA_pd_15.csv",encoding='utf-8')
KT_pd_15_df.to_csv("KT_pd_15.csv",encoding='utf-8')
LG_pd_15_df.to_csv("LG_pd_15.csv",encoding='utf-8')
NC_pd_15_df.to_csv("NC_pd_15.csv",encoding='utf-8')
SK_pd_15_df.to_csv("SK_pd_15.csv",encoding='utf-8')

In [ ]: