In [1]:
# import package
import pandas as pd
import time
import pickle
from selenium import webdriver
import glob
import os

In [2]:
api_delay_term = 3

In [6]:
# crawling_runner
def crawling_runner(season_id, team_id):
    """
    season_id = 0 ~ 34
    team_id = 1 ~ 10
    ------------------------------------------------------------------------------------
    <season_id>
    0 : 2002 ~ 14 : 2016
    
    <team_id> ==> It can be different from several season.
    1 : Nexen heroes
    2 : Doosan
    3 : Lotte
    4 : Samsung
    5 : Hanhwa
    6 : KIA
    7 : KT
    8 : LG twins
    9 : NC dinos
    10 : SK wyberns
    """
    driver = webdriver.Firefox()
    url = "http://www.koreabaseball.com/Record/Player/Runner/Basic.aspx"
    driver.get(url)
    
    # click season
    driver.find_element_by_css_selector('#cphContainer_cphContents_ddlSeason_ddlSeason').\
            find_elements_by_css_selector('option')[season_id].click()
    time.sleep(api_delay_term)
    
    # click team
    driver.find_element_by_css_selector('#cphContainer_cphContents_ddlTeam_ddlTeam').\
            find_elements_by_css_selector('option')[team_id].click()
    time.sleep(api_delay_term)
    
    # get page number
    page_elements = driver.find_elements_by_css_selector(".paging02 a")
    page_number = len(page_elements)
    if page_number == 1:
        page_number = page_number
    
    if page_number > 1:
        page_number = page_number -2
        
    # make empty dataframe
    runner_df = pd.DataFrame(columns=[
        'rank', 'name', 'team', 'G', 'SBA', 'SB', 'CS', 'SB%', 'OOB',
        'PKO'
    ])
    
    # if having one page
    if page_number == 1:
        elements = driver.find_elements_by_css_selector(".record_result tr")
        elements = elements[1:len(elements)+1]
        
        for element in elements:
            tmp_dict  = {
                'rank' : element.find_elements_by_css_selector('td')[0].text,
                'name' : element.find_elements_by_css_selector('td')[1].text,
                'team' : element.find_elements_by_css_selector('td')[2].text,
                'G' : element.find_elements_by_css_selector('td')[3].text,
                'SBA' : element.find_elements_by_css_selector('td')[4].text,
                'SB' : element.find_elements_by_css_selector('td')[5].text,
                'CS' : element.find_elements_by_css_selector('td')[6].text,
                'SB%' : element.find_elements_by_css_selector('td')[7].text,
                'OOB' : element.find_elements_by_css_selector('td')[8].text,
                'PKO' : element.find_elements_by_css_selector('td')[9].text,
            }
            runner_df.loc[len(runner_df)] = tmp_dict
        
    # if having other more pages
    if page_number > 1:
        for page in range(1, page_number+1):
            driver.find_element_by_css_selector('#cphContainer_cphContents_ucPager_btnNo' + str(page)).click()
            time.sleep(api_delay_term)
            
            elements = driver.find_elements_by_css_selector(".record_result tr")
            elements = elements[1:len(elements)+1]
            
            for element in elements:
                tmp_dict  = {
                    'rank' : element.find_elements_by_css_selector('td')[0].text,
                    'name' : element.find_elements_by_css_selector('td')[1].text,
                    'team' : element.find_elements_by_css_selector('td')[2].text,
                    'G' : element.find_elements_by_css_selector('td')[3].text,
                    'SBA' : element.find_elements_by_css_selector('td')[4].text,
                    'SB' : element.find_elements_by_css_selector('td')[5].text,
                    'CS' : element.find_elements_by_css_selector('td')[6].text,
                    'SB%' : element.find_elements_by_css_selector('td')[7].text,
                    'OOB' : element.find_elements_by_css_selector('td')[8].text,
                    'PKO' : element.find_elements_by_css_selector('td')[9].text,
                }
                runner_df.loc[len(runner_df)] = tmp_dict

    return runner_df

In [5]:
nexen_running_df = crawling_runner(13, 1)

In [7]:
nexen_running_df.to_csv("nexen_running.csv",encoding='utf-8')

In [8]:
start_time = time.time()
doosan_running_df = crawling_runner(13,2)
lotte_running_df = crawling_runner(13,3)
samsung_running_df = crawling_runner(13,4)
hanhwa_running_df = crawling_runner(13,5)
KIA_running_df = crawling_runner(13,6)
KT_running_df = crawling_runner(13,7)
LG_running_df = crawling_runner(13,8)
NC_running_df = crawling_runner(13,9)
SK_running_df = crawling_runner(13,10)
end_time = time.time()

end_time-start_time


Out[8]:
231.96908617019653

In [9]:
doosan_running_df.to_csv("doosan_running.csv",encoding='utf-8')
lotte_running_df.to_csv("lotte_running.csv",encoding='utf-8')
samsung_running_df.to_csv("samsung_running.csv",encoding='utf-8')
hanhwa_running_df.to_csv("hanhwa_running.csv",encoding='utf-8')
KIA_running_df.to_csv("KIA_running.csv",encoding='utf-8')
KT_running_df.to_csv("KT_running.csv",encoding='utf-8')
LG_running_df.to_csv("LG_running.csv",encoding='utf-8')
NC_running_df.to_csv("NC_running.csv",encoding='utf-8')
SK_running_df.to_csv("SK_running.csv",encoding='utf-8')

In [ ]: