In [1]:
# import package
import pandas as pd
import time
import pickle
from selenium import webdriver
import glob
import os
import csv

In [2]:
api_delay_term = 5

In [6]:
# save player url
def get_players_url(season_id, team_id):
    # connect url
    url = "http://www.koreabaseball.com/Record/Player/HitterBasic/Basic1.aspx"
    driver = webdriver.Firefox()
    driver.get(url)
    
    # click season
    driver.find_element_by_css_selector('#cphContainer_cphContents_ddlSeason_ddlSeason').\
                find_elements_by_css_selector('option')[season_id].click()
    time.sleep(api_delay_term)
    
    # click team
    driver.find_element_by_css_selector('#cphContainer_cphContents_ddlTeam_ddlTeam').\
                find_elements_by_css_selector('option')[team_id].click()
    time.sleep(api_delay_term)
    
    # get page number
    page_elements = driver.find_elements_by_css_selector(".paging02 a")
    page_number = len(page_elements)
    if page_number == 1:
        page_number = page_number
    
    if page_number > 1:
        page_number = page_number -2
    
    # make empty dataframe
    url_df = pd.DataFrame(columns=['url'])
    
    # if having one page
    if page_number == 1:
        elements = driver.find_elements_by_css_selector(".record_result tr")
        elements = elements[1:len(elements)+1]
    
        for element in elements:
            tmp_dict = {
                "url" : element.find_element_by_css_selector("a").get_attribute("href")
                }
            url_df.loc[len(url_df)] = tmp_dict
            
    # if having other more pages
    if page_number > 1:
        for page in range(1, page_number+1):
            driver.find_element_by_css_selector('#cphContainer_cphContents_ucPager_btnNo' + str(page)).click()
            time.sleep(api_delay_term)
            
            elements = driver.find_elements_by_css_selector(".record_result tr")
            elements = elements[1:len(elements)+1]
            
            for element in elements:
                tmp_dict = {
                    "url" : element.find_element_by_css_selector("a").get_attribute("href")
                    }
                url_df.loc[len(url_df)] = tmp_dict        
    return url_df

In [7]:
def crawling_player_info(season_id, team_id):
    # get each player's url
    url_df = get_players_url(season_id, team_id)
    
    # drop the retired players
    is_retired = url_df['url'].str.contains("Retire") == True
    url_df = url_df.drop(url_df.index[is_retired]).reset_index()
    print("get url_df successfully")
    
    # make empty dataframe
    player_info_df = pd.DataFrame(columns=[
                    "p_number", "name", "salary"
            ])
    driver1 = webdriver.Firefox()
    
    # connect each url & scrap player info
    for number in range(0, len(url_df)):
        player_url = url_df['url'][number]
        driver1.get(player_url)
        time.sleep(api_delay_term)
        
        player_info = driver1.find_elements_by_css_selector(".player_basic ul li")

        tmp_dict = {
                    "p_number" : player_url.split("=")[1],
                    "name" : player_info[0].text.split(": ")[1],
                    "salary" : player_info[7].text.split(": ")[1],
                    }
        player_info_df.loc[len(player_info_df)] = tmp_dict
   
    return player_info_df

In [8]:
nexen_hsalary_df = crawling_player_info(33,1)


get url_df successfully

In [9]:
nexen_hsalary_df.to_csv("nexen_hsalary.csv",encoding='utf-8')

In [10]:
start_time = time.time()
doosan_hsalary_df = crawling_player_info(33,2)
lotte_hsalary_df = crawling_player_info(33,3)
samsung_hsalary_df = crawling_player_info(33,4)
hanhwa_hsalary_df = crawling_player_info(33,5)
KIA_hsalary_df = crawling_player_info(33,6)
KT_hsalary_df = crawling_player_info(33,7)
LG_hsalary_df = crawling_player_info(33,8)
NC_hsalary_df = crawling_player_info(33,9)
SK_hsalary_df = crawling_player_info(33,10)
end_time = time.time()

end_time-start_time


get url_df successfully
get url_df successfully
get url_df successfully
get url_df successfully
get url_df successfully
get url_df successfully
get url_df successfully
get url_df successfully
get url_df successfully
Out[10]:
1981.678078174591

In [11]:
doosan_hsalary_df.to_csv("doosan_hsalary.csv",encoding='utf-8')
lotte_hsalary_df.to_csv("lotte_hsalary.csv",encoding='utf-8')
samsung_hsalary_df.to_csv("samsung_hsalary.csv",encoding='utf-8')
hanhwa_hsalary_df.to_csv("hanhwa_hsalary.csv",encoding='utf-8')
KIA_hsalary_df.to_csv("KIA_hsalary.csv",encoding='utf-8')
KT_hsalary_df.to_csv("KT_hsalary.csv",encoding='utf-8')
LG_hsalary_df.to_csv("LG_hsalary.csv",encoding='utf-8')
NC_hsalary_df.to_csv("NC_hsalary.csv",encoding='utf-8')
SK_hsalary_df.to_csv("SK_hsalary.csv",encoding='utf-8')

In [12]:
# save player url
def get_players_url(season_id, team_id):
    # connect url
    url = "http://www.koreabaseball.com/Record/Player/PitcherBasic/Basic1.aspx"
    driver = webdriver.Firefox()
    driver.get(url)
    
    # click season
    driver.find_element_by_css_selector('#cphContainer_cphContents_ddlSeason_ddlSeason').\
                find_elements_by_css_selector('option')[season_id].click()
    time.sleep(api_delay_term)
    
    # click team
    driver.find_element_by_css_selector('#cphContainer_cphContents_ddlTeam_ddlTeam').\
                find_elements_by_css_selector('option')[team_id].click()
    time.sleep(api_delay_term)
    
    # get page number
    page_elements = driver.find_elements_by_css_selector(".paging02 a")
    page_number = len(page_elements)
    if page_number == 1:
        page_number = page_number
    
    if page_number > 1:
        page_number = page_number -2
    
    # make empty dataframe
    url_df = pd.DataFrame(columns=['url'])
    
    # if having one page
    if page_number == 1:
        elements = driver.find_elements_by_css_selector(".record_result tr")
        elements = elements[1:len(elements)+1]
    
        for element in elements:
            tmp_dict = {
                "url" : element.find_element_by_css_selector("a").get_attribute("href")
                }
            url_df.loc[len(url_df)] = tmp_dict
            
    # if having other more pages
    if page_number > 1:
        for page in range(1, page_number+1):
            driver.find_element_by_css_selector('#cphContainer_cphContents_ucPager_btnNo' + str(page)).click()
            time.sleep(api_delay_term)
            
            elements = driver.find_elements_by_css_selector(".record_result tr")
            elements = elements[1:len(elements)+1]
            
            for element in elements:
                tmp_dict = {
                    "url" : element.find_element_by_css_selector("a").get_attribute("href")
                    }
                url_df.loc[len(url_df)] = tmp_dict        
    return url_df

In [13]:
nexen_psalary_df = crawling_player_info(33,1)


get url_df successfully

In [14]:
nexen_psalary_df.to_csv("nexen_psalary.csv",encoding='utf-8')

In [15]:
start_time = time.time()
doosan_psalary_df = crawling_player_info(33,2)
lotte_psalary_df = crawling_player_info(33,3)
samsung_psalary_df = crawling_player_info(33,4)
hanhwa_psalary_df = crawling_player_info(33,5)
KIA_psalary_df = crawling_player_info(33,6)
KT_psalary_df = crawling_player_info(33,7)
LG_psalary_df = crawling_player_info(33,8)
NC_psalary_df = crawling_player_info(33,9)
SK_psalary_df = crawling_player_info(33,10)
end_time = time.time()

end_time-start_time


get url_df successfully
get url_df successfully
get url_df successfully
get url_df successfully
get url_df successfully
get url_df successfully
get url_df successfully
get url_df successfully
get url_df successfully
Out[15]:
1265.9138431549072

In [16]:
doosan_psalary_df.to_csv("doosan_psalary.csv",encoding='utf-8')
lotte_psalary_df.to_csv("lotte_psalary.csv",encoding='utf-8')
samsung_psalary_df.to_csv("samsung_psalary.csv",encoding='utf-8')
hanhwa_psalary_df.to_csv("hanhwa_psalary.csv",encoding='utf-8')
KIA_psalary_df.to_csv("KIA_psalary.csv",encoding='utf-8')
KT_psalary_df.to_csv("KT_psalary.csv",encoding='utf-8')
LG_psalary_df.to_csv("LG_psalary.csv",encoding='utf-8')
NC_psalary_df.to_csv("NC_psalary.csv",encoding='utf-8')
SK_psalary_df.to_csv("SK_psalary.csv",encoding='utf-8')

In [ ]: