pip install selenium
conda install -c conda-forge selenium
webdriver.Chrome(executable_path="driver/path/chrome.exe")
brew install chromedriver
In [1]:
import time
import requests
from selenium import webdriver
from bs4 import BeautifulSoup
import re
In [2]:
# 처음 브라우저 실행했을 때 플래시 클릭해서 실행하도록 하기 위해 임의의 영상 링크 연다.
# 플래시 실행 여부 클릭만 하고 브라우저는 끄지 않는다.
link = "http://www.tvcf.co.kr/YCf/V.asp?Code=A000301056"
driver = webdriver.Chrome()
time.sleep(2)
driver.get(link)
In [3]:
CATEGORIES = [3, 6] # 자동차가 3번, 생활이 6번
PERIOD = [1995, 2017] # 나오고 안나오는게 있어서 연 단위로 추려놓고 긁기
crawled_data = []
for category in CATEGORIES:
for year in range(PERIOD[0], PERIOD[1]+1):
# 해당 년도의 총 페이지 개수를 알고싶어서 실행
first_page_for_max_count = f"http://www.tvcf.co.kr/MovieK/List.asp?pumone={category} \
&Date1={year}-01-01&Date2={year}-12-31&Page=1"
tmp_html = requests.get(first_page_for_max_count).text
tmp_soup = BeautifulSoup(tmp_html, 'html.parser')
max_page = int(tmp_soup.select('div#pageNavi li')[0].text.split('/')[1])
# 해당 년도 1페이지부터 마지막 페이지까지 긁기
for page_num in range(1, max_page + 1):
page_link = f"http://www.tvcf.co.kr/MovieK/List.asp?pumone={category} \
&Date1={year}-01-01&Date2={year}-12-31&Page={ page_num }"
page_html = requests.get(page_link).text
page_soup = BeautifulSoup(page_html, 'html.parser')
for video_link in page_soup.select('div.thumWrapfix > a'):
video_link = 'http://www.tvcf.co.kr' + video_link.get('href')
driver.get(video_link)
video_html = driver.page_source
video_soup = BeautifulSoup(video_html, 'html.parser')
tmp_title = video_soup.select_one('h2.player_title').text.strip()
tmp_script = video_soup.select_one('div#copyArea').text.strip()
tmp_date = video_soup.select_one('div.onair').text
tmp_tags = [tag.text for tag in video_soup.select('div#tagAreaWrap a')]
driver.find_element_by_css_selector('div[title="만든이정보"]').click()
summary_html = driver.page_source
summary_soup = BeautifulSoup(summary_html, 'html.parser')
tmp_summary = []
for tr in summary_soup.select('table#makerSummary tr'):
tds = tr.select('td')
tmp_tr = {}
tmp_tr['key'] = tds[0].text.strip()
tmp_val = re.sub('\nX', '', tds[1].text.strip())
tmp_val = re.sub(r'[\n]+', ' ', tmp_val)
tmp_tr['val'] = tmp_val
tmp_summary.append(tmp_tr)
tmp_video_info = {}
tmp_video_info['title'] = tmp_title
tmp_video_info['script'] = tmp_script
tmp_video_info['date'] = tmp_date
tmp_video_info['tags'] = tmp_tags
tmp_video_info['summary'] = tmp_summary
crawled_data.append(tmp_video_info)
print('=================')
print('title: ', tmp_title)
print('copy_script: ', tmp_script)
print('video_date: ', tmp_date)
print('tags: ', tmp_tags)
print('maker_summary: ', tmp_summary)
print('=================\n')
In [4]:
for data in crawled_data:
print(data)
In [ ]: