In [ ]:
import requests, math, re, csv, pandas
from bs4 import BeautifulSoup
In [ ]:
def get_day(year, month):
if month in [1, 3, 5, 7, 8, 10, 12]:
return 31
elif month in [4, 6, 9, 11]:
return 30
elif month == 2:
if year in [2012, 2016]:
return 29
else:
return 28
In [ ]:
HEADERS = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
START_YEAR = 2012
END_YEAR = 2017
cnt = 1
In [ ]:
for year in range(START_YEAR, END_YEAR+1):
for month in range(1, 13):
data = []
if year == 2017 and month == 11:
break
for day in range(1, get_day(year, month) + 1):
date = "{}{:02d}{:02d}".format(year, month, day)
tmp_link = "https://search.naver.com/search.naver?date_from={}&date_option=8&date_to={}&dup_remove=1&ie=utf8&nso=p%3Afrom{}to{}&post_blogurl=&post_blogurl_without=&query=송도%20-포항%20-부산&sm=tab_pge&srchby=all&st=sim&where=post&start={}".format(date, date, date, date, "1")
tmp_raw_html = requests.get(tmp_link, headers=HEADERS).text
tmp_soup = BeautifulSoup(tmp_raw_html, 'lxml')
navi = tmp_soup.select_one('span.title_num').text
total_posts_num = int(re.findall(r"[0-9]+-[0-9]+ / ([0-9]+)건", navi)[0])
max_page = math.ceil(total_posts_num / 10)
for page in range(1, max_page + 1):
link = "https://search.naver.com/search.naver?date_from={}&date_option=8&date_to={}&dup_remove=1&ie=utf8&nso=p%3Afrom{}to{}&post_blogurl=&post_blogurl_without=&query=송도%20-포항%20-부산&sm=tab_pge&srchby=all&st=sim&where=post&start={}".format(date, date, date, date, 10*page-9)
raw_html = requests.get(link, headers=HEADERS).text
soup = BeautifulSoup(raw_html, 'lxml')
for post in soup.select('li.sh_blog_top'):
title = post.dl.dt.a.text.strip()
date = post.dl.dd.text.strip()
desc = post.dl.select_one('dd.sh_blog_passage').text
data.append([title, date, desc])
print("{} page complete.".format(cnt))
cnt += 1
file_name = "{}-{}.csv".format(year, month)
with open(file_name, 'w') as csvfile:
w = csv.writer(csvfile, delimiter=',')
for line in data:
w.writerow(line)
In [ ]:
d = pandas.read_csv('2012-1.csv', header=None)
d.head()
In [ ]: