notebook.community

Edit and run



In [ ]:

    
import requests, math, re, csv, pandas
from bs4 import BeautifulSoup



In [ ]:

    
def get_day(year, month):
    if month in [1, 3, 5, 7, 8, 10, 12]:
        return 31
    elif month in [4, 6, 9, 11]:
        return 30
    elif month == 2:
        if year in [2012, 2016]:
            return 29
        else:
            return 28



In [ ]:

    
HEADERS = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
START_YEAR = 2012
END_YEAR = 2017
cnt = 1



In [ ]:

    
for year in range(START_YEAR, END_YEAR+1):
    for month in range(1, 13):
        data = []
        if year == 2017 and month == 11:
            break
        for day in range(1, get_day(year, month) + 1):
            date = "{}{:02d}{:02d}".format(year, month, day)
            tmp_link = "https://search.naver.com/search.naver?date_from={}&date_option=8&date_to={}&dup_remove=1&ie=utf8&nso=p%3Afrom{}to{}&post_blogurl=&post_blogurl_without=&query=송도%20-포항%20-부산&sm=tab_pge&srchby=all&st=sim&where=post&start={}".format(date, date, date, date, "1")
            tmp_raw_html = requests.get(tmp_link, headers=HEADERS).text
            tmp_soup = BeautifulSoup(tmp_raw_html, 'lxml')
            navi = tmp_soup.select_one('span.title_num').text
            total_posts_num = int(re.findall(r"[0-9]+-[0-9]+ / ([0-9]+)건", navi)[0])
            max_page = math.ceil(total_posts_num / 10)
            for page in range(1, max_page + 1):
                link = "https://search.naver.com/search.naver?date_from={}&date_option=8&date_to={}&dup_remove=1&ie=utf8&nso=p%3Afrom{}to{}&post_blogurl=&post_blogurl_without=&query=송도%20-포항%20-부산&sm=tab_pge&srchby=all&st=sim&where=post&start={}".format(date, date, date, date, 10*page-9)
                raw_html = requests.get(link, headers=HEADERS).text
                soup = BeautifulSoup(raw_html, 'lxml')
                for post in soup.select('li.sh_blog_top'):
                    title = post.dl.dt.a.text.strip()
                    date = post.dl.dd.text.strip()
                    desc = post.dl.select_one('dd.sh_blog_passage').text
                    data.append([title, date, desc])
                print("{} page complete.".format(cnt))
                cnt += 1
        file_name = "{}-{}.csv".format(year, month)
        with open(file_name, 'w') as csvfile:
            w = csv.writer(csvfile, delimiter=',')
            for line in data:
                w.writerow(line)



In [ ]:

    
d = pandas.read_csv('2012-1.csv', header=None)
d.head()



In [ ]: