In [1]:
import requests
import pandas as pd
import time
import json
import pickle

from bs4 import BeautifulSoup

Article


In [2]:
category_dict = {
    "100":950203, # 정치
    "101":949986, # 경제
    "102":949987, # 사회
    "103":949988, # 생활/문화
    "104":949990, # 세계
    "105":949984, # IT/과학
}

In [3]:
def last_page(category, date):
    compnentId = category_dict[str(category)]
    url = "http://news.naver.com/main/mainNews.nhn?componentId=" + str(compnentId) + "&date=" + date + " 00:00:00&page=100"
    response = requests.get(url)
    return response.json()["pagerInfo"]["page"]

In [4]:
# using json
def get_likeit(aid, oid):    
    url = "http://news.like.naver.com/likeIt/likeItContent.jsonp?_callback=window.__jindo2_callback._7105&serviceId=NEWS&displayId=NEWS&contentsId=ne_" + str(oid) + "_" + str(aid) + "&lang=ko&viewType=recommend"
    response = requests.get(url)
    return response.text.split('likeItCount":')[1].split(",")[0]
    
# using bs4
def get_content(path):
    
    response = requests.get(path)
    dom = BeautifulSoup(response.content, "html.parser")

    if len(dom.select("#articleTitleCommentCount .lo_txt")) == 0:
        return 0, 0, "-"
    
    comment = dom.select_one("#articleTitleCommentCount .lo_txt").text
    content = dom.select_one("#articleBodyContents").text.replace("\n","").replace("\r","").replace("\t","")
    aid = path.split("aid=")[1]
    oid = path.split("oid=")[1].split("&")[0]
    likeit = get_likeit(aid, oid)
    
    return comment, likeit, content

In [5]:
def one_page_df(category, date, page):
    """ excute time about 5 ~ 6 sec """

    url = "http://news.naver.com/main/mainNews.nhn?componentId=" + str(category_dict[str(category)]) + "&date=" + date + " 00:00:00&page=" + str(page)
    response = requests.get(url)
    article_list = response.json()["itemList"]
    
    result_df = pd.DataFrame(columns=["newsid", "oid", "newspaper", "title", "link", "comment", "likeit", "content", "date", "category"])

    for article in article_list:
        link = "http://news.naver.com/main/read.nhn?mode=LSD&mid=shm&sid1=" + str(category) + "&oid=" + article["officeId"] + "&aid=" + article["articleId"]        
        comment, likeit, content = get_content(link)
        
        tmp_dict = {
            "newsid": article["articleId"],
            "oid": article["officeId"],
            "newspaper": article["officeName"],
            "title": article["title"],
            "link": link,
            "comment": comment,
            "likeit": likeit,
            "content": content.split("▶")[0],
            "date": date,
            "category": str(category-100),
        }
        if len(tmp_dict["content"]) < 100:
            continue
            
        result_df.loc[len(result_df)] = tmp_dict
        
    return result_df

In [6]:
def one_day_df(category, date):
    """ excute time about 60 sec / 10 page """
    
    last_page_number = int(last_page(category, date))
    
    print("last page : {} / {} / {}".format(last_page_number, category, date))
    
    df_list = []
    
    for page in range(1, last_page_number + 1):
        df = one_page_df(category, date, page)
        df_list.append(df)
        time.sleep(0.5)
        
    return pd.concat(df_list).reset_index(drop=True)

In [7]:
def day_news(date):
    
    print("day_news : {}".format(date))
    
    df_list = []
    
    for category in range(100, 106):
        day_df = one_day_df(category, date)
        df_list.append(day_df)
    
    return pd.concat(df_list).reset_index(drop=True)

def get_monthly_article(month, startday, lastday):
    for day in range(startday, lastday+1):
        month = "0" + str(month) if 10 > month else str(month)
        day = "0" + str(day) if 10 > day else str(day)
        date = "2016-" + month + "-" + day
        df = day_news(date)
        df.to_csv("./news/" + date + ".csv", index=False, encoding="utf-8" )

Comments


In [8]:
headers = {
    "Accept":"*/*",
    "Accept-Encoding":"gzip, deflate, sdch",
    "Accept-Language":"ko-KR,ko;q=0.8,en-US;q=0.6,en;q=0.4",
    "Cache-Control":"max-age=0",
    "Connection":"keep-alive",
    "Cookie":"npic=yGOyjNw7Q1cCoicfWB0wV+Tr4fwcQPxyluH+SKnsOSsV5NS2FNfNvLUSv1UqBJYBCA==; NNB=53ANW3TAEVWVO; nx_ssl=2; nid_iplevel=1; nid_inf=1295365988; NID_AUT=qctX1xOK/J8QR13lto6P3+2ewP5SxBjr4+qhVLq5pDGsWRg6lY34IJcuIP2/PB7n; NID_SES=AAABYqaaTl9N4w74I/Ek86u0fjTVd/4jnbUWV1SAuw8H0KAopRsMrf86acCTIqaQ8JX2cZS7Yj9fGO4PiWHMEzHh9NebkGJbEIWlAcpXrM8g37v9YCj+IAFlUzYfOkfMW394RLvn8ruFceiLEbCSpmUkSEtiIKVnM9+neHGEzbYgtzTqUBFwsDDVq7A5iCYAoXANuXP3qxsx49iOtqSl4fk6k4f9OwKaa5lcjnOuBAyJfH9P2d/GbOddJ81e0CDQKoqZvvNhESf/r+NQFu35fpEmEeR18hhjDSjKB5L4YJIRG1KtGXisehyM3gyEIDDFzE6MZu6Z79gzkCU8tabNi2Rd1HSqpGGJAaWtFtALYXSYbHNj7LKqYxpGIXNjCuZvjRpKefNweu5c64NI8gK3ow0Gf9IzaJgpewMeNQGuZi/qCx9mgfWnSTrMRQhYfnmYcp+mRnzNGzklOcUrPwpM16/y0yN2sI5mmiEnp1Fb9B9W3ILT; page_uid=SGE/2wpydfZsscDq0tZssssssv8-387743; _naver_usersession_=flkyahsVLGlU207+MCWYpg==",
    "Host":"m.cafe.naver.com",
    "Referer":"http://m.cafe.naver.com/ArticleSearchList.nhn?search.query=%EB%A7%A5%EB%B6%81&search.menuid=&search.searchBy=0&search.sortBy=sim&search.clubid=10050146",
    "Upgrade-Insecure-Requests":"1",
    "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36",
}

In [9]:
def get_one_article_comments(link):
    aid = link.split("aid=")[1]
    oid = link.split("oid=")[1].split("&")[0]
    category = link.split("sid1=")[1].split("&")[0]
    url = "https://apis.naver.com/commentBox/cbox/web_naver_list_jsonp.json?ticket=news&templateId=default_world&pool=cbox5&_callback=window.__cbox_jindo_callback._9381&lang=ko&country=KR&objectId=news" + oid + "," + aid + "&pageSize=10000&indexSize=10&page=1&sort=new"
    response = requests.get(url, headers=headers)
    
    result = response.text.replace("window.__cbox_jindo_callback._9381(", "")[:-2]
    result_json = json.loads(result)
    comments = result_json["result"]["commentList"]
    
    result_df = pd.DataFrame(columns=["category","aid","oid","userIdNo", "userName", "good", "bad", "contents", "regTime"])

    if response.status_code != 200:
        print("response fail : {}".format(response))
        return result_df;
    
    for comment in comments:
    
        tmp_dict = {
            "category":  str(int(category)-100),
            "aid": int(aid),
            "oid": oid,
            "userIdNo": comment["userIdNo"],
            "userName": comment["userName"],
            "contents": comment["contents"],
            "regTime": comment["regTime"].replace("T", " ").split("+")[0],
            "good": comment["sympathyCount"],
            "bad": comment["antipathyCount"],
        }
        
        result_df.loc[len(result_df)] = tmp_dict

    return result_df

In [10]:
def get_comments(article_df):
    
    links = list(article_df["link"])
    
    df_list = []

    for idx, link in enumerate(links):
        
        if idx%100 == 0:
            print(idx)
        
        comment_df = get_one_article_comments(link)
        df_list.append(comment_df)
        time.sleep(0.1)
        
    return pd.concat(df_list).reset_index(drop=True)

In [11]:
def auto_cralwing(date):
    
    # cralwing oneday article
    article_oneday_df = day_news(date)

    # save article df
    with open("./data/article_" + date + ".plk", 'wb') as file:
        pickle.dump(article_oneday_df, file)

    with open("./data/article_" + date + ".plk", 'rb') as file:
        df = pickle.load(file)
        print("article : {}".format(len(df)))
    
    # crawling comment
    comments_df = get_comments(article_oneday_df[:])

    # save comment
    with open("./data/comment_" + date + ".plk", 'wb') as file:
        pickle.dump(comments_df, file)
    
    with open("./data/comment_" + date + ".plk", 'rb') as file:
        df = pickle.load(file)
        print("comment : {}".format(len(df)))

In [ ]:
%time auto_cralwing("2016-06-01")