In [1]:
import requests
import pandas as pd
import time
import json
import pickle
from bs4 import BeautifulSoup
In [2]:
category_dict = {
"100":950203, # 정치
"101":949986, # 경제
"102":949987, # 사회
"103":949988, # 생활/문화
"104":949990, # 세계
"105":949984, # IT/과학
}
In [3]:
def last_page(category, date):
compnentId = category_dict[str(category)]
url = "http://news.naver.com/main/mainNews.nhn?componentId=" + str(compnentId) + "&date=" + date + " 00:00:00&page=100"
response = requests.get(url)
return response.json()["pagerInfo"]["page"]
In [4]:
# using json
def get_likeit(aid, oid):
url = "http://news.like.naver.com/likeIt/likeItContent.jsonp?_callback=window.__jindo2_callback._7105&serviceId=NEWS&displayId=NEWS&contentsId=ne_" + str(oid) + "_" + str(aid) + "&lang=ko&viewType=recommend"
response = requests.get(url)
return response.text.split('likeItCount":')[1].split(",")[0]
# using bs4
def get_content(path):
response = requests.get(path)
dom = BeautifulSoup(response.content, "html.parser")
if len(dom.select("#articleTitleCommentCount .lo_txt")) == 0:
return 0, 0, "-"
comment = dom.select_one("#articleTitleCommentCount .lo_txt").text
content = dom.select_one("#articleBodyContents").text.replace("\n","").replace("\r","").replace("\t","")
aid = path.split("aid=")[1]
oid = path.split("oid=")[1].split("&")[0]
likeit = get_likeit(aid, oid)
return comment, likeit, content
In [5]:
def one_page_df(category, date, page):
""" excute time about 5 ~ 6 sec """
url = "http://news.naver.com/main/mainNews.nhn?componentId=" + str(category_dict[str(category)]) + "&date=" + date + " 00:00:00&page=" + str(page)
response = requests.get(url)
article_list = response.json()["itemList"]
result_df = pd.DataFrame(columns=["newsid", "oid", "newspaper", "title", "link", "comment", "likeit", "content", "date", "category"])
for article in article_list:
link = "http://news.naver.com/main/read.nhn?mode=LSD&mid=shm&sid1=" + str(category) + "&oid=" + article["officeId"] + "&aid=" + article["articleId"]
comment, likeit, content = get_content(link)
tmp_dict = {
"newsid": article["articleId"],
"oid": article["officeId"],
"newspaper": article["officeName"],
"title": article["title"],
"link": link,
"comment": comment,
"likeit": likeit,
"content": content.split("▶")[0],
"date": date,
"category": str(category-100),
}
if len(tmp_dict["content"]) < 100:
continue
result_df.loc[len(result_df)] = tmp_dict
return result_df
In [6]:
def one_day_df(category, date):
""" excute time about 60 sec / 10 page """
last_page_number = int(last_page(category, date))
print("last page : {} / {} / {}".format(last_page_number, category, date))
df_list = []
for page in range(1, last_page_number + 1):
df = one_page_df(category, date, page)
df_list.append(df)
time.sleep(0.5)
return pd.concat(df_list).reset_index(drop=True)
In [7]:
def day_news(date):
print("day_news : {}".format(date))
df_list = []
for category in range(100, 106):
day_df = one_day_df(category, date)
df_list.append(day_df)
return pd.concat(df_list).reset_index(drop=True)
def get_monthly_article(month, startday, lastday):
for day in range(startday, lastday+1):
month = "0" + str(month) if 10 > month else str(month)
day = "0" + str(day) if 10 > day else str(day)
date = "2016-" + month + "-" + day
df = day_news(date)
df.to_csv("./news/" + date + ".csv", index=False, encoding="utf-8" )
In [8]:
headers = {
"Accept":"*/*",
"Accept-Encoding":"gzip, deflate, sdch",
"Accept-Language":"ko-KR,ko;q=0.8,en-US;q=0.6,en;q=0.4",
"Cache-Control":"max-age=0",
"Connection":"keep-alive",
"Cookie":"npic=yGOyjNw7Q1cCoicfWB0wV+Tr4fwcQPxyluH+SKnsOSsV5NS2FNfNvLUSv1UqBJYBCA==; NNB=53ANW3TAEVWVO; nx_ssl=2; nid_iplevel=1; nid_inf=1295365988; NID_AUT=qctX1xOK/J8QR13lto6P3+2ewP5SxBjr4+qhVLq5pDGsWRg6lY34IJcuIP2/PB7n; NID_SES=AAABYqaaTl9N4w74I/Ek86u0fjTVd/4jnbUWV1SAuw8H0KAopRsMrf86acCTIqaQ8JX2cZS7Yj9fGO4PiWHMEzHh9NebkGJbEIWlAcpXrM8g37v9YCj+IAFlUzYfOkfMW394RLvn8ruFceiLEbCSpmUkSEtiIKVnM9+neHGEzbYgtzTqUBFwsDDVq7A5iCYAoXANuXP3qxsx49iOtqSl4fk6k4f9OwKaa5lcjnOuBAyJfH9P2d/GbOddJ81e0CDQKoqZvvNhESf/r+NQFu35fpEmEeR18hhjDSjKB5L4YJIRG1KtGXisehyM3gyEIDDFzE6MZu6Z79gzkCU8tabNi2Rd1HSqpGGJAaWtFtALYXSYbHNj7LKqYxpGIXNjCuZvjRpKefNweu5c64NI8gK3ow0Gf9IzaJgpewMeNQGuZi/qCx9mgfWnSTrMRQhYfnmYcp+mRnzNGzklOcUrPwpM16/y0yN2sI5mmiEnp1Fb9B9W3ILT; page_uid=SGE/2wpydfZsscDq0tZssssssv8-387743; _naver_usersession_=flkyahsVLGlU207+MCWYpg==",
"Host":"m.cafe.naver.com",
"Referer":"http://m.cafe.naver.com/ArticleSearchList.nhn?search.query=%EB%A7%A5%EB%B6%81&search.menuid=&search.searchBy=0&search.sortBy=sim&search.clubid=10050146",
"Upgrade-Insecure-Requests":"1",
"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36",
}
In [9]:
def get_one_article_comments(link):
aid = link.split("aid=")[1]
oid = link.split("oid=")[1].split("&")[0]
category = link.split("sid1=")[1].split("&")[0]
url = "https://apis.naver.com/commentBox/cbox/web_naver_list_jsonp.json?ticket=news&templateId=default_world&pool=cbox5&_callback=window.__cbox_jindo_callback._9381&lang=ko&country=KR&objectId=news" + oid + "," + aid + "&pageSize=10000&indexSize=10&page=1&sort=new"
response = requests.get(url, headers=headers)
result = response.text.replace("window.__cbox_jindo_callback._9381(", "")[:-2]
result_json = json.loads(result)
comments = result_json["result"]["commentList"]
result_df = pd.DataFrame(columns=["category","aid","oid","userIdNo", "userName", "good", "bad", "contents", "regTime"])
if response.status_code != 200:
print("response fail : {}".format(response))
return result_df;
for comment in comments:
tmp_dict = {
"category": str(int(category)-100),
"aid": int(aid),
"oid": oid,
"userIdNo": comment["userIdNo"],
"userName": comment["userName"],
"contents": comment["contents"],
"regTime": comment["regTime"].replace("T", " ").split("+")[0],
"good": comment["sympathyCount"],
"bad": comment["antipathyCount"],
}
result_df.loc[len(result_df)] = tmp_dict
return result_df
In [10]:
def get_comments(article_df):
links = list(article_df["link"])
df_list = []
for idx, link in enumerate(links):
if idx%100 == 0:
print(idx)
comment_df = get_one_article_comments(link)
df_list.append(comment_df)
time.sleep(0.1)
return pd.concat(df_list).reset_index(drop=True)
In [11]:
def auto_cralwing(date):
# cralwing oneday article
article_oneday_df = day_news(date)
# save article df
with open("./data/article_" + date + ".plk", 'wb') as file:
pickle.dump(article_oneday_df, file)
with open("./data/article_" + date + ".plk", 'rb') as file:
df = pickle.load(file)
print("article : {}".format(len(df)))
# crawling comment
comments_df = get_comments(article_oneday_df[:])
# save comment
with open("./data/comment_" + date + ".plk", 'wb') as file:
pickle.dump(comments_df, file)
with open("./data/comment_" + date + ".plk", 'rb') as file:
df = pickle.load(file)
print("comment : {}".format(len(df)))
In [ ]:
%time auto_cralwing("2016-06-01")