In [1]:
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import json
import time
from dateutil.parser import parse
In [16]:
def __get_novel_ID(genre, level):
"""
genre = 101~108
level = webnovel, best, challenge
1개의 genre의 완결, 미완결 소설들의 ID와 이름, 총 화수를 수집
"""
df = pd.DataFrame(columns=["level", "genre", "ID", "name", "episode_total"])
if level == "webnovel":
number = 2
df = pd.DataFrame(columns=["level", "genre", "ID", "name", "episode_total", "is_fin"])
else:
number = 1
df = pd.DataFrame(columns=["level", "genre", "ID", "name", "episode_total"])
for fin in ["", "&order=Read&finish=true"][0:number]: # 미완결과 완결; 완결은 webnovel 일 때만 존재함
url = "http://novel.naver.com/{0}/genre.nhn?genre={1}{2}&page=1000".format(level, genre, fin)
response = requests.get(url)
dom = BeautifulSoup(response.content, "lxml")
end_page = dom.select_one("div.paging") # 최종 페이지 number를 수집
if end_page:
end_page = int(end_page.select_one("strong").contents[0])
else:
end_page = 1
for page in range(1, end_page+1): # 페이지 수 많큼의 반복문
url = "http://novel.naver.com/{0}/genre.nhn?genre={1}{2}&page={3}".format(level, genre, fin, page)
response = requests.get(url)
dom = BeautifulSoup(response.content, "lxml")
list_item = dom.select("a.list_item")
for item in list_item: # 페이지 내 한 개의 소설 정보 수집
novel_ID = item["href"].split("=")[1]
novel_name = item.select_one("span.ellipsis").text
novel_episode_total = item.select_one("span.num_total").text.split(" ")[1][:-1]
if level == "webnovel":
if fin == "":
df.loc[len(df)] = level, genre, novel_ID, novel_name, novel_episode_total, 0
else:
df.loc[len(df)] = level, genre, novel_ID, novel_name, novel_episode_total, 1
else:
df.loc[len(df)] = level, genre, novel_ID, novel_name, novel_episode_total
return df
def make_genre_df(level):
"""
level = webnovel, best, challenge
해당 level의 모든 장르의 각 소설 기본 정보 수집
"""
if "data" not in os.listdir("."):
os.mkdir("data")
if level not in os.listdir("data"):
os.mkdir(os.path.join("data", level))
genres = [101, 102, 103, 104, 106, 108]
if level == "best": # best의 경우 역사/전쟁이 추가
genres += [105]
if level == "challenge": # challenge의 경우 역사/전쟁에 이어 팬픽이 추가
genres += [105, 107]
genre_data = map(__get_novel_ID, genres, [level] * len(genres))
genre_df = pd.concat(genre_data).reset_index(drop=True)
genre_df.drop_duplicates(inplace=True)
like_url = "http://novel.naver.com/likeCountJson.nhn?contentsIds=" \
+ (",").join(genre_df["ID"]) #like 점수는 다른 곳에서 가져와야 함.
genre_df["main_likeit"] = [
i["likeItCount"]
for i in requests.get(like_url).json()["result"]["contents"]
]
genre_df["episode_total"] = genre_df["episode_total"].astype("int")
genre_df["genre"] = genre_df["genre"].astype("int")
genre_df.to_csv(os.path.join("data", level, "genre_df.csv"))
return genre_df
In [17]:
def get_comment_count(ID, level, get_main=True, number=0):
"""
get_main
소설의 총 댓글 수만 가져오는 함수
"""
if level == "webnovel":
level = "novel01"
else:
level = "novel02"
headers = {"Referer": "http://novel.naver.com/{level}/list.nhn?".format(level=level)}
if get_main: # 소설의 전체 댓글 수
data = {
"ticket": level,
"object_id": "novel-{ID}".format(ID=ID),
"_ts": "1469461095606",
"page_size": "10",
"page_no": "1",
"sort": "newest"
}
else: # 소설의 각 에피소드 댓글 수
data = {
"ticket": level,
"object_id": "{ID}-{number}".format(ID=ID, number=number),
"_ts": "1469461095606",
"page_size": "10",
"page_no": "1",
"sort": "newest"
}
comment_response = requests.post(
"http://novel.naver.com/comments/list_comment.nhn",
headers=headers,
data=data
)
total_count = json.loads(comment_response.text.replace("\\'", "\'"))['total_count']
return total_count
In [22]:
def get_novel_data(ID, level):
url = "http://novel.naver.com/{level}/list.nhn?novelId={ID}".format(level=level, ID=ID)
response = requests.get(url)
dom = BeautifulSoup(response.content, "lxml")
main_score = float(dom.select_one("p.grade_area").select_one("em").text) # 메인 별점
concern_count = int(dom.select_one("span#concernCount").text.replace(",", "")) # 관심도
comments_count = get_comment_count(ID, level)
return ID, main_score, concern_count, comments_count
def make_novel_df(genre_df):
"""
총 볆점, 댓글수, 관심수를 가져옴
"""
data = list(map(get_novel_data, genre_df["ID"], genre_df["level"]))
novel_df = pd.DataFrame(data, columns=["ID", "main_score", "concern_count", "comments_count"])
novel_df.drop_duplicates(inplace=True)
novel_df.reset_index(drop=True, inplace=True)
novel_df.to_csv("data/{level}/novel_df.csv".format(level=genre_df["level"][0]))
return novel_df
In [23]:
def make_main_df(level):
"""
level = webnovel, best, challenge
위의 make_genre_df, make_novel_df의 함수를 이용한 데이터 자동 실행 및 merge
"""
if "genre_df.csv" in os.listdir(os.path.join("data", level)):
try:
genre_df = pd.read_csv(
os.path.join("data", level, "genre_df.csv"),
index_col=0
)
except:
genre_df = pd.read_csv(
os.path.join("data", level, "genre_df.csv"),
index_col=0,
encoding="cp949"
)
else:
genre_df = make_genre_df(level)
if "novel_df.csv" in os.listdir(os.path.join("data", level)):
try:
novel_df = pd.read_csv(
os.path.join("data", level, "novel_df.csv"),
index_col=0
)
except:
novel_df = pd.read_csv(
os.path.join("data", level, "novel_df.csv"),
index_col=0,
encoding="cp949"
)
else:
novel_df = make_novel_df(genre_df)
main_df = genre_df.merge(novel_df, on="ID")
main_df.to_csv(os.path.join("data", level, "main_df.csv"))
return main_df
In [24]:
best_df = make_main_df("best")
In [25]:
webnovel_df = make_main_df("webnovel")
In [35]:
def get_text(episode_url):
"""
episode_url = 소설 1화 url
소설 내용과 하트를 누른 카운터를 반환
"""
episode_response = requests.get(episode_url)
dom = BeautifulSoup(episode_response.content, "lxml")
score_count = int(dom.select_one("span#currentStarScoreCount").text.replace(",","")[:-1])
text = dom.select_one("div.detail_view_content").text.replace("\r\n", "")
return [score_count, text]
In [36]:
def get_novel_episode(ID, level, episode_total, reset=False):
file_name = "{ID}-{level}.pickle".format(ID=ID, level=level)
if (file_name in os.listdir("data/episode")) & (reset == False):
return pd.read_pickle(os.path.join("data", "episode", file_name))
pages = (episode_total // 10) + 2 if episode_total % 10 > 0 else (episode_total // 10) + 1
titles = []
volumes = []
times = []
scores = []
hits = []
print(ID)
for page in range(1, pages):
episode_url = "http://novel.naver.com/{level}/list.nhn?novelId={ID}&page={page}".format(
level = level,
ID = ID,
page = page
)
episode_response = requests.get(episode_url)
dom = BeautifulSoup(episode_response.content, "lxml")
titles += [
i.text
for i in dom.select_one("ul.list_type2.v3.NE=a:lst").select("p.subj.v3")
if i.text != "게시 보류중"
]
volumes += [
i["href"].split("=")[-1]
for i in dom.select_one("ul.list_type2.v3.NE=a:lst").select("a.list_item.NPI=a:list")
if i
]
times += [
i.text[:-1] if len(i.text) > 8 else "2016.07.30"
for i in dom.select_one("ul.list_type2.v3.NE=a:lst").select("span.date")
]
scores += [
float(i.text)
for i in dom.select_one("ul.list_type2.v3.NE=a:lst").select("span.score_area em")
]
if level != "webnovel":
hits += [
int(i.text.replace(",", ""))
for i in dom.select_one("ul.list_type2.v3.NE=a:lst").select("span.favorite em")
]
like_url = "http://novel.naver.com/likeCountJson.nhn?contentsIds=" + str(ID) + "_" + ("," + str(ID) + "_").join(volumes)
likeits = [
i["likeItCount"]
for i in requests.get(like_url).json()["result"]["contents"]
]
comments_count = [
get_comment_count(ID, level, False, i)
for i in volumes
]
episode_urls = [
"http://novel.naver.com/{level}/detail.nhn?novelId={ID}&volumeNo={episode}".format(
level = level,
ID = ID,
episode = i
)
for i in volumes
]
score_count, text = zip(
*[
get_text(url)
for url in episode_urls
]
)
episodes = [
i
for i in range(1, len(volumes)+1)
][::-1]
if level != "webnovel":
episode_df = pd.DataFrame(
list(zip(titles, volumes, episodes, times, likeits, hits, comments_count, scores, score_count, text)),
columns=["title", "volume", "episode", "time", "likeit", "hit", "comments_count", "score", "score_count", "text"]
)
else:
episode_df = pd.DataFrame(
list(zip(titles, volumes, episodes, times, likeits, comments_count, scores, score_count, text)),
columns=["title", "volume", "episode", "time", "likeit", "comments_count", "score", "score_count", "text"]
)
episode_df["level"] = level
episode_df["ID"] = ID
episode_df.to_pickle(os.path.join("data", "episode", file_name))
return episode_df
In [37]:
def make_episode_df(level, reset=False):
if "main_df.csv" in os.listdir(os.path.join("data", level)):
try:
main_df = pd.read_csv(os.path.join('data', level, "main_df.csv"), index_col=0)
except:
main_df = pd.read_csv(os.path.join('data', level, "main_df.csv"), index_col=0, encoding="cp949")
else:
main_df = make_main_df(level)
main_df = main_df[main_df["episode_total"] != 0]
data = list(map(get_novel_episode, main_df["ID"], main_df["level"], main_df["episode_total"], [reset]*len(main_df)))
episode_df = pd.concat(data)
episode_df.reset_index(drop=True, inplace=True)
episode_df.to_csv(os.path.join("data", main_df["level"][0], "episode_df.csv"))
return episode_df
In [ ]:
episode_df = make_episode_df("webnovel", reset=True)
In [15]:
def get_novel_comments(ID, comments_count, get_main, level, reset, number=0):
file_name = "{ID}-{number}.pickle".format(ID=ID, number=number)
if "comment" not in os.listdir("data"):
os.mkdir(os.path.join("data", "comment"))
if (file_name in os.listdir(os.path.join("data", "comment"))) & (reset == False):
return pd.read_pickle(os.path.join("data", "comment", file_name))
if level == "webnovel":
novel = "novel01"
else:
novel = "novel02"
headers = {"Referer": "http://novel.naver.com/{level}/list.nhn?".format(level=level)}
comment_list = []
pages = (comments_count // 100) + 1
for page in range(1, pages+1):
if get_main:
data = {
"ticket": novel,
"object_id": "novel-{ID}".format(ID=ID),
"_ts": "1469461095606",
"page_size": "100",
"page_no": page,
"sort": "newest"
}
else:
data = {
"ticket": novel,
"object_id": "{ID}-{number}".format(ID=ID, number=number),
"_ts": "1469461095606",
"page_size": "100",
"page_no": page,
"sort": "newest"
}
comment_response = requests.post(
"http://novel.naver.com/comments/list_comment.nhn",
headers=headers,
data=data
)
comment_list += json.loads(comment_response.text.replace("\\'", "\'"))["comment_list"]
df = pd.DataFrame(comment_list)
df.drop(
[
"comment_no",
"enc_writer_id",
"enc_writer_profile_user_id",
"group_no",
"reply_level",
"status",
"is_mine",
"is_reply",
"parent_comment_no",
"deleted_yn",
"is_yozm",
"is_me2day",
"visible_yn",
"object_url",
"writer_profile_user_id"
],
axis=1,
inplace=True
)
df.to_pickle(os.path.join("data", "comment", file_name))
return df
In [16]:
def make_episode_novel_comment(episode_df, reset=False):
episode_df = episode_df[episode_df["comments_count"] != 0]
df_list = list(map(
get_novel_comments,
episode_df["ID"],
episode_df["comments_count"],
[False] * len(episode_df),
episode_df["level"],
[reset] * len(episode_df),
episode_df["volume"],
))
episode_comments_df = pd.concat(df_list).reset_index(drop=True)
episode_comments_df.to_csv(os.path.join("data", episode_df["level"][0], "episode_comments.csv"))
return episode_comments_df
In [17]:
def make_main_novel_comment(main_df, reset=False):
main_df = main_df[main_df["comments_count"] != 0]
df_list = list(map(
get_novel_comments,
main_df["ID"],
main_df["comments_count"],
[True] * len(main_df),
main_df["level"],
[reset] * len(main_df)
))
main_comment_df = pd.concat(df_list).reset_index(drop=True)
main_comment_df.to_csv(os.path.join("data", main_df["level"][0], "main_comments.csv"))
return main_comment_df
In [30]:
episode_df = make_main_novel_comment("webnovel")
In [ ]: