In [1]:
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import json
import time
from dateutil.parser import parse

Novel Class

  • 101 로맨스
  • 102 SF & 판타지
  • 103 무협
  • 104 미스터리
  • 105 역사&전쟁 (베스트&첼린지)
  • 106 라이트노벨
  • 107 팬픽 (첼린지)
  • 108 퓨전
  • webnovel 오늘의 웹소설
  • best 베스트리그
  • challenge 첼린지리그

전체 소설 ID 크롤링 (종류, 장르, ID, 작가이름, 총 화수)


In [16]:
def __get_novel_ID(genre, level):
    """
    genre = 101~108
    level = webnovel, best, challenge
    1개의 genre의 완결, 미완결 소설들의 ID와 이름, 총 화수를 수집
    """
    df = pd.DataFrame(columns=["level", "genre", "ID", "name", "episode_total"])

    
    if level == "webnovel":
        number = 2
        df = pd.DataFrame(columns=["level", "genre", "ID", "name", "episode_total", "is_fin"])
    else:
        number = 1
        df = pd.DataFrame(columns=["level", "genre", "ID", "name", "episode_total"])
        
    for fin in ["", "&order=Read&finish=true"][0:number]:    # 미완결과 완결; 완결은 webnovel 일 때만 존재함
        url = "http://novel.naver.com/{0}/genre.nhn?genre={1}{2}&page=1000".format(level, genre, fin)
        response = requests.get(url)
        dom = BeautifulSoup(response.content, "lxml")
        end_page = dom.select_one("div.paging")    # 최종 페이지 number를 수집
        if end_page:
            end_page = int(end_page.select_one("strong").contents[0])
        else:
            end_page = 1

        for page in range(1, end_page+1):    # 페이지 수 많큼의 반복문
            url = "http://novel.naver.com/{0}/genre.nhn?genre={1}{2}&page={3}".format(level, genre, fin, page)
            response = requests.get(url)
            dom = BeautifulSoup(response.content, "lxml")
            list_item = dom.select("a.list_item")

            for item in list_item:    # 페이지 내 한 개의 소설 정보 수집
                novel_ID = item["href"].split("=")[1]
                novel_name = item.select_one("span.ellipsis").text
                novel_episode_total = item.select_one("span.num_total").text.split(" ")[1][:-1]
                
                if level == "webnovel":
                    if fin == "":
                        df.loc[len(df)] =  level, genre, novel_ID, novel_name, novel_episode_total, 0
                    else:
                        df.loc[len(df)] =  level, genre, novel_ID, novel_name, novel_episode_total, 1
                else:        
                    df.loc[len(df)] =  level, genre, novel_ID, novel_name, novel_episode_total

    return df

def make_genre_df(level):
    """
    level = webnovel, best, challenge
    해당 level의 모든 장르의 각 소설 기본 정보 수집
    """
    if "data" not in os.listdir("."):
        os.mkdir("data")
    if level not in os.listdir("data"):
        os.mkdir(os.path.join("data", level))
    
    genres = [101, 102, 103, 104, 106, 108]
    if level == "best":    # best의 경우 역사/전쟁이 추가
        genres += [105]
    if level == "challenge":    # challenge의 경우 역사/전쟁에 이어 팬픽이 추가
        genres += [105, 107]
    
    genre_data = map(__get_novel_ID, genres, [level] * len(genres))
    genre_df = pd.concat(genre_data).reset_index(drop=True)
    genre_df.drop_duplicates(inplace=True)
    
    like_url = "http://novel.naver.com/likeCountJson.nhn?contentsIds=" \
                + (",").join(genre_df["ID"])    #like 점수는 다른 곳에서 가져와야 함.
    genre_df["main_likeit"] = [
        i["likeItCount"]
        for i in requests.get(like_url).json()["result"]["contents"]
    ]
    
    genre_df["episode_total"] = genre_df["episode_total"].astype("int")
    genre_df["genre"] = genre_df["genre"].astype("int")
    genre_df.to_csv(os.path.join("data", level, "genre_df.csv"))
    
    return genre_df

소설 score 크롤링 (별점, 관심수, 댓글수)


In [17]:
def get_comment_count(ID, level, get_main=True, number=0):
    """
    get_main
    소설의 총 댓글 수만 가져오는 함수
    """
    
    if level == "webnovel":
        level = "novel01"
    else:
        level = "novel02"
    
    headers = {"Referer": "http://novel.naver.com/{level}/list.nhn?".format(level=level)}
    if get_main:    # 소설의 전체 댓글 수
        data = {
            "ticket": level,
            "object_id": "novel-{ID}".format(ID=ID),
            "_ts": "1469461095606",
            "page_size": "10",
            "page_no": "1",
            "sort": "newest"
        }
    else:    # 소설의 각 에피소드 댓글 수
        data = {
            "ticket": level,
            "object_id": "{ID}-{number}".format(ID=ID, number=number),
            "_ts": "1469461095606",
            "page_size": "10",
            "page_no": "1",
            "sort": "newest"
        }
        
    comment_response = requests.post(
        "http://novel.naver.com/comments/list_comment.nhn", 
        headers=headers, 
        data=data
    )
    total_count = json.loads(comment_response.text.replace("\\'", "\'"))['total_count']
    
    return total_count

In [22]:
def get_novel_data(ID, level):  
    url = "http://novel.naver.com/{level}/list.nhn?novelId={ID}".format(level=level, ID=ID)
    response = requests.get(url)
    dom = BeautifulSoup(response.content, "lxml")
    
    main_score = float(dom.select_one("p.grade_area").select_one("em").text) # 메인 별점
    concern_count = int(dom.select_one("span#concernCount").text.replace(",", "")) # 관심도
    comments_count = get_comment_count(ID, level)
    
    return ID, main_score, concern_count, comments_count

def make_novel_df(genre_df):
    """
    총 볆점, 댓글수, 관심수를 가져옴
    """
    data = list(map(get_novel_data, genre_df["ID"], genre_df["level"]))
    novel_df = pd.DataFrame(data, columns=["ID", "main_score", "concern_count", "comments_count"])
    novel_df.drop_duplicates(inplace=True)
    novel_df.reset_index(drop=True, inplace=True)
    novel_df.to_csv("data/{level}/novel_df.csv".format(level=genre_df["level"][0]))
    
    return novel_df

소설 크롤링 merge data 생성


In [23]:
def make_main_df(level):
    """
    level = webnovel, best, challenge
    위의 make_genre_df, make_novel_df의 함수를 이용한 데이터 자동 실행 및 merge
    """

    if "genre_df.csv" in os.listdir(os.path.join("data", level)):
        try:
            genre_df = pd.read_csv(
                os.path.join("data", level, "genre_df.csv"), 
                index_col=0
            )
        except:
            genre_df = pd.read_csv(
                os.path.join("data", level, "genre_df.csv"), 
                index_col=0, 
                encoding="cp949"
            )
    else:
        genre_df = make_genre_df(level)

    if "novel_df.csv" in os.listdir(os.path.join("data", level)):
        try:
            novel_df = pd.read_csv(
                os.path.join("data", level, "novel_df.csv"), 
                index_col=0
            )
        except:
            novel_df = pd.read_csv(
                os.path.join("data", level, "novel_df.csv"), 
                index_col=0,
                encoding="cp949"
            )
    else:
        novel_df = make_novel_df(genre_df)
    
    main_df = genre_df.merge(novel_df, on="ID")
    main_df.to_csv(os.path.join("data", level, "main_df.csv"))
    
    return main_df

In [24]:
best_df = make_main_df("best")

In [25]:
webnovel_df = make_main_df("webnovel")

소설 episode 크롤링 (제목, url_volume, 몇 화, 시간, 하트수, (조회수), 댓글 수, 별점, 별점수, 글)


In [35]:
def get_text(episode_url):
    """
    episode_url = 소설 1화 url
    소설 내용과 하트를 누른 카운터를 반환
    """
    episode_response = requests.get(episode_url)
    dom = BeautifulSoup(episode_response.content, "lxml")
    score_count = int(dom.select_one("span#currentStarScoreCount").text.replace(",","")[:-1])
    text = dom.select_one("div.detail_view_content").text.replace("\r\n", "")
    
    return [score_count, text]

In [36]:
def get_novel_episode(ID, level, episode_total, reset=False):

    file_name = "{ID}-{level}.pickle".format(ID=ID, level=level)
    if (file_name in os.listdir("data/episode")) & (reset == False):
        return pd.read_pickle(os.path.join("data", "episode", file_name))
    
    pages = (episode_total // 10) + 2 if episode_total % 10 > 0 else (episode_total // 10) + 1

    titles = []
    volumes = []
    times = []
    scores = []
    hits = []
    
    print(ID)
    for page in range(1, pages):
        
        episode_url = "http://novel.naver.com/{level}/list.nhn?novelId={ID}&page={page}".format(
            level = level,
            ID = ID,
            page = page
        )
        
        episode_response = requests.get(episode_url)
        dom = BeautifulSoup(episode_response.content, "lxml")
        titles += [
            i.text
            for i in dom.select_one("ul.list_type2.v3.NE=a:lst").select("p.subj.v3")
            if i.text != "게시 보류중"
        ]
        volumes += [
            i["href"].split("=")[-1]
            for i in dom.select_one("ul.list_type2.v3.NE=a:lst").select("a.list_item.NPI=a:list")
            if i
        ]
        times += [
            i.text[:-1] if len(i.text) > 8 else "2016.07.30"
            for i in dom.select_one("ul.list_type2.v3.NE=a:lst").select("span.date")
        ]
        scores += [
            float(i.text)
            for i in dom.select_one("ul.list_type2.v3.NE=a:lst").select("span.score_area em")
        ]
        if level != "webnovel":
            hits += [
                int(i.text.replace(",", ""))
                for i in dom.select_one("ul.list_type2.v3.NE=a:lst").select("span.favorite em")
            ]

    like_url = "http://novel.naver.com/likeCountJson.nhn?contentsIds=" + str(ID) + "_" + ("," + str(ID) + "_").join(volumes)
    likeits = [
        i["likeItCount"]
        for i in requests.get(like_url).json()["result"]["contents"]
    ]

    comments_count = [
        get_comment_count(ID, level, False, i)
        for i in volumes
    ]

    episode_urls = [
        "http://novel.naver.com/{level}/detail.nhn?novelId={ID}&volumeNo={episode}".format(
            level = level, 
            ID = ID, 
            episode = i
        )
        for i in volumes
    ]

    score_count, text = zip(
        *[
            get_text(url)
            for url in episode_urls
        ]
    )
    
    episodes = [
        i
        for i in range(1, len(volumes)+1)
    ][::-1]

    if level != "webnovel":
        episode_df = pd.DataFrame(
            list(zip(titles, volumes, episodes, times, likeits, hits, comments_count, scores, score_count, text)), 
            columns=["title", "volume", "episode", "time", "likeit", "hit", "comments_count", "score", "score_count", "text"]
        )
    else:
        episode_df = pd.DataFrame(
            list(zip(titles, volumes, episodes, times, likeits, comments_count, scores, score_count, text)), 
            columns=["title", "volume", "episode", "time", "likeit", "comments_count", "score", "score_count", "text"]
        ) 
        
    episode_df["level"] = level
    episode_df["ID"] = ID
    
    
    episode_df.to_pickle(os.path.join("data", "episode", file_name))
    
    return episode_df

In [37]:
def make_episode_df(level, reset=False):
    
    if "main_df.csv" in os.listdir(os.path.join("data", level)):
        try:
            main_df = pd.read_csv(os.path.join('data', level, "main_df.csv"), index_col=0)
        except:
            main_df = pd.read_csv(os.path.join('data', level, "main_df.csv"), index_col=0, encoding="cp949") 
    else:
        main_df = make_main_df(level)
    main_df = main_df[main_df["episode_total"] != 0]
    
    data = list(map(get_novel_episode, main_df["ID"], main_df["level"], main_df["episode_total"], [reset]*len(main_df)))
    episode_df = pd.concat(data)
    episode_df.reset_index(drop=True, inplace=True)
    episode_df.to_csv(os.path.join("data", main_df["level"][0], "episode_df.csv"))
    
    return episode_df

In [ ]:
episode_df = make_episode_df("webnovel", reset=True)


466391

novel comment crawling


In [15]:
def get_novel_comments(ID, comments_count, get_main, level, reset, number=0):
    
    file_name = "{ID}-{number}.pickle".format(ID=ID, number=number)
    
    if "comment" not in os.listdir("data"):
        os.mkdir(os.path.join("data", "comment"))
    
    if (file_name in os.listdir(os.path.join("data", "comment"))) & (reset == False):
        return pd.read_pickle(os.path.join("data", "comment", file_name))
    
    if level == "webnovel":
        novel = "novel01"
    else:
        novel = "novel02"
    
    headers = {"Referer": "http://novel.naver.com/{level}/list.nhn?".format(level=level)}
    
    comment_list = []
    pages = (comments_count // 100) + 1
    for page in range(1, pages+1):
        
        if get_main:
            data = {
                "ticket": novel,
                "object_id": "novel-{ID}".format(ID=ID),
                "_ts": "1469461095606",
                "page_size": "100",
                "page_no": page,
                "sort": "newest"
            }
        else:
            data = {
                "ticket": novel,
                "object_id": "{ID}-{number}".format(ID=ID, number=number),
                "_ts": "1469461095606",
                "page_size": "100",
                "page_no": page,
                "sort": "newest"
            }

        comment_response = requests.post(
            "http://novel.naver.com/comments/list_comment.nhn", 
            headers=headers, 
            data=data
        )
        
        comment_list += json.loads(comment_response.text.replace("\\'", "\'"))["comment_list"]
    
    df = pd.DataFrame(comment_list)
    df.drop(
        [
            "comment_no", 
            "enc_writer_id", 
            "enc_writer_profile_user_id",
            "group_no",
            "reply_level",
            "status",
            "is_mine",
            "is_reply",
            "parent_comment_no",
            "deleted_yn",
            "is_yozm",
            "is_me2day",
            "visible_yn",
            "object_url",
            "writer_profile_user_id"
        ], 
        axis=1, 
        inplace=True
    )
    df.to_pickle(os.path.join("data", "comment", file_name))
    
    return df

In [16]:
def make_episode_novel_comment(episode_df, reset=False):
    
    episode_df = episode_df[episode_df["comments_count"] != 0]
    
    df_list = list(map(
            get_novel_comments,
            episode_df["ID"],
            episode_df["comments_count"],
            [False] * len(episode_df), 
            episode_df["level"],
            [reset] * len(episode_df),
            episode_df["volume"],
        ))
        
    episode_comments_df = pd.concat(df_list).reset_index(drop=True)
    episode_comments_df.to_csv(os.path.join("data", episode_df["level"][0], "episode_comments.csv"))
    
    return episode_comments_df

In [17]:
def make_main_novel_comment(main_df, reset=False):
    
    main_df = main_df[main_df["comments_count"] != 0]
    
    df_list = list(map(
            get_novel_comments,
            main_df["ID"],
            main_df["comments_count"],
            [True] * len(main_df),
            main_df["level"],
            [reset] * len(main_df)
        ))
        
    main_comment_df = pd.concat(df_list).reset_index(drop=True)
    main_comment_df.to_csv(os.path.join("data", main_df["level"][0], "main_comments.csv"))
    
    return main_comment_df

In [30]:
episode_df = make_main_novel_comment("webnovel")


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-30-50a09d83bd24> in <module>()
----> 1 episode_df = make_main_novel_comment("webnovel")

NameError: name 'make_main_novel_comment' is not defined

In [ ]: