var reviews = document.querySelectorAll('div[class="d15Mdf bAhLNe"]')
var data = []
reviews.forEach(v => data.push({body: v.outerHTML}))
copy(temp1)
을 하면 클립보드로 데이터가 들어감review-data.json
으로 저장
In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import json
import re
from bs4 import BeautifulSoup
import warnings
from konlpy.tag import Twitter
from sklearn.feature_extraction.text import CountVectorizer
warnings.filterwarnings('ignore')
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
In [2]:
df = pd.read_json("review-data.json")
In [3]:
df.head(2)
Out[3]:
In [4]:
p = re.compile(r'\d+')
def parser(body):
bs = BeautifulSoup(body, 'html.parser')
user_name = bs.find('span', class_='X43Kjb').text
date = bs.find('span', class_='p2TkOb').text
rating = bs.find('div', {'role':'img'})['aria-label']
rating = p.findall(rating)[-1]
review_text = bs.find('span', {'jsname':'bN97Pc'}).text
return user_name, date, rating, review_text
In [5]:
%%time
df['user_name'], df['date'], df['rating'], df['review_text'] = zip(*df['body'].map(parser))
In [6]:
del df["body"]
In [7]:
df['date'] = pd.to_datetime(df['date'], format='%Y년 %m월 %d일')
In [8]:
df.head(2)
Out[8]:
In [9]:
df = df.sort_values(by='date', ascending=False).reindex()
In [10]:
print("최소 :", min(df['date'].value_counts().index))
print("최대 :", max(df['date'].value_counts().index))
In [11]:
sns.factorplot('rating',kind='count',data=df)
Out[11]:
In [12]:
df['rating'].value_counts()
Out[12]:
In [13]:
low_rate_review = df[df['rating'] <= '3']['review_text']
In [14]:
len(low_rate_review)
Out[14]:
In [15]:
low_rate_review[:10]
Out[15]:
In [19]:
low_rate_review = low_rate_review.apply(lambda x:re.sub('[^가-힣\s\d]',"",x))
In [20]:
low_rate_review[:10]
Out[20]:
In [21]:
tagger = Twitter()
In [23]:
def get_word(sentence):
nouns = tagger.nouns(sentence)
return [noun for noun in nouns if len(noun) > 1]
In [24]:
cv = CountVectorizer(tokenizer=get_word, max_features=300)
tdf = cv.fit_transform(low_rate_review)
words = cv.get_feature_names()
In [26]:
words[:5]
Out[26]:
In [27]:
count_mat = tdf.sum(axis=0)
count_mat
Out[27]:
In [28]:
count = np.squeeze(np.asarray(count_mat))
word_count = list(zip(words, count))
word_count = sorted(word_count, key=lambda t:t[1], reverse=True)
In [29]:
word_count[:15]
Out[29]:
In [ ]: