예: http://dic.daum.net/word/view.do?wordid=ekw000078303&q=help
저장 형식은 자유. 단, 가능한 한 보기 좋게 내용을 편집하여 저장하자.
예)
단어 1
1. 뜻풀이
2. 뜻풀이
===
단어 2
1. 뜻풀이
In [ ]:
from bs4 import BeautifulSoup
import urllib.request
import numpy as np
word_list = []
meaning_list = []
pronounce_list = []
word_index = np.random.random_integers(1,100000,100)
start = "http://dic.daum.net/word/view.do?wordid=ekw"
end = "&q="
for index in word_index:
num = 1000000000+index
url = ""
url = start + str(num)[1:] + end
doc = ""
with urllib.request.urlopen(url) as url:
doc = url.read()
soup = BeautifulSoup(doc, "html.parser")
word = soup.find_all("span", class_="txt_cleanword")
meaning = soup.find_all("span", class_="txt_mean")
pro = soup.find_all("span", class_="txt_pronounce")
word_list.append(word)
meaning_list.append(meaning)
pronounce_list.append(pro)
#print(word[0].text+' copied')
In [ ]:
def pronounce(usa):
pro = str(usa)[str(usa).index('[')+1:str(usa).index(']')]
pattern1 = re.compile(r'<daum:pron>', re.IGNORECASE)
match = pattern1.findall(pro)
if match != []:
pro = re.sub(match[0],'', pro)
pattern2 = re.compile(r'</daum:pron>', re.IGNORECASE)
match = pattern2.findall(pro)
if match != []:
pro = re.sub(match[0],'', pro)
pattern3 = re.compile(r'<daum:italic>', re.IGNORECASE)
match = pattern3.findall(pro)
if match != []:
pro = re.sub(match[0],'', pro)
pattern4 = re.compile(r'</daum:italic>', re.IGNORECASE)
match = pattern4.findall(pro)
if match != []:
pro = re.sub(match[0],'', pro)
return pro
In [ ]:
import re
w_index = 1 # 출력되는 단어의 index
for word, meaning, pro in zip(word_list, meaning_list,pronounce_list):
print(w_index,". "+word[0].text)
m_index = 1 # 출력되는 의미의 index
for txt in meaning:
print('\t('+str(m_index)+"). "+txt.text)
m_index +=1
if pro != []:
print("발음")
print("미국식 = ",pronounce(pro[0]))
print("영국식 = ",pronounce(pro[1]))
print('\n')
w_index +=1
In [ ]: