In [ ]:
# 웹문서 (html) Crawling
# Crawling
# Scraping -> 링크를 계속 타고 가서 특정정보를 걸러내는거
# parsing -> 의미있는 정보를 뽑는 과정
In [ ]:
# Crawling -> urlib, urllibz, requests
# scraping -> 직접 크롤링한 결과 a태그를 찾아서 다시 크롤링 // scrapy //
# parsing -> 정규표현식, BeautifulSoup, lxml
In [2]:
import requests
from bs4 import BeautifulSoup
In [69]:
base_url = https://search.naver.com/search.naver?where=post&sm=tab_jum&ie=utf8&query=%EB%AC%B8%EC%9E%AC%EC%9D%B8"
In [76]:
response = requests.get(base_url)
In [77]:
response.status_code
Out[77]:
403
In [78]:
response.text
Out[78]:
u'<!doctype html> <html lang="ko"> <head> <meta charset="utf-8"> <meta name="referrer" content="always"> <title>\ub124\uc774\ubc84 \uac80\uc0c9</title> <link rel="stylesheet" type="text/css" href=""https://ssl.pstatic.net/sstatic/search/css/2011/common.css""> <link rel="shortcut icon" href="https://ssl.pstatic.net/sstatic/search/favicon/favicon_140327.ico"> <style type="text/css"> body{margin:0;padding:0;text-align:center} body,td,div{color:#404040;font-size:12px} img{border:none} label{cursor:hand;cursor:pointer} *{margin:0;padding:0} li{list-style-type:none} fieldset{border:none} legend{display:none} /* 120110 */ .blind{visibility:hidden;overflow:hidden;position:absolute;top:0;left:0;width:1px;height:1px;font-size:0;line-height:0} .logo a{display:block;width:93px;height:17px;background:url(https://ssl.pstatic.net/sstatic/search/img3/sp_block2.gif) 0 0 no-repeat}/* 120209 */ .info dt.share_pc{margin-right:10px;_margin-right:7px} .btn a{display:inline-block;margin-right:1px;background:url(https://ssl.pstatic.net/sstatic/search/img3/sp_block2.gif) no-repeat}/* 120209 */ .btn a.home{width:64px;height:24px;background-position:0 -180px} .btn a.btn_service{width:87px;height:24px;background-position:0 -150px} .captcha .input .ty_img{margin:1px 0 0 -1px} .captcha .desc a{color:#004790;text-decoration:underline} /* \uae00\ub85c\ubc8c \uc5d0\ub7ec\uba54\uc2dc\uc9c0 */ #login_message{width:660px;margin:0 auto;font-family:\'\ub3cb\uc6c0\', Dotum, AppleGothic, sans-serif;text-align:left} #login_message img{vertical-align:top} .logo{margin:65px 0 10px 10px} .conts{border:none;background:url(https://ssl.pstatic.net/sstatic/search/citats/rnd01_bg.gif) repeat-y} .rnd_t{overflow:hidden;clear:both;width:auto;_width:100%;padding:61px 30px 55px 54px;_padding:61px 30px 48px 54px;background:url(https://ssl.pstatic.net/sstatic/search/citats/rnd02_t.gif) no-repeat} .rnd_b{clear:both;width:100%;height:7px;background:url(https://ssl.pstatic.net/sstatic/search/citats/rnd01_b.gif) 0 100% no-repeat} .notice_img{float:left;width:53px;height:46px;margin:0;background:url(https://ssl.pstatic.net/sstatic/search/img3/sp_block2.gif) 0 -30px no-repeat}/* 120209 */ .notice_txt{display:inline;float:left;width:477px;margin:0 0 0 17px} .notice_txt h3{height:24px;margin:0}/* 120210 */ .notice_txt p{line-height:18px}/* 120210 */ .notice_txt a{color:#004790;text-decoration:underline} .info01{margin:0 0 13px}/* 120210 */ .info02{margin:0 0 16px} .captcha dl.info{clear:both;width:100%;padding:12px 0 3px;color:#333}/* 120210 */ .captcha .info dt{float:left;clear:both;margin-top:2px;margin-right:3px;_margin-right:0}/* 120209 */ .captcha .info dd{overflow:hidden;height:auto;margin:0 0 6px 5px;line-height:16px;zoom:1}/* 120209 */ .btn{margin-top:14px;padding-top:12px;border-top:1px solid #efefef}/* 120210 */ .btn .btn_captcha{width:74px;height:24px;background-position:0 -120px}/* 120209 */ .btn .btn_captcha.on{background-position:0 -90px} .captcha{display:none;margin:3px 0 0;margin:4px 0 0\\0;*margin:7px 0 0}/* 120209 */ .captcha.on{display:block} .captcha fieldset{overflow:hidden;position:relative;width:450px;padding:14px 14px 10px 14px;background-color:#f7f7f7} .captcha dl{margin:0;padding:9px 0 0;*padding:8px 0 0}/* 120210 */ .captcha dt{float:none;height:17px;margin:0;padding:0;color:#333;font-weight:bold} .captcha dd{height:21px;margin:0;padding:0;color:#555} .captcha .img{float:left} .captcha .img img{display:block;margin:1px 0 5px;*margin:1px 0 3px} .captcha .img a{margin-left:50px;padding-left:12px;background:url(https://ssl.pstatic.net/sstatic/search/img3/sp_block2.gif) 0 -210px no-repeat;color:#666;font-size:11px;letter-spacing:-1px;text-decoration:underline}/* 120209 */ .captcha .img a:hover{text-decoration:underline} .captcha .input{float:right;height:30px} .captcha .input *{vertical-align:top} .captcha .input .input_text{position:relative;_top:-1px;width:188px;width:188px;height:19px;margin:1px 0 0;padding:2px 0 2px 6px;border:1px solid #c9cbcf;line-height:19px} .captcha .error{display:none;float:right;width:237px;margin-top:3px} .captcha .error.on{display:block} .captcha .error em{color:#ff5b00;font-style:normal} .captcha .desc{position:relative;margin:0 0 -8px;_margin:0 0 -5px;padding:13px 0 0;*padding:11px 0 0;color:#888;font-size:11px;letter-spacing:-1px;line-height:16px} #footer{clear:both;margin:0;padding:20px 0 28px;text-align:center} #footer *{margin:0;padding:0;color:#444;font-family:\'\ub3cb\uc6c0\', Dotum, \'\uad74\ub9bc\', Gulim, AppleGothic, Sans-serif;font-size:12px;line-height:normal;list-style:none} #footer a{color:#444;text-decoration:none} #footer a:visited{color:#666;text-decoration:none} #footer a:hover{text-decoration:underline} #footer ul{margin:0 0 14px} #footer ul li{display:inline;position:relative;padding:0 1px 0 6px;background:url(https://ssl.pstatic.net/sstatic/search/citats/line_v_footer10.gif) no-repeat 0 0;font-size:11px;white-space:nowrap} #footer ul li *{font-size:11px} #footer ul li.first{padding:0 1px 0 0;background:none} #footer ul a{letter-spacing:-1px} #footer address{margin:0 14px 0 0} #footer address{font:9px Verdana} #footer address a{color:#00535d;font:bold 9px Tahoma} /* 120209 */ .btn2{margin-top:-5px;padding-top:0;border-top:none}/* 120210 */ .notice_txt p.info{margin-top:17px;*margin-top:15px;padding-top:15px;border-top:1px solid #efefef}/* 120210 */ </style> </head> <script type="text/javascript"> var g_ssc = "tab.blog.post" ; var g_query = "%EB%AC%B8%EC%9E%AC%EC%9D%B8" ; var g_query_cr = "%B9%AE%C0%E7%C0%CE" ; var g_puid = "TfzMGdpVuFKssvw/jD4ssssss04-101368" ; var g_suid = "GjtHm6HVPeP+0DbG3BgP1A==" ; var g_tab = "blog" ; var g_stab = "post" ; var g_crt = "" ; var g_D = 0 ; function urlencode (q) { return escape(q).replace(/\\+/g, "%2B") ; } function cpip () { var evt, sx = sy = px = py = -1 ; try { evt = window.event ; } catch (e) {} try { sx=evt.clientX-document.body.clientLeft, sy=evt.clientY-document.body.clientTop ; } catch (e) {} try { px=document.body.scrollLeft+(sx<0?0:sx), py=document.documentElement.scrollTop+(sy<0?0:sy) ; } catch (e) {} try { if (evt.pageX) px=evt.pageX ; if (evt.pageY) py=evt.pageY ; } catch (e) {} return "px="+px+"&py="+py+"&sx="+sx+"&sy="+sy ; } function nxGetCommonCRParam () { return "p="+g_puid+"&q="+g_query_cr+"&ssc="+g_ssc+"&f="+g_tab+"&w="+g_stab+"&s="+g_suid+"&time="+(new Date()).getTime()+g_crt ; } function nxGetCRURL (m, a, b, c, d, e) { var p = "" ; var u ; if (c==undefined && d==undefined && e==undefined) { p = (a==undefined?"":"&"+a) + (b==undefined || a.indexOf("u=")==0 || a.indexOf("&u=") > 0 ?"":"&u="+urlencode(b)) ; u = b ; } else { p = (a==undefined ? "" : "&a="+urlencode(a)) + (b==undefined ? "" : "&r="+urlencode(b)) + (c==undefined ? "" : "&i="+urlencode(c)) + (d==undefined ? "" : "&u="+urlencode(d)) + (e==undefined ? "" : "&"+e) ; u = d ; } if (! p) return null ; if (m != 0) m = g_D>1 && (u && u.indexOf("://")>=0 && u.search(/^\\w*:\\/\\/([^:/?]*\\.|)*(?!(ad)?cr\\.)[^.:/?]+\\.+naver\\.com(:\\d*)?(\\/|$)/)<0) ? 2 : ((m<0) ? 0 : 1) ; return ((window.location.protocol&&window.location.protocol.indexOf("https:")==0)?"/p/cr/":"http://cr.naver.com/")+(g_D>1?"nr":"rd")+"?m="+m+"&"+cpip()+"&"+nxGetCommonCRParam()+p ; } function tCR (a, b, c, d, e) { var p = "" ; if (arguments.length < 3) p = (a==undefined?"":"&"+a)+(b==undefined?"":"&u="+urlencode(b)) ; else p = (a==undefined?"":"&a="+urlencode(a))+(b==undefined?"":"&r="+urlencode(b))+(c==undefined?"":"&i="+urlencode(c))+(d==undefined?"":"&u="+urlencode(d))+(e==undefined?"":"&"+e) ; if (! p) return ; var l = "http://cr.naver.com/rd?m=0&"+cpip()+"&"+nxGetCommonCRParam()+p ; if (document.images) (new Image()).src = l ; else document.location = l ; return false ; } function goCR (o, p, t) { var u = o.href ; if (p.indexOf("u=javascript") >= 0) t = true ; /* no frame/iframe but new window */ var n = (o.ownerDocument==document && o.target && o.target!="_self" && o.target!="_parent" && o.target!="_top") && !(window.location.protocol&&window.location.protocol.indexOf("https:")==0) ; if (!(u && u.indexOf("http://cr.naver.com/")==0) && !(o.getAttribute !== undefined && o.getAttribute("crurl"))) { u = nxGetCRURL(t?0:(n?-1:1), p, u) ; } if (u && !u.match(/m=0&/)) { var a = o.innerHTML ; o.href = u ; if (o.getAttribute !== undefined) o.setAttribute("crurl", "1"); if (o.innerHTML != a) o.innerHTML = a ; } else if (document.images) (new Image()).src = u ; return true ; } function goOtherCR (o, p) { return goCR(o, p, false) ; } var lcs_add = {}; var lcs_bc = {}; var lcs_ver = "v0.5.00"; var lcs_cnt = 0; function lcs_do( etc ) { if (!window.lcs_SerName) { window.lcs_SerName = "lcs.naver.com"; } var rs = ""; var index; var doc = document; var wlt = window.location; try { var lcs_Addr = (wlt.protocol ? wlt.protocol : "http:")+"//" + window.lcs_SerName + "/m?"; } catch(e){ return; } try { rs = lcs_Addr + "u=" + encodeURIComponent(wlt.href) + "&e=" + (doc.referrer ? encodeURIComponent(doc.referrer) : ""); } catch(e) { } try { if (typeof lcs_add.i == \'undefined\' ) lcs_add.i = ""; for( var index in lcs_add) { if( typeof lcs_add[index] != \'function\' ) rs += "&" + index + "=" + encodeURIComponent(lcs_add[index]); } for( var index in etc ) { if ( (index.length >= 3 && (typeof etc[index] != \'function\')) || index == \'qy\') { rs += "&" + index + "=" + encodeURIComponent(etc[index]); } } if(lcs_cnt < 1) { lcs_getBrowserCapa(); } for( var index in lcs_bc ) { if( typeof lcs_bc[index] != \'function\' ) rs += "&" + index + "=" + encodeURIComponent(lcs_bc[index]); } var timeStr = (new Date).getTime(); rs += "&ts=" + timeStr; rs += "&EOU"; var obj = document.createElement(\'img\') ; obj.src = rs; obj.onload = function() { obj.onload = null; return; } ; lcs_cnt++; } catch(e) { return; } } function lcs_getBrowserCapa() { lcs_getOS(); lcs_getlanguage(); lcs_getScreen(); lcs_getWindowSize(); lcs_getColorDepth(); lcs_getJavaEnabled(); lcs_getCookieEnabled(); } function lcs_getOS() { var lcs_os = ""; try { (navigator.platform ? lcs_os = navigator.platform : ""); } catch (e) { } lcs_bc["os"] = lcs_os; } function lcs_getlanguage() { var lcs_ln = ""; try { (navigator.userLanguage? lcs_ln = navigator.userLanguage : (navigator.language)? lcs_ln = navigator.language : ""); } catch (e) { } lcs_bc["ln"] = lcs_ln; } function lcs_getScreen() { var lcs_sr = ""; var pixelRatio = 0 ; try { if ( window.screen && screen.width && screen.height) { lcs_sr = screen.width + \'x\'+ screen.height; pixelRatio = window.devicePixelRatio; if ( (pixelRatio) && (pixelRatio != 1) ){ lcs_sr = ( screen.width * pixelRatio ) + \'x\' +( screen.height * pixelRatio ) ; } } else if ( window.java || self.java ) { var sr = java.awt.Toolkit.getDefaultToolkit().getScreenSize(); lcs_sr = sr.width + \'x\' + sr.height; } } catch(e) { lcs_sr = ""; } lcs_bc["sr"] = lcs_sr; } function lcs_getWindowSize() { var doc = document; lcs_bc["bw"] = \'\'; lcs_bc["bh"] = \'\'; try { lcs_bc["bw"] = doc.documentElement.clientWidth ? doc.documentElement.clientWidth : doc.body.clientWidth; lcs_bc["bh"] = doc.documentElement.clientHeight ? doc.documentElement.clientHeight : doc.body.clientHeight; } catch(e) { } } function lcs_getColorDepth(){ lcs_bc["c"] = ""; try { if (window.screen) { lcs_bc["c"] = screen.colorDepth ? screen.colorDepth : screen.pixelDepth; } else if (window.java || self.java ) { var c = java.awt.Toolkit.getDefaultToolkit().getColorModel().getPixelSize(); lcs_bc["c"] = c; } } catch (e) { lcs_bc["c"] = ""; } } function lcs_getJavaEnabled() { lcs_bc["j"] = ""; try { lcs_bc["j"]= navigator.javaEnabled() ? "Y":"N"; } catch (e) {} } function lcs_getCookieEnabled() { lcs_bc["k"] = ""; try { lcs_bc["k"]= navigator.cookieEnabled ? "Y":"N"; } catch (e) { } } if (window.addEventListener) window.addEventListener("load", function() { lcs_do({"sti": "deny_notice"}) }, false) ; else lcs_do({"sti": "deny_notice"}) ; </script> <body> <div id="login_message"> <div class="logo"><a href="http://www.naver.com/"><span class="blind">naver</span></a></div> <div class="conts"> <div class="rnd_t"> <div class="notice_img"><span class="blind">!</span></div> <div class="notice_txt"> <h3><img src="https://ssl.pstatic.net/sstatic/search/img3/txt_error02.gif" width="233" height="16" alt="\uac80\uc0c9 \uc11c\ube44\uc2a4 \uc774\uc6a9\uc774 \uc81c\ud55c\ub418\uc5c8\uc2b5\ub2c8\ub2e4."></h3> <p class="info01">\uc0ac\uc6a9 \uc911\uc774\uc2e0 PC \ub610\ub294 \ub124\ud2b8\uc6cc\ud06c\uc5d0\uc11c \ub124\uc774\ubc84\uc758 \uc548\uc815\uc801\uc778 \uac80\uc0c9 \uc11c\ube44\uc2a4\ub97c \ubc29\ud574\ud558\ub294 \ub0b4\uc6a9\uc774 \uac10\uc9c0\ub418\uc5c8\uc2b5\ub2c8\ub2e4.</p> <p class="info01">\ub124\uc774\ubc84\ub294 \uc548\uc815\uc801\uc778 \uac80\uc0c9 \uc11c\ube44\uc2a4\ub97c \uc81c\uacf5\ud558\uace0\uc790, \uac80\uc0c9 \uc11c\ube44\uc2a4\ub97c \ubc29\ud574\ud558\ub294 \ube44\uc815\uc0c1\uc801\uc778 \uc6c0\uc9c1\uc784\uc774 \ubc1c\uacac\ub420 \uc2dc \uc2dc\uc2a4\ud15c\uc5d0 \uc758\ud574 \ud574\ub2f9 \ub124\ud2b8\uc6cc\ud06c\uc758 \uac80\uc0c9\uc744 \uc77c\uc2dc\uc801\uc73c\ub85c \uc81c\ud55c\ud558\uace0 \uc788\uc2b5\ub2c8\ub2e4.<br><br> \ubcf4\uc548 \uc808\ucc28\ub97c \ud1b5\uacfc\ud558\uba74 \uac80\uc0c9 \uc11c\ube44\uc2a4\ub97c \uc815\uc0c1\uc801\uc73c\ub85c \uc774\uc6a9\ud558\uc2e4 \uc218 \uc788\uc2b5\ub2c8\ub2e4.<br> \ubcf4\uc548 \uc808\ucc28\ub97c \ud1b5\uacfc\ud558\ub824\uba74 [\uc81c\ud55c \ud574\uc81c] \ubc84\ud2bc\uc744 \ud074\ub9ad\ud558\uc138\uc694.<br><br> \u203b "\ube44\uc815\uc0c1\uc801\uc778 \uac80\uc0c9"\uc774\ub780? : \ud504\ub85c\uadf8\ub7a8\ub4f1\uc744 \uc774\uc6a9, \ud2b9\uc815 \ub2e8\uc5b4\ub97c \ubc18\ubcf5\uc801\uc73c\ub85c \ub300\ub7c9\uc73c\ub85c \uc785\ub825\ud558\ub294 \ub4f1 \ub124\uc774\ubc84 \uac80\uc0c9 \uc11c\ube44\uc2a4\uc758 \uc548\uc815\uc131\uc744 \ubc29\ud574\ud558\ub294 \uac80\uc0c9 \ud328\ud134\uc744 \uc758\ubbf8\ud569\ub2c8\ub2e4. </p> <div class="btn"> <a class="btn_captcha on" href="#" onclick="this.href=window.location.href+(window.location.search==\'\'?\'?\':\'&\')+\'ct=1\';"><span class="blind">\uc81c\ud55c\ud574\uc81c</span></a> </div> </div> </div> <div class="rnd_b"></div> </div> <div id="footer"><div class="footer_wrap"> <ul> <li class="first"><a href="http://www.navercorp.com/">\ud68c\uc0ac\uc18c\uac1c</a></li> <li><a href="http://www.naver.com/rules/service.html">\uc774\uc6a9\uc57d\uad00</a></li> <li><a href="http://www.naver.com/rules/privacy.html"><strong>\uac1c\uc778\uc815\ubcf4\ucde8\uae09\ubc29\uce68</strong></a></li> <li><a href="http://www.naver.com/rules/disclaimer.html">\ucc45\uc784\uc758 \ud55c\uacc4\uc640 \ubc95\uc801\uace0\uc9c0</a></li> <li><a href="http://help.naver.com/">\uace0\uac1d\uc13c\ud130</a></li> </ul> <address> Copyright © <a href="http://www.navercorp.com/">NAVER Corp.</a> All Rights Reserved. </address> </div> </div> </body> </html>'
In [79]:
dom = BeautifulSoup(response.content, "html.parser")
In [81]:
post_elements = dom.select("li.sh_blog_top")
In [91]:
dom.select("dd.sh_blog_passage")
Out[91]:
[]
In [75]:
post_elements
Out[75]:
[]
In [68]:
len(post_elements)
Out[68]:
0
In [101]:
import requests
from bs4 import BeautifulSoup
res = requests.get('https://search.naver.com/search.naver?where=post&sm=tab_jum&ie=utf8&query=%EB%AC%B8%EC%9E%AC%EC%9D%B8')
soup = BeautifulSoup(res.content)
body = soup.find('div', attrs = {'class' : 'blog section _blogBase'})
for dl in body.find_all('dl'):
print xdl.get_text()
In [ ]:
Content source: GD-park/python_basic
Similar notebooks: