In [1]:
a = 'abcdef\n'
print a
# raw string
b = r'abcdef\n'
print b
In [2]:
# 첫번째꺼 찾으면 멈춤
import re
match = re.search(r'iii', 'piiig')
print match
print match.group()
match = re.search(r'iiiig', 'piiig')
print match
In [3]:
m = re.search(r'..g', 'piiig')
print m.group()
m = re.search(r'\d\d\d', 'p123g')
print m.group()
m = re.search(r'\d\d\d', '오마이갓123이럴수가')
print m.group()
m = re.search(r'\w\w\w', '@@abcd!!')
print m.group()
m = re.search(r'\w\w\w', '@@ab0!!')
print m.group()
In [5]:
m = re.search(r'[\w.-]+@[\w.-]+',
"My email is jiyong5411@gmail.com")
print m.group(), type(m.group())
print m.groups()
In [7]:
m = re.search(r'([\w.-]+)@([\w.-]+)',
"My email is jiyong5411@gmail.com")
print m.group()
print ''
print m.groups()
print m.group(1)
print m.group(2)
In [11]:
import requests
from bs4 import BeautifulSoup
def get_news_content(url):
response = requests.get(url)
content = response.text
soup = BeautifulSoup(content)
div = soup.find('div', attrs = {'id' : 'harmonyContainer'})
content = ''
for paragraph in div.find_all('p'):
content += paragraph.get_text()
return content.encode('utf-8')
news1 = get_news_content('http://media.daum.net/foreign/newsview?newsid=20160921114543616')
news2 = get_news_content('http://media.daum.net/digital/newsview?newsid=20160920180606199')
pattern = r'([\w.-]+)@([\w.-]+)'
print re.search(pattern, news1).group()
print re.search(pattern, news2).group()
In [ ]:
In [ ]:
In [ ]: