In [1]:
a = 'abcdef\n'
print a

# raw string
b = r'abcdef\n'
print b


abcdef

abcdef\n

In [2]:
# 첫번째꺼 찾으면 멈춤
import re

match = re.search(r'iii', 'piiig')
print match
print match.group()

match = re.search(r'iiiig', 'piiig')
print match


<_sre.SRE_Match object at 0x03859480>
iii
None

In [3]:
m = re.search(r'..g', 'piiig')
print m.group()

m = re.search(r'\d\d\d', 'p123g') 
print m.group()

m = re.search(r'\d\d\d', '오마이갓123이럴수가') 
print m.group()

m = re.search(r'\w\w\w', '@@abcd!!')
print m.group()

m = re.search(r'\w\w\w', '@@ab0!!')
print m.group()


iig
123
123
abc
ab0

In [5]:
m = re.search(r'[\w.-]+@[\w.-]+',
              "My email is jiyong5411@gmail.com")

print m.group(), type(m.group())
print m.groups()


jiyong5411@gmail.com <type 'str'>
()

In [7]:
m = re.search(r'([\w.-]+)@([\w.-]+)',
              "My email is jiyong5411@gmail.com")

print m.group()
print ''
print m.groups()

print m.group(1)
print m.group(2)


jiyong5411@gmail.com

('jiyong5411', 'gmail.com')
jiyong5411
gmail.com

In [11]:
import requests
from bs4 import BeautifulSoup

def get_news_content(url):
    response = requests.get(url)
    content = response.text

    soup = BeautifulSoup(content)

    div = soup.find('div', attrs = {'id' : 'harmonyContainer'})
    
    content = ''
    for paragraph in div.find_all('p'):
        content += paragraph.get_text()
        
    return content.encode('utf-8')
        
news1 = get_news_content('http://media.daum.net/foreign/newsview?newsid=20160921114543616')
news2 = get_news_content('http://media.daum.net/digital/newsview?newsid=20160920180606199')


pattern = r'([\w.-]+)@([\w.-]+)'
print re.search(pattern, news1).group()
print re.search(pattern, news2).group()


realism@yna.co.kr
wani@hani.co.kr

In [ ]:


In [ ]:


In [ ]: