Dalam tahapan ini, kita akan melakukan scraping isi dari sebuah berita.
Silakan melakukan step pertama (1_Scraping_Google_News_Indonesia.pynb) untuk menghasilkan list_links_google_news_indonesia.txt
yang selanjutnya akan menjadi acuhan kita mengambil content tersebut.
In [1]:
from goose import Goose
from pprint import pprint
import string
import datetime
class scrap_news(object):
def __init__(self, url):
self.url = url
def scrap_publisher_news(self):
g = Goose(
{
# 'browser_user_agent': 'Opera/9.80 (Android; Opera Mini/8.0.1807/36.1609; U; en) Presto/2.12.423 Version/12.16',
'use_meta_language': False,
'target_language':'id',
'enable_image_fetching': False,
'http_timeout': 2,
}
)
article = g.extract(url=self.url)
content = article.cleaned_text
printable = set(string.printable)
content = filter(lambda x: x in printable, content)
title = article.title
title = filter(lambda x: x in printable, title)
if len(content) < 2 :
article = g.extract(article.amphtml)
content = article.cleaned_text
content = filter(lambda x: x in printable, content)
else:
article = article
if len(content) > 0 :
title = title
content = content.replace('\n','')
return (title, content)
Detik.com
In [5]:
url = '''https://news.detik.com/berita/3494173/polisi-jl-jend-sudirman-macet-karena-salju-palsu-dari-busa-air-got'''
sn = scrap_news(url)
result = sn.scrap_publisher_news()
print('URL : %s' % url)
print('Title : %s' % result[0])
print('Content : %s' % result[1])
Kumparan
In [8]:
url = '''https://kumparan.com/kita-setara/menyingkirkan-stigma-buruk-hiv-aids'''
sn = scrap_news(url)
result = sn.scrap_publisher_news()
print('URL : %s' % url)
print('Title : %s' % result[0])
print('Content : %s' % result[1])
Metro TV News
In [7]:
url = '''http://celebrity.okezone.com/read/2017/05/06/33/1684964/el-rumi-rayakan-kelulusan-di-puncak-gunung-penanggungan'''
sn = scrap_news(url)
result = sn.scrap_publisher_news()
print('URL : %s' % url)
print('Title : %s' % result[0])
print('Content : %s' % result[1])
In [9]:
f = open('list_links_google_news_indonesia.txt','r')
list_google_news = f.read().replace('[','').replace(']','').replace("u'","").replace("'","").split(',')
set(list_google_news)
Out[9]:
In [10]:
checkType = type(list_google_news)
pprint(checkType)
In [11]:
total_link = len(list_google_news)
pprint(total_link)
In [12]:
for link in list_google_news[:5]:
print(link)
In [13]:
import os
def generate_and_save_to_file(data):
if(len(data[1]) > 0):
fname = os.path.join('google_news',data[0]+'.txt')
f = open(fname,'w')
f.write(data[1])
f.close()
else:
fname = 'CONTENT NOT VALID'
return fname
In [14]:
index_link = 1
for link in list_google_news:
try:
url = '''%s''' % link
sn = scrap_news(url)
result = sn.scrap_publisher_news()
fname = generate_and_save_to_file(result)
print('%d / %d : %s' % (index_link,total_link,fname))
except:
print('%d / %d : %s' % (index_link,total_link,'ERROR'))
pass
index_link = index_link + 1
In [ ]:
os.listdir('google_news')
In [ ]: