In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time
import progressbar
In [2]:
# Let's get started: scrape main page
url = "https://daphnecaruanagalizia.com"
response = requests.get(url)
daphne = BeautifulSoup(response.text, 'html.parser')
In [3]:
# Get structural information based on developer tools in Google Chrome
posts = daphne.find_all("div", class_="postmaster")
In [4]:
# Explore first entry
posts[0]
Out[4]:
In [5]:
# url
posts[0].a["href"]
Out[5]:
In [6]:
# time stamp
posts[0].find(class_="time").get_text()
Out[6]:
In [7]:
# title of posts
posts[0].a["title"]
Out[7]:
In [8]:
# post id
posts[0].get('data-postid')
Out[8]:
In [9]:
# Extract relevant content from main page, loop through posts
new_lst = []
for element in posts:
url = element.a["href"]
title = element.a["title"]
title = title[18:]
date = element.find(class_="time").get_text()
post_id = element.get('data-postid')
#print(url)
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
text = soup.find('div', {'class': 'entry'}).text.strip()
temp_dict = {'URL': url,
'Title': title,
'Date': date,
'ID': post_id,
'Txt': text}
new_lst.append(temp_dict)
In [11]:
pd.DataFrame(new_lst)[0:5]
Out[11]:
In [14]:
# Putting everything together: scrape posts from all pages for relevant content
bar = progressbar.ProgressBar()
new_lst = []
# showcase for the first 9 pages / to get all pages change to range(1,1443)
for elem,i in zip(range(1,10), bar((range(1,10)))):
page = "https://daphnecaruanagalizia.com/page/" + str(elem)
response = requests.get(page)
soup = BeautifulSoup(response.text, 'html.parser')
posts = soup.find_all("div", class_="postmaster")
for element in posts:
url = element.a["href"]
url_temp = url.replace("https://daphnecaruanagalizia.com/", "")
date_y = url_temp[:4]
date_m = url_temp[5:7]
# dealing with error message stemming from one post on page 127
try:
date_t = element.find(class_="time").get_text()
except AttributeError:
date_t = "n.a."
title = element.a["title"]
title = title.replace("Permanent Link to ", "")
post_id = element.get('data-postid')
response = requests.get(url)
abc = BeautifulSoup(response.text, 'html.parser')
text = abc.find('div', {'class': 'entry'}).text.strip()
text = text.replace('\n', ' ')
temp_dict = {'Link': url,
'Title': title,
'Txt': text,
'Date_1': date_y,
'Date_2': date_m,
'Date_3': date_t,
'ID_post': post_id,
'ID_page': i }
new_lst.append(temp_dict)
df = pd.DataFrame(new_lst)
df.to_csv('daphne.csv', sep='\t', encoding='utf-16')
In [12]:
pd.DataFrame(new_lst)[0:5]
Out[12]:
In [ ]: