In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time
import progressbar

In [2]:
# Let's get started: scrape main page
url = "https://daphnecaruanagalizia.com"
response = requests.get(url)
daphne = BeautifulSoup(response.text, 'html.parser')

In [3]:
# Get structural information based on developer tools in Google Chrome
posts = daphne.find_all("div", class_="postmaster")

In [4]:
# Explore first entry 
posts[0]


Out[4]:
<div class="postmaster" data-postid="97964">
<p class="column-caption"></p>
<div class="post">
<h1><a href="https://daphnecaruanagalizia.com/2017/10/first-things-first-something-horrendous-posture/" rel="bookmark" title="Permanent Link to First things first: do something about that horrendous posture">
First things first: do something about that horrendous posture </a>
</h1>
<div class="entry">
<p>
You can wear the flashiest watch and keep your snazzy shirt-cuff turned up to make …</p>
</div>
<p class="postmetadata"><a href="https://daphnecaruanagalizia.com/2017/10/first-things-first-something-horrendous-posture/#respond">Post a comment</a> | <a href="https://daphnecaruanagalizia.com/2017/10/first-things-first-something-horrendous-posture/#comments"><span class="dsq-postid" data-dsqidentifier="97964 https://daphnecaruanagalizia.com/?p=97964">Read (4)</span></a> | <span class="time">Monday, 16 October 2:09 pm</span></p>
</div>
</div>

In [5]:
# url 
posts[0].a["href"]


Out[5]:
'https://daphnecaruanagalizia.com/2017/10/first-things-first-something-horrendous-posture/'

In [6]:
# time stamp
posts[0].find(class_="time").get_text()


Out[6]:
'Monday, 16 October 2:09 pm'

In [7]:
# title of posts
posts[0].a["title"]


Out[7]:
'Permanent Link to First things first: do something about that horrendous posture'

In [8]:
# post id
posts[0].get('data-postid')


Out[8]:
'97964'

In [9]:
# Extract relevant content from main page, loop through posts

new_lst = []

for element in posts:
    
    url = element.a["href"]
    title = element.a["title"]
    title = title[18:]
    date = element.find(class_="time").get_text()
    post_id = element.get('data-postid')
        
    #print(url)
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    text = soup.find('div', {'class': 'entry'}).text.strip()
    
    temp_dict = {'URL': url,
                'Title': title,
                'Date': date,
                'ID': post_id,
                'Txt': text}
    
    new_lst.append(temp_dict)

In [11]:
pd.DataFrame(new_lst)[0:5]


Out[11]:
Date ID Title Txt URL
0 Monday, 16 October 2:09 pm 97964 First things first: do something about that ho... You can wear the flashiest watch and keep your... https://daphnecaruanagalizia.com/2017/10/first...
1 Sunday, 15 October 10:07 pm 97961 Austria’s new chancellor is 31 – and will have... Exit polls show that Sebastian Kurz, 31, is ab... https://daphnecaruanagalizia.com/2017/10/austr...
2 Sunday, 15 October 7:26 pm 97958 The party leaders and Sunday morning Is it going to be a five-year electoral campai... https://daphnecaruanagalizia.com/2017/10/party...
3 Saturday, 14 October 12:52 am 97955 Looks like Delia is surrounding himself with l... The disgraceful thing is that this man has bee... https://daphnecaruanagalizia.com/2017/10/looks...
4 Saturday, 14 October 12:26 am 97952 Chris Cardona: a one-track mind “I don’t recall any other budget having given ... https://daphnecaruanagalizia.com/2017/10/chris...

In [12]:
# Putting everything together: scrape posts from all pages for relevant content

bar = progressbar.ProgressBar()

new_lst = []

# showcase for the first 9 pages / to get all pages change to range(1,1443)
for elem,i in zip(range(1,10), bar((range(1,10)))):

    page = "https://daphnecaruanagalizia.com/page/" + str(elem)
    
    response = requests.get(page)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    posts = soup.find_all("div", class_="postmaster")
    #soup.find_all('div', {'class':'postmaster'})

    for element in posts:

        url = element.a["href"]
        
        url_temp = url.replace("https://daphnecaruanagalizia.com/", "")
        date_y = url_temp[:4]
        date_m = url_temp[5:7]
        
        # dealing with error message stemming from one post on page 127
        try:
            date_t = element.find(class_="time").get_text()

        except AttributeError:
            date_t = "n.a."
            
        title = element.a["title"]
        title = title.replace("Permanent Link to ", "")
        
        post_id = element.get('data-postid')
        
        response = requests.get(url)
        abc = BeautifulSoup(response.text, 'html.parser')
        text = abc.find('div', {'class': 'entry'}).text.strip()
        text = text.replace('\n', ' ')

        temp_dict = {'Link': url,
                    'Title': title,
                    'Txt': text,
                    'Date_1': date_y,
                    'Date_2': date_m,
                    'Date_3': date_t,
                    'ID_post': post_id,
                    'ID_page': i }

        new_lst.append(temp_dict)
    

df = pd.DataFrame(new_lst)
df.to_csv('daphne.csv', sep='\t', encoding='utf-16')


 88% (8 of 9) |########################    | Elapsed Time: 0:02:42 ETA: 0:00:20

In [12]:
pd.DataFrame(new_lst)[0:5]


Out[12]:
Date_1 Date_2 Date_3 ID_page ID_post Link Title Txt
0 2017 10 Monday, 16 October 2:09 pm 1 97964 https://daphnecaruanagalizia.com/2017/10/first... First things first: do something about that ho... You can wear the flashiest watch and keep your...
1 2017 10 Sunday, 15 October 10:07 pm 1 97961 https://daphnecaruanagalizia.com/2017/10/austr... Austria’s new chancellor is 31 – and will have... Exit polls show that Sebastian Kurz, 31, is ab...
2 2017 10 Sunday, 15 October 7:26 pm 1 97958 https://daphnecaruanagalizia.com/2017/10/party... The party leaders and Sunday morning Is it going to be a five-year electoral campai...
3 2017 10 Saturday, 14 October 12:52 am 1 97955 https://daphnecaruanagalizia.com/2017/10/looks... Looks like Delia is surrounding himself with l... The disgraceful thing is that this man has bee...
4 2017 10 Saturday, 14 October 12:26 am 1 97952 https://daphnecaruanagalizia.com/2017/10/chris... Chris Cardona: a one-track mind “I don’t recall any other budget having given ...

In [ ]: