notebook.community

Edit and run



In [1]:

    
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time
import progressbar



In [2]:

    
# Let's get started: scrape main page
url = "https://daphnecaruanagalizia.com"
response = requests.get(url)
daphne = BeautifulSoup(response.text, 'html.parser')



In [3]:

    
# Get structural information based on developer tools in Google Chrome
posts = daphne.find_all("div", class_="postmaster")



In [4]:

    
# Explore first entry 
posts[0]









    Out[4]:





<div class="postmaster" data-postid="97964">
<p class="column-caption"></p>
<div class="post">
<h1><a href="https://daphnecaruanagalizia.com/2017/10/first-things-first-something-horrendous-posture/" rel="bookmark" title="Permanent Link to First things first: do something about that horrendous posture">
First things first: do something about that horrendous posture </a>
</h1>
<div class="entry">
<p>
You can wear the flashiest watch and keep your snazzy shirt-cuff turned up to make …</p>
</div>
<p class="postmetadata"><a href="https://daphnecaruanagalizia.com/2017/10/first-things-first-something-horrendous-posture/#respond">Post a comment</a> | <a href="https://daphnecaruanagalizia.com/2017/10/first-things-first-something-horrendous-posture/#comments"><span class="dsq-postid" data-dsqidentifier="97964 https://daphnecaruanagalizia.com/?p=97964">Read (4)</span></a> | <span class="time">Monday, 16 October 2:09 pm</span></p>
</div>
</div>



In [5]:

    
# url 
posts[0].a["href"]









    Out[5]:





'https://daphnecaruanagalizia.com/2017/10/first-things-first-something-horrendous-posture/'



In [6]:

    
# time stamp
posts[0].find(class_="time").get_text()









    Out[6]:





'Monday, 16 October 2:09 pm'



In [7]:

    
# title of posts
posts[0].a["title"]









    Out[7]:





'Permanent Link to First things first: do something about that horrendous posture'



In [8]:

    
# post id
posts[0].get('data-postid')









    Out[8]:





'97964'



In [9]:

    
# Extract relevant content from main page, loop through posts

new_lst = []

for element in posts:
    
    url = element.a["href"]
    title = element.a["title"]
    title = title[18:]
    date = element.find(class_="time").get_text()
    post_id = element.get('data-postid')
        
    #print(url)
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    text = soup.find('div', {'class': 'entry'}).text.strip()
    
    temp_dict = {'URL': url,
                'Title': title,
                'Date': date,
                'ID': post_id,
                'Txt': text}
    
    new_lst.append(temp_dict)



In [11]:

    
pd.DataFrame(new_lst)[0:5]









    Out[11]:







  
    
      
      Date
      ID
      Title
      Txt
      URL
    
  
  
    
      0
      Monday, 16 October 2:09 pm
      97964
      First things first: do something about that ho...
      You can wear the flashiest watch and keep your...
      https://daphnecaruanagalizia.com/2017/10/first...
    
    
      1
      Sunday, 15 October 10:07 pm
      97961
      Austria’s new chancellor is 31 – and will have...
      Exit polls show that Sebastian Kurz, 31, is ab...
      https://daphnecaruanagalizia.com/2017/10/austr...
    
    
      2
      Sunday, 15 October 7:26 pm
      97958
      The party leaders and Sunday morning
      Is it going to be a five-year electoral campai...
      https://daphnecaruanagalizia.com/2017/10/party...
    
    
      3
      Saturday, 14 October 12:52 am
      97955
      Looks like Delia is surrounding himself with l...
      The disgraceful thing is that this man has bee...
      https://daphnecaruanagalizia.com/2017/10/looks...
    
    
      4
      Saturday, 14 October 12:26 am
      97952
      Chris Cardona: a one-track mind
      “I don’t recall any other budget having given ...
      https://daphnecaruanagalizia.com/2017/10/chris...



In [12]:

    
# Putting everything together: scrape posts from all pages for relevant content

bar = progressbar.ProgressBar()

new_lst = []

# showcase for the first 9 pages / to get all pages change to range(1,1443)
for elem,i in zip(range(1,10), bar((range(1,10)))):

    page = "https://daphnecaruanagalizia.com/page/" + str(elem)
    
    response = requests.get(page)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    posts = soup.find_all("div", class_="postmaster")
    #soup.find_all('div', {'class':'postmaster'})

    for element in posts:

        url = element.a["href"]
        
        url_temp = url.replace("https://daphnecaruanagalizia.com/", "")
        date_y = url_temp[:4]
        date_m = url_temp[5:7]
        
        # dealing with error message stemming from one post on page 127
        try:
            date_t = element.find(class_="time").get_text()

        except AttributeError:
            date_t = "n.a."
            
        title = element.a["title"]
        title = title.replace("Permanent Link to ", "")
        
        post_id = element.get('data-postid')
        
        response = requests.get(url)
        abc = BeautifulSoup(response.text, 'html.parser')
        text = abc.find('div', {'class': 'entry'}).text.strip()
        text = text.replace('\n', ' ')

        temp_dict = {'Link': url,
                    'Title': title,
                    'Txt': text,
                    'Date_1': date_y,
                    'Date_2': date_m,
                    'Date_3': date_t,
                    'ID_post': post_id,
                    'ID_page': i }

        new_lst.append(temp_dict)
    

df = pd.DataFrame(new_lst)
df.to_csv('daphne.csv', sep='\t', encoding='utf-16')









    



 88% (8 of 9) |########################    | Elapsed Time: 0:02:42 ETA: 0:00:20



In [12]:

    
pd.DataFrame(new_lst)[0:5]









    Out[12]:







  
    
      
      Date_1
      Date_2
      Date_3
      ID_page
      ID_post
      Link
      Title
      Txt
    
  
  
    
      0
      2017
      10
      Monday, 16 October 2:09 pm
      1
      97964
      https://daphnecaruanagalizia.com/2017/10/first...
      First things first: do something about that ho...
      You can wear the flashiest watch and keep your...
    
    
      1
      2017
      10
      Sunday, 15 October 10:07 pm
      1
      97961
      https://daphnecaruanagalizia.com/2017/10/austr...
      Austria’s new chancellor is 31 – and will have...
      Exit polls show that Sebastian Kurz, 31, is ab...
    
    
      2
      2017
      10
      Sunday, 15 October 7:26 pm
      1
      97958
      https://daphnecaruanagalizia.com/2017/10/party...
      The party leaders and Sunday morning
      Is it going to be a five-year electoral campai...
    
    
      3
      2017
      10
      Saturday, 14 October 12:52 am
      1
      97955
      https://daphnecaruanagalizia.com/2017/10/looks...
      Looks like Delia is surrounding himself with l...
      The disgraceful thing is that this man has bee...
    
    
      4
      2017
      10
      Saturday, 14 October 12:26 am
      1
      97952
      https://daphnecaruanagalizia.com/2017/10/chris...
      Chris Cardona: a one-track mind
      “I don’t recall any other budget having given ...



In [ ]:

	Date	ID	Title	Txt	URL
0	Monday, 16 October 2:09 pm	97964	First things first: do something about that ho...	You can wear the flashiest watch and keep your...	https://daphnecaruanagalizia.com/2017/10/first...
1	Sunday, 15 October 10:07 pm	97961	Austria’s new chancellor is 31 – and will have...	Exit polls show that Sebastian Kurz, 31, is ab...	https://daphnecaruanagalizia.com/2017/10/austr...
2	Sunday, 15 October 7:26 pm	97958	The party leaders and Sunday morning	Is it going to be a five-year electoral campai...	https://daphnecaruanagalizia.com/2017/10/party...
3	Saturday, 14 October 12:52 am	97955	Looks like Delia is surrounding himself with l...	The disgraceful thing is that this man has bee...	https://daphnecaruanagalizia.com/2017/10/looks...
4	Saturday, 14 October 12:26 am	97952	Chris Cardona: a one-track mind	“I don’t recall any other budget having given ...	https://daphnecaruanagalizia.com/2017/10/chris...

	Date_1	Date_2	Date_3	ID_page	ID_post	Link	Title	Txt
0	2017	10	Monday, 16 October 2:09 pm	1	97964	https://daphnecaruanagalizia.com/2017/10/first...	First things first: do something about that ho...	You can wear the flashiest watch and keep your...
1	2017	10	Sunday, 15 October 10:07 pm	1	97961	https://daphnecaruanagalizia.com/2017/10/austr...	Austria’s new chancellor is 31 – and will have...	Exit polls show that Sebastian Kurz, 31, is ab...
2	2017	10	Sunday, 15 October 7:26 pm	1	97958	https://daphnecaruanagalizia.com/2017/10/party...	The party leaders and Sunday morning	Is it going to be a five-year electoral campai...
3	2017	10	Saturday, 14 October 12:52 am	1	97955	https://daphnecaruanagalizia.com/2017/10/looks...	Looks like Delia is surrounding himself with l...	The disgraceful thing is that this man has bee...
4	2017	10	Saturday, 14 October 12:26 am	1	97952	https://daphnecaruanagalizia.com/2017/10/chris...	Chris Cardona: a one-track mind	“I don’t recall any other budget having given ...