In [1]:
import requests
from bs4 import BeautifulSoup
import time
import random
In [2]:
time.sleep(3.1)
In [3]:
random.randint(1,5)
Out[3]:
In [4]:
for item in range(10):
# get page here
# get url for next page
# wait a bit
time.sleep(random.randint(1,5))
In [4]:
url='http://vaping.info/read-testimonials'
In [5]:
resp = requests.get(url)
In [6]:
resp.status_code
Out[6]:
In [7]:
html = BeautifulSoup(resp.text, 'html.parser')
In [8]:
stories=html.findAll('div', {'class': 'pfpItem'})
In [9]:
len(stories)
Out[9]:
In [10]:
stories[2]
Out[10]:
In [11]:
stories[0].find('a', {'class': 'pfpReadMore'}).attrs['href']
Out[11]:
In [12]:
story_urls = []
for story in stories:
story_urls.append(story.find('a', {'class': 'pfpReadMore'}).attrs['href'])
In [13]:
story_urls
Out[13]:
In [15]:
#story_urls = [story.find('a', {'class': 'pfpReadMore'}).attrs['href']
# for story in stories]
In [14]:
html.find('span', {'class': 'pfpNav'}).find('a', text='Older Entries » ').attrs['href']
Out[14]:
In [15]:
html.find('span', {'class': 'pfpNav'}).find('a').text
Out[15]:
In [16]:
def addNums(n1, n2):
answer = n1+n2
return answer
def subNums(n1, n2):
return n1 - n2
def pwrNum(num, n=2):
return num**n
In [ ]:
In [17]:
def get_link_page(url):
'''
This is a function to get web page from supplied url
then test it exists and parse it with BeautifulSoup
expects a url
returns a list of links and url of next page
'''
try:
resp = requests.get(url)
assert(resp.status_code==200)
except:
return None, None
html = BeautifulSoup(resp.text, 'html.parser')
# 1. find stories on page
stories=html.findAll('div', {'class': 'pfpItem'})
# 2. extract urls to each story
story_urls = []
for story in stories:
story_urls.append(story.find('a', {'class': 'pfpReadMore'}).attrs['href'])
# 3. find link to Older Entries if it exists
nav_links = html.findAll('span', {'class': 'pfpNav'})
try:
olink=nav_links[-1].find('a')
older_link=olink.attrs['href']
assert(olink.text.startswith('Older'))
except:
older_link=None
return story_urls, older_link
In [18]:
get_link_page('http://vaping.info/read-testimonials/page/11')
Out[18]:
In [19]:
url='http://vaping.info/read-testimonials'
links = []
while url is not None:
print('Processing', url)
story_links, url = get_link_page(url)
print('finished - next url', url)
links.extend(story_links)
time.sleep(1)
In [21]:
len(links)
Out[21]:
In [22]:
counter=10
while counter>0:
print(counter)
counter = counter -1
In [ ]:
len(links)
In [ ]:
links[0]
In [23]:
def get_page(url):
'''
'''
try:
resp = requests.get(url)
assert(resp.status_code==200)
except:
return None
html = BeautifulSoup(resp.text, 'html.parser')
blog = html.find('div', {'class': 'blog'})
title = blog.find('h4', {'class': 'heading'}).text
paras = [p.text for p in blog.findAll('p')]
return {'title': title, 'text': ' '.join(paras), 'url': url}
In [24]:
rows = []
num_of_links = len(links)
for idx,link in enumerate(links):
print('Processing', idx, 'of', num_of_links, link)
data = get_page(link)
if data is not None:
rows.append(data)
time.sleep(0.1)
In [25]:
import pandas as pd
In [26]:
df = pd.DataFrame.from_dict(rows)
In [27]:
df.head()
Out[27]:
In [28]:
df.to_csv('data/vaping_testimonials.csv')