In [ ]:
import re
import csv
import json
import requests
import pandas as pd
from lxml import html
from bs4 import BeautifulSoup
from selenium import webdriver
In [ ]:
#function for bs4 and lxml
def get_source(url):
response = requests.get(url)
page = response.content
soup = BeautifulSoup(page,"html.parser")
tree = html.document_fromstring(page)
return soup, tree
#functions for bs4 only
def get_divs(page,div_class):
divs = page.find_all("div",class_=div_class)
return divs
def get_text(div,tag,class_name=True, regex=""):
text = div.find(tag,class_=class_name,text=re.compile(regex)).get_text()
return text
def get_url(div,tag,class_name=True):
text = div.find(tag,class_=class_name).get("href")
return text
In [ ]:
url="https://www.youtube.com/feed/trending"
soup, tree = get_source(url)
In [ ]:
# bs4 approach
div_class = "yt-lockup-content"
divs = get_divs(page=soup, div_class=div_class)
data_bs4 = []
for i in divs:
data_bs4.append({
"title":get_text(div=i,tag="a"),
"username":get_text(div=i,tag="a",class_name="yt-uix-sessionlink spf-link "),
"link":get_url(div=i,tag="a"),
"duration":get_text(div=i,tag="span",class_name="accessible-description"),
"views":get_text(div=i,tag="li",class_name=False,regex="view")
})
df_bs4 = pd.DataFrame(data_bs4)
In [ ]:
#lxml approach
data_xml = {
"title":tree.xpath("//h3/a/@title"),
"username":tree.xpath("//a[@class='yt-uix-sessionlink spf-link ']/text()"),
"link":tree.xpath("//h3/a/@href"),
"duration":tree.xpath("//h3/span/text()"),
"views":tree.xpath("//li[contains(text(), 'views')]/text()")
}
df_xml = pd.DataFrame(data_xml)
In [ ]:
#selenium approach
browser = webdriver.Chrome()
browser.get(url)
data_sel = {
"title":[i.get_attribute("title") for i in browser.find_elements_by_xpath("//h3/a")],
"username":[i.text for i in browser.find_elements_by_xpath("//yt-formatted-string[@id='byline']")],
"link":[i.get_attribute("href") for i in browser.find_elements_by_xpath("//h3/a")],
"duration":[i.text for i in browser.find_elements_by_xpath("//span[@class='style-scope ytd-thumbnail-overlay-time-status-renderer']")],
"views":[i.text for i in browser.find_elements_by_xpath("//span[@class='style-scope ytd-video-meta-block'][contains(text(), 'views')]")]
}
browser.close()
df_sel = pd.DataFrame(data_sel)
In [ ]:
def absolute_links(url):
start = "https://www.youtube.com"
if start not in url:
return start+url
def clean_duration(dur):
rep = "- Duration:"
if rep in dur:
return dur.replace(rep,"").strip()
def clean_views(view):
rep = "views"
if rep in view:
return view.replace(rep,"").strip()
In [ ]:
dataframes = [df_bs4,df_xml,df_sel]
for i in dataframes:
i["link"] = i.link.apply(absolute_links)
i["duration"] = i.duration.apply(clean_duration)
i["views"] = i.views.apply(clean_views)
for index,df in enumerate(dataframes):
df.to_csv(str(index)+".csv")
df.to_json(str(index)+".json")