notebook.community

Edit and run



In [ ]:

    
import re
import csv
import json
import requests
import pandas as pd
from lxml import html
from bs4 import BeautifulSoup
from selenium import webdriver



In [ ]:

    
#function for bs4 and lxml
def get_source(url):
    response = requests.get(url)
    page = response.content
    soup = BeautifulSoup(page,"html.parser")
    tree = html.document_fromstring(page)
    return soup, tree

#functions for bs4 only
def get_divs(page,div_class):
    divs = page.find_all("div",class_=div_class)
    return divs    

def get_text(div,tag,class_name=True, regex=""):
    text = div.find(tag,class_=class_name,text=re.compile(regex)).get_text()
    return text

def get_url(div,tag,class_name=True):
    text = div.find(tag,class_=class_name).get("href")
    return text



In [ ]:

    
url="https://www.youtube.com/feed/trending"
soup, tree = get_source(url)



In [ ]:

    
# bs4 approach
div_class = "yt-lockup-content"
divs = get_divs(page=soup, div_class=div_class)
data_bs4 = []
for i in divs:
    data_bs4.append({
        "title":get_text(div=i,tag="a"),
        "username":get_text(div=i,tag="a",class_name="yt-uix-sessionlink spf-link "),
        "link":get_url(div=i,tag="a"),
        "duration":get_text(div=i,tag="span",class_name="accessible-description"),
        "views":get_text(div=i,tag="li",class_name=False,regex="view")
    })
    
df_bs4 = pd.DataFrame(data_bs4)



In [ ]:

    
#lxml approach
data_xml = {
        "title":tree.xpath("//h3/a/@title"),
        "username":tree.xpath("//a[@class='yt-uix-sessionlink       spf-link ']/text()"),
        "link":tree.xpath("//h3/a/@href"),
        "duration":tree.xpath("//h3/span/text()"),
        "views":tree.xpath("//li[contains(text(), 'views')]/text()")
    }

df_xml = pd.DataFrame(data_xml)



In [ ]:

    
#selenium approach
browser = webdriver.Chrome()
browser.get(url)

data_sel = {
        "title":[i.get_attribute("title") for i in browser.find_elements_by_xpath("//h3/a")],
        "username":[i.text for i in browser.find_elements_by_xpath("//yt-formatted-string[@id='byline']")],
        "link":[i.get_attribute("href") for i in browser.find_elements_by_xpath("//h3/a")],
        "duration":[i.text for i in browser.find_elements_by_xpath("//span[@class='style-scope ytd-thumbnail-overlay-time-status-renderer']")],
        "views":[i.text for i in browser.find_elements_by_xpath("//span[@class='style-scope ytd-video-meta-block'][contains(text(), 'views')]")]
    }

browser.close()
df_sel = pd.DataFrame(data_sel)



In [ ]:

    
def absolute_links(url):
    start = "https://www.youtube.com"
    if start not in url:
        return start+url
    
def clean_duration(dur):
    rep = "- Duration:"
    if rep in dur:
        return dur.replace(rep,"").strip()
    
def clean_views(view):
    rep = "views"
    if rep in view:
        return view.replace(rep,"").strip()



In [ ]:

    
dataframes = [df_bs4,df_xml,df_sel]

for i in dataframes:
    i["link"] = i.link.apply(absolute_links)
    i["duration"] = i.duration.apply(clean_duration)
    i["views"] = i.views.apply(clean_views)

for index,df in enumerate(dataframes):
    df.to_csv(str(index)+".csv")
    df.to_json(str(index)+".json")