Intro to RegEx

RegEx is a small programming language to deal with text. It is hard to learn but sometimes can save the day with its smart search abilities. Is mainly useful for:

cleaning data,
finding selectors/attributes.

This notebook introduces regex, but first provides the quotes scraper we know in a slighty changed way using:

select() instead of find_all,
find_next_sibling() instead of find_all,
attr instead of directly providing class,
returns dataframe instead of a list.



In [1]:

    
import numpy as np
import pandas as pd
import time

import requests
from bs4 import BeautifulSoup
from pprint import pprint 
import re #regular expressions

from textblob import TextBlob, Word
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords, wordnet



In [49]:

    
#select <=> find_all
#select_one <=> find
def my_scraper(url):
    response = requests.get(url)
    page = response.content
    page = BeautifulSoup(page,"html.parser")
    quotes = page.select("div.quote span.text")
    quotes_text = [i.get_text() for i in quotes]
    authors = page.find_all("small",attrs={"class":"author"})
    authors_text = [i.get_text() for i in authors]
    author_links = [i.find_next_sibling().get("href") for i in authors]
    my_output = {"quotes":quotes_text,
                 "authors":authors_text,
                 "author_links":author_links}
    my_df = pd.DataFrame(my_output)
    return my_df



In [50]:

    
my_data = my_scraper("http://quotes.toscrape.com/")



In [52]:

    
my_data.head()









    Out[52]:







  
    
      
      author_links
      authors
      quotes
    
  
  
    
      0
      /author/Albert-Einstein
      Albert Einstein
      “The world as we have created it is a process ...
    
    
      1
      /author/J-K-Rowling
      J.K. Rowling
      “It is our choices, Harry, that show what we t...
    
    
      2
      /author/Albert-Einstein
      Albert Einstein
      “There are only two ways to live your life. On...
    
    
      3
      /author/Jane-Austen
      Jane Austen
      “The person, be it gentleman or lady, who has ...
    
    
      4
      /author/Marilyn-Monroe
      Marilyn Monroe
      “Imperfection is beauty, madness is genius and...



In [98]:

    
my_text = "I am Jack, I am 37 years old, I am earning $100"



In [56]:

    
re.findall("I",my_text)









    Out[56]:





['I', 'I', 'I']



In [64]:

    
re.findall("[0-9]+",my_text) #find one or more digits









    Out[64]:





['37', '100']



In [100]:

    
re.findall(".*",my_text) #find anything any number of times









    Out[100]:





['I am Jack, I am 37 years old, I am earning $100', '']



In [101]:

    
re.sub("37","73",my_text) #substitute









    Out[101]:





'I am Jack, I am 73 years old, I am earning $100'



In [107]:

    
re.sub("\s[0-9]+"," 73",my_text)









    Out[107]:





'I am Jack, I am 73 years old, I am earning $100'



In [109]:

    
expres = re.compile("\s[0-9]+") #compile the expresson to be used somewhere else



In [110]:

    
re.findall(expres,my_text)









    Out[110]:





[' 37']



In [112]:

    
url = "http://books.toscrape.com/"



In [113]:

    
response = requests.get(url)
page = response.content
page = BeautifulSoup(page,"html.parser")



In [116]:

    
page.find_all("p",class_=re.compile("price.+"))









    Out[116]:





[<p class="price_color">£51.77</p>,
 <p class="price_color">£53.74</p>,
 <p class="price_color">£50.10</p>,
 <p class="price_color">£47.82</p>,
 <p class="price_color">£54.23</p>,
 <p class="price_color">£22.65</p>,
 <p class="price_color">£33.34</p>,
 <p class="price_color">£17.93</p>,
 <p class="price_color">£22.60</p>,
 <p class="price_color">£52.15</p>,
 <p class="price_color">£13.99</p>,
 <p class="price_color">£20.66</p>,
 <p class="price_color">£17.46</p>,
 <p class="price_color">£52.29</p>,
 <p class="price_color">£35.02</p>,
 <p class="price_color">£57.25</p>,
 <p class="price_color">£23.88</p>,
 <p class="price_color">£37.59</p>,
 <p class="price_color">£51.33</p>,
 <p class="price_color">£45.17</p>]



In [119]:

    
page_str = str(page)



In [125]:

    
prices = re.findall("£[0-9]+\S[0-9]+",page_str)



In [127]:

    
[re.sub("£","",i) for i in prices]









    Out[127]:





['51.77',
 '53.74',
 '50.10',
 '47.82',
 '54.23',
 '22.65',
 '33.34',
 '17.93',
 '22.60',
 '52.15',
 '13.99',
 '20.66',
 '17.46',
 '52.29',
 '35.02',
 '57.25',
 '23.88',
 '37.59',
 '51.33',
 '45.17']



In [156]:

    
re.sub(r"(\S)([0-9].+)",r"$\2",prices[0])









    Out[156]:





'$51.77'



In [137]:

    
my_t = "My name is Hrant, my e-mail is hdavtyan@aua.am"



In [153]:

    
re.sub(r"(\S+@)(.+)",r"\1harvard.edu",my_t)









    Out[153]:





'My name is Hrant, my e-mail is hdavtyan@harvard.edu'



In [ ]:

	author_links	authors	quotes
0	/author/Albert-Einstein	Albert Einstein	“The world as we have created it is a process ...
1	/author/J-K-Rowling	J.K. Rowling	“It is our choices, Harry, that show what we t...
2	/author/Albert-Einstein	Albert Einstein	“There are only two ways to live your life. On...
3	/author/Jane-Austen	Jane Austen	“The person, be it gentleman or lady, who has ...
4	/author/Marilyn-Monroe	Marilyn Monroe	“Imperfection is beauty, madness is genius and...