RegEx is a small programming language to deal with text. It is hard to learn but sometimes can save the day with its smart search abilities. Is mainly useful for:
This notebook introduces regex, but first provides the quotes scraper we know in a slighty changed way using:
In [1]:
import numpy as np
import pandas as pd
import time
import requests
from bs4 import BeautifulSoup
from pprint import pprint
import re #regular expressions
from textblob import TextBlob, Word
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords, wordnet
In [49]:
#select <=> find_all
#select_one <=> find
def my_scraper(url):
response = requests.get(url)
page = response.content
page = BeautifulSoup(page,"html.parser")
quotes = page.select("div.quote span.text")
quotes_text = [i.get_text() for i in quotes]
authors = page.find_all("small",attrs={"class":"author"})
authors_text = [i.get_text() for i in authors]
author_links = [i.find_next_sibling().get("href") for i in authors]
my_output = {"quotes":quotes_text,
"authors":authors_text,
"author_links":author_links}
my_df = pd.DataFrame(my_output)
return my_df
In [50]:
my_data = my_scraper("http://quotes.toscrape.com/")
In [52]:
my_data.head()
Out[52]:
In [98]:
my_text = "I am Jack, I am 37 years old, I am earning $100"
In [56]:
re.findall("I",my_text)
Out[56]:
In [64]:
re.findall("[0-9]+",my_text) #find one or more digits
Out[64]:
In [100]:
re.findall(".*",my_text) #find anything any number of times
Out[100]:
In [101]:
re.sub("37","73",my_text) #substitute
Out[101]:
In [107]:
re.sub("\s[0-9]+"," 73",my_text)
Out[107]:
In [109]:
expres = re.compile("\s[0-9]+") #compile the expresson to be used somewhere else
In [110]:
re.findall(expres,my_text)
Out[110]:
In [112]:
url = "http://books.toscrape.com/"
In [113]:
response = requests.get(url)
page = response.content
page = BeautifulSoup(page,"html.parser")
In [116]:
page.find_all("p",class_=re.compile("price.+"))
Out[116]:
In [119]:
page_str = str(page)
In [125]:
prices = re.findall("£[0-9]+\S[0-9]+",page_str)
In [127]:
[re.sub("£","",i) for i in prices]
Out[127]:
In [156]:
re.sub(r"(\S)([0-9].+)",r"$\2",prices[0])
Out[156]:
In [137]:
my_t = "My name is Hrant, my e-mail is hdavtyan@aua.am"
In [153]:
re.sub(r"(\S+@)(.+)",r"\1harvard.edu",my_t)
Out[153]:
In [ ]: