In [2]:
! pip install beautifulsoup4
Requirement already satisfied: beautifulsoup4 in c:\users\dell\anaconda3\lib\site-packages
In [3]:
from bs4 import BeautifulSoup
In [7]:
! pip install urllib3
Requirement already satisfied: urllib3 in c:\users\dell\anaconda3\lib\site-packages
In [9]:
import urllib.request
In [10]:
r = urllib.request.urlopen('https://www.yelp.com/search?find_desc=Restaurants&find_loc=Houston%2C+TX').read()
In [11]:
#Using Beautiful Soup Library to parse the data
soup = BeautifulSoup(r, "lxml")
type(soup)
Out[11]:
bs4.BeautifulSoup
In [12]:
len(str(soup.prettify()))
Out[12]:
299419
In [13]:
a=str(soup.prettify())
In [14]:
a.find('''class="snippet"''')
Out[14]:
170264
In [15]:
a[115000:145000]
Out[15]:
' </label>\n </li>\n </ul>\n <ul class="column">\n <li>\n <label class="feature radio-check">\n <input name="feature" type="checkbox" value="HappyHour"/>\n <span class="filter-label">\n Happy Hour\n </span>\n </label>\n </li>\n </ul>\n </div>\n </div>\n </div>\n <div class="js-toggle-list-item toggle-list-item">\n <h4 class="arrange arrange--middle toggle-list-link js-toggle-list-trigger u-space-b1">\n <span class="arrange_unit u-cursor-pointer">\n <span class="toggle-link-arrow-wrap u-inline-block">\n <span class="toggle-link-arrow" role="presentation">\n </span>\n </span>\n </span>\n <span class="arrange_unit arrange_unit--fill">\n <span class="u-pseudo-link u-decoration-none">\n Meals Served\n </span>\n </span>\n </h4>\n <div class="js-toggle-list-content toggle-list-content">\n <div class="filter-group">\n <ul class="column">\n <li>\n <label class="feature radio-check">\n <input name="feature" type="checkbox" value="GoodForMeal.breakfast"/>\n <span class="filter-label">\n Breakfast\n </span>\n </label>\n </li>\n <li>\n <label class="feature radio-check">\n <input name="feature" type="checkbox" value="GoodForMeal.brunch"/>\n <span class="filter-label">\n Brunch\n </span>\n </label>\n </li>\n <li>\n <label class="feature radio-check">\n <input name="feature" type="checkbox" value="GoodForMeal.lunch"/>\n <span class="filter-label">\n Lunch\n </span>\n </label>\n </li>\n </ul>\n <ul class="column">\n <li>\n <label class="feature radio-check">\n <input name="feature" type="checkbox" value="GoodForMeal.dinner"/>\n <span class="filter-label">\n Dinner\n </span>\n </label>\n </li>\n <li>\n <label class="feature radio-check">\n <input name="feature" type="checkbox" value="GoodForMeal.dessert"/>\n <span class="filter-label">\n Dessert\n </span>\n </label>\n </li>\n <li>\n <label class="feature radio-check">\n <input name="feature" type="checkbox" value="GoodForMeal.latenight"/>\n <span class="filter-label">\n Late Night\n </span>\n </label>\n </li>\n </ul>\n </div>\n </div>\n </div>\n <div class="js-toggle-list-item toggle-list-item">\n <h4 class="arrange arrange--middle toggle-list-link js-toggle-list-trigger u-space-b1">\n <span class="arrange_unit u-cursor-pointer">\n <span class="toggle-link-arrow-wrap u-inline-block">\n <span class="toggle-link-arrow" role="presentation">\n </span>\n </span>\n </span>\n <span class="arrange_unit arrange_unit--fill">\n <span class="u-pseudo-link u-decoration-none">\n Music\n </span>\n </span>\n </h4>\n <div class="js-toggle-list-content toggle-list-content">\n <div class="filter-group">\n <ul class="column">\n <li>\n <label class="feature radio-check">\n <input name="feature" type="checkbox" value="Music.dj"/>\n <span class="filter-label">\n DJ\n </span>\n </label>\n </li>\n <li>\n <label class="feature radio-check">\n <input name="feature" type="checkbox" value="Music.jukebox"/>\n <span class="filter-label">\n Juke Box\n </span>\n </label>\n </li>\n </ul>\n <ul class="column">\n <li>\n <label class="feature radio-check">\n <input name="feature" type="checkbox" value="Music.karaoke"/>\n <span class="filter-label">\n Karaoke\n </span>\n </label>\n </li>\n <li>\n <label class="feature radio-check">\n <input name="feature" type="checkbox" value="Music.live"/>\n <span class="filter-label">\n Live\n </span>\n </label>\n </li>\n </ul>\n </div>\n </div>\n </div>\n <div class="js-toggle-list-item toggle-list-item">\n <h4 class="arrange arrange--middle toggle-list-link js-toggle-list-trigger u-space-b1">\n <span class="arrange_unit u-cursor-pointer">\n <span class="toggle-link-arrow-wrap u-inline-block">\n <span class="toggle-link-arrow" role="presentation">\n </span>\n </span>\n </span>\n <span class="arrange_unit arrange_unit--fill">\n <span class="u-pseudo-link u-decoration-none">\n Parking\n </span>\n </span>\n </h4>\n <div class="js-toggle-list-content toggle-list-content">\n <div class="filter-group">\n <ul class="column">\n <li>\n <label class="feature radio-check">\n <input name="feature" type="checkbox" value="BusinessParking.street"/>\n <span class="filter-label">\n Street\n </span>\n </label>\n </li>\n <li>\n <label class="feature radio-check">\n <input name="feature" type="checkbox" value="BusinessParking.garage"/>\n <span class="filter-label">\n Garage\n </span>\n </label>\n </li>\n <li>\n <label class="feature radio-check">\n <input name="feature" type="checkbox" value="BusinessParking.valet"/>\n <span class="filter-label">\n Valet\n </span>\n </label>\n </li>\n </ul>\n <ul class="column">\n <li>\n <label class="feature radio-check">\n <input name="feature" type="checkbox" value="BusinessParking.lot"/>\n <span class="filter-label">\n Private Lot\n </span>\n </label>\n </li>\n <li>\n <label class="feature radio-check">\n <input name="feature" type="checkbox" value="BusinessParking.validated"/>\n <span class="filter-label">\n Validated\n </span>\n </label>\n </li>\n </ul>\n </div>\n </div>\n </div>\n <div class="js-toggle-list-item toggle-list-item">\n <h4 class="arrange arrange--middle toggle-list-link js-toggle-list-trigger u-space-b1">\n <span class="arrange_unit u-cursor-pointer">\n <span class="toggle-link-arrow-wrap u-inline-block">\n <span class="toggle-link-arrow" role="presentation">\n </span>\n </span>\n </span>\n <span class="arrange_unit arrange_unit--fill">\n <span class="u-pseudo-link u-decoration-none">\n Wi-Fi\n </span>\n </span>\n </h4>\n <div class="js-toggle-list-content toggle-list-content">\n <div class="filter-group">\n <ul class="column">\n <li>\n <label class="feature radio-check">\n <input name="feature" type="checkbox" value="WiFi.free"/>\n <span class="filter-label">\n Free\n </span>\n </label>\n </li>\n </ul>\n <ul class="column">\n <li>\n <label class="feature radio-check">\n <input name="feature" type="checkbox" value="WiFi.paid"/>\n <span class="filter-label">\n Paid\n </span>\n </label>\n </li>\n </ul>\n </div>\n </div>\n </div>\n <div class="js-toggle-list-item toggle-list-item">\n <h4 class="arrange arrange--middle toggle-list-link js-toggle-list-trigger u-space-b1">\n <span class="arrange_unit u-cursor-pointer">\n <span class="toggle-link-arrow-wrap u-inline-block">\n <span class="toggle-link-arrow" role="presentation">\n </span>\n </span>\n </span>\n <span class="arrange_unit arrange_unit--fill">\n <span class="u-pseudo-link u-decoration-none">\n Smoking\n </span>\n </span>\n </h4>\n <div class="js-toggle-list-content toggle-list-content">\n <div class="filter-group">\n <ul class="column">\n <li>\n <label class="feature radio-check">\n <input name="feature" type="checkbox" value="Smoking.no"/>\n <span class="filter-label">\n No\n </span>\n </label>\n </li>\n <li>\n <label class="feature radio-check">\n <input name="feature" type="checkbox" value="Smoking.outdoor"/>\n <span class="filter-label">\n Outdoor Area / Patio Only\n </span>\n </label>\n </li>\n </ul>\n <ul class="column">\n <li>\n <label class="feature radio-check">\n <input name="feature" type="checkbox" value="Smoking.yes"/>\n <span class="filter-label">\n Yes\n </span>\n </label>\n </li>\n </ul>\n </div>\n </div>\n </div>\n </ul>\n </div>\n <div class="filter-set category-filters">\n <h4>\n Category\n </h4>\n <ul class="main">\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="mexican"/>\n <span>\n Mexican\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="hotdogs"/>\n <span>\n Fast Food\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="sandwiches"/>\n <span>\n Sandwiches\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="burgers"/>\n <span>\n Burgers\n </span>\n </label>\n </li>\n </ul>\n <a class="more-link" href="javascript:;">\n More Categories\n </a>\n <div class="more category-more filter-group arrange arrange--30 arrange--equal">\n <ul class="arrange_unit">\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="restaurants"/>\n <span>\n Restaurants\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="mexican"/>\n <span>\n Mexican\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="hotdogs"/>\n <span>\n Fast Food\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="sandwiches"/>\n <span>\n Sandwiches\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="food"/>\n <span>\n Food\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="burgers"/>\n <span>\n Burgers\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="pizza"/>\n <span>\n Pizza\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="tradamerican"/>\n <span>\n American (Traditional)\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="nightlife"/>\n <span>\n Nightlife\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="chinese"/>\n <span>\n Chinese\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="chicken_wings"/>\n <span>\n Chicken Wings\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="bars"/>\n <span>\n Bars\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="seafood"/>\n <span>\n Seafood\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="breakfast_brunch"/>\n <span>\n Breakfast & Brunch\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="italian"/>\n <span>\n Italian\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="tex-mex"/>\n <span>\n Tex-Mex\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="newamerican"/>\n <span>\n American (New)\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="delis"/>\n <span>\n Delis\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="bbq"/>\n <span>\n Barbeque\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="vietnamese"/>\n <span>\n Vietnamese\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="foodtrucks"/>\n <span>\n Food Trucks\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="cajun"/>\n <span>\n Cajun/Creole\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="salad"/>\n <span>\n Salad\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="cafes"/>\n <span>\n Cafes\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="latin"/>\n <span>\n Latin American\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="japanese"/>\n <span>\n Japanese\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="sportsbars"/>\n <span>\n Sports Bars\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="sushi"/>\n <span>\n Sushi Bars\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="bakeries"/>\n <span>\n Bakeries\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="soup"/>\n <span>\n Soup\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="indpak"/>\n <span>\n Indian\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="southern"/>\n <span>\n Southern\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="steak"/>\n <span>\n Steakhouses\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="mediterranean"/>\n <span>\n Mediterranean\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="asianfusion"/>\n <span>\n Asian Fusion\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="buffets"/>\n <span>\n Buffets\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="diners"/>\n <span>\n Diners\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="hotdog"/>\n <span>\n Hot Dogs\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="eventservices"/>\n <span>\n Event Planning & Services\n </span>\n </label>\n </li>\n </ul>\n <ul class="arrange_unit">\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="desserts"/>\n <span>\n Desserts\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="coffee"/>\n <span>\n Coffee & Tea\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="foodstands"/>\n <span>\n Food Stands\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="chickenshop"/>\n <span>\n Chicken Shop\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="tacos"/>\n <span>\n Tacos\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="halal"/>\n <span>\n Halal\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="catering"/>\n <span>\n Caterers\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="wine_bars"/>\n <span>\n Wine Bars\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="thai"/>\n <span>\n Thai\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="vegetarian"/>\n <span>\n Vegetarian\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="juicebars"/>\n <span>\n Juice Bars & Smoothies\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="soulfood"/>\n <span>\n Soul Food\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="gourmet"/>\n <span>\n Specialty Food\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="gluten_free"/>\n <span>\n Gluten-Free\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="mideastern"/>\n <span>\n Middle Eastern\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="icecream"/>\n <span>\n Ice Cream & Frozen Yogurt\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="salvadoran"/>\n <span>\n Salvadoran\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="greek"/>\n <span>\n Greek\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="cocktailbars"/>\n <span>\n Cocktail Bars\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="caribbean"/>\n <span>\n Caribbean\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="pakistani"/>\n <span>\n Pakistani\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="noodles"/>\n <span>\n Noodles\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="donuts"/>\n <span>\n Donuts\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="korean"/>\n <span>\n Korean\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="arts"/>\n <span>\n Arts & Entertainment\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="french"/>\n <span>\n French\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="gastropubs"/>\n <span>\n Gastropubs\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="comfortfood"/>\n <span>\n Comfort Food\n </span>\n </label>\n </li>\n <li>\n <label class="category radio-check">\n <input name="category" type="checkbox" value="lounges"/>\n <span>\n Lounges\n </span>\n <'
In [17]:
#Lets try and find the list of phone numbers. We note both the HTNL tag and the class for it.
# We use the find_all function
letters = soup.find_all("span", class_="biz-phone")
letters[1:1000]
Out[17]:
[<span class="biz-phone">
(713) 842-7114
</span>, <span class="biz-phone">
(832) 831-9453
</span>, <span class="biz-phone">
(832) 487-9412
</span>, <span class="biz-phone">
(281) 501-3780
</span>, <span class="biz-phone">
(281) 888-1929
</span>, <span class="biz-phone">
(832) 203-5180
</span>, <span class="biz-phone">
(832) 834-4417
</span>, <span class="biz-phone">
(713) 861-6143
</span>, <span class="biz-phone">
(713) 225-8079
</span>, <span class="biz-phone">
(346) 319-5919
</span>]
In [ ]:
Content source: decisionstats/pythonfordatascience
Similar notebooks: