In [34]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
In [35]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
In [36]:
#%load_ext soup
import requests
from bs4 import BeautifulSoup
from os import path
In [37]:
url = "http://www.brandsoftheworld.com/logos/countries/ch"
In [47]:
def retrieve_swiss_brands():
arguments = {'page':0} # grab HTML data
all_brands = []
# retrieve the html web page containing the list of brands
r = requests.get(url, params=arguments)
soup_of_brands = BeautifulSoup(r.text, 'html.parser')
# select the tags containing the names of these brands
resulting_brands = soup_of_brands.findAll("span", { "class" : "title" })
# remove the tags and only keep the actual brands
brands = [brand.string for brand in resulting_brands]
while(len(brands) > 0):
# print progress
print("Finished page "+str(arguments['page'])+" of swiss brands")
all_brands += brands
arguments['page'] += 1
# rinse and repeat
r = requests.get(url, params=arguments)
soup_of_brands = BeautifulSoup(r.text, 'html.parser')
resulting_brands = soup_of_brands.findAll("span", { "class" : "title" })
brands = [brand.string for brand in resulting_brands]
return all_brands
In [48]:
all_swiss_brands = retrieve_swiss_brands()
In [50]:
len(all_swiss_brands)
Out[50]:
Dump the list of contents into a file
In [52]:
import pickle
with open('data/all_swiss_brands.pickle', 'wb') as fp:
pickle.dump(all_swiss_brands, fp)
Load the file into a list of brands
In [53]:
with open ('data/all_swiss_brands.pickle', 'rb') as fp:
brand_list = pickle.load(fp)
In [55]:
len(brand_list)
Out[55]:
In [ ]: