In [34]:
%load_ext autoreload
%autoreload 2 
%matplotlib inline


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

In [35]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

In [36]:
#%load_ext soup
import requests
from bs4 import BeautifulSoup
from os import path

In [37]:
url = "http://www.brandsoftheworld.com/logos/countries/ch"

In [47]:
def retrieve_swiss_brands():
    arguments = {'page':0} # grab HTML data
    all_brands = []
    # retrieve the html web page containing the list of brands
    r = requests.get(url, params=arguments)
    soup_of_brands = BeautifulSoup(r.text, 'html.parser')
    # select the tags containing the names of these brands
    resulting_brands = soup_of_brands.findAll("span", { "class" : "title" })
    # remove the tags and only keep the actual brands
    brands = [brand.string for brand in resulting_brands]
    while(len(brands) > 0):
        # print progress
        print("Finished page "+str(arguments['page'])+" of swiss brands")
        all_brands += brands
        arguments['page'] += 1
        # rinse and repeat
        r = requests.get(url, params=arguments)
        soup_of_brands = BeautifulSoup(r.text, 'html.parser')
        resulting_brands = soup_of_brands.findAll("span", { "class" : "title" })
        brands = [brand.string for brand in resulting_brands]
    return all_brands

In [48]:
all_swiss_brands = retrieve_swiss_brands()


Finished page 0 of swiss brands
Finished page 1 of swiss brands
Finished page 2 of swiss brands
Finished page 3 of swiss brands
Finished page 4 of swiss brands
Finished page 5 of swiss brands
Finished page 6 of swiss brands
Finished page 7 of swiss brands
Finished page 8 of swiss brands
Finished page 9 of swiss brands
Finished page 10 of swiss brands
Finished page 11 of swiss brands
Finished page 12 of swiss brands
Finished page 13 of swiss brands
Finished page 14 of swiss brands
Finished page 15 of swiss brands
Finished page 16 of swiss brands
Finished page 17 of swiss brands
Finished page 18 of swiss brands
Finished page 19 of swiss brands
Finished page 20 of swiss brands
Finished page 21 of swiss brands
Finished page 22 of swiss brands
Finished page 23 of swiss brands
Finished page 24 of swiss brands
Finished page 25 of swiss brands
Finished page 26 of swiss brands
Finished page 27 of swiss brands
Finished page 28 of swiss brands
Finished page 29 of swiss brands
Finished page 30 of swiss brands
Finished page 31 of swiss brands
Finished page 32 of swiss brands
Finished page 33 of swiss brands
Finished page 34 of swiss brands
Finished page 35 of swiss brands
Finished page 36 of swiss brands
Finished page 37 of swiss brands
Finished page 38 of swiss brands
Finished page 39 of swiss brands
Finished page 40 of swiss brands
Finished page 41 of swiss brands
Finished page 42 of swiss brands
Finished page 43 of swiss brands
Finished page 44 of swiss brands
Finished page 45 of swiss brands
Finished page 46 of swiss brands
Finished page 47 of swiss brands
Finished page 48 of swiss brands
Finished page 49 of swiss brands
Finished page 50 of swiss brands
Finished page 51 of swiss brands
Finished page 52 of swiss brands
Finished page 53 of swiss brands
Finished page 54 of swiss brands
Finished page 55 of swiss brands
Finished page 56 of swiss brands
Finished page 57 of swiss brands
Finished page 58 of swiss brands

In [50]:
len(all_swiss_brands)


Out[50]:
1885

Dump the list of contents into a file


In [52]:
import pickle

with open('data/all_swiss_brands.pickle', 'wb') as fp:
    pickle.dump(all_swiss_brands, fp)

Load the file into a list of brands


In [53]:
with open ('data/all_swiss_brands.pickle', 'rb') as fp:
    brand_list = pickle.load(fp)

In [55]:
len(brand_list)


Out[55]:
1885

In [ ]: