In [1]:
import os
import csv
import urllib
import webbrowser
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from altair import *
from bs4 import BeautifulSoup
%matplotlib inline
In [2]:
if not os.path.exists('../data/'):
os.makedirs('../data/')
In [3]:
url = 'https://en.wikipedia.org/wiki/List_of_most_popular_websites'
webbrowser.open_new_tab(url)
Out[3]:
wiki_popular_sites.csv
In [4]:
soup = BeautifulSoup(urllib.request.urlopen('https://en.wikipedia.org/wiki/List_of_most_popular_websites'), "lxml")
# <th> tag: defines a header cell in a table
# <tr> tag: defines a row in a table
# <td> tag: defines a cell in a table
headers = [header.text for header in soup.findAll('th')]
rows = []
headers
for row in soup.find_all('tr')[1:]:
rows.append([val.text for val in row.find_all_next('td',limit=6)])
with open('../data/wiki_popular_sites.csv', 'w') as f:
writer = csv.writer(f)
writer.writerow(headers)
writer.writerows(row for row in rows if row)
wiki_popular_sites.csv
with pandas
In [5]:
wiki_popular_sites = pd.read_csv("../data/wiki_popular_sites.csv")
wiki_popular_sites.head(10)
Out[5]:
In [6]:
wiki_popular_sites.columns
Out[6]:
In [7]:
columns = ["Site", "Domain", "Alexa top 100 websites", "SimilarWeb top 100 websites", "Type", "Principal country"]
wiki_popular_sites.columns = columns
wiki_popular_sites.columns
Out[7]:
In [8]:
wiki_popular_sites.describe()
Out[8]:
In [9]:
wiki_popular_sites['Type'].unique()
Out[9]:
In [10]:
wiki_popular_sites['Principal country'].unique()
Out[10]:
pandas
groupby function to group websites by Type and Principal coutry
In [11]:
grouped_sites = wiki_popular_sites.groupby(['Type','Principal country']).count()
In [12]:
grouped_sites.iloc[:,0:1]
Out[12]:
In [13]:
by_country = wiki_popular_sites.pivot_table('Type', aggfunc='count', columns='Principal country')
by_country.head(10)
Out[13]:
In [14]:
fig, ax = plt.subplots(1, figsize=(12, 8))
fig.subplots_adjust(hspace=0.4)
plt.hist(by_country, bins = 50, color='blue')
plt.xticks(np.arange(0, by_country.max()+1, 2))
plt.yticks(np.arange(0, by_country.value_counts().max()+10, 5))
plt.xlabel('Count')
plt.ylabel('Frequency')
plt.title('Histogram of country frequency in \n List of Popular Websites (Wikipedia)')
plt.show()
In [15]:
by_country_df = pd.DataFrame(by_country)
In [16]:
Chart(by_country_df).mark_bar().encode(
x=X('Type:Q',
bin=Bin(
maxbins=50.0,
),
),
y='count(*):Q',
)
In this tutorial, we learned how to examine the html structure of webpage and use the Beautiful Soup
module to parse a single table on a webpage into .csv. After creating a .csv of the webpage table, we analyzed the .csv using the Pandas
module. Lastly, we created vizualizations with plotting libraries
In [17]:
import sys
import matplotlib
import altair
import bs4
print("System and module version information: \n")
print('Python version:', sys.version_info)
print('urllib.request version:', urllib.request.__version__)
print('numpy version:',np.__version__)
print('pandas version:', pd.__version__)
print('matplotlib version:',matplotlib.__version__)
print('altair version:',altair.__version__)
print('Beautiful Soup version:', bs4.__version__)