In [1]:
import os
import urllib
import webbrowser
import pandas as pd
from bs4 import BeautifulSoup
In [2]:
url = 'http://grad-schools.usnews.rankingsandreviews.com/best-graduate-schools/top-humanities-schools/sociology-rankings/page+1'
webbrowser.open_new_tab(url)
Out[2]:
In [3]:
def extract_page_data(table_rows):
"""
Extract and return the the desired information from the td elements within
the table rows.
"""
# create the empty list to store the player data
# <tr> tag: defines a row in a table
# <td> tag: defines a cell in a table
rows = []
for row in soup.findAll('tr'):
rows.append([val.text for val in row.find_all('td')])
return rows[1:]
In [4]:
us_news_rankings = []
errors_list = []
In [5]:
url_template = 'http://grad-schools.usnews.rankingsandreviews.com/best-graduate-schools/top-humanities-schools/sociology-rankings/page+{page_number}'
# for each page from 1 to (and including) 4
for page_number in range(1, 5):
# Use try/except block to catch and inspect any urls that cause an error
try:
# get the webpage url
url = url_template.format(page_number=page_number)
# get the html
html = urllib.request.urlopen(url)
# create the BeautifulSoup object
soup = BeautifulSoup(html, "lxml")
# get the column headers
headers = [header.text for header in soup.findAll('th')]
# start etracting rows
table_rows = soup.select('td')[1:]
school_data = extract_page_data(table_rows)
# create the dataframe for the current page
school_df = pd.DataFrame(school_data, columns=headers)
school_df = pd.DataFrame(school_data)
# append the current dataframe to the list of dataframes
us_news_rankings.append(school_df)
except Exception as e:
# Store the url and the error it causes in a list
error =[url, e]
# then append it to the list of errors
errors_list.append(error)
In [6]:
print(len(errors_list))
errors_list
Out[6]:
In [7]:
us_news_rankings
Out[7]:
In [8]:
us_news_df_raw = pd.concat(us_news_rankings, axis=0)
column_headers = ["rank", "school", "score"]
us_news_df_raw.columns = column_headers
In [9]:
us_news_df_raw.head(10)
Out[9]:
In [10]:
if not os.path.exists('../data/raw_data'):
os.makedirs('../data/raw_data')
if not os.path.exists('../data/clean_data'):
os.makedirs('../data/clean_data')
In [11]:
# Write out the raw rankings data to the raw_data folder in the data folder
us_news_df_raw.to_csv("../data/raw_data/us_news_rankings_RAW.csv", index=False)
Pandas
In [12]:
us_news_df_raw = pd.read_csv("../data/raw_data/us_news_rankings_RAW.csv")
us_news_df_raw = us_news_df_raw[0:len(us_news_df_raw)]
us_news_df_raw.head()
Out[12]:
In [13]:
us_news_df_raw["school_location"] = "NaN"
us_news_df_raw["school_name"] = "NaN"
us_news_df_raw["rank"] = us_news_df_raw.loc[:,('rank')].replace(r"\D", "", regex = True)
us_news_df_raw["score"] = us_news_df_raw.loc[:,('score')].str.extract("(\d.\d)", expand=False)
In [14]:
us_news_df_raw.head(10)
Out[14]:
In [15]:
for i in range(0,len(us_news_df_raw)+1):
try:
us_news_df_raw["school_name"][i] = us_news_df_raw["school"].str.split("\n\n")[i][0]
us_news_df_raw["school_location"][i] = us_news_df_raw["school"].str.split("\n\n")[i][1]
except:
us_news_df_raw["school_name"][i] = "NaN"
us_news_df_raw["school_location"][i] = "NaN"
us_news_df_raw["school_name"] = us_news_df_raw.loc[:,('school_name')].replace(r"\n", "", regex = True)
us_news_df_raw["school_location"] = us_news_df_raw.loc[:,('school_location')].replace("\n", "", regex = True)
In [16]:
cols = ["rank", "school_name", "school_location", "score"]
us_news_df_raw = us_news_df_raw[cols]
us_news_df_raw.head()
Out[16]:
In [17]:
us_news_df_clean = us_news_df_raw[us_news_df_raw['school_name']!="NaN"]
us_news_df_clean.head()
Out[17]:
In [18]:
us_news_df_clean.to_csv("../data/clean_data/us_news_rankings_clean.csv")
In [19]:
from geopy.geocoders import Nominatim
In [20]:
geolocator = Nominatim()
locations = us_news_df_clean['school_location'].apply(lambda x: geolocator.geocode(x)) # equiv to df.sum(0)
address,coordinates = zip(*locations)
latitude,longitude = zip(*coordinates)
In [21]:
us_news_df_clean.loc[:,'latitude'] = latitude
us_news_df_clean.loc[:,('longitude')] = longitude
us_news_df_clean = us_news_df_clean.apply(pd.to_numeric, errors="ignore")
us_news_df_clean.head()
Out[21]:
In [22]:
us_news_df_clean['quintile'] = pd.qcut(us_news_df_clean['score'], 5, labels=False)
In [23]:
from altair import Chart, X, Y, Axis, Scale
Chart(us_news_df_clean).mark_circle(
size=100,
opacity=0.6
).encode(
x=X('longitude:Q', axis=Axis(title=' ')),
y=Y('latitude:Q', axis=Axis(title=' ')),
#scale=Scale(domain=(-60, 80))),
color='quintile:N',
).configure_cell(
width=800,
height=350
).configure_axis(
grid=False,
axisWidth=0,
tickWidth=0,
labels=False,
)
In this tutorial, we learned how to examine the html structure of webpage and use a function based on the Beautiful Soup
module to parse tables on multiple webpage into .csv. After creating a .csv of all webpage tables, we analyzed the .csv using the Pandas
module. Lastly, we created vizualizations with the Altair library
In [24]:
import sys
import altair
import bs4
print("System and module version information: \n")
print('Python version:', sys.version_info)
print('urllib.request version:', urllib.request.__version__)
print('pandas version:', pd.__version__)
print('altair version:',altair.__version__)
print('Beautiful Soup version:', bs4.__version__)