In this end-to-end example we will perform a data analysis in Python Pandas we will attempt to answer the following questions:
Things we will demonstrate:
read_html()
for basic web scrapingappend()
multiple DataFrames
togetherDataFrame
)The iSchool schedule of classes can be found here: https://ischool.syr.edu/classes
In [22]:
import pandas as pd
# this turns off warning messages
import warnings
warnings.filterwarnings('ignore')
In [23]:
# just figure out how to get the data
website = 'https://ischool.syr.edu/classes/?page=1'
data = pd.read_html(website)
data[0]
Out[23]:
In [24]:
# let's generate links to the other pages
website = 'https://ischool.syr.edu/classes/?page='
for i in range(1,6):
link = website + str(i)
print(link)
In [25]:
# let's read them all and append them to a single data frame
website = 'https://ischool.syr.edu/classes/?page='
classes = pd.DataFrame() # (columns = ['Course','Section','ClassNo','Credits','Title','Instructor','Time','Days','Room'])
for i in range(1,6):
link = website + str(i)
data = pd.read_html(website + str(i))
classes = classes.append(data[0], ignore_index=True)
classes.sample(5)
Out[25]:
In [26]:
## let's set the columns
website = 'https://ischool.syr.edu/classes/?page='
classes = pd.DataFrame()
for i in range(1,6):
link = website + str(i)
data = pd.read_html(website + str(i))
classes = classes.append(data[0], ignore_index=True)
classes.columns = ['Course','Section','ClassNo','Credits','Title','Instructor','Time','Days','Room']
classes.sample(5)
Out[26]:
In [31]:
## this is good stuff. Let's make a function out of it for simplicity
def get_ischool_classes():
website = 'https://ischool.syr.edu/classes/?page='
classes = pd.DataFrame()
for i in range(1,6):
link = website + str(i)
data = pd.read_html(website + str(i))
classes = classes.append(data[0], ignore_index=True)
classes.columns = ['Course','Section','ClassNo','Credits','Title','Instructor','Time','Days','Room']
return classes
# main program
classes = get_ischool_classes()
In [32]:
# undergrad classes are 0-499, grad classes are 500 and up but we don't have course numbers!!!! So we must engineer them.
classes['Course'].str[0:3].sample(5)
classes['Course'].str[3:].sample(5)
Out[32]:
In [33]:
# make the subject and number columns
classes['Subject'] = classes['Course'].str[0:3]
classes['Number'] = classes['Course'].str[3:]
classes.sample(5)
Out[33]:
In [36]:
# and finally we can create the column we need!
classes['Type'] = ''
classes['Type'][classes['Number'] < '500'] = 'UGrad'
classes['Type'][classes['Number'] >= '500'] = 'Grad'
classes.sample(5)
Out[36]:
In [44]:
# the entire program to retrieve the data and setup the columns looks like this:
# main program
classes = get_ischool_classes()
classes['Subject'] = classes['Course'].str[0:3]
classes['Number'] = classes['Course'].str[3:]
classes['Type'] = ''
classes['Type'][classes['Number'] < '500'] = 'UGrad'
classes['Type'][classes['Number'] >= '500'] = 'Grad'
In [45]:
# let's fins the number of grad / undergrad courses
classes['Type'].value_counts()
# more grad classes than undergrad
Out[45]:
In [46]:
# how many undergrad classes on a Friday?
friday = classes[ (classes['Type'] == 'UGrad') & (classes['Days'].str.find('F')>=0 ) ]
friday
Out[46]:
In [47]:
# let's get rid of those pesky LAB sections!!!
# how many undergrad classes on a Friday?
friday_no_lab = friday[ ~friday['Title'].str.startswith('LAB:')]
friday_no_lab
Out[47]:
In [48]:
# Looking for more classes to avoid? How about 8AM classes?
eight_am = classes[ classes['Time'].str.startswith('8:00am')]
eight_am
Out[48]:
In [ ]: