$ whoami
{
"name": "Sang Han",
"website": "http://sanghan.me",
"github": "https://github.com/jjangsangy",
"education": [
{
"school": "University of California: San Diego",
"degree": "B.Sc",
"years": "2007-2011",
"conc": ["Physical Chemistry", "Behavioral Psychology"]
},
{
"school": "Stanford University",
"degree": "M.Eng",
"year": "2016+",
"conc": ["Artificial Intelligence", "Systems Security"]
}
],
"work": {
"company": "Qadium Inc",
"position": "Data Scientist",
"desc": "DARPA Research in Information Innovation and Machine Learning"
}
}
Basically just scraped cape website using Python (Both Python 2 and Python 3 Work), and I used PostgreSQL as the backend. Took about a day to write, and then another day just messing around to get everything to fit the schema, so it was a fun weekend project. The packages that are required to run the scraper are
The Anacondas Python Distribution is like the easiest way to get all the packages needed if you wish to try out the code yourself.
I also use a ThreadPool for making connections asynchronously, so that this doesn't take a million years lol.
The visualizations I used here are Tableau
I know that usually it's not polite to scrape from a service if they already provide an API, like reddit for instance. However, when I went to go look for one, I couldn't find any, so that gave me the green light to go ahead and write a scraper. And honestly, ever since I was a student (like 3 years ago), I was always unsatisfied with CAPE, so this is kind of my way of liberating the data so that students can access it better.
Currently it only queries about 30-40 or so different departments and grabs the tables generated for those queries.
However, every single class also has it's own page, but since I didn't want to make 20,000 HTTP requests, I went and only grabbed the front matter.
This kind of opens it up for anyone else, or even myself to build a service that takes into account the rest of the data. In the scraper itself, I've created a column called link that actually points to the individual CAPE's for classes, so I've made it really easy for people to do this.
In [1]:
from __future__ import print_function
import requests
import sys
import itertools
import logging
import string
import os
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
from operator import itemgetter
from multiprocessing.dummy import Pool as ThreadPool
try:
from urllib.parse import urljoin
except ImportError:
from urlparse import urljoin
In [2]:
import requests
req = requests.get('http://google.com')
print(req.text)
In [3]:
def connect(prot='http', **q):
"""
Makes a connection with CAPE.
Required that at least one query is made.
Parameters
----------
:params prot: Either HTTP or HTTPS
:params q: Query Dictionary
Returns
-------
:return: Request
:rtype : request.Request
"""
host = 'cape.ucsd.edu'
inputs = 'Name', 'courseNumber', 'department'
prot = prot.lower()
base = '%s://%s/responses/Results.aspx' % (prot, host)
assert prot in ['http', 'https']
assert any(val in inputs for val in q)
headers = { "Host": host,
"Accept": ','.join([
"text/html",
"application/xhtml+xml",
"application/xml;q=0.9,*/*;q=0.8"]),
"Accept-Language": "en-US,en;q=0.5",
"User-Agent": ' '.join([
"Mozilla/5.0]",
"(Macintosh; Intel Mac OS X 10_10_2)",
"AppleWebKit/600.3.18",
"(KHTML, like Gecko)",
"Version/8.0.3 Safari/600.3.18"]),
"Cache-Control": "no-cache"
}
queries = '&'.join(
[
'{key}={value}'.format(key=key, value=value)
for key, value in q.items()
if key in inputs
]
)
req = requests.get('?'.join([base, queries]), headers=headers)
if not req.ok:
print("Request didn't make it", file=sys.stderr)
req.raise_for_status()
return req
**q is a variable set of keyword arguments that it will apply to the URL>>> connect(department=CHEM)
Will make a request to http://cape.ucsd.edu/responses/Results.aspx?department=CHEM and return the result.
In [4]:
# URL: http://cape.com/responses/Results.aspx?
req = connect(department="CHEM")
print(req.text)
BeautifulSoup is a HTML Parser
Let's grab all the class listings within
<option value="">Select a Department</option>
<option value="ANTH">ANTH - Anthropology</option>
<option value="BENG">BENG - Bioengineering</option>
<option value="BIOL">BIOL - Biological Sciences</option>
<option value="CAT">CAT - Sixth College</option>
<option value="CENG">CENG - Chemical Engineering</option>
...
...
In [5]:
from bs4 import BeautifulSoup
# Grab the HTML
req = connect(department="CHEM")
# Shove it into BeautifulSoup
soup = BeautifulSoup(req.text, 'lxml')
# Find all Option Tags
options = soup.find_all('option')
# Returns a list of options
options
Out[5]:
In [6]:
# Grab the `value= ` Attribute
for option in options:
print(option.attrs['value'])
In [7]:
def departments():
"""
Gets a mapping of all the deparments by key.
"""
logging.info('Grabbing a list of Departments')
prototype = connect("http", department="CHEM")
soup = BeautifulSoup(prototype.content, 'lxml')
options = list(reversed(soup.find_all('option')))
options.pop()
# Initial Course Mapping
mapping = dict(option.text.split(' - ') for option in options)
# Cleanup
for dept in ['BIOL', 'SOC', 'HIST', 'LING', 'LIT', 'NENG', 'RSM ', 'SOE', 'THEA']:
mapping.pop(dept)
# Actual Departments
mapping.update({
'BIBC': 'Biology Biochemistry',
'BILD': 'Biology Lower Division',
'BIMM': 'Biology Molecular, Microbiology',
'BIPN': 'Biology Physiology and Neuroscience',
'SOCA': 'Sociology Theory & Methods',
'SOCB': 'Sociology Cult, Lang, & Soc Interact',
'SOCC': 'Sociology Organiz & Institutions',
'SOCD': 'Sociology Comparative & Historical',
'SOCE': 'Sociology Ind Research & Honors Prog',
'SOCI': 'Sociology',
'SOCL': 'Sociology Lower Division',
'HILD': 'History Lower Division',
'HIAF': 'History of Africa',
'HIEA': 'History of East Asia',
'HIEU': 'History of Europe',
'HINE': 'History of Near East',
'HILA': 'History of Latin America',
'HISC': 'History of Science',
'HIUS': 'History of the United States',
'HITO': 'History Topics',
'LTAF': 'Literature African',
'LTAM': 'Literature of the Americas',
'LTCH': 'Literature Chinese',
'LTCS': 'Literature Cultural Studies',
'LTEA': 'Literature East Asian',
'LTEU': 'Literature European/Eurasian',
'LTFR': 'Literature French',
'LTGM': 'Literature General',
'LTGK': 'Literature Greek',
'LTGM': 'Literature German',
'LTIT': 'Literature Italian',
'LTKO': 'Literature Korean',
'LTLA': 'Literature Latin',
'LTRU': 'Literature Russian',
'LTSP': 'Literature Spanish',
'LTTH': 'Literature Theory',
'LTWL': 'Literature of the World',
'LTWR': 'Literature Writing',
'RELI': 'Literature Study of Religion',
'TWS' : 'Literature Third World Studies',
'NANO': 'Nano Engineering',
'MGT' : 'Rady School of Management',
'ENG' : 'Jacobs School of Engineering',
'LIGN': 'Linguistics',
'TDAC': 'Theatre Acting',
'TDCH': 'Theatre Dance Choreography',
'TDDE': 'Theatre Design',
'TDDR': 'Theatre Directing/Stage Management',
'TDGE': 'Theatre General',
'TDHD': 'Theatre Dance History',
'TDHT': 'Theatre History',
'TDMV': 'Theatre Dance Movement',
'TDPF': 'Theatre Dance Performance',
'TDPW': 'Theatre Playwriting',
'TDTR': 'Theatre Dance Theory',
})
# Create Categorical Series
dep = pd.Series(name='department_name', data=mapping)
# Reindexing
dep = dep.map(lambda x: np.nan if x == '' else x)
dep = dep.dropna()
dep.index.name = 'Departments'
return dep
In [8]:
def create_table(courses):
"""
Generates a pandas DataFrame by querying UCSD Cape Website.
Parameters
==========
:params courses: Either Course or Path to HTML File
Returns
=======
:returns df: Query Results
:rtype: pandas.DataFrame
"""
header = [
'instructor', 'course', 'term', 'enroll', 'evals',
'recommend_class', 'recommend_instructor', 'study_hours_per_week',
'average_grade_expected', 'average_grade_received'
]
first, second = itemgetter(0), itemgetter(1)
print('\nGrabbing Classes: {0}'.format(courses))
# Get Data
base = 'http://cape.ucsd.edu/responses/'
req = (
open(courses).read()
if os.path.isfile(courses)
else connect("http", courseNumber=courses).content
)
html = BeautifulSoup(req, 'lxml')
table = first(html.find_all('table'))
# Create Dataframe
df = first(pd.read_html(str(table)), flavor=None, na_values=['No CAPEs submitted'])
# Data Clean Up
df.columns = header
df['link'] = [
urljoin(base, link.attrs['href']) if link.has_attr('href') else np.nan
for link in table.find_all('a')
]
df['instructor'] = df.instructor.map(
lambda name: (
str.title(name)
if isinstance(name, str) else 'Unknown, Unknonwn'
)
)
# Data Extraction
df['first_name'] = df.instructor.map(lambda name: second(name.split(',')).strip('.'))
df['last_name'] = df.instructor.map(lambda name: first(name.split(',')))
df['class_id'] = df.course.map( lambda course: first(course.split(' - ')))
df['department'] = df.class_id.map(lambda course: first(course.split(' ')))
df['class_name'] = df.course.map(
lambda course: (
second(course.split(' - '))[:-4]
if ' - ' in course else np.nan)
)
# Data Types
df['recommend_class'] = df.recommend_class.map(calculate_percentage)
df['recommend_instructor'] = df.recommend_instructor.map(calculate_percentage)
df['average_grade_expected'] = df.average_grade_expected.map(calculate_grades)
df['average_grade_received'] = df.average_grade_received.map(calculate_grades)
# Reindexing and Transforms
df['section_id'] = df.link.map(calculate_section_id)
df = df.dropna(subset=['section_id'])
df = df.drop_duplicates(subset='section_id')
df['section_id'] = df.section_id.astype(np.int32)
return df.set_index('section_id', drop=True)
def calculate_percentage(element):
if isinstance(element, str):
return np.float(element.strip('%').strip()) / 100
else:
return np.nan
def calculate_grades(element):
if isinstance(element, str):
return np.float(element[1:].lstrip('+-').lstrip().strip('()'))
else:
return np.nan
def calculate_section_id(element):
if isinstance(element, str):
return int(element.lower().rsplit('sectionid=')[-1].strip(string.ascii_letters))
else:
return np.nan
In [28]:
def to_db(df, table, user='postgres', db='graphucsd', resolve='replace', host='localhost'):
"""
Helper Function to Push DataFrame to Postgresql Database
"""
url = 'postgresql+psycopg2://{user}@{host}/{db}'.format(user=user, db=db, host=host)
if not database_exists(url):
create_database(url)
engine = create_engine(url)
return df.to_sql(table, engine, if_exists=resolve)
In [10]:
df = create_table('CHEM')
In [11]:
header = [
'instructor', 'course', 'term', 'enroll', 'evals',
'recommend_class', 'recommend_instructor', 'study_hours_per_week',
'average_grade_expected', 'average_grade_received'
]
first, second = itemgetter(0), itemgetter(1)
base = 'http://cape.ucsd.edu/responses/'
req = connect("http", courseNumber='CSE').content
html = BeautifulSoup(req, 'lxml')
table = first(html.find_all('table'))
In [12]:
def calculate_percentage(element):
if isinstance(element, str):
return np.float(element.strip('%').strip()) / 100
else:
return np.nan
In [13]:
import pandas as pd
In [14]:
df = first(pd.read_html(str(table)), flavor=None, na_values=['No CAPEs submitted'])
In [ ]:
In [22]:
def main(threads=6):
"""
Get all departments
"""
logging.info('Program is Starting')
# Get Departments
deps = departments()
keys = [department.strip() for department in deps.keys()]
# Run Scraper Concurrently Using ThreadPool
pool = ThreadPool(threads)
logging.info('Initialize Scraper with {} Threads'.format(threads))
table = pool.map(create_table, keys)
logging.info('Scrape Complete')
# Manage ThreadPool
pool.close(); pool.join()
df = pd.concat(table)
return df.groupby(level=0).first()
In [24]:
df = main(threads=4)
In [25]:
df
Out[25]:
In [ ]: