In [1]:
from bs4 import BeautifulSoup
import csvkit #because Unicode
import os
import wget

In [3]:
def downloadResults():
    baseurl = 'http://upcat.up.edu.ph/results/'
    page = "page-%003d.html"
    for i in range(151)[1:]:
        currentpage = page % i
        pageurl = baseurl + currentpage
        #print "Processing %s \n" % currentpage
        if os.path.isfile(currentpage):
            print "%s Already exists \n" % currentpage
            continue
        else:
            wget.download(pageurl)
            print "Downloading %s" % currentpage

In [4]:
def listHTML():
    htmlList = open('html.list.txt','wt')
    for r,d,f in os.walk("."):
        for files in f:
            if files.endswith(".html"):
                #print os.path.join(r,files) 
                out = os.path.join(r,files) + "\n" 
                htmlList.write(out)

In [5]:
def readList():
    htmlFile = open('html.list.txt')
    htmlList = []
    for line in htmlFile.readlines():
        htmlList.append(line)
    return htmlList

In [6]:
def process(htmlfile):
    page = open(htmlfile)
    soup = BeautifulSoup(page)

    #tag_black_list = ['em']
    #[s.decompose() for s in soup(tag_black_list)]
    
    tables = soup.findAll('table', {'class': ['printable']})


    #check if file exists and append if it does
    outfile = 'passers.csv'
    if os.path.isfile(outfile):
        f = open('passers.csv','ab')
        writer = csvkit.writer(f)

    else:
        f = open('passers.csv','wb')
        writer = csvkit.writer(f)
        writer.writerow(("Name", "Campus", "Course"))

    for row in tables[0].findAll('tr')[1:]:
        col = row.findAll('td')

        name = col[0].text
        if name.endswith(")"):
            name = name[:-13]
        campus = col[1].text    
        course = col[2].text
        #print name, campus, course
        writer.writerow( (name, campus, course))
    
    f.close()
    page.close()

In [7]:
def cleanURL(url):
  url = url[2:-1]
  return url

In [8]:
def getPassers():
    downloadResults()
    listHTML()
    htmls = readList()
    for html in htmls:
        html = cleanURL(html)
        process(html)

In [10]:
getPassers()


page-001.html Already exists 

page-002.html Already exists 

page-003.html Already exists 

page-004.html Already exists 

page-005.html Already exists 

page-006.html Already exists 

page-007.html Already exists 

page-008.html Already exists 

page-009.html Already exists 

page-010.html Already exists 

page-011.html Already exists 

page-012.html Already exists 

page-013.html Already exists 

page-014.html Already exists 

page-015.html Already exists 

page-016.html Already exists 

page-017.html Already exists 

page-018.html Already exists 

page-019.html Already exists 

page-020.html Already exists 

page-021.html Already exists 

page-022.html Already exists 

page-023.html Already exists 

page-024.html Already exists 

page-025.html Already exists 

page-026.html Already exists 

page-027.html Already exists 

page-028.html Already exists 

page-029.html Already exists 

page-030.html Already exists 

page-031.html Already exists 

page-032.html Already exists 

page-033.html Already exists 

page-034.html Already exists 

page-035.html Already exists 

page-036.html Already exists 

page-037.html Already exists 

page-038.html Already exists 

page-039.html Already exists 

page-040.html Already exists 

page-041.html Already exists 

page-042.html Already exists 

page-043.html Already exists 

page-044.html Already exists 

page-045.html Already exists 

page-046.html Already exists 

page-047.html Already exists 

page-048.html Already exists 

page-049.html Already exists 

page-050.html Already exists 

page-051.html Already exists 

page-052.html Already exists 

page-053.html Already exists 

page-054.html Already exists 

page-055.html Already exists 

page-056.html Already exists 

page-057.html Already exists 

page-058.html Already exists 

page-059.html Already exists 

page-060.html Already exists 

page-061.html Already exists 

page-062.html Already exists 

page-063.html Already exists 

page-064.html Already exists 

page-065.html Already exists 

page-066.html Already exists 

page-067.html Already exists 

page-068.html Already exists 

page-069.html Already exists 

page-070.html Already exists 

page-071.html Already exists 

page-072.html Already exists 

page-073.html Already exists 

page-074.html Already exists 

page-075.html Already exists 

page-076.html Already exists 

page-077.html Already exists 

page-078.html Already exists 

page-079.html Already exists 

page-080.html Already exists 

page-081.html Already exists 

page-082.html Already exists 

page-083.html Already exists 

page-084.html Already exists 

page-085.html Already exists 

page-086.html Already exists 

page-087.html Already exists 

page-088.html Already exists 

page-089.html Already exists 

page-090.html Already exists 

page-091.html Already exists 

page-092.html Already exists 

page-093.html Already exists 

page-094.html Already exists 

page-095.html Already exists 

page-096.html Already exists 

page-097.html Already exists 

page-098.html Already exists 

page-099.html Already exists 

page-100.html Already exists 

page-101.html Already exists 

page-102.html Already exists 

page-103.html Already exists 

page-104.html Already exists 

page-105.html Already exists 

page-106.html Already exists 

page-107.html Already exists 

page-108.html Already exists 

page-109.html Already exists 

page-110.html Already exists 

page-111.html Already exists 

page-112.html Already exists 

page-113.html Already exists 

page-114.html Already exists 

page-115.html Already exists 

page-116.html Already exists 

page-117.html Already exists 

page-118.html Already exists 

page-119.html Already exists 

page-120.html Already exists 

page-121.html Already exists 

page-122.html Already exists 

page-123.html Already exists 

page-124.html Already exists 

page-125.html Already exists 

page-126.html Already exists 

page-127.html Already exists 

page-128.html Already exists 

page-129.html Already exists 

page-130.html Already exists 

page-131.html Already exists 

page-132.html Already exists 

page-133.html Already exists 

page-134.html Already exists 

page-135.html Already exists 

page-136.html Already exists 

page-137.html Already exists 

page-138.html Already exists 

page-139.html Already exists 

page-140.html Already exists 

page-141.html Already exists 

page-142.html Already exists 

page-143.html Already exists 

page-144.html Already exists 

page-145.html Already exists 

page-146.html Already exists 

page-147.html Already exists 

page-148.html Already exists 

page-149.html Already exists 

page-150.html Already exists 


In [ ]: