In [3]:
# Use PyCurl to grab the Web page source code.
#
# PyCurl will give packets of bytes until all data has been transfered.
# Use a lambda function for the write function that will encode
# each packet of binary data into a string and append it to a list
import pycurl
sourceList = []
curlObj = pycurl.Curl()
curlObj.setopt(pycurl.URL, "http://www.pythonchallenge.com/pc/def/ocr.html")
curlObj.setopt(pycurl.WRITEFUNCTION, lambda data: sourceList.append(data.decode(encoding="UTF-8")))
curlObj.perform()
In [4]:
# Create a single string out of the list of source fragments.
# Use functions from the functools and operator modules since they already exist
#
# Alternatively, we could loop through the sourceList and build the string ourselves
import functools, operator
sourceCode = functools.reduce(operator.concat, sourceList)
In [8]:
# Now we have the entire page source as a single string, we need
# to isolate the search block. Since the block is an HTML
# comment, a regular expression search will find it
import re
# Do a non-greedy search for all HTML comments.
# The DOTALL flag lets the search continue past newlines
matches = re.findall("<!--(.*?)-->", sourceCode, flags=re.DOTALL)
# The second hit happens to be the block we want. Lets
# just use that a priori for convenience. Also, lets remove those newlines, just cause
searchSpace = matches[1].translate({ord("\n"):None})
In [10]:
# To find the solution, we'll use a recursive function that
# acts very similar to the Sieve of Eratosthenes.
#
# We will scan the search space linearly, building the solution.
# For each character, if it's in the solution, remove it from the solution
# and remove all occurrences of it from the search space. Otherwise, add
# it to the solution and continue.
#
# As we move through, the search space dwindles and resultant string
# becomes all of the unique characters, in the order they appeared in the
# search space
def removeDuplicates(space, soln=""):
''' Recursively remove all duplicate characters from the given string '''
if(len(space) == 0):
return soln
if(space[0] in soln):
delTable = {ord(space[0]):None}
return removeDuplicates(space.translate(delTable), soln.translate(delTable))
return removeDuplicates(space[1:], soln + space[0])
In [11]:
removeDuplicates(searchSpace)
Out[11]:
In [ ]: