In [1]:
import pycurl
import io
In [46]:
buf = []
In [47]:
curlObj = pycurl.Curl()
curlObj.setopt(pycurl.URL, "http://www.pythonchallenge.com/pc/def/ocr.html")
curlObj.setopt(pycurl.WRITEFUNCTION, lambda data: buf.append(data.decode(encoding="UTF-8")))
In [48]:
curlObj.perform()
In [44]:
#print(buffer)
In [ ]:
In [19]:
print(buffer[0])
In [20]:
type(buffer[0])
Out[20]:
In [21]:
buffer[0][0]
Out[21]:
In [23]:
buff_0_str = buffer[0].decode(encoding="UTF-8")
In [24]:
print(buff_0_str)
In [28]:
bufStr = ''.join([for raw in buffer raw.decode(encoding="UTF-8")])
In [50]:
print(type(buf))
print(type(buf[0]))
In [51]:
import operator, functools
testStr = functools.reduce(operator.concat, buf)
#testStr2 = str([b.decode(encoding="UTF-8") for b in buffer])
def bytesToStr(byteList):
for bytes in byteList:
yield bytes.decode(encoding="UTF-8")
#testStr3 = str([x for x in bytesToStr(buffer)])
print(len(testStr)); print(testStr[:8])
#print(len(testStr2)); print(testStr2[:8])
#print(len(testStr3)); print(testStr3[:8])
In [10]:
bufStr = bytesToStr(buffer)
print(bufStr)
In [32]:
generator = bufStr
bufStr = ''
for x in generator bufStr += x
In [11]:
bufStr = ''
for x in bytesToStr(buffer):
bufStr += x
#print(bufStr)
In [23]:
import re
# Need to do a non-greedy match (.*?), otherwise the single match will be
# "<!-- (instructions) --> ... <!-- ('rare char' block) -->"
#
# Need to use the DOTALL flag, since the entire source code string has newlines in it.
# Leave the HTML comment parts out of the group
matches = re.findall("<!--(.*?)-->", bufStr, flags=re.DOTALL)
#for hit in matches:
# print(hit)
# The second hit happens to be the block we're interested in. Just use that fact a priori for convenience.
# Alternatively we could search the matches for the instructions block, and use the following match
# as the 'rare character' block
block = matches[1]
In [88]:
block = matches[1]
occurrences = dict()
occurrences2 = dict()
for idx, char in enumerate(block):
occurrences[char] = (idx, occurrences.get(char, (0,0))[1] + 1)
occurrences2[char] = occurrences2.get(char, 0) + 1
print(len(occurrences))
rare = [(char, tup) for (char, tup) in occurrences.items() if tup[1] == 1 and not (char == '<' or char == '>')]
rare2 = list(filter(lambda char: occurrences2[char] == 1 and not (char == '<' or char == '>'), occurrences2))
print(sorted(rare, key=lambda x: x[1][0]))
print(rare2)
In [96]:
# A psuedo-functional way of solving the problem.
# Each character maps to a 2-element tuple with (<idx>, <count>),
# where idx is the position of the last occurrence in the string and
# count is the number of occurrences
occurrences = dict()
for idx, char in enumerate(block):
count = occurrences.get(char, (0,0))[1] + 1 # Use dict.get(..) so we can get a default tuple if char isnt mapped yet
occurrences[char] = (idx, count)
# Now that we have all the characters mapped to an occurrence count, filter out the rare ones.
rare = [(char, tup) for (char, tup) in occurrences.items() if tup[1] == 1]
# Dictionaries store their items in an unordered fashion, so we need to sort our rare
# characters by their position in the original string
#
# The join-ing portion is just to have a human-readable printout
print(''.join(item[0] for item in sorted(rare, key=lambda x: x[1][0])))
In [24]:
# An arguably easier way to solve the problem. This does a greedy
# recursive search into the string, removing duplicates until we've
# exhausted all of the search space.
#
# Since the space is searched linearly and the solution isn't stored in a dict
# like above, we end up with the solution without any additional manipulation
#
# Note: This may not be the most memory efficient. Hopefully all these intermediate strings are
# appropriately cleaned up during the recursive calls, but it's up to the Python kernel.
# Something, something, tail recursion...
# Remove the newlines that are in the block (not strictly necessary with this given string)
searchSpace = block.translate({ord("\n"):None})
def removeDuplicates(space, soln=''):
''' Recursively searches through the space removing duplicate characters'''
if(len(space) == 0):
return soln
# If we've already seen the character, remove the char (and all occurrences of it) from
# both the search space and solution and continue recursing
#
# Note the use of str.translate(dict) since I'm using Python 3 (different signature from Python 2)
if space[0] in soln:
delTable = {ord(space[0]):None}
return removeDuplicates(space.translate(delTable), soln.translate(delTable))
else:
return removeDuplicates(space[1:], soln + space[0])
removeDuplicates(searchSpace)
Out[24]:
In [44]:
# According to the wise people of Stack Overflow
# (http://stackoverflow.com/questions/13591970/does-python-optimize-tail-recursion)
# Python does not do tail call optimization (TCO).
# From that thread, the hack to more-or-less get the tail recursion is to
# wrap the method in a while loop.
#
# Keep in mind that this current stage solves fine, using recursion, without this optimization
def removeDuplicates_tro(space, soln=''):
while True:
if(len(space) == 0):
return soln
if space[0] in soln:
delTable = {ord(space[0]):None}
space = space.translate(delTable)
soln = soln.translate(delTable)
else:
soln = soln + space[0]
space = space[1:]
searchSpace = block.translate({ord("\n"):None})
removeDuplicates_tro(searchSpace)
Out[44]:
In [16]:
test = [1, 2, 3]
targets = [2]
for x in test:
print(x)
if x in targets:
print("Found " + str(x) + " in the test list!")
print(test)
In [22]:
# Seeing if you can modify an iterable while iterating over it.
# It works in this simple example, but the long story short version
# is: don't do it
test = "abc"
for x in test:
if x == "b":
delTable = { ord(x):None }
test = test.translate(delTable)
print(test)
In [ ]: