In [1]:
import pycurl
import io

In [46]:
buf = []

In [47]:
curlObj = pycurl.Curl()
curlObj.setopt(pycurl.URL, "http://www.pythonchallenge.com/pc/def/ocr.html")
curlObj.setopt(pycurl.WRITEFUNCTION, lambda data: buf.append(data.decode(encoding="UTF-8")))

In [48]:
curlObj.perform()

In [44]:
#print(buffer)

In [ ]:


In [19]:
print(buffer[0])


b'<html>\n<head>\n  <title>ocr</title>\n  <link rel="stylesheet" type="text/css" href="../style.css">\n</head>\n<body>\n<center><img src="ocr.jpg">\n<br><font color="#c03000">\nrecognize the characters. maybe they are in the book, <br>but MAYBE they \nare in the page source.</center>\n\n<br>\n<br>\n<br>\n\n<font size="-1" color="gold">\nGeneral tips:\n<li>Use the hints. They are helpful, most of the times.</li>\n<li>Investigate the data given to you.</li>\n<li>Avoid looking for spoilers.</li>\n<br>\nForums: <a href="http://www.pythonchallenge.com/forums"/>Python Challenge Forums</a>, \nread before you post.\n<br>\nIRC: irc.freenode.net #pythonchallenge\n<br><br>\nTo see the solutions to the previous level, replace pc with pcc, i.e. go \nto: http://www.pythonchallenge.com/pcc/def/ocr.html\n\n</body>\n</html>\n\n<!--\nfind rare characters in the mess below:\n-->\n\n<!--\n%%$@_$^__#)^)&!_+]!*@&^}@[@%]()%+$&[(_@%+%$*^@$^!+]!&_#)_*}{}}!}_]$[%}@[{_@#_^{*\n@##&{#&{&)*%(]{{([*}@[@&]+!!*{)!}{%+{))])[!^})+)$]#{*+^((@^@}$[**$&^{$!@#$%)!@(&\n+^!{%_$&@^!}$_${)$_#)!({@!)(^}!*^&!$%_&&}&_#&@{)]{+)%*{&*%*&@%$+]!*__(#!*){%&@++\n!_)^$&&%#+)}!@!)&^}**#!_$([$!$}#*^}$+&#[{*{}{((#$]{[$[$$()_#}!@}^@_&%^*!){*^^_$^\n]@}#%[%!^[^_})+@&}{@*!(@$%$^)}[_!}(*}#}#_'

In [20]:
type(buffer[0])


Out[20]:
bytes

In [21]:
buffer[0][0]


Out[21]:
60

In [23]:
buff_0_str = buffer[0].decode(encoding="UTF-8")

In [24]:
print(buff_0_str)


<html>
<head>
  <title>ocr</title>
  <link rel="stylesheet" type="text/css" href="../style.css">
</head>
<body>
<center><img src="ocr.jpg">
<br><font color="#c03000">
recognize the characters. maybe they are in the book, <br>but MAYBE they 
are in the page source.</center>

<br>
<br>
<br>

<font size="-1" color="gold">
General tips:
<li>Use the hints. They are helpful, most of the times.</li>
<li>Investigate the data given to you.</li>
<li>Avoid looking for spoilers.</li>
<br>
Forums: <a href="http://www.pythonchallenge.com/forums"/>Python Challenge Forums</a>, 
read before you post.
<br>
IRC: irc.freenode.net #pythonchallenge
<br><br>
To see the solutions to the previous level, replace pc with pcc, i.e. go 
to: http://www.pythonchallenge.com/pcc/def/ocr.html

</body>
</html>

<!--
find rare characters in the mess below:
-->

<!--
%%$@_$^__#)^)&!_+]!*@&^}@[@%]()%+$&[(_@%+%$*^@$^!+]!&_#)_*}{}}!}_]$[%}@[{_@#_^{*
@##&{#&{&)*%(]{{([*}@[@&]+!!*{)!}{%+{))])[!^})+)$]#{*+^((@^@}$[**$&^{$!@#$%)!@(&
+^!{%_$&@^!}$_${)$_#)!({@!)(^}!*^&!$%_&&}&_#&@{)]{+)%*{&*%*&@%$+]!*__(#!*){%&@++
!_)^$&&%#+)}!@!)&^}**#!_$([$!$}#*^}$+&#[{*{}{((#$]{[$[$$()_#}!@}^@_&%^*!){*^^_$^
]@}#%[%!^[^_})+@&}{@*!(@$%$^)}[_!}(*}#}#_

In [28]:
bufStr = ''.join([for raw in buffer raw.decode(encoding="UTF-8")])


  File "<ipython-input-28-2549bf87843a>", line 1
    bufStr = ''.join([for raw in buffer raw.decode(encoding="UTF-8")])
                        ^
SyntaxError: invalid syntax

In [50]:
print(type(buf))
print(type(buf[0]))


<class 'list'>
<class 'str'>

In [51]:
import operator, functools

testStr = functools.reduce(operator.concat, buf)
#testStr2 = str([b.decode(encoding="UTF-8") for b in buffer])

def bytesToStr(byteList):
    for bytes in byteList:
        yield bytes.decode(encoding="UTF-8")
        
#testStr3 = str([x for x in bytesToStr(buffer)])

print(len(testStr)); print(testStr[:8])
#print(len(testStr2)); print(testStr2[:8])
#print(len(testStr3)); print(testStr3[:8])


99613
<html>
<

In [10]:
bufStr = bytesToStr(buffer)
print(bufStr)


<generator object bytesToStr at 0x10db041f8>

In [32]:
generator = bufStr
bufStr = ''
for x in generator bufStr += x


  File "<ipython-input-32-383101af3fcc>", line 3
    for x in generator bufStr += x
                            ^
SyntaxError: invalid syntax

In [11]:
bufStr = ''
for x in bytesToStr(buffer):
    bufStr += x
#print(bufStr)

In [23]:
import re

# Need to do a non-greedy match (.*?), otherwise the single match will be
# "<!-- (instructions) --> ... <!-- ('rare char' block) -->"
#
# Need to use the DOTALL flag, since the entire source code string has newlines in it.
# Leave the HTML comment parts out of the group
matches = re.findall("<!--(.*?)-->", bufStr, flags=re.DOTALL)
#for hit in matches:
#    print(hit)

# The second hit happens to be the block we're interested in. Just use that fact a priori for convenience.
# Alternatively we could search the matches for the instructions block, and use the following match
# as the 'rare character' block
block = matches[1]

In [88]:
block = matches[1]
occurrences = dict()
occurrences2 = dict()
for idx, char in enumerate(block):
    occurrences[char] = (idx, occurrences.get(char, (0,0))[1] + 1)
    occurrences2[char] = occurrences2.get(char, 0) + 1
print(len(occurrences))
rare = [(char, tup) for (char, tup) in occurrences.items() if tup[1] == 1 and not (char == '<' or char == '>')]
rare2 = list(filter(lambda char: occurrences2[char] == 1 and not (char == '<' or char == '>'), occurrences2))

print(sorted(rare, key=lambda x: x[1][0]))
print(rare2)


28
[('e', (2426, 1)), ('q', (26293, 1)), ('u', (33131, 1)), ('a', (44802, 1)), ('l', (62765, 1)), ('i', (84711, 1)), ('t', (86244, 1)), ('y', (87328, 1))]
['y', 'e', 't', 'a', 'l', 'q', 'u', 'i']

In [96]:
# A psuedo-functional way of solving the problem.

# Each character maps to a 2-element tuple with (<idx>, <count>),
# where idx is the position of the last occurrence in the string and
# count is the number of occurrences
occurrences = dict()
for idx, char in enumerate(block):
    count = occurrences.get(char, (0,0))[1] + 1 # Use dict.get(..) so we can get a default tuple if char isnt mapped yet
    occurrences[char] = (idx, count)

# Now that we have all the characters mapped to an occurrence count, filter out the rare ones.
rare = [(char, tup) for (char, tup) in occurrences.items() if tup[1] == 1]

# Dictionaries store their items in an unordered fashion, so we need to sort our rare
# characters by their position in the original string
#
# The join-ing portion is just to have a human-readable printout
print(''.join(item[0] for item in sorted(rare, key=lambda x: x[1][0])))


equality

In [24]:
# An arguably easier way to solve the problem. This does a greedy
# recursive search into the string, removing duplicates until we've
# exhausted all of the search space.
#
# Since the space is searched linearly and the solution isn't stored in a dict
# like above, we end up with the solution without any additional manipulation
#
# Note: This may not be the most memory efficient. Hopefully all these intermediate strings are
#       appropriately cleaned up during the recursive calls, but it's up to the Python kernel.
#       Something, something, tail recursion...

# Remove the newlines that are in the block (not strictly necessary with this given string)
searchSpace = block.translate({ord("\n"):None})

def removeDuplicates(space, soln=''):
    ''' Recursively searches through the space removing duplicate characters'''
    if(len(space) == 0):
        return soln
    
    # If we've already seen the character, remove the char (and all occurrences of it) from
    # both the search space and solution and continue recursing
    #
    # Note the use of str.translate(dict) since I'm using Python 3 (different signature from Python 2)
    if space[0] in soln:
        delTable = {ord(space[0]):None}
        return removeDuplicates(space.translate(delTable), soln.translate(delTable))
    else:
        return removeDuplicates(space[1:], soln + space[0])
        
removeDuplicates(searchSpace)


Out[24]:
'equality'

In [44]:
# According to the wise people of Stack Overflow
#    (http://stackoverflow.com/questions/13591970/does-python-optimize-tail-recursion)
# Python does not do tail call optimization (TCO).
# From that thread, the hack to more-or-less get the tail recursion is to
# wrap the method in a while loop.
#
# Keep in mind that this current stage solves fine, using recursion, without this optimization

def removeDuplicates_tro(space, soln=''):
    while True:
        if(len(space) == 0):
            return soln
        
        if space[0] in soln:
            delTable = {ord(space[0]):None}
            space = space.translate(delTable)
            soln = soln.translate(delTable)
        else:
            soln = soln + space[0]
            space = space[1:]

searchSpace = block.translate({ord("\n"):None})
removeDuplicates_tro(searchSpace)


Out[44]:
'equality'

In [16]:
test = [1, 2, 3]
targets = [2]
for x in test:
    print(x)
    if x in targets:
        print("Found " + str(x) + " in the test list!")
        
print(test)


1
2
Found 2 in the test list!
3
[1, 2, 3]

In [22]:
# Seeing if you can modify an iterable while iterating over it.
# It works in this simple example, but the long story short version
# is: don't do it

test = "abc"
for x in test:
    if x == "b":
        delTable = { ord(x):None }
        test = test.translate(delTable)
    
print(test)


ac

In [ ]: