In [18]:
import requests
from scrapy.http import TextResponse
import re
import csv
import collections
import sys

In [34]:
def strip_text(data):
    #return a list from an html string of the html tags
    p = re.compile('<.*?>')
    return p.findall(data)

def strip_tags(data):
    #strips tags from an html string.
    p = re.compile('<.*?>')
    return p.sub("",data)

def strip_meta(tags):
    #strips metadata (classes, attributes etc) from list of html tags
    cleantags = [];
    p = re.compile("""\A\<[a-z | A-Z]*\ """)
    for tag in tags:
        if (tag[1]=="!"):
            pass
        else:
            new_tag = p.findall(tag)
            if new_tag==[]: cleantags.append(tag)
            else: cleantags.append(new_tag[0][:-1]+">")
    return cleantags

def sanitise(raw_tags,codex):
    #take tags and replace by an string character alphabet. codex is "tags4"
    reader = csv.reader(open(codex, 'rb'))
    tag_dict= dict((x[0],x[1]) for x in reader)
    sanitised_list = []
    for item in raw_tags:
        try:
            sanitised = tag_dict[item]
            sanitised_list.append(sanitised)
        except:
            pass
    return "".join(sanitised_list)

In [20]:
def most_common(lst):
    data = collections.Counter(lst)
    return data.most_common(1)[0][0]

In [21]:
#strike a match algorithm!
#http://www.catalysoft.com/articles/StrikeAMatch.html
#http://stackoverflow.com/questions/653157/a-better-similarity-ranking-algorithm-for-variable-length-strings
def get_bigrams(s):
    '''
    Takes a string and returns a list of bigrams
    '''
    return [s[i:i+2] for i in xrange(len(s) - 1)]

def similarity(str1, str2):
    '''
    Perform bigram comparison between two strings
    and return a percentage match in decimal form
    '''
    if (str1=="" or str2==""): 
        score = 0.0
    else: 
        pairs1 = get_bigrams(str1)
        pairs2 = get_bigrams(str2)
        union  = len(pairs1) + len(pairs2)
        hit_count = 0
        for x in pairs1:
            for y in pairs2:
                if x == y:
                    hit_count += 1
                    pairs2.remove(y)
                    break
        if union == 0:
            score = 0.
        else: 
            score = (2.0 * hit_count) / union
    return score

In [37]:
def layer_report(r):
    structures = []
    sims = []
    for mem in r.xpath("*"):
        raw_tags = strip_meta(strip_text(mem.extract()))
        san = sanitise(raw_tags,'tags4.csv')
        structures.append(san)
    mc = most_common(structures)
    for s in structures:
        sims.append(similarity(s,mc))
    ave_sim = sum(sims)/len(sims)
    node_count = len(r.xpath("*"))
    qual_nodes = sum([similarity_threshold<=x<=1 for x in sims])
    proportion = qual_nodes/float(node_count)
    done = (
        (qual_nodes>=node_threshold) and 
        (proportion>=proportion_threshold) and 
        (ave_sim>=ave_similarity_threshold)
    )
    print("average similarity: " + str(ave_sim))
    print("node count: " + str(node_count))
    print("qualifying nodes: "+str(qual_nodes))
    print("proportion of records similar: " + str(proportion))
    print("Am I done? : " + str(done) )
    return (done,sims)

In [23]:
def select_cube(r):
    cubes = r.xpath("*")
    sizes = [len(c.xpath("descendant::*")) for c in cubes]
    ind = sizes.index(max(sizes))
    print(
        "returning node:" + str(ind) +
        " with descendents: "+ str(sizes[ind])
    )
    return (cubes[ind])

In [30]:
def crush_ice(r):
    (layer,sims) = layer_report(r)
    if not layer:
        return crush_ice(select_cube(r))
    else:
        return serve_drink(r,sims)

In [42]:
def serve_drink(r,sims):
    cubes = r.xpath("*")
    for ind in range(len(cubes)):
        if sims[ind]>=similarity_threshold:
            print("*"*10+"record number: "+str(ind)+"*"*10)
            sys.stdout.write(strip_tags(cubes[ind].extract()))
        #sys.stdout.write(strip_tags(rec.extract()))
        #sys.stdout.write(rec.extract())
        #print("***")
    return("finished")

In [40]:
def export(cube):
    """make record"""
    pass

In [43]:
r = requests.get('http://ora.ox.ac.uk/search/detailed?q=%2A%3A%2A&truncate=450&rows=50&sort=timestamp%20desc')
response = TextResponse(r.url, body=r.text, encoding='utf-8')
similarity_threshold = 0.60
node_threshold = 50
proportion_threshold = 0.70
ave_similarity_threshold = 0.70
crush_ice(response)


average similarity: 0.5
node count: 2
qualifying nodes: 1
proportion of records similar: 0.5
Am I done? : False
returning node:1 with descendents: 2005
average similarity: 0.0
node count: 5
qualifying nodes: 0
proportion of records similar: 0.0
Am I done? : False
returning node:0 with descendents: 1977
average similarity: 0.0
node count: 5
qualifying nodes: 0
proportion of records similar: 0.0
Am I done? : False
returning node:2 with descendents: 1954
average similarity: 0.563717746182
node count: 2
qualifying nodes: 1
proportion of records similar: 0.5
Am I done? : False
returning node:0 with descendents: 1294
average similarity: 0.334956712094
node count: 3
qualifying nodes: 1
proportion of records similar: 0.333333333333
Am I done? : False
returning node:2 with descendents: 1277
average similarity: 0.813790544088
node count: 57
qualifying nodes: 50
proportion of records similar: 0.877192982456
Am I done? : True
**********record number: 4**********
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-43-8eae17fb2447> in <module>()
      5 proportion_threshold = 0.70
      6 ave_similarity_threshold = 0.70
----> 7 crush_ice(response)

<ipython-input-30-8e74b1d2add3> in crush_ice(r)
      2     (layer,sims) = layer_report(r)
      3     if not layer:
----> 4         return crush_ice(select_cube(r))
      5     else:
      6         return serve_drink(r,sims)

<ipython-input-30-8e74b1d2add3> in crush_ice(r)
      2     (layer,sims) = layer_report(r)
      3     if not layer:
----> 4         return crush_ice(select_cube(r))
      5     else:
      6         return serve_drink(r,sims)

<ipython-input-30-8e74b1d2add3> in crush_ice(r)
      2     (layer,sims) = layer_report(r)
      3     if not layer:
----> 4         return crush_ice(select_cube(r))
      5     else:
      6         return serve_drink(r,sims)

<ipython-input-30-8e74b1d2add3> in crush_ice(r)
      2     (layer,sims) = layer_report(r)
      3     if not layer:
----> 4         return crush_ice(select_cube(r))
      5     else:
      6         return serve_drink(r,sims)

<ipython-input-30-8e74b1d2add3> in crush_ice(r)
      2     (layer,sims) = layer_report(r)
      3     if not layer:
----> 4         return crush_ice(select_cube(r))
      5     else:
      6         return serve_drink(r,sims)

<ipython-input-30-8e74b1d2add3> in crush_ice(r)
      4         return crush_ice(select_cube(r))
      5     else:
----> 6         return serve_drink(r,sims)

<ipython-input-42-126f3247e5cf> in serve_drink(r, sims)
      4         if sims[ind]>=similarity_threshold:
      5             print("*"*10+"record number: "+str(ind)+"*"*10)
----> 6             sys.stdout.write(cubes[ind].xpath("text()"))
      7         #sys.stdout.write(strip_tags(rec.extract()))
      8         #sys.stdout.write(rec.extract())

/Library/Python/2.7/site-packages/IPython/kernel/zmq/iostream.pyc in write(self, string)
    211             # Make sure that we're handling unicode
    212             if not isinstance(string, unicode_type):
--> 213                 string = string.decode(self.encoding, 'replace')
    214 
    215             is_child = (self._check_mp_mode() == CHILD)

AttributeError: 'SelectorList' object has no attribute 'decode'

In [ ]:


In [ ]: