In [3]:
import requests
from scrapy.http import TextResponse
import re
import csv

In [4]:
def strip_text(data):
    #return a list from an html string of the html tags
    p = re.compile('<.*?>')
    return p.findall(data)

In [5]:
def strip_tags(data):
    #return a list from an html string of the html tags
    p = re.compile('<.*?>')
    return p.sub("*",data)

In [6]:
def strip_meta(tags):
    #strips metadata (classes, attributes etc) from list of html tags
    cleantags = [];
    p = re.compile("""\A\<[a-z | A-Z]*\ """)
    for tag in tags:
        if (tag[1]=="!"):
            pass
        else:
            new_tag = p.findall(tag)
            if new_tag==[]: cleantags.append(tag)
            else: cleantags.append(new_tag[0][:-1]+">")
    return cleantags

In [7]:
def sanitise(raw_tags,codex):
    #take tags and replace by an integer alphabet. codex is "tags3"
    reader = csv.reader(open(codex, 'rb'))
    tag_dict= dict((x[0],int(x[1])) for x in reader)
    sanitised_list = []
    for item in raw_tags:
        try:
            sanitised = tag_dict[item]
            sanitised_list.append(sanitised)
        except:
            pass
    return sanitised_list

In [22]:
def layer_search(r):
    members = r.xpath("*")
    big_layer_size = 0
    big_layer_ind = 1
    layer_ind = 0
    for member in members:
        layer_ind+=1
        layer_size = len(member.xpath("descendant::*"))
        print("descendants of node "+str(layer_ind) +": " +str(layer_size))
        if (layer_size>big_layer_size):
            next_member = member
            big_layer_ind=layer_ind
            big_layer_size = layer_size
    print("returning node "+str(big_layer_ind))
    return next_member

In [143]:
def recurse(func,thing,times):
    if times ==0: 
        layer_report(thing)
        return thing
    else: 
        layer_report(thing)
        return recurse(func, func(thing),times-1)

In [200]:
from collections import Counter

def Most_Common(lst):
    data = Counter(lst)
    return data.most_common(1)[0][0]

In [179]:
#strike a match algorithm!
#http://www.catalysoft.com/articles/StrikeAMatch.html
#http://stackoverflow.com/questions/653157/a-better-similarity-ranking-algorithm-for-variable-length-strings
def get_bigrams(s):
    '''
    Takes a string and returns a list of bigrams
    '''
    return [s[i:i+2] for i in xrange(len(s) - 1)]

def string_similarity(str1, str2):
    '''
    Perform bigram comparison between two strings
    and return a percentage match in decimal form
    '''
    if (str1=="" or str2==""): 
        score = 0.0
    else: 
        pairs1 = get_bigrams(str1)
        pairs2 = get_bigrams(str2)
        union  = len(pairs1) + len(pairs2)
        hit_count = 0
        for x in pairs1:
            for y in pairs2:
                if x == y:
                    hit_count += 1
                    pairs2.remove(y)
                    break
        if union == 0:
            score = 0.
        else: 
            score = (2.0 * hit_count) / union
    return score

In [83]:
def sanitise2(raw_tags,codex):
    #take tags and replace by an integer alphabet. codex is "tags3"
    reader = csv.reader(open(codex, 'rb'))
    tag_dict= dict((x[0],x[1]) for x in reader)
    sanitised_list = []
    for item in raw_tags:
        try:
            sanitised = tag_dict[item]
            sanitised_list.append(sanitised)
        except:
            pass
    return "".join(sanitised_list)

In [191]:
def layer_report(r):
    struct_list = []
    for mem in r.xpath("*"):
        raw_tags = strip_meta(strip_text(mem.extract()))
        san = sanitise2(raw_tags,'tags4.csv')
        struct_list.append(san)
    bench = Most_Common(struct_list)
    sim_list = []
    for s in struct_list:
        sim_list.append(string_similarity(s,bench))
    average_similarity = sum(sim_list)/len(sim_list)
    number = len(r.xpath("*"))
    qualifying_records = sum([similarity_threshold<=x<=1 for x in sim_list])
    proportion = sum([similarity_threshold<=x<=1 for x in sim_list])/float(number)
    print("average similarity is: " + str(average_similarity))
    print("number of nodes is: " + str(number))
    print("number of qualifying records: "+str(qualifying_records))
    print("proportion of similar records: " + str(proportion))

In [27]:
#make a function that counts descendents. This will simplify greatly.
#for each member in a member "layer" list, get their counts. Don't imediately take the longest. 
#Look at them and see if most of them are about the same size. If they are, we have likely found the repeating unit
#if not, we need to go down into the most "descendent heavy" member and continue. 

#if the number of nodes is "large" 
#and the descendents of the nodes is "similar"
#STRUCTURE OF THE NODES have common repeating substrings...
#These three should guarenteee the repeat unit has been found. 

#plan shaping up:
#1: get most commonly occuring one
#2: process members into their encoded letters by structure
#3: rank their similarity to most commonly occuring one
#4: those above a certain threshhold are considered records
#5: find shortest and longest records and get then get the records.

In [131]:
r = requests.get('http://ora.ox.ac.uk/search/detailed?q=%2A%3A%2A&truncate=450&rows=50&sort=timestamp%20desc')
response = TextResponse(r.url, body=r.text, encoding='utf-8')
body = response.xpath("//body").extract()
raw_tags = strip_meta(strip_text(body[0]))
san_list = sanitise(raw_tags,'tags2.csv')

In [201]:
similarity_threshold = 0.65
number_threshold = 50
proportion_threshold = 0.70
average_similarity_threshold = 0.80
b = recurse(layer_search,response.xpath("//body"),5)
print(strip_tags(b.extract()))


average similarity is: 0.0
number of nodes is: 5
number of qualifying records: 0
proportion of similar records: 0.0
descendants of node 1: 1988
descendants of node 2: 21
descendants of node 3: 2
descendants of node 4: 0
descendants of node 5: 0
returning node 1
average similarity is: 0.0
number of nodes is: 5
number of qualifying records: 0
proportion of similar records: 0.0
descendants of node 1: 0
descendants of node 2: 18
descendants of node 3: 1965
descendants of node 4: 0
descendants of node 5: 0
returning node 3
average similarity is: 0.56335078534
number of nodes is: 2
number of qualifying records: 1
proportion of similar records: 0.5
descendants of node 1: 1313
descendants of node 2: 650
returning node 1
average similarity is: 0.334932056398
number of nodes is: 3
number of qualifying records: 1
proportion of similar records: 0.333333333333
descendants of node 1: 4
descendants of node 2: 10
descendants of node 3: 1296
returning node 3
average similarity is: 0.751784883952
number of nodes is: 57
number of qualifying records: 50
proportion of similar records: 0.877192982456
descendants of node 1: 24
descendants of node 2: 8
descendants of node 3: 8
descendants of node 4: 0
descendants of node 5: 20
descendants of node 6: 22
descendants of node 7: 20
descendants of node 8: 21
descendants of node 9: 28
descendants of node 10: 22
descendants of node 11: 28
descendants of node 12: 20
descendants of node 13: 22
descendants of node 14: 21
descendants of node 15: 28
descendants of node 16: 25
descendants of node 17: 25
descendants of node 18: 20
descendants of node 19: 28
descendants of node 20: 28
descendants of node 21: 20
descendants of node 22: 28
descendants of node 23: 28
descendants of node 24: 28
descendants of node 25: 25
descendants of node 26: 25
descendants of node 27: 22
descendants of node 28: 23
descendants of node 29: 20
descendants of node 30: 20
descendants of node 31: 20
descendants of node 32: 20
descendants of node 33: 20
descendants of node 34: 20
descendants of node 35: 21
descendants of node 36: 20
descendants of node 37: 25
descendants of node 38: 30
descendants of node 39: 25
descendants of node 40: 25
descendants of node 41: 22
descendants of node 42: 25
descendants of node 43: 22
descendants of node 44: 21
descendants of node 45: 25
descendants of node 46: 24
descendants of node 47: 23
descendants of node 48: 25
descendants of node 49: 23
descendants of node 50: 25
descendants of node 51: 22
descendants of node 52: 23
descendants of node 53: 25
descendants of node 54: 19
descendants of node 55: 24
descendants of node 56: 8
descendants of node 57: 0
returning node 38
average similarity is: 0.0
number of nodes is: 6
number of qualifying records: 0
proportion of similar records: 0.0
*

**
**Illuminance flow estimation by regression**
**
*
*
*Abstract*

**We investigate the estimation of illuminance flow using Histograms of Oriented Gradient features (HOGs). In a regression setting, we found for both ridge regression and support vector machines, that the optimal solution shows close resemblance to the gradient based structure tensor (also known as the second moment matrix).**Theoretical results are presented showing in detail how the structure tensor and the HOGs are connected. This relati ... [truncated at 450 characters in length]**
*
*
*Author**
            			    
Stefan M. Karlsson; 
                
Sylvia C. Pont; 
                
Jan J. Koenderink; 
                
                
et al
**
*
*Date*
          
*2010*
*
*

*
*
          
**Article**
*
          
*
****
*
          
*
****
          
**
****
*
*


**
*

In [ ]:
def layer_report_dev(r):
    struct_list = []
    for mem in r.xpath("*"):
        raw_tags = strip_meta(strip_text(mem.extract()))
        san = sanitise2(raw_tags,'tags4.csv')
        struct_list.append(san)
    bench = Most_Common(struct_list)
    sim_list = []
    for s in struct_list:
        sim_list.append(string_similarity(s,bench))
    average_similarity = sum(sim_list)/len(sim_list)
    number = len(r.xpath("*"))
    qualifying_records = sum([similarity_threshold<=x<=1 for x in sim_list])
    proportion = sum([similarity_threshold<=x<=1 for x in sim_list])/float(number)
    print("average similarity is: " + str(average_similarity))
    print("number of nodes is: " + str(number))
    print("number of qualifying records: "+str(qualifying_records))
    print("proportion of similar records: " + str(proportion))
    return ((number>=number_threshold) and 
            (proportion>=proportion_threshold) and 
            (average_similarity>=average_similarity_threshold))