In [3]:
import requests
from scrapy.http import TextResponse
import re
import csv
In [4]:
def strip_text(data):
#return a list from an html string of the html tags
p = re.compile('<.*?>')
return p.findall(data)
In [5]:
def strip_tags(data):
#return a list from an html string of the html tags
p = re.compile('<.*?>')
return p.sub("*",data)
In [6]:
def strip_meta(tags):
#strips metadata (classes, attributes etc) from list of html tags
cleantags = [];
p = re.compile("""\A\<[a-z | A-Z]*\ """)
for tag in tags:
if (tag[1]=="!"):
pass
else:
new_tag = p.findall(tag)
if new_tag==[]: cleantags.append(tag)
else: cleantags.append(new_tag[0][:-1]+">")
return cleantags
In [7]:
def sanitise(raw_tags,codex):
#take tags and replace by an integer alphabet. codex is "tags3"
reader = csv.reader(open(codex, 'rb'))
tag_dict= dict((x[0],int(x[1])) for x in reader)
sanitised_list = []
for item in raw_tags:
try:
sanitised = tag_dict[item]
sanitised_list.append(sanitised)
except:
pass
return sanitised_list
In [22]:
def layer_search(r):
members = r.xpath("*")
big_layer_size = 0
big_layer_ind = 1
layer_ind = 0
for member in members:
layer_ind+=1
layer_size = len(member.xpath("descendant::*"))
print("descendants of node "+str(layer_ind) +": " +str(layer_size))
if (layer_size>big_layer_size):
next_member = member
big_layer_ind=layer_ind
big_layer_size = layer_size
print("returning node "+str(big_layer_ind))
return next_member
In [143]:
def recurse(func,thing,times):
if times ==0:
layer_report(thing)
return thing
else:
layer_report(thing)
return recurse(func, func(thing),times-1)
In [200]:
from collections import Counter
def Most_Common(lst):
data = Counter(lst)
return data.most_common(1)[0][0]
In [179]:
#strike a match algorithm!
#http://www.catalysoft.com/articles/StrikeAMatch.html
#http://stackoverflow.com/questions/653157/a-better-similarity-ranking-algorithm-for-variable-length-strings
def get_bigrams(s):
'''
Takes a string and returns a list of bigrams
'''
return [s[i:i+2] for i in xrange(len(s) - 1)]
def string_similarity(str1, str2):
'''
Perform bigram comparison between two strings
and return a percentage match in decimal form
'''
if (str1=="" or str2==""):
score = 0.0
else:
pairs1 = get_bigrams(str1)
pairs2 = get_bigrams(str2)
union = len(pairs1) + len(pairs2)
hit_count = 0
for x in pairs1:
for y in pairs2:
if x == y:
hit_count += 1
pairs2.remove(y)
break
if union == 0:
score = 0.
else:
score = (2.0 * hit_count) / union
return score
In [83]:
def sanitise2(raw_tags,codex):
#take tags and replace by an integer alphabet. codex is "tags3"
reader = csv.reader(open(codex, 'rb'))
tag_dict= dict((x[0],x[1]) for x in reader)
sanitised_list = []
for item in raw_tags:
try:
sanitised = tag_dict[item]
sanitised_list.append(sanitised)
except:
pass
return "".join(sanitised_list)
In [191]:
def layer_report(r):
struct_list = []
for mem in r.xpath("*"):
raw_tags = strip_meta(strip_text(mem.extract()))
san = sanitise2(raw_tags,'tags4.csv')
struct_list.append(san)
bench = Most_Common(struct_list)
sim_list = []
for s in struct_list:
sim_list.append(string_similarity(s,bench))
average_similarity = sum(sim_list)/len(sim_list)
number = len(r.xpath("*"))
qualifying_records = sum([similarity_threshold<=x<=1 for x in sim_list])
proportion = sum([similarity_threshold<=x<=1 for x in sim_list])/float(number)
print("average similarity is: " + str(average_similarity))
print("number of nodes is: " + str(number))
print("number of qualifying records: "+str(qualifying_records))
print("proportion of similar records: " + str(proportion))
In [27]:
#make a function that counts descendents. This will simplify greatly.
#for each member in a member "layer" list, get their counts. Don't imediately take the longest.
#Look at them and see if most of them are about the same size. If they are, we have likely found the repeating unit
#if not, we need to go down into the most "descendent heavy" member and continue.
#if the number of nodes is "large"
#and the descendents of the nodes is "similar"
#STRUCTURE OF THE NODES have common repeating substrings...
#These three should guarenteee the repeat unit has been found.
#plan shaping up:
#1: get most commonly occuring one
#2: process members into their encoded letters by structure
#3: rank their similarity to most commonly occuring one
#4: those above a certain threshhold are considered records
#5: find shortest and longest records and get then get the records.
In [131]:
r = requests.get('http://ora.ox.ac.uk/search/detailed?q=%2A%3A%2A&truncate=450&rows=50&sort=timestamp%20desc')
response = TextResponse(r.url, body=r.text, encoding='utf-8')
body = response.xpath("//body").extract()
raw_tags = strip_meta(strip_text(body[0]))
san_list = sanitise(raw_tags,'tags2.csv')
In [201]:
similarity_threshold = 0.65
number_threshold = 50
proportion_threshold = 0.70
average_similarity_threshold = 0.80
b = recurse(layer_search,response.xpath("//body"),5)
print(strip_tags(b.extract()))
In [ ]:
def layer_report_dev(r):
struct_list = []
for mem in r.xpath("*"):
raw_tags = strip_meta(strip_text(mem.extract()))
san = sanitise2(raw_tags,'tags4.csv')
struct_list.append(san)
bench = Most_Common(struct_list)
sim_list = []
for s in struct_list:
sim_list.append(string_similarity(s,bench))
average_similarity = sum(sim_list)/len(sim_list)
number = len(r.xpath("*"))
qualifying_records = sum([similarity_threshold<=x<=1 for x in sim_list])
proportion = sum([similarity_threshold<=x<=1 for x in sim_list])/float(number)
print("average similarity is: " + str(average_similarity))
print("number of nodes is: " + str(number))
print("number of qualifying records: "+str(qualifying_records))
print("proportion of similar records: " + str(proportion))
return ((number>=number_threshold) and
(proportion>=proportion_threshold) and
(average_similarity>=average_similarity_threshold))