In [18]:
import requests
from scrapy.http import TextResponse
import re
import csv
import collections
import sys
In [34]:
def strip_text(data):
#return a list from an html string of the html tags
p = re.compile('<.*?>')
return p.findall(data)
def strip_tags(data):
#strips tags from an html string.
p = re.compile('<.*?>')
return p.sub("",data)
def strip_meta(tags):
#strips metadata (classes, attributes etc) from list of html tags
cleantags = [];
p = re.compile("""\A\<[a-z | A-Z]*\ """)
for tag in tags:
if (tag[1]=="!"):
pass
else:
new_tag = p.findall(tag)
if new_tag==[]: cleantags.append(tag)
else: cleantags.append(new_tag[0][:-1]+">")
return cleantags
def sanitise(raw_tags,codex):
#take tags and replace by an string character alphabet. codex is "tags4"
reader = csv.reader(open(codex, 'rb'))
tag_dict= dict((x[0],x[1]) for x in reader)
sanitised_list = []
for item in raw_tags:
try:
sanitised = tag_dict[item]
sanitised_list.append(sanitised)
except:
pass
return "".join(sanitised_list)
In [20]:
def most_common(lst):
data = collections.Counter(lst)
return data.most_common(1)[0][0]
In [21]:
#strike a match algorithm!
#http://www.catalysoft.com/articles/StrikeAMatch.html
#http://stackoverflow.com/questions/653157/a-better-similarity-ranking-algorithm-for-variable-length-strings
def get_bigrams(s):
'''
Takes a string and returns a list of bigrams
'''
return [s[i:i+2] for i in xrange(len(s) - 1)]
def similarity(str1, str2):
'''
Perform bigram comparison between two strings
and return a percentage match in decimal form
'''
if (str1=="" or str2==""):
score = 0.0
else:
pairs1 = get_bigrams(str1)
pairs2 = get_bigrams(str2)
union = len(pairs1) + len(pairs2)
hit_count = 0
for x in pairs1:
for y in pairs2:
if x == y:
hit_count += 1
pairs2.remove(y)
break
if union == 0:
score = 0.
else:
score = (2.0 * hit_count) / union
return score
In [37]:
def layer_report(r):
structures = []
sims = []
for mem in r.xpath("*"):
raw_tags = strip_meta(strip_text(mem.extract()))
san = sanitise(raw_tags,'tags4.csv')
structures.append(san)
mc = most_common(structures)
for s in structures:
sims.append(similarity(s,mc))
ave_sim = sum(sims)/len(sims)
node_count = len(r.xpath("*"))
qual_nodes = sum([similarity_threshold<=x<=1 for x in sims])
proportion = qual_nodes/float(node_count)
done = (
(qual_nodes>=node_threshold) and
(proportion>=proportion_threshold) and
(ave_sim>=ave_similarity_threshold)
)
print("average similarity: " + str(ave_sim))
print("node count: " + str(node_count))
print("qualifying nodes: "+str(qual_nodes))
print("proportion of records similar: " + str(proportion))
print("Am I done? : " + str(done) )
return (done,sims)
In [23]:
def select_cube(r):
cubes = r.xpath("*")
sizes = [len(c.xpath("descendant::*")) for c in cubes]
ind = sizes.index(max(sizes))
print(
"returning node:" + str(ind) +
" with descendents: "+ str(sizes[ind])
)
return (cubes[ind])
In [30]:
def crush_ice(r):
(layer,sims) = layer_report(r)
if not layer:
return crush_ice(select_cube(r))
else:
return serve_drink(r,sims)
In [42]:
def serve_drink(r,sims):
cubes = r.xpath("*")
for ind in range(len(cubes)):
if sims[ind]>=similarity_threshold:
print("*"*10+"record number: "+str(ind)+"*"*10)
sys.stdout.write(strip_tags(cubes[ind].extract()))
#sys.stdout.write(strip_tags(rec.extract()))
#sys.stdout.write(rec.extract())
#print("***")
return("finished")
In [40]:
def export(cube):
"""make record"""
pass
In [43]:
r = requests.get('http://ora.ox.ac.uk/search/detailed?q=%2A%3A%2A&truncate=450&rows=50&sort=timestamp%20desc')
response = TextResponse(r.url, body=r.text, encoding='utf-8')
similarity_threshold = 0.60
node_threshold = 50
proportion_threshold = 0.70
ave_similarity_threshold = 0.70
crush_ice(response)
In [ ]:
In [ ]: