In [ ]:
import os
in_dir = "./data/constitutions/"
# we create a dictionary with our constitutions:
sources = {}
for file in sorted(os.listdir(in_dir)):
key = os.path.basename(file).split(os.extsep)[0]
with open(in_dir + '/' + file, encoding="utf-8") as f:
sources[key] = f.read()
# and a list of available constitutions for quick lookup:
constitutions = list(sources.keys())
print ("{} files read:".format(len(constitutions)))
print (constitutions)
In [ ]:
from nltk import tokenize
# nltk.download('punkt')
sentences = {}
nos = {}
for c in constitutions:
t = tokenize.sent_tokenize(sources[c])
nos[c] = len(t)
for i, s in enumerate(t):
sentences[c + '_' + str(i)] = s
boundary = len(sentences) - nos['1948_-_BU_-_Burma_-_constitution_of_burma']
print("Corpus has {} sentences.".format(len(sentences)))
print("1948_-_BU_-_Burma_-_constitution_of_burma has {}.\n".format(nos['1948_-_BU_-_Burma_-_constitution_of_burma']))
print("Its first 3 sentences are:\n{}".format([sentences['1948_-_BU_-_Burma_-_constitution_of_burma_0'],\
sentences['1948_-_BU_-_Burma_-_constitution_of_burma_1'],\
sentences['1948_-_BU_-_Burma_-_constitution_of_burma_2']]))
In [ ]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer='word', strip_accents='unicode', stop_words=["the", "of", "and"])
dfm = vectorizer.fit_transform(sentences.values())
print(dfm.shape)
print(type(dfm))
print(dfm.toarray())
In [ ]:
from sklearn.metrics.pairwise import cosine_similarity
target = dfm[boundary:]
sources = dfm[:boundary,]
print(target.shape)
print(sources.shape)
simils = cosine_similarity(target, sources)
print(simils.shape)
In [ ]:
import numpy as np
np.argmax(simils, axis=1)
In [ ]:
import os
in_dir = "./data/manual/"
# we create a dictionary with our manuales:
sources = {}
for file in sorted(os.listdir(in_dir)):
key = os.path.basename(file).split(os.extsep)[0]
with open(in_dir + '/' + file, encoding="utf-8") as f:
sources[key] = f.read()
# and a list of available constitutions for quick lookup:
manuales = list(sources.keys())
print ("{} files read:".format(len(manuales)))
print (manuales)
In [ ]:
from nltk import tokenize
# nltk.download('punkt')
sentences = {}
nos = {}
for c in manuales:
t = tokenize.sent_tokenize(sources[c])
nos[c] = len(t)
for i, s in enumerate(t):
sentences[c + '_' + str(i)] = s
print("Corpus has {} sentences.".format(len(sentences)))
print("azp1552_ch17 {}.\n".format(nos['azp1552_ch17']))
print("Its first 3 sentences are:\n{}".format([sentences['azp1552_ch17_2'],\
sentences['azp1552_ch17_3'],\
sentences['azp1552_ch17_4']]))
In [2]:
import lxml
from lxml import etree
import glob
import re
import os
in_dir = "./data/manual/"
sources = glob.glob(in_dir + '*.xml')
parsed = { os.path.basename(file).split(os.extsep)[0] :
(etree.parse(file))
for file in sorted(sources)
}
manuales = list(parsed.keys())
print (manuales)
nsmap = {"tei": "http://www.tei-c.org/ns/1.0"}
def flatten(element):
t = ""
# Dagger milestones
if element.get("rendition")=="#dagger":
t += "†"
if element.tail:
t += str.replace(element.tail, "\n", " ")
# asterisk milestones (additions in the 1556 ed.) - create temporary marker
elif element.get("rendition")=="#asterisk":
t += "*"
if element.tail:
t += str.replace(element.tail, "\n", " ")
# Unanchored milestones - create temporary marker
elif element.get("rendition")=="#unanchored":
t += "‡"
if element.tail:
t += str.replace(element.tail, "\n", " ")
else:
for c in element.iter("expan"):
flatten(c)
if element.tail:
t += str.replace(element.tail, "\n", " ")
for c in element.iter("corr"):
flatten(c)
if element.tail:
t += str.replace(element.tail, "\n", " ")
if element.text:
t += str.replace(element.text, "\n", " ")
if element.getchildren():
t += " ".join((flatten(child)) for child in element.getchildren())
if element.tail:
t += str.replace(element.tail, "\n", " ")
return t
xp_divs = etree.XPath("(//tei:body/tei:div[@type = 'chapter'][not(@n = '0')])", namespaces = nsmap)
divs = {}
text = {}
for ed in manuales:
t1 = ""
divs[ed] = xp_divs(parsed[ed])
t1 = "".join("++div--" + re.sub('\s+', ' ', '<p>' + flatten(div)) for div in divs[ed])
t2 = re.sub(r'¶', '++break--¶', t1) # where pilcrow signs are
t3 = re.sub(r'([:\.\?\]])\s+([A-Z])(?!([CIJLVX]+|.)?\.)(?![^†‡*]{0,80}[:\.\?\]][^a-z]*[A-Z])(?=.{0,80}[†‡*])',
r'\1 ++break-- \2', t2) # sentences beginning
# with punctuation, whitespace, and a
# capital letter (not immediately followed by
# an abbreviation period)
# and a milestone follows within 80 characters
# (that do not contain a punctuation character)
t4 = re.sub(r'\b([A-Z]{2}\s*[a-z])', r'++break-- \1', t3) # two capital letters
t5 = t4[::-1] # reverse the string
t6 = re.sub(r'([†‡*])(?!.{0,100}--kaerb)', r'\1--kaerb++', t5) # daggers without sentence boundaries, i.e. not covered above
t7 = t6[::-1] # reverse the string
t8 = re.sub(r'‡', '', t7) # Eliminate temporary markers: unanchored milestones
# Concat everything and do a final removal of redundant breaks.
t9 = re.sub(r'\+\+break--\s*\+\+break--', '++break--', " ".join(t8.strip().split()))
t10 = re.sub(r'\+\+break--', r'<milestone type="lera-segment"/>', t9)
t11 = re.sub(r'\+\+div--', r'</div><div type="chapter">', t10)
text[ed] = '<root>' + re.sub(r'&', '&', t11)[6:] + '</div></root>'
print("text['azp1552_ch17'] is:\n{}...".format(text['azp1552_ch17'][:400]))
In [8]:
sentences = {}
nos = {}
for ed in manuales:
sentences[ed] = {}
segments = text[ed].split('<milestone type="lera-segment"/>')
nos[ed] = len(segments)
for i, s in enumerate(segments):
sentences[ed][ed + '_' + str(i)] = s.strip()
print("Corpus has {} sentences.".format(len(sentences)))
print("azp1552_ch17 has {}.\n".format(nos['azp1552_ch17']))
print("Its first 5 sentences are:\n{}".format([sentences['azp1552_ch17']['azp1552_ch17_0'],\
sentences['azp1552_ch17']['azp1552_ch17_1'],\
sentences['azp1552_ch17']['azp1552_ch17_2'],\
sentences['azp1552_ch17']['azp1552_ch17_3'],\
sentences['azp1552_ch17']['azp1552_ch17_4']]))
Save sentences as plaintext files.
In [9]:
import csv
for ed in manuales:
with open('./data/manual/' + ed + '_seg.csv', 'w', encoding='utf-8') as csv_file:
writer = csv.writer(csv_file, lineterminator="\n")
for key, value in sentences[ed].items():
writer.writerow([key, value])