In [4]:
import nltk
from lxml import etree
import pandas as pd
% matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = [12, 6]
In [5]:
class Text():
def __init__(self, filename):
self.tree = etree.parse(filename)
self.textEnd = sum(1 for line in open(filename))
self.textBegin = self.tree.find('.//text').sourceline
def getLang(self, lang):
# map XML built-in namespace so we can search for xml:lang
nsmap = {"xml": "http://www.w3.org/XML/1998/namespace"}
xpath = './/seg[@xml:lang="%s"]' % lang
matches = self.tree.findall(xpath, namespaces=nsmap)
return [match.sourceline for match in matches]
def showLang(self, lang):
lines = self.getLang(lang)
pd.Series(self.makeBuckets(lines)).plot(kind="bar")
def showLangs(self, langs):
langDict = {}
for lang in langs:
lines = self.getLang(lang)
results = self.makeBuckets(lines)
#print('results for lang: %s: %s' % (lang, results))
langDict[lang] = results
pd.DataFrame(langDict).plot()
def getFeature(self, feature):
xpath = './/seg[@type="%s"]' % feature
xpath2 = './/lg[@type="%s"]' % feature
matches1 = self.tree.findall(xpath)
matches2 = self.tree.findall(xpath2)
matches = matches1 + matches2
return [match.sourceline for match in matches]
def showFeatures(self, features):
featureDict = {}
for feature in features:
lines = self.getFeature(feature)
results = self.makeBuckets(lines)
featureDict[feature] = results
pd.DataFrame(featureDict).plot()
def makeBuckets(self, lines, numBuckets=20):
bucketSize = (self.textEnd - self.textBegin) / numBuckets
buckets = {}
for bucket in range(0, numBuckets):
lower = self.textBegin + (bucket * bucketSize)
upper = lower + bucketSize
#print("Looking for lines between %s and %s" % (lower, upper))
buckets[bucket] = sum(1 for line in lines if line > lower and line < upper)
return buckets
In [7]:
t = Text('../portrait.xml')
langDict = t.showLangs(['la', 'fr', 'ita'])
t.showFeatures(['song'])
In [ ]: