In [8]:
# test number of tus and tuvs; number of tuvs per tu and filenames from filesource attribute of tuvs within a tu
# Sept 24, 2017 this is a working version
import sys, codecs
from xml.dom import minidom
In [9]:
arg1 = '/home/masha/birmingham/data/finalDavid.tmx' #finalDavid.tmx fin_filtered_rltc.tmx
In [10]:
# produce a list of tus
doc = minidom.parse(arg1)
body = doc.childNodes[1].childNodes[3] # equals body = doc.getElementsByTagName("body")[0]
tus = body.getElementsByTagName("tu")
In [11]:
# return counts of tus and tuvs within the body tree-structure
tu_num = len(tus)
print "I have", str(tu_num), "translation units in this tmx"
tuvs = body.getElementsByTagName("tuv")
tuv_num = len(tuvs)
print "I have", str(tuv_num), "translation unit variants in this tmx"
In [12]:
# see how many tuvs are there per tu
under2 = 0
over2 = 0
looksgood = 0
nums = []
for tu in tus:
tuvs = tu.getElementsByTagName("tuv")
if len(tuvs) < 2:
under2 += 1
#print fn, ': TU has < 2 tuvs'
if len(tuvs) == 2:
looksgood += 1
#print fn, ': TU has exactly 2 tuvs'
if len(tuvs) > 2:
over2 += 1
#print fn, ': TU has > 2 tuvs'
nums.append(len(tuvs))
print over2, ' TUs have > 2 tuvs'
print under2, ' TUs have < 2 tuvs'
print looksgood, ' TUs have exactly 2 tuvs'
#print nums
print 'TUs:', len(nums), 'tuvs:', sum(nums)
In [6]:
# test that there are filenames in filesource attribute in each tuv
## this prints a long list of paired heads from filesources of tuvs with Source and Translation attributes
# for tu in tus:
# tuvs = tu.getElementsByTagName("tuv")
# for tuv in tuvs:
# tuv_type = tuv.getAttribute('type')
# if tuv_type == 'Source':
# tuv_source = tuv # tuv_source is a new name for tuv
# filesource = tuv_source.getAttribute('filesource')
# print 'Source: ', filesource
# print 'Translations: '
# continue
# if tuv_type == 'Translation':
# tuv_target = tuv
# filetarget = tuv_target.getAttribute('filesource')
# print '\t', filetarget, '\n'
In [ ]:
In [ ]: