In [8]:
# test number of tus and tuvs; number of tuvs per tu and filenames from filesource attribute of tuvs within a tu 
# Sept 24, 2017 this is a working version

import sys, codecs
from xml.dom import minidom

In [9]:
arg1 = '/home/masha/birmingham/data/finalDavid.tmx' #finalDavid.tmx fin_filtered_rltc.tmx

In [10]:
# produce a list of tus
doc = minidom.parse(arg1)
body = doc.childNodes[1].childNodes[3] # equals body = doc.getElementsByTagName("body")[0]
tus = body.getElementsByTagName("tu")

In [11]:
# return counts of tus and tuvs within the body tree-structure
tu_num = len(tus)
print "I have", str(tu_num), "translation units in this tmx"
tuvs = body.getElementsByTagName("tuv")
tuv_num = len(tuvs)
print "I have", str(tuv_num), "translation unit variants in this tmx"


I have 9127 translation units in this tmx
I have 18254 translation unit variants in this tmx

In [12]:
# see how many tuvs are there per tu
under2 = 0
over2 = 0
looksgood = 0
nums = []

for tu in tus:
    tuvs = tu.getElementsByTagName("tuv")
    if len(tuvs) < 2:
        under2 += 1
        #print fn, ': TU has < 2 tuvs'
     
    if len(tuvs) == 2:
        looksgood += 1
        #print fn, ': TU has exactly 2 tuvs'    
    
    if len(tuvs) > 2:
        over2 += 1
        #print fn, ': TU has > 2 tuvs'    
    nums.append(len(tuvs))
    
print over2, ' TUs have > 2 tuvs'    
print under2, ' TUs have < 2 tuvs'
print looksgood, ' TUs have exactly 2 tuvs'
#print nums
print 'TUs:', len(nums), 'tuvs:', sum(nums)


0  TUs have > 2 tuvs
0  TUs have < 2 tuvs
9127  TUs have exactly 2 tuvs
TUs: 9127 tuvs: 18254

In [6]:
# test that there are filenames in filesource attribute in each tuv
## this prints a long list of paired heads from filesources of tuvs with Source and Translation attributes
# for tu in tus:
#     tuvs = tu.getElementsByTagName("tuv")
#     for tuv in tuvs:
#         tuv_type = tuv.getAttribute('type')
#         if tuv_type == 'Source':
#             tuv_source = tuv # tuv_source is a new name for tuv
#             filesource = tuv_source.getAttribute('filesource')
#             print 'Source: ', filesource
#             print 'Translations: '
#             continue
#         if tuv_type == 'Translation':
#             tuv_target = tuv
#         filetarget = tuv_target.getAttribute('filesource')
         
#         print '\t', filetarget, '\n'

In [ ]:


In [ ]: