Aggregate Parallel Texts in directory

This assumes that we have a bunch of .csv files with the filename in the format ${source}-${lang}.csv, where each file has the header ts,txt to read in the text at each numeric timestamp.



In [ ]:

    
import os
import csv
import time, random
import re



In [ ]:

    
lang_from, lang_to = 'en', 'ko'

data_path = './data'

Go through all the files in the directory, and find the source prefixes that have both lang_from and lang_to CSVs available.



In [ ]:

    
stub_from, stub_to = set(),set()
stub_matcher = re.compile(r"(.*)\-(\w+)\.csv")
for fname in os.listdir(data_path):
    #print(fname)
    m = stub_matcher.match(fname)
    if m:
        stub, lang = m.group(1), m.group(2)
        if lang == lang_from: stub_from.add(stub)
        if lang == lang_to:   stub_to.add(stub)
stub_both = stub_from.intersection(stub_to)

Now, go through stub_both and for each CSVs, read in both languages, and take all the txt entries at the same timestamps, and build the correspondence.



In [ ]:

    
correspondence_loc,txt_from,txt_to=[],[],[]

def read_dict_from_csv(fname):
    d=dict()
    with open(fname, 'r') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            d[float(row['ts'])]=row['txt']
    return d

for stub in stub_both:
    #print("Reading stub %s" % (stub,))
    data_from = read_dict_from_csv( os.path.join(data_path, stub+'-'+lang_from+'.csv') )
    data_to   = read_dict_from_csv( os.path.join(data_path, stub+'-'+lang_to+'.csv') )
    
    valid, skipped=0, 0
    for ts, txt in data_from.items():
        if ts in data_to:
            correspondence_loc.append( (stub, ts) )
            txt_from.append( txt )
            txt_to.append( data_to[ts] )
            valid += 1
        else:
            skipped += 1
    print("%3d valid of %3d fragments from '%s'" % (valid, valid+skipped, stub))
print("  Total data : %d text fragments" % (len(correspondence_loc),))



In [ ]:

    
for _ in range(10):
    i = random.randrange(len(correspondence_loc))
    print( txt_from[i], txt_to[i]  )

Tokenize the correspondences

NB: Japanese requires word-splitting too



In [ ]:

    
sub_punctuation = re.compile(r'[\,\.\:\;\?\!\-\—\s\"0-9\(\)]+')
sub_apostrophes = re.compile(r'\'(\w+)')
sub_multispaces = re.compile(r'\s\s+')
    
if lang_from=='ja' or lang_to=='ja':
    import tinysegmenter
    ja_segmenter = tinysegmenter.TinySegmenter()
    sub_punc_ja  = re.compile(r'[\」\「\？\。\、\・\（\）\―]+')

def tokenize_txt(arr, lang):
    tok=[]
    for txt in arr:
        t = txt.lower()
        t = re.sub(sub_punctuation, u' ', t)
        if "'" in t:
            t = re.sub(sub_apostrophes, r" '\1", t)
        if lang=='ja':
            t = ' '.join( ja_segmenter.tokenize(t) )
            t = re.sub(sub_punc_ja, u' ', t)
        t = re.sub(sub_multispaces, ' ', t)
        tok.append(t.strip())
    return tok



In [ ]:

    
tok_from = tokenize_txt(txt_from, lang_from)
tok_to   = tokenize_txt(txt_to, lang_to)



In [ ]:

    
tok_from[220:250]



In [ ]:

    
tok_to[220:250]

Build frequency dictionaries



In [ ]:

    
def build_freq(tok_arr):
    f=dict()
    for tok in tok_arr:
        for w in tok.split():
            if w not in f: f[w]=0
            f[w]+=1
    return f



In [ ]:

    
freq_from=build_freq(tok_from)
freq_to  =build_freq(tok_to)



In [ ]:

    
len(freq_from),len(freq_to),



In [ ]:

    
def most_frequent(freq, n=50, start=0):
    return ', '.join( sorted(freq,key=lambda w:freq[w], reverse=True)[start:n+start] )

print(most_frequent(freq_from))
print(most_frequent(freq_to, n=100))



In [ ]:

    
print(most_frequent(freq_from, n=20, start=9000))



In [ ]:

    
print( len( [_ for w,f in freq_from.items() if f>=10]))
print( len( [_ for w,f in freq_to.items() if f>=10]))



In [ ]:

    
def build_rank(freq):
    return { w:i for i,w in enumerate( sorted(freq, key=lambda w:freq[w], reverse=True) ) }



In [ ]:

    
rank_from = build_rank(freq_from)
rank_to   = build_rank(freq_to)



In [ ]:

    
print(rank_from['robot'])



In [ ]:

    
def max_rank(tok, rank):  # Find the most infrequent word in this tokenized sentence
    r = -1
    for w in tok.split():
        if rank[w]>r: r=rank[w] 
    return r
tok_max_rank_from = [ max_rank(tok, rank_from) for tok in tok_from ]
tok_max_rank_to   = [ max_rank(tok, rank_to)   for tok in tok_to ]



In [ ]:

    
start=0;print(tok_max_rank_from[start:start+15], '\n', tok_max_rank_to[start:start+15],)
i=0; tok_max_rank_from[i], tok_from[i], tok_to[i], tok_max_rank_to[i],

Build a fragment coincidence matrix

This might allow us to do single word translations...



In [ ]: