In [1]:
import random
import pandas as pd
data = pd.read_table('kc_gold.tsv')
data.head()


Out[1]:
Number Token Language Named Entity POS Universal Dependency Sentence
0 1000 . Spn O . PUNCT s0
1 1001 Mil Spn O NUM NUM s1
2 1002 gracias Spn O NOUN NOUN s1
3 1003 . Spn O . PUNCT s1
4 1004 I Eng O PRON PRON s2

In [2]:
def peek(d): # https://stackoverflow.com/a/28704691/
    return dict(list(d.items())[:5])

data['Universal Dependency'].value_counts()


Out[2]:
PUNCT    1397
NOUN      820
ADP       748
ADJ       685
VERB      638
DET       637
ADV       508
PROPN     439
PRON      378
CCONJ     244
X         166
PART      136
AUX       102
SCONJ      63
NUM        41
Name: Universal Dependency, dtype: int64

In [60]:
feat = 'Universal Dependency'
n = 2
n_gram = ()

word4feat = {}
lang_switch_count_all = {} # probability of switching language
next_count_all = {} # counts of next feature
last_lang = None
langs = set()

for i, row in data.iterrows():
    if len(n_gram) < n:
        n_gram += (row[feat],)
        continue # to get at least n things
    
    lang = row['Language']
    langs.add(lang)
    if lang not in lang_switch_count_all:
        lang_switch_count_all[lang] = {}
    lang_switch_count = lang_switch_count_all[lang]
    if lang not in next_count_all:
        next_count_all[lang] = {}
    next_count = next_count_all[lang]
    
    lang_cat = 'switch' if row['Language'] != last_lang else 'stay'
    last_lang = row['Language']
    if n_gram not in lang_switch_count:
        lang_switch_count[n_gram] = {}
    lang_switch_count[n_gram][lang_cat] = lang_switch_count[n_gram].get(lang_cat, 0) + 1
    
    if n_gram not in next_count:
        next_count[n_gram] = {}
    next_count[n_gram][row[feat]] = next_count.get(row[feat], 0) + 1

    n_gram = list(n_gram)
    n_gram.append(row[feat]) # aaaaaaaaaaaaaaaa
    if len(n_gram) > n:
        n_gram.pop(0)
    n_gram = tuple(n_gram)
    
    key = (lang, row[feat])
    if key not in word4feat:
        word4feat[key] = set()
    word4feat[key].add(row['Token'])
    lang_switch_count_all[lang] = lang_switch_count

peek(lang_switch_count_all['Spn'])


Out[60]:
{('PUNCT', 'NUM'): {'switch': 1},
 ('NUM', 'NOUN'): {'stay': 7, 'switch': 1},
 ('NOUN', 'PUNCT'): {'switch': 90, 'stay': 88},
 ('PUNCT', 'PROPN'): {'stay': 28, 'switch': 8},
 ('NOUN', 'ADP'): {'switch': 11, 'stay': 111}}

In [61]:
lang_switch_prob_all = {} # probability of switching language
next_pos_prob_all = {} # probs of next POS

for lang in langs:
    lang_switch_count = lang_switch_count_all[lang]
    next_count = next_count_all[lang]
    
    lang_switch_prob = {}
    next_pos_prob = {}

    for n_gram in lang_switch_count:
        counts = lang_switch_count[n_gram]
        lang_switch_prob[n_gram] = counts.get('switch', 0) / (counts.get('switch', 0) + counts.get('stay', 0))

    for n_gram in next_count:
        following = next_count[n_gram]
        total = 0
        for f in following:
            total += following[f]
        next_pos_prob[n_gram] = {}
        for f in following:
            next_pos_prob[n_gram][f] = following[f] / total
    
    lang_switch_prob_all[lang] = lang_switch_prob
    next_pos_prob_all[lang] = next_pos_prob

peek(lang_switch_prob_all['Spn'])


Out[61]:
{('PUNCT', 'NUM'): 1.0,
 ('NUM', 'NOUN'): 0.125,
 ('NOUN', 'PUNCT'): 0.5056179775280899,
 ('PUNCT', 'PROPN'): 0.2222222222222222,
 ('NOUN', 'ADP'): 0.09016393442622951}

In [62]:
langs = lang_switch_prob_all.keys()

def pick_word(lang, feat):
    return random.sample(word4feat[(lang, feat)], 1)

def switch_lang(lang, n_gram):
    switch_prob = lang_switch_prob_all[lang].get(n_gram, 0)
    return random.random() > switch_prob

lang = random.sample(langs, 1)[0]
sent = random.sample(word4feat.keys(), 1)

for i in range(30):
#     print(sent)
    n_gram = tuple([x[1] for x in sent[-n:]])
    if switch_lang(lang, n_gram):
        if lang == 'Spn':
            lang = 'Eng'
        else:
            lang = 'Spn'
#     print(n_gram)
    candidates = next_pos_prob_all[lang].get(n_gram)
    if candidates is None:
        print('Unable to find something to follow', n_gram, 'in', lang)
        sent.append(random.sample(word4feat.keys(), 1)[0])
    else:
        population = list(candidates.keys())
        weights = [candidates[f] for f in candidates.keys()]
        chosen_feat = random.choices(population, weights)[0]
        sent.append((lang, chosen_feat,))
        if chosen_feat == "PUNCT":
            break
#         print("yay")

print(sent)
print(" ".join([pick_word(x[0], x[1])[0] for x in sent]))


Unable to find something to follow ('NUM',) in Spn
[('Num', 'NUM'), ('Eng', 'NOUN'), ('Eng', 'PUNCT')]
23 way .