In [1]:
import random
import pandas as pd
data = pd.read_table('kc_gold.tsv')
data.head()
Out[1]:
In [2]:
def peek(d): # https://stackoverflow.com/a/28704691/
return dict(list(d.items())[:5])
data['Universal Dependency'].value_counts()
Out[2]:
In [60]:
feat = 'Universal Dependency'
n = 2
n_gram = ()
word4feat = {}
lang_switch_count_all = {} # probability of switching language
next_count_all = {} # counts of next feature
last_lang = None
langs = set()
for i, row in data.iterrows():
if len(n_gram) < n:
n_gram += (row[feat],)
continue # to get at least n things
lang = row['Language']
langs.add(lang)
if lang not in lang_switch_count_all:
lang_switch_count_all[lang] = {}
lang_switch_count = lang_switch_count_all[lang]
if lang not in next_count_all:
next_count_all[lang] = {}
next_count = next_count_all[lang]
lang_cat = 'switch' if row['Language'] != last_lang else 'stay'
last_lang = row['Language']
if n_gram not in lang_switch_count:
lang_switch_count[n_gram] = {}
lang_switch_count[n_gram][lang_cat] = lang_switch_count[n_gram].get(lang_cat, 0) + 1
if n_gram not in next_count:
next_count[n_gram] = {}
next_count[n_gram][row[feat]] = next_count.get(row[feat], 0) + 1
n_gram = list(n_gram)
n_gram.append(row[feat]) # aaaaaaaaaaaaaaaa
if len(n_gram) > n:
n_gram.pop(0)
n_gram = tuple(n_gram)
key = (lang, row[feat])
if key not in word4feat:
word4feat[key] = set()
word4feat[key].add(row['Token'])
lang_switch_count_all[lang] = lang_switch_count
peek(lang_switch_count_all['Spn'])
Out[60]:
In [61]:
lang_switch_prob_all = {} # probability of switching language
next_pos_prob_all = {} # probs of next POS
for lang in langs:
lang_switch_count = lang_switch_count_all[lang]
next_count = next_count_all[lang]
lang_switch_prob = {}
next_pos_prob = {}
for n_gram in lang_switch_count:
counts = lang_switch_count[n_gram]
lang_switch_prob[n_gram] = counts.get('switch', 0) / (counts.get('switch', 0) + counts.get('stay', 0))
for n_gram in next_count:
following = next_count[n_gram]
total = 0
for f in following:
total += following[f]
next_pos_prob[n_gram] = {}
for f in following:
next_pos_prob[n_gram][f] = following[f] / total
lang_switch_prob_all[lang] = lang_switch_prob
next_pos_prob_all[lang] = next_pos_prob
peek(lang_switch_prob_all['Spn'])
Out[61]:
In [62]:
langs = lang_switch_prob_all.keys()
def pick_word(lang, feat):
return random.sample(word4feat[(lang, feat)], 1)
def switch_lang(lang, n_gram):
switch_prob = lang_switch_prob_all[lang].get(n_gram, 0)
return random.random() > switch_prob
lang = random.sample(langs, 1)[0]
sent = random.sample(word4feat.keys(), 1)
for i in range(30):
# print(sent)
n_gram = tuple([x[1] for x in sent[-n:]])
if switch_lang(lang, n_gram):
if lang == 'Spn':
lang = 'Eng'
else:
lang = 'Spn'
# print(n_gram)
candidates = next_pos_prob_all[lang].get(n_gram)
if candidates is None:
print('Unable to find something to follow', n_gram, 'in', lang)
sent.append(random.sample(word4feat.keys(), 1)[0])
else:
population = list(candidates.keys())
weights = [candidates[f] for f in candidates.keys()]
chosen_feat = random.choices(population, weights)[0]
sent.append((lang, chosen_feat,))
if chosen_feat == "PUNCT":
break
# print("yay")
print(sent)
print(" ".join([pick_word(x[0], x[1])[0] for x in sent]))