In [1]:
import csv
import requests

langs = {
    "Spn": "es",
    "Eng": "en"
}

cache = {}

def lookup(lang, title):
    key = "{}-{}".format(lang, title)
    if key in cache:
        return cache[key]

    url = "https://{}.wiktionary.org/w/api.php".format(langs[lang])
    params = {
        "action": "query",
        "titles": title.lower(),
        "prop": "categories",
        "cllimit": 500,
        "format": "json"
    }
    data = requests.get(url, params=params).json()
    pages = data["query"]["pages"]
    for pid in pages:
        page = pages[pid]
        if "missing" in page:
            cache[key] = False
            return False
        for cat in page["categories"]:
            if lang == "Spn":
                if "Categoría:ES" in cat["title"]:
                    cache[key] = True
                    return True
            if lang == "Eng":
                if "Category:English" in cat["title"]:
                    cache[key] = True
                    return True
        cache[key] = False
        return False

with open("Stavans El Little Principe Parts 1-1V_annotated - Stavans_annotated.tsv", "r", encoding="utf8") as tsvin:
    lines = csv.reader(tsvin, delimiter="\t")
    next(lines) # skip header
    for line in lines:
        word, lang, *_ = line
        if lang in langs:
            exists = lookup(lang, word)
            if not exists:
                print("{} (tagged as {})".format(word, lang))


IV (tagged as Eng)
SLD (tagged as Eng)
CN (tagged as Eng)
8N (tagged as Eng)
DP (tagged as Eng)
LMDNS (tagged as Eng)
IA (tagged as Spn)
LD (tagged as Eng)
CNM (tagged as Eng)
masticarlas (tagged as Spn)
crayola (tagged as Spn)
drawin (tagged as Eng)
digesteando (tagged as Spn)
concentrarme (tagged as Spn)
anythin (tagged as Eng)
Arizona (tagged as Spn)
Neckarsteinach (tagged as Eng)
Germany (tagged as Eng)
Tintenfas (tagged as Eng)
January (tagged as Eng)
8N (tagged as Spn)
NL (tagged as Eng)
ALI (tagged as Spn)
9L (tagged as Spn)
NDI (tagged as Eng)
DN (tagged as Eng)
NDI (tagged as Eng)
CNN (tagged as Spn)
IIE (tagged as Spn)
NL (tagged as Spn)
LI (tagged as Spn)
8N (tagged as Spn)
DP (tagged as Eng)
LMDNS (tagged as Eng)
IA (tagged as Spn)
NDI (tagged as Eng)
MN (tagged as Spn)
IIE (tagged as Spn)
SPANGLISH (tagged as Eng)
talkin (tagged as Eng)
II (tagged as Eng)
SLD (tagged as Eng)
CN (tagged as Eng)
8N (tagged as Eng)
DP (tagged as Eng)
LMDNS (tagged as Eng)
IA (tagged as Spn)
LD (tagged as Eng)
CNM (tagged as Eng)
Sahara (tagged as Spn)
adrifteando (tagged as Spn)
imaginarte (tagged as Spn)
awekeado (tagged as Spn)
dibujarme (tagged as Spn)
Dibújame (tagged as Spn)
mis (tagged as Spn)
lightin (tagged as Eng)
ví (tagged as Spn)
estudiándome (tagged as Spn)
faulta (tagged as Spn)
8N (tagged as Spn)
NL (tagged as Eng)
ALI (tagged as Spn)
9L (tagged as Spn)
NDI (tagged as Eng)
DN (tagged as Eng)
NDI (tagged as Eng)
CNN (tagged as Spn)
IIE (tagged as Spn)
NL (tagged as Spn)
LI (tagged as Spn)
8N (tagged as Spn)
DP (tagged as Eng)
LMDNS (tagged as Eng)
IA (tagged as Spn)
NDI (tagged as Eng)
MN (tagged as Spn)
IIE (tagged as Spn)
IV (tagged as Eng)
8N (tagged as Eng)
DP (tagged as Eng)
LMDNS (tagged as Eng)
IA (tagged as Spn)
LD (tagged as Eng)
CNM (tagged as Eng)
exhaustión (tagged as Spn)
frekeado (tagged as Spn)
quietmente (tagged as Spn)
dibujarme (tagged as Spn)
bafeado (tagged as Spn)
Dibújame (tagged as Spn)
pictura (tagged as Spn)
astoundeado (tagged as Spn)
tinito (tagged as Spn)
Dibújame (tagged as Spn)
escrutinó (tagged as Spn)
Dibújame (tagged as Spn)
undulgentemente (tagged as Spn)
empezé (tagged as Spn)
estripear (tagged as Spn)
crate (tagged as Spn)
astonisheado (tagged as Spn)
SLD (tagged as Eng)
CN (tagged as Eng)
8N (tagged as Eng)
NL (tagged as Eng)
ALI (tagged as Spn)
9L (tagged as Spn)
NDI (tagged as Eng)
DN (tagged as Eng)
NDI (tagged as Eng)
CNN (tagged as Spn)
IIE (tagged as Spn)
NL (tagged as Spn)
LI (tagged as Spn)
8N (tagged as Spn)
DP (tagged as Eng)
LMDNS (tagged as Eng)
IA (tagged as Spn)
NDI (tagged as Eng)
MN (tagged as Spn)
IIE (tagged as Spn)
SPANGLISH (tagged as Eng)
tinito (tagged as Spn)
closemente (tagged as Spn)
SLD (tagged as Eng)
CN (tagged as Eng)
8N (tagged as Eng)
DP (tagged as Eng)
LMDNS (tagged as Eng)
IA (tagged as Spn)
LD (tagged as Eng)
CNM (tagged as Eng)
escucharme (tagged as Spn)
gradualmente (tagged as Spn)
instancia (tagged as Spn)
dibujarlo (tagged as Spn)
askeó (tagged as Spn)
modestamente (tagged as Spn)
tinkleada (tagged as Spn)
annoyin (tagged as Eng)
mis (tagged as Spn)
Suddenmente (tagged as Spn)
dí (tagged as Spn)
dándome (tagged as Spn)
mysteriosa (tagged as Spn)
replayó (tagged as Spn)
Gentlemente (tagged as Spn)
persamientos (tagged as Spn)
imaginarte (tagged as Spn)
cuán (tagged as Spn)
pondereó (tagged as Spn)
replayó (tagged as Spn)
crate (tagged as Spn)
shockeado (tagged as Eng)
Amarrarlo (tagged as Spn)
8N (tagged as Spn)
NL (tagged as Eng)
ALI (tagged as Spn)
9L (tagged as Spn)
NDI (tagged as Eng)
DN (tagged as Eng)
NDI (tagged as Eng)
CNN (tagged as Spn)
IIE (tagged as Spn)
NL (tagged as Spn)
LI (tagged as Spn)
8N (tagged as Spn)
DP (tagged as Eng)
LMDNS (tagged as Eng)
IA (tagged as Spn)
NDI (tagged as Eng)
MN (tagged as Spn)
IIE (tagged as Spn)
IV (tagged as Eng)
wanderear (tagged as Eng)
laugheando (tagged as Spn)
solemnemente (tagged as Spn)
wouldn (tagged as Eng)
tinico (tagged as Spn)
wistfulmente (tagged as Spn)
IV (tagged as Eng)
SLD (tagged as Eng)
CN (tagged as Eng)
8N (tagged as Eng)
DP (tagged as Eng)
LMDNS (tagged as Eng)
IA (tagged as Spn)
LD (tagged as Eng)
CNM (tagged as Eng)
somethin (tagged as Eng)
Júpiter (tagged as Spn)
Marte (tagged as Spn)
B612 (tagged as Spn)
glimpseado (tagged as Spn)
dessed (tagged as Eng)
Luckymente (tagged as Spn)
B612 (tagged as Spn)
European (tagged as Eng)
presentatión (tagged as Spn)
B612 (tagged as Spn)
8N (tagged as Spn)
NL (tagged as Eng)
ALI (tagged as Spn)
9L (tagged as Spn)
NDI (tagged as Eng)
DN (tagged as Eng)
NDI (tagged as Eng)
CNN (tagged as Spn)
IIE (tagged as Spn)
NL (tagged as Spn)
LI (tagged as Spn)
8N (tagged as Spn)
DP (tagged as Eng)
LMDNS (tagged as Eng)
IA (tagged as Spn)
NDI (tagged as Eng)
MN (tagged as Spn)
IIE (tagged as Spn)
8N (tagged as Spn)
DP (tagged as Eng)
LMDNS (tagged as Eng)
IA (tagged as Spn)
LD (tagged as Eng)
CNM (tagged as Eng)
ví (tagged as Spn)
visualizarla (tagged as Spn)
ví (tagged as Spn)
B612 (tagged as Spn)
tenerlo (tagged as Spn)
lightemente (tagged as Spn)
stirreando (tagged as Spn)
disapareció (tagged as Spn)
describírtelo (tagged as Spn)
forgetée (tagged as Eng)
convertirme (tagged as Spn)
crayolas (tagged as Spn)
digesteando (tagged as Spn)
mis (tagged as Spn)
lograrlo (tagged as Spn)
disculparme (tagged as Spn)
crates (tagged as Spn)
SLD (tagged as Eng)
CN (tagged as Eng)
8N (tagged as Eng)
SPANGLISH (tagged as Eng)
NL (tagged as Eng)
ALI (tagged as Spn)
9L (tagged as Spn)
NDI (tagged as Eng)
DN (tagged as Eng)
NDI (tagged as Eng)
CNN (tagged as Spn)
IIE (tagged as Spn)
NL (tagged as Spn)
LI (tagged as Spn)
8N (tagged as Spn)
DP (tagged as Eng)
LMDNS (tagged as Eng)
IA (tagged as Spn)
NDI (tagged as Eng)
MN (tagged as Spn)
IIE (tagged as Spn)

In [10]:
candidates = []
for item in cache:
    if not cache[item]: # not found
        candidates.append(", ".join(item.split("-", 1)))

print(len(candidates))
with open("potential mixed matches.csv", "w") as f:
    for c in candidates:
        f.write(c)
        f.write("\n")


105