In [1]:
EATINGMEAT_BECAUSE_TRAIN = "../data/interim/eatingmeat_because_large_train_withprompt.ndjson"
EATINGMEAT_BECAUSE_TEST = "../data/interim/eatingmeat_because_large_test_withprompt.ndjson"
EATINGMEAT_BUT_TRAIN = "../data/interim/eatingmeat_but_large_train_withprompt.ndjson"
EATINGMEAT_BUT_TEST = "../data/interim/eatingmeat_but_large_test_withprompt.ndjson"
JUNKFOOD_BECAUSE_TRAIN = "../data/interim/junkfood_because_train_withprompt.ndjson"
JUNKFOOD_BUT_TRAIN = "../data/interim/junkfood_but_train_withprompt.ndjson"
In [3]:
from collections import Counter
import spacy
import ndjson
nlp = spacy.load("en")
def count_tokens(f):
with open(f) as i:
data = ndjson.load(i)
tokens = Counter()
for item in data:
tokens.update([t.orth_ for t in nlp(item["text"])])
return tokens
tokens_because_train = count_tokens(TRAIN_FILE_BECAUSE)
tokens_because_test = count_tokens(TEST_FILE_BECAUSE)
tokens_but_train = count_tokens(TRAIN_FILE_BUT)
tokens_but_test = count_tokens(TEST_FILE_BUT)
In [3]:
def compute_overlap(train, test):
print("Number of tokens in train:", len(train))
print("Number of tokens in test:", len(test))
unseen = [t for t in test if not t in train]
print("Unseen tokens in test:", len(unseen))
print("% of unseen tokens in test:", len(unseen)/len(test))
print("BECAUSE")
compute_overlap(tokens_because_train, tokens_because_test)
print("")
print("BUT")
compute_overlap(tokens_but_train, tokens_but_test)
In [19]:
from collections import defaultdict
def compute_label_overlap(data):
tokens_by_label = defaultdict(lambda: set())
for item in data:
tokens = set([t.orth_ for t in nlp(item["text"])])
tokens_by_label[item["label"]].update(tokens)
overlaps = []
for label1 in tokens_by_label:
for label2 in tokens_by_label:
tokenset1 = tokens_by_label[label1]
tokenset2 = tokens_by_label[label2]
overlap = len(tokenset1 & tokenset2)/len(tokenset1 | tokenset2)
overlaps.append(overlap)
return sum(overlaps) / len(overlaps)
def compute_label_overlap_from_file(f):
with open(f) as i:
data = ndjson.load(i)
overlap = compute_label_overlap(data)
return overlap
print("Eating meat - but:", compute_label_overlap_from_file(EATINGMEAT_BUT_TRAIN))
print("Eating meat - because:", compute_label_overlap_from_file(EATINGMEAT_BECAUSE_TRAIN))
print("Junk food - but:", compute_label_overlap_from_file(JUNKFOOD_BUT_TRAIN))
print("Junk food - because:", compute_label_overlap_from_file(JUNKFOOD_BECAUSE_TRAIN))
In [18]:
from scipy import spatial
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from collections import defaultdict
import re
import pandas as pd
def compute_label_similarity(data):
texts_by_label = defaultdict(list)
for item in data:
text = item["text"]
text = re.split(", (but|because) ", text)[-1]
texts_by_label[item["label"]].append(text)
vectorizer = TfidfVectorizer()
labels = list(texts_by_label.keys())
label_matrix = vectorizer.fit_transform([" ".join(texts_by_label[label]) for label in labels])
cosines = []
for i, label1 in enumerate(labels):
for j, label2 in enumerate(labels):
cosine = spatial.distance.cosine(label_matrix[i,:].todense(), label_matrix[j,:].todense())
cosines.append(cosine)
df = pd.DataFrame(cosines)
print(df.describe())
return sum(cosines)/len(cosines)
def compute_label_similarity_from_file(f):
with open(f) as i:
data = ndjson.load(i)
overlap = compute_label_similarity(data)
return overlap
print("Eating Meat - because", compute_label_similarity_from_file(EATINGMEAT_BECAUSE_TRAIN))
print("Eating Meat - but", compute_label_similarity_from_file(EATINGMEAT_BUT_TRAIN))
print("Junk food - because", compute_label_similarity_from_file(JUNKFOOD_BECAUSE_TRAIN))
print("Junk food - but", compute_label_similarity_from_file(JUNKFOOD_BUT_TRAIN))
In [ ]: