In [1]:
EATINGMEAT_BECAUSE_TRAIN = "../data/interim/eatingmeat_because_large_train_withprompt.ndjson"
EATINGMEAT_BECAUSE_TEST = "../data/interim/eatingmeat_because_large_test_withprompt.ndjson"

EATINGMEAT_BUT_TRAIN = "../data/interim/eatingmeat_but_large_train_withprompt.ndjson"
EATINGMEAT_BUT_TEST = "../data/interim/eatingmeat_but_large_test_withprompt.ndjson"

JUNKFOOD_BECAUSE_TRAIN = "../data/interim/junkfood_because_train_withprompt.ndjson"
JUNKFOOD_BUT_TRAIN = "../data/interim/junkfood_but_train_withprompt.ndjson"

In [3]:
from collections import Counter
import spacy
import ndjson

nlp = spacy.load("en")

def count_tokens(f):
    with open(f) as i:
        data = ndjson.load(i)
        
    tokens = Counter()
    for item in data:
        tokens.update([t.orth_ for t in nlp(item["text"])])
        
    return tokens
        

tokens_because_train = count_tokens(TRAIN_FILE_BECAUSE)
tokens_because_test = count_tokens(TEST_FILE_BECAUSE)

tokens_but_train = count_tokens(TRAIN_FILE_BUT)
tokens_but_test = count_tokens(TEST_FILE_BUT)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-3-a1fb19352d45> in <module>
     16 
     17 
---> 18 tokens_because_train = count_tokens(TRAIN_FILE_BECAUSE)
     19 tokens_because_test = count_tokens(TEST_FILE_BECAUSE)
     20 

NameError: name 'TRAIN_FILE_BECAUSE' is not defined

In [3]:
def compute_overlap(train, test):

    print("Number of tokens in train:", len(train))
    print("Number of tokens in test:", len(test))
    
    unseen = [t for t in test if not t in train]
    print("Unseen tokens in test:", len(unseen))
    print("% of unseen tokens in test:", len(unseen)/len(test))

    
print("BECAUSE")
compute_overlap(tokens_because_train, tokens_because_test)
print("")
print("BUT")
compute_overlap(tokens_but_train, tokens_but_test)


BECAUSE
Number of tokens in train: 569
Number of tokens in test: 294
Unseen tokens in test: 63
% of unseen tokens in test: 0.21428571428571427

BUT
Number of tokens in train: 936
Number of tokens in test: 452
Unseen tokens in test: 119
% of unseen tokens in test: 0.26327433628318586

In [19]:
from collections import defaultdict

def compute_label_overlap(data): 
    
    tokens_by_label = defaultdict(lambda: set())
    for item in data:
        tokens = set([t.orth_ for t in nlp(item["text"])])
        tokens_by_label[item["label"]].update(tokens)
        
    overlaps = []
    for label1 in tokens_by_label:
        for label2 in tokens_by_label:
            tokenset1 = tokens_by_label[label1]
            tokenset2 = tokens_by_label[label2]
            overlap = len(tokenset1 & tokenset2)/len(tokenset1 | tokenset2)
            overlaps.append(overlap)
    
    return sum(overlaps) / len(overlaps)

def compute_label_overlap_from_file(f):
    with open(f) as i:
        data = ndjson.load(i)
    overlap = compute_label_overlap(data)
    return overlap

print("Eating meat - but:", compute_label_overlap_from_file(EATINGMEAT_BUT_TRAIN))
print("Eating meat - because:", compute_label_overlap_from_file(EATINGMEAT_BECAUSE_TRAIN))
print("Junk food - but:", compute_label_overlap_from_file(JUNKFOOD_BUT_TRAIN))
print("Junk food - because:", compute_label_overlap_from_file(JUNKFOOD_BECAUSE_TRAIN))


Eating meat - but: 0.2536198693730758
Eating meat - because: 0.2974968609449983
Junk food - but: 0.3291589014804912
Junk food - because: 0.4016385873271023

In [18]:
from scipy import spatial
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from collections import defaultdict
import re
import pandas as pd

def compute_label_similarity(data):
    texts_by_label = defaultdict(list)
    for item in data:
        text = item["text"]
        text = re.split(", (but|because) ", text)[-1]
        texts_by_label[item["label"]].append(text)
        
    vectorizer = TfidfVectorizer()
    labels = list(texts_by_label.keys())
    label_matrix = vectorizer.fit_transform([" ".join(texts_by_label[label]) for label in labels])
    
    cosines = []
    for i, label1 in enumerate(labels):
        for j, label2 in enumerate(labels):
            cosine = spatial.distance.cosine(label_matrix[i,:].todense(), label_matrix[j,:].todense())
            cosines.append(cosine)

    df = pd.DataFrame(cosines)
    print(df.describe())
            
    return sum(cosines)/len(cosines)
    
def compute_label_similarity_from_file(f):
    with open(f) as i:
        data = ndjson.load(i)
    overlap = compute_label_similarity(data)
    return overlap


print("Eating Meat - because", compute_label_similarity_from_file(EATINGMEAT_BECAUSE_TRAIN))
print("Eating Meat - but", compute_label_similarity_from_file(EATINGMEAT_BUT_TRAIN))
print("Junk food - because", compute_label_similarity_from_file(JUNKFOOD_BECAUSE_TRAIN))
print("Junk food - but", compute_label_similarity_from_file(JUNKFOOD_BUT_TRAIN))


               0
count  49.000000
mean    0.576889
std     0.338906
min     0.000000
25%     0.176325
50%     0.702399
75%     0.810982
max     0.945170
Eating Meat - because 0.5768892926126091
                0
count  121.000000
mean     0.622426
std      0.230877
min      0.000000
25%      0.596339
50%      0.669411
75%      0.749399
max      0.886755
Eating Meat - but 0.622426429537375
               0
count  16.000000
mean    0.143293
std     0.097128
min     0.000000
25%     0.086045
50%     0.162129
75%     0.209390
max     0.263503
Junk food - because 0.14329315186519204
               0
count  49.000000
mean    0.129789
std     0.076973
min     0.000000
25%     0.079385
50%     0.125557
75%     0.161029
max     0.278618
Junk food - but 0.12978943185195838

In [ ]: