In [1]:
#The common functions for evaluation, splitting corpus
from collections import Counter
import math, random
#--------------------------------------------------------------------------------
#
# data splitting
#

def split_data(data, prob):
    """split data into fractions [prob, 1 - prob]"""
    results = [], []
    for row in data:
        results[0 if random.random() < prob else 1].append(row)
    return results
#--------------------------------------------------------------------------------
def train_test_split(x, y, test_pct):
    data = zip(x, y)                              # pair corresponding values  
    train, test = split_data(data, 1 - test_pct)  # split the dataset of pairs
    x_train, y_train = zip(*train)                # magical un-zip trick
    x_test, y_test = zip(*test)
    return x_train, x_test, y_train, y_test
#--------------------------------------------------------------------------------
#
# correctness
#

def accuracy(tp, fp, fn, tn):
    correct = tp + tn
    total = tp + fp + fn + tn
    return correct / total
#--------------------------------------------------------------------------------
def precision(tp, fp, fn, tn):
    return tp / (tp + fp)
#--------------------------------------------------------------------------------
def recall(tp, fp, fn, tn):
    return tp / (tp + fn)
#--------------------------------------------------------------------------------
def f1_score(tp, fp, fn, tn):
    p = precision(tp, fp, fn, tn)
    r = recall(tp, fp, fn, tn)

    return 2 * p * r / (p + r)

In [2]:
if __name__ == "__main__":

    print("accuracy(70, 4930, 13930, 981070)", accuracy(70, 4930, 13930, 981070))
    print("precision(70, 4930, 13930, 981070)", precision(70, 4930, 13930, 981070))
    print("recall(70, 4930, 13930, 981070)", recall(70, 4930, 13930, 981070))
    print("f1_score(70, 4930, 13930, 981070)", f1_score(70, 4930, 13930, 981070))


accuracy(70, 4930, 13930, 981070) 0.98114
precision(70, 4930, 13930, 981070) 0.014
recall(70, 4930, 13930, 981070) 0.005
f1_score(70, 4930, 13930, 981070) 0.00736842105263158