In [1]:
from random import random


def split_data(data, prob):
    """split data into fractions [prob, 1 -prob]"""
    results = [], []
    for row in data:
        results[0 if random() < prob else 1].append(row)
    return results


def train_test_split(x, y, test_pct):
    data = zip(x, y)
    train, test = split_data(data, 1 - test_pct)
    x_train, y_train = zip(*train)
    x_test, y_test = zip(*test)
    return x_train, x_test, y_train, y_test

In [11]:
# This is just a test of the Asterisk * notation.

train = [1,2,3,4,5]
print(train)
print(*train)
print(type(train))
#print(type(*train))  # TypeError: type() takes 1 or 3 arguments

pairs = [(1,2), (3,4), (5,6), (7,8), (9,0)]
print(pairs)
print(*pairs)
print(type(pairs))
#print(type(*pairs))  # TypeError: type() takes 1 or 3 arguments

#print(zip(*train))  # TypeError: zip argument #1 must support iteration
print(zip(*pairs))


# The asterisk operator unpacks a collection into positional arguments for a function call:
# http://stackoverflow.com/questions/2921847/what-does-the-star-operator-mean-in-python


[1, 2, 3, 4, 5]
1 2 3 4 5
<class 'list'>
[(1, 2), (3, 4), (5, 6), (7, 8), (9, 0)]
(1, 2) (3, 4) (5, 6) (7, 8) (9, 0)
<class 'list'>
<zip object at 0x7fa21815ee48>

In [12]:
# CORRECTNESS


def accuracy(tp, fp, fn, tn):
    """fraction of correct predictions"""
    correct = tp + tn
    total = tp + fp + fn + tn
    return correct / total


def precision(tp, fp, fn, tn):
    """fraction of correct positive predictions"""
    return tp / (tp + fp)


def recall(tp, fp, fn, tn):
    """What fractions positive values predicted"""
    return tp/ (tp + fn)


def f1_score(tp, fp, fn, tn):
    """The harmonic mean of precision and recall is
    the F1 score."""
    p = precision(tp, fp, fn, tn)
    r = recall(tp, fp, fn, tn)
    return 2 * p * r / (p + r)

In [15]:
# The ridiculous hypothesis: Being name "Luke" causes Leukemia

#        leukemia  no_lukemia  total
data = [[70,       4930,       5000],     # named "Luke"
        [13930,    981070,     995000],   # not named "Luke"
        [14000,    986000,     1000000]]  # total

print('\nWhat is the accuracy of my hypothesis?')
print(accuracy(data[0][0], data[0][1], data[1][0], data[1][1]))

print('\nWhat is the precision of my hypothesis?')
print(precision(data[0][0], data[0][1], data[1][0], data[1][1]))

print('\nWhat is the recall of my hypothesis?')
print(recall(data[0][0], data[0][1], data[1][0], data[1][1]))

print('\nWhat is the F1 Score of my hypothesis?')
print(f1_score(data[0][0], data[0][1], data[1][0], data[1][1]))


What is the accuracy of my hypothesis?
0.98114

What is the precision of my hypothesis?
0.014

What is the recall of my hypothesis?
0.005

What is the F1 Score of my hypothesis?
0.00736842105263158

In [ ]: