In [1]:
from random import random
def split_data(data, prob):
"""split data into fractions [prob, 1 -prob]"""
results = [], []
for row in data:
results[0 if random() < prob else 1].append(row)
return results
def train_test_split(x, y, test_pct):
data = zip(x, y)
train, test = split_data(data, 1 - test_pct)
x_train, y_train = zip(*train)
x_test, y_test = zip(*test)
return x_train, x_test, y_train, y_test
In [11]:
# This is just a test of the Asterisk * notation.
train = [1,2,3,4,5]
print(train)
print(*train)
print(type(train))
#print(type(*train)) # TypeError: type() takes 1 or 3 arguments
pairs = [(1,2), (3,4), (5,6), (7,8), (9,0)]
print(pairs)
print(*pairs)
print(type(pairs))
#print(type(*pairs)) # TypeError: type() takes 1 or 3 arguments
#print(zip(*train)) # TypeError: zip argument #1 must support iteration
print(zip(*pairs))
# The asterisk operator unpacks a collection into positional arguments for a function call:
# http://stackoverflow.com/questions/2921847/what-does-the-star-operator-mean-in-python
In [12]:
# CORRECTNESS
def accuracy(tp, fp, fn, tn):
"""fraction of correct predictions"""
correct = tp + tn
total = tp + fp + fn + tn
return correct / total
def precision(tp, fp, fn, tn):
"""fraction of correct positive predictions"""
return tp / (tp + fp)
def recall(tp, fp, fn, tn):
"""What fractions positive values predicted"""
return tp/ (tp + fn)
def f1_score(tp, fp, fn, tn):
"""The harmonic mean of precision and recall is
the F1 score."""
p = precision(tp, fp, fn, tn)
r = recall(tp, fp, fn, tn)
return 2 * p * r / (p + r)
In [15]:
# The ridiculous hypothesis: Being name "Luke" causes Leukemia
# leukemia no_lukemia total
data = [[70, 4930, 5000], # named "Luke"
[13930, 981070, 995000], # not named "Luke"
[14000, 986000, 1000000]] # total
print('\nWhat is the accuracy of my hypothesis?')
print(accuracy(data[0][0], data[0][1], data[1][0], data[1][1]))
print('\nWhat is the precision of my hypothesis?')
print(precision(data[0][0], data[0][1], data[1][0], data[1][1]))
print('\nWhat is the recall of my hypothesis?')
print(recall(data[0][0], data[0][1], data[1][0], data[1][1]))
print('\nWhat is the F1 Score of my hypothesis?')
print(f1_score(data[0][0], data[0][1], data[1][0], data[1][1]))
In [ ]: