In [1]:
# an easy way to get the data necessary for following along with part 1 of the python intro

import numpy, pandas as pd
df = pd.read_csv('inflammation-01.csv',
                header=None)
data = numpy.array(df)

1.0.1. Draw diagrams showing what variables refer to what values after each statement in the following program:

mass = 47.5
age = 122
mass = mass * 2.0
age = age - 20

1.0.2. What does the following program print out?

first, second = 'Grace', 'Hopper'
third, fourth = second, first
print third, fourth

In [2]:
first, second = 'Grace', 'Hopper'
third, fourth = second, first
print third, fourth


Hopper Grace

1.0.3. "Adding" two strings produces their concatention: 'a' + 'b' is 'ab'. Write a function called fence that takes two parameters called original and wrapper and returns a new string that has the wrapper character at the beginning and end of the original:

print fence('name', '*')
*name*

In [3]:
def fence(orig_str, fence_char):
    """ fence returns new string that has the wrapper character
    at the beginning
    and end of the original string"""
    
    assert type(fence_char) == str and len(fence_char) == 1, \
        'fence char must be str of length one'
    
    return fence_char + orig_str + fence_char

fence('name', '*')


Out[3]:
'*name*'

1.0.4. If the variable s refers to a string, then s[0] is the string's first character and s[-1] is its last. Write a function called outer that returns a string made up of just the first and last characters of its input:

print outer('helium')
hm

In [4]:
def outer(full_str):
    return full_str[0] + full_str[-1]
outer('helium')


Out[4]:
'hm'

1.0.5. We previously wrote functions called fence and outer. Draw a diagram showing how the call stack changes when we run the following:

print outer(fence('carbon', '+'))

Exercise 1.1: Predicting Weather


In [5]:
import pandas as pd
df = pd.read_csv('weather-numeric.csv')

In [6]:
df.head()


Out[6]:
outlook temperature humidity windy play
0 sunny 85 85 False no
1 sunny 80 90 True no
2 overcast 83 86 False yes
3 rainy 70 96 False yes
4 rainy 68 80 False yes

In [7]:
# use TAB to explore commands

Abie's dumb predictor


In [8]:
def predict(s):
    if s['outlook'] == 'sunny':
        return 'no'
    else:
        return 'yes'

In [9]:
predict(df.loc[1]) # .loc[1] means "location = row 1"


Out[9]:
'no'

How good is this dumb predictor?


In [10]:
i = 0
predict(df.loc[i]) == df.play[i]


Out[10]:
True

In [11]:
correct = 0
total = 0
for i in df.index:
    # count how many predictions are correct
    if predict(df.loc[i]) == df.play[i]:
        correct += 1
    total += 1

In [12]:
print correct, 'out of', total, 'correct = %.2f%%' % (100. * correct / total)


10 out of 14 correct = 71.43%

How much better can you do with a single rule?


In [13]:
# first refactor accuracy measurement into a function
# very cool note: in Python functions can take other functions as arguments!

def accuracy(predict):
    """ measure the ("in-sample") accuracy of function predict
    Parameters
    ----------
    predict : function that takes a row of data and produces a prediction
    
    Results 
    -------
    Percent of examples where rule is correct"""
    correct = 0
    total = 0
    for i in df.index:
        # count how many predictions are correct
        if predict(df.loc[i]) == df.play[i]:
            correct += 1
        total += 1
        
    return 100. * correct / total

accuracy(predict)


Out[13]:
71.42857142857143

In [16]:
# now search over all rules
# this uses a sneaky approach to help you understand
# the function stack

acc = {} # dictionary to hold results of search

for col in ['outlook', 'temperature', 'humidity', 'windy']:
    for val in df[col].unique():
        def predict_col_val(s):
            if s[col] == val:
                return 'no'
            else:
                return 'yes'
        
        acc[col,val] = accuracy(predict_col_val)
    
acc = pd.Series(acc)
acc.max(), acc.argmax()


Out[16]:
(71.428571428571431, ('humidity', 85))

In [18]:
acc.order()


Out[18]:
windy        False       35.714286
outlook      overcast    35.714286
humidity     80          50.000000
temperature  75          50.000000
humidity     65          57.142857
temperature  83          57.142857
             81          57.142857
             70          57.142857
             69          57.142857
             68          57.142857
outlook      rainy       57.142857
temperature  64          57.142857
humidity     86          57.142857
             75          57.142857
             70          57.142857
             96          57.142857
             90          64.285714
temperature  72          64.285714
windy        True        64.285714
outlook      sunny       71.428571
humidity     95          71.428571
temperature  65          71.428571
humidity     91          71.428571
temperature  71          71.428571
humidity     85          71.428571
temperature  80          71.428571
             85          71.428571
dtype: float64

Homework:

  • Find the best "length-two decision list" for weather

  • Think about machine learning projects you might do for this course (related to your IHME research?), and about elevator pitches

  • Read


In [15]:
# searching over all length two decision lists
# with a similar approach, but more nesting:
# now search over all rules
# this uses a sneaky approach to help you understand
# the function stack

acc = {} # dictionary to hold results of search

for col1 in ['outlook', 'temperature', 'humidity', 'windy']:
    for val1 in df[col1].unique():
        for col2 in ['outlook', 'temperature', 'humidity', 'windy']:
            for val2 in df[col2].unique():
                
                # more complicated prediction function now...
                def predict_col_val(s):
                    if s[col1] == val1:
                        return 'yes'
                    elif s[col2] == val2:
                        return 'no'
                    else:
                        return 'yes'
        
                acc[col1,val1,col2,val2] = accuracy(predict_col_val)
    
acc = pd.Series(acc)
acc.max(), acc.argmax()


Out[15]:
(85.714285714285708, ('humidity', 70, 'outlook', 'sunny'))