In [1]:
# an easy way to get the data necessary for following along with part 1 of the python intro
import numpy, pandas as pd
df = pd.read_csv('inflammation-01.csv',
header=None)
data = numpy.array(df)
1.0.1. Draw diagrams showing what variables refer to what values after each statement in the following program:
mass = 47.5
age = 122
mass = mass * 2.0
age = age - 20
1.0.2. What does the following program print out?
first, second = 'Grace', 'Hopper'
third, fourth = second, first
print third, fourth
In [2]:
first, second = 'Grace', 'Hopper'
third, fourth = second, first
print third, fourth
1.0.3. "Adding" two strings produces their concatention: 'a' + 'b' is 'ab'. Write a function called fence that takes two parameters called original and wrapper and returns a new string that has the wrapper character at the beginning and end of the original:
print fence('name', '*')
*name*
In [3]:
def fence(orig_str, fence_char):
""" fence returns new string that has the wrapper character
at the beginning
and end of the original string"""
assert type(fence_char) == str and len(fence_char) == 1, \
'fence char must be str of length one'
return fence_char + orig_str + fence_char
fence('name', '*')
Out[3]:
1.0.4. If the variable s refers to a string, then s[0] is the string's first character and s[-1] is its last. Write a function called outer that returns a string made up of just the first and last characters of its input:
print outer('helium')
hm
In [4]:
def outer(full_str):
return full_str[0] + full_str[-1]
outer('helium')
Out[4]:
1.0.5. We previously wrote functions called fence and outer. Draw a diagram showing how the call stack changes when we run the following:
print outer(fence('carbon', '+'))
In [5]:
import pandas as pd
df = pd.read_csv('weather-numeric.csv')
In [6]:
df.head()
Out[6]:
In [7]:
# use TAB to explore commands
In [8]:
def predict(s):
if s['outlook'] == 'sunny':
return 'no'
else:
return 'yes'
In [9]:
predict(df.loc[1]) # .loc[1] means "location = row 1"
Out[9]:
In [10]:
i = 0
predict(df.loc[i]) == df.play[i]
Out[10]:
In [11]:
correct = 0
total = 0
for i in df.index:
# count how many predictions are correct
if predict(df.loc[i]) == df.play[i]:
correct += 1
total += 1
In [12]:
print correct, 'out of', total, 'correct = %.2f%%' % (100. * correct / total)
In [13]:
# first refactor accuracy measurement into a function
# very cool note: in Python functions can take other functions as arguments!
def accuracy(predict):
""" measure the ("in-sample") accuracy of function predict
Parameters
----------
predict : function that takes a row of data and produces a prediction
Results
-------
Percent of examples where rule is correct"""
correct = 0
total = 0
for i in df.index:
# count how many predictions are correct
if predict(df.loc[i]) == df.play[i]:
correct += 1
total += 1
return 100. * correct / total
accuracy(predict)
Out[13]:
In [16]:
# now search over all rules
# this uses a sneaky approach to help you understand
# the function stack
acc = {} # dictionary to hold results of search
for col in ['outlook', 'temperature', 'humidity', 'windy']:
for val in df[col].unique():
def predict_col_val(s):
if s[col] == val:
return 'no'
else:
return 'yes'
acc[col,val] = accuracy(predict_col_val)
acc = pd.Series(acc)
acc.max(), acc.argmax()
Out[16]:
In [18]:
acc.order()
Out[18]:
In [15]:
# searching over all length two decision lists
# with a similar approach, but more nesting:
# now search over all rules
# this uses a sneaky approach to help you understand
# the function stack
acc = {} # dictionary to hold results of search
for col1 in ['outlook', 'temperature', 'humidity', 'windy']:
for val1 in df[col1].unique():
for col2 in ['outlook', 'temperature', 'humidity', 'windy']:
for val2 in df[col2].unique():
# more complicated prediction function now...
def predict_col_val(s):
if s[col1] == val1:
return 'yes'
elif s[col2] == val2:
return 'no'
else:
return 'yes'
acc[col1,val1,col2,val2] = accuracy(predict_col_val)
acc = pd.Series(acc)
acc.max(), acc.argmax()
Out[15]: