In [1]:
    
import pandas as pd
    
In [2]:
    
# a major part of the _art_ of applied machine learning
# is "feature engineering", which means mapping the
# raw data we loaded in the previous notebook into a 
# representation that is *good*.  I'm not going to
# get into that now. Instead, here is what it might look like:
df = pd.read_csv('../3-data/phmrc_cleaned.csv')
df.head()
    
    Out[2]:
In [3]:
    
# remember that the fundamental package for
# scientific computing with Python? 
import numpy as np
    
In [4]:
    
# we will use it to create an array of feature vectors
# and an array of the corresponding labels
#X = np.array(df.filter(like='s1'))
X = np.array(df.filter(regex='^(s[0-9]+|age|sex)').fillna(0))
y = np.array(df.gs_text34)
    
In [5]:
    
# how much data are we dealing with here?
X.shape
    
    Out[5]:
In [6]:
    
# Here is how to train a charmingly self-deprecating
# ML method, Naive Bayes, to predict underlying CoD
# with sklearn
import sklearn.naive_bayes
    
In [7]:
    
clf = sklearn.naive_bayes.BernoulliNB()
    
In [8]:
    
clf.fit(X, y)
    
    Out[8]:
In [9]:
    
# Let's see how it works for a single feature vector:
clf.predict(X[[0], :])
    
    Out[9]:
In [10]:
    
# And what was the true label for this example?
y[0]
    
    Out[10]:
In [11]:
    
# So let's see how well it is making predictions overall:
y_pred = clf.predict(X)
    
In [12]:
    
np.mean(y == y_pred)
    
    Out[12]:
In [13]:
    
import sklearn.neighbors
    
In [14]:
    
clf = sklearn.neighbors.KNeighborsClassifier()
    
In [15]:
    
clf.fit(X, y)
    
    Out[15]:
In [16]:
    
clf.predict(X[[2], :])
    
    Out[16]:
In [17]:
    
y[2]
    
    Out[17]:
In [18]:
    
%time y_pred = clf.predict(X)
    
    
In [19]:
    
np.mean(y == y_pred)
    
    Out[19]:
In [20]:
    
import sklearn.ensemble
    
In [21]:
    
clf = sklearn.ensemble.GradientBoostingClassifier()
    
In [22]:
    
%time clf.fit(X, y)
    
    
    Out[22]:
In [23]:
    
clf.predict(X[[1], :])
    
    Out[23]:
In [24]:
    
y[1]
    
    Out[24]:
In [25]:
    
y_pred = clf.predict(X)
    
In [26]:
    
np.mean(y == y_pred)
    
    Out[26]: