In [1]:
import pandas as pd
In [2]:
# a major part of the _art_ of applied machine learning
# is "feature engineering", which means mapping the
# raw data we loaded in the previous notebook into a
# representation that is *good*. I'm not going to
# get into that now. Instead, here is what it might look like:
df = pd.read_csv('../3-data/phmrc_cleaned.csv')
df.head()
Out[2]:
In [3]:
# remember that the fundamental package for
# scientific computing with Python?
import numpy as np
In [4]:
# we will use it to create an array of feature vectors
# and an array of the corresponding labels
#X = np.array(df.filter(like='s1'))
X = np.array(df.filter(regex='^(s[0-9]+|age|sex)').fillna(0))
y = np.array(df.gs_text34)
In [5]:
# how much data are we dealing with here?
X.shape
Out[5]:
In [6]:
# Here is how to train a charmingly self-deprecating
# ML method, Naive Bayes, to predict underlying CoD
# with sklearn
import sklearn.naive_bayes
In [7]:
clf = sklearn.naive_bayes.BernoulliNB()
In [8]:
clf.fit(X, y)
Out[8]:
In [9]:
# Let's see how it works for a single feature vector:
clf.predict(X[[0], :])
Out[9]:
In [10]:
# And what was the true label for this example?
y[0]
Out[10]:
In [11]:
# So let's see how well it is making predictions overall:
y_pred = clf.predict(X)
In [12]:
np.mean(y == y_pred)
Out[12]:
In [13]:
import sklearn.neighbors
In [14]:
clf = sklearn.neighbors.KNeighborsClassifier()
In [15]:
clf.fit(X, y)
Out[15]:
In [16]:
clf.predict(X[[2], :])
Out[16]:
In [17]:
y[2]
Out[17]:
In [18]:
%time y_pred = clf.predict(X)
In [19]:
np.mean(y == y_pred)
Out[19]:
In [20]:
import sklearn.ensemble
In [21]:
clf = sklearn.ensemble.GradientBoostingClassifier()
In [22]:
%time clf.fit(X, y)
Out[22]:
In [23]:
clf.predict(X[[1], :])
Out[23]:
In [24]:
y[1]
Out[24]:
In [25]:
y_pred = clf.predict(X)
In [26]:
np.mean(y == y_pred)
Out[26]: