notebook.community

Edit and run



In [1]:

    
%matplotlib inline



In [2]:

    
# This will be regarding automatic feature selection. This is
# the feature equivalent of parameter tuning. In the same way that
# we use cross-validate to find parameter, we can find the best
# subset of features.
# The simplest idea is univariate selection. The other methods
# involve working with a combination of features.
# An added benefit to feature selection is that it can ease the
# burden on the data collection.



In [3]:

    
# With univariate selection, scoring functions will come to the
# forefront again. This time, they will define the comparable
# measure by which we can eliminate features.



In [4]:

    
from sklearn import datasets
X, y = datasets.make_regression(1000, 10000)



In [5]:

    
# We can now compare the features that are included with the
# various methods. (This is good for text analysis)



In [6]:

    
from sklearn import feature_selection



In [7]:

    
f, p = feature_selection.f_regression(X, y)



In [8]:

    
# Here, f is the f-score associated with each linear model fit
# with just one of the features. We can then compare these
# features and based on this comparison, we can cull features.
# p is also the p-value associated with that f-score.



In [9]:

    
# In statistics, the p-value is the probability of a value more
# extreme than the current value of the test statistic.
# Here, the f-score is the test statistic.
f[:5]









    Out[9]:





array([ 2.15568056,  0.0344714 ,  0.19768934,  0.25618762,  0.02081596])



In [10]:

    
p[:5]









    Out[10]:





array([ 0.14235859,  0.85274539,  0.65668885,  0.61286328,  0.88531057])



In [11]:

    
# many of the p-values are quite large. We would rather have that
# the p-values be small. We can use numpy to choose all the
# p-values less than .05. These will be the features we will use
# for analysis.



In [12]:

    
import numpy as np



In [13]:

    
idx = np.arange(0, X.shape[1])
features_to_keep = idx[p < .05]
len(features_to_keep)









    Out[13]:





499



In [14]:

    
# Another option is using the VarianceThreshold object. A nice
# feature of this is that because it does not use the outcome
# variable, it can be used for unsupervised cases.



In [16]:

    
# Set the threshold for which we eliminate features. Just take the
# median of the feature variances and supply that:
var_threshold = feature_selection.VarianceThreshold(np.median(np.var(X, axis=1)))



In [17]:

    
var_threshold.fit_transform(X).shape









    Out[17]:





(1000, 4905)



In [18]:

    
# this elimiated almost half the features.



In [19]:

    
# How it works...
# All these methods work by fitting a basic model with a single
# feature. Depending on whether we have a classification problem
# or a regression problem, we can use the appropriate scoring
# function.



In [20]:

    
# Smaller problem:
X, y = datasets.make_regression(10000, 20)



In [21]:

    
f, p = feature_selection.f_regression(X, y)



In [22]:

    
import matplotlib.pyplot as plt



In [23]:

    
fig, ax = plt.subplots(figsize=(7,5))
ax.bar(np.arange(20), p, color='k')
ax.set_title('Feature p values')









    Out[23]:





<matplotlib.text.Text at 0x123c44610>



In [ ]: