Which features do you use? All of them? Some of them?
Remember: Our goal is to find the smallest set of the available features such that the fitted model will reach its maximal predictive value.
In [ ]:
import os
import zipfile
import requests
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
In [ ]:
OCCUPANCY = ('http://bit.ly/ddl-occupancy-dataset', 'occupancy.zip')
CREDIT = ('http://bit.ly/ddl-credit-dataset', 'credit.xls')
CONCRETE = ('http://bit.ly/ddl-concrete-data', 'concrete.xls')
def download_data(url, name, path='data'):
if not os.path.exists(path):
os.mkdir(path)
response = requests.get(url)
with open(os.path.join(path, name), 'wb') as f:
f.write(response.content)
def download_all(path='data'):
for href, name in (OCCUPANCY, CREDIT, CONCRETE):
download_data(href, name, path)
# Extract the occupancy zip data
z = zipfile.ZipFile(os.path.join(path, 'occupancy.zip'))
z.extractall(os.path.join(path, 'occupancy'))
path='data'
download_all(path)
In [ ]:
# Load the room occupancy dataset into a dataframe
occupancy = os.path.join('data','occupancy','datatraining.txt')
occupancy = pd.read_csv(occupancy, sep=',')
occupancy.columns = [
'date', 'temp', 'humid', 'light', 'co2', 'hratio', 'occupied'
]
In [ ]:
features = occupancy[['temp', 'humid', 'light', 'co2', 'hratio']]
labels = occupancy['occupied']
In [ ]:
list(features)
In [ ]:
model = Lasso()
model.fit(features, labels)
print(list(zip(features, model.coef_.tolist())))
In [ ]:
model = Ridge()
model.fit(features, labels)
print(list(zip(features, model.coef_.tolist())))
In [ ]:
model = ElasticNet(l1_ratio=0.10)
model.fit(features, labels)
print(list(zip(features, model.coef_.tolist())))
In [ ]:
model = Lasso()
sfm = SelectFromModel(model)
sfm.fit(features, labels)
print(list(features[sfm.get_support(indices=True)]))
In [ ]:
model = Ridge()
sfm = SelectFromModel(model)
sfm.fit(features, labels)
print(list(features[sfm.get_support(indices=True)]))
In [ ]:
model = ElasticNet()
sfm = SelectFromModel(model)
sfm.fit(features, labels)
print(list(features[sfm.get_support(indices=True)]))
Linear dimensionality reduction using Singular Value Decomposition (SVD) of the data and keeping only the most significant singular vectors to project the data into a lower dimensional space.
In [ ]:
pca = PCA(n_components=2)
new_features = pca.fit(features).transform(features)
print(new_features)
A classifier with a linear decision boundary, generated by fitting class conditional densities to the data and using Bayes’ rule. The model fits a Gaussian density to each class, assuming that all classes share the same covariance matrix. Can be used to reduce the dimensionality of the input by projecting it to the most discriminative directions.
In [ ]:
lda = LDA(n_components=2)
new_features = lda.fit(features, labels).transform(features)
print(new_features)
To learn more about feature selection tools within Scikit-Learn, check out http://scikit-learn.org/stable/modules/feature_selection.html.
Try out the above techniques yourself with the Credit Card Default and Concrete Strength datasets.
In [ ]:
# Load the credit card default dataset into a dataframe
credit = os.path.join('data','credit.xls')
credit = pd.read_excel(credit, header=1)
credit.columns = [
'id', 'limit', 'sex', 'edu', 'married', 'age', 'apr_delay', 'may_delay',
'jun_delay', 'jul_delay', 'aug_delay', 'sep_delay', 'apr_bill', 'may_bill',
'jun_bill', 'jul_bill', 'aug_bill', 'sep_bill', 'apr_pay', 'may_pay', 'jun_pay',
'jul_pay', 'aug_pay', 'sep_pay', 'default'
]
# Separate dataframe into features and targets
cred_features = credit[[
'limit', 'sex', 'edu', 'married', 'age', 'apr_delay', 'may_delay',
'jun_delay', 'jul_delay', 'aug_delay', 'sep_delay', 'apr_bill', 'may_bill',
'jun_bill', 'jul_bill', 'aug_bill', 'sep_bill', 'apr_pay', 'may_pay',
'jun_pay', 'jul_pay', 'aug_pay', 'sep_pay'
]]
cred_labels = credit['default']
# Load the concrete compression dataset into a dataframe
concrete = pd.read_excel(os.path.join('data','concrete.xls'))
concrete.columns = [
'cement', 'slag', 'ash', 'water', 'splast',
'coarse', 'fine', 'age', 'strength'
]
# Separate dataframe into features and targets
conc_features = concrete[[
'cement', 'slag', 'ash', 'water', 'splast', 'coarse', 'fine', 'age'
]]
conc_labels = concrete['strength']