Explaining Feature Engineering

Basic questions


In [5]:
# Why is called feature engineering? Classifying data so machine can understand it

In [6]:
# How do we do that? We make them binary. And create features.

In [7]:
# What is a feature? Feature are based on attributes of the data and will help make the predictions.

 An Example


In [40]:
import pandas as pd
%matplotlib inline
from sklearn import preprocessing

In [12]:
df = pd.DataFrame({'key':['cat','cat','dog','donkey','dog','cat'],'data1':range(6)})
df


Out[12]:
data1 key
0 0 cat
1 1 cat
2 2 dog
3 3 donkey
4 4 dog
5 5 cat

Getting the dummies


In [13]:
#data1
#key = attributes, and they can become the features for our pred model.

In [14]:
#Definition
#Dummy variables assign the numbers ‘0’ and ‘1’ to indicate membership in any mutually exclusive and exhaustive category.
#https://www.moresteam.com/whitepapers/download/dummy-variables.pdf

In [16]:
pd.get_dummies(df['key'],prefix='key') #String to append DataFrame column names


Out[16]:
key_cat key_dog key_donkey
0 1.0 0.0 0.0
1 1.0 0.0 0.0
2 0.0 1.0 0.0
3 0.0 0.0 1.0
4 0.0 1.0 0.0
5 1.0 0.0 0.0

In [17]:
billionaires = pd.read_excel('richpeople.xlsx')

In [18]:
billionaires.head(2)


Out[18]:
year name rank citizenship countrycode networthusbillion selfmade typeofwealth gender age ... relationshiptocompany foundingdate gdpcurrentus sourceofwealth notes notes2 source source_2 source_3 source_4
0 2001 A Jerrold Perenchio 151 United States USA 3.0 self-made executive male 70.0 ... former chairman and CEO 1955.0 1.062180e+13 NaN represented Marlon Brando and Elizabeth Taylor NaN http://en.wikipedia.org/wiki/Jerry_Perenchio http://www.forbes.com/profile/a-jerrold-perenc... COLUMN ONE; A Hollywood Player Who Owns the Ga... NaN
1 2014 A. Jerrold Perenchio 663 United States USA 2.6 self-made executive male 83.0 ... former chairman and CEO 1955.0 NaN television, Univision represented Marlon Brando and Elizabeth Taylor NaN http://en.wikipedia.org/wiki/Jerry_Perenchio http://www.forbes.com/profile/a-jerrold-perenc... COLUMN ONE; A Hollywood Player Who Owns the Ga... NaN

2 rows × 30 columns


In [19]:
#Doing the same with the billionaires
billionaires['Old Guys'] = billionaires['age'].apply(lambda x: 1 if x>60 else 0 )

In [25]:
#These are the billionaires above 70
Oldguys = billionaires[['name', 'age', 'Old Guys']]

In [27]:
Oldguys.head(2)


Out[27]:
name age Old Guys
0 A Jerrold Perenchio 70.0 1
1 A. Jerrold Perenchio 83.0 1

In [29]:
#Here we are telling the machine that in row 4 we have an age 0. Which is probably wrong.
pd.get_dummies(billionaires['age'],prefix='age')


Out[29]:
age_-42.0 age_-7.0 age_0.0 age_12.0 age_21.0 age_24.0 age_28.0 age_29.0 age_30.0 age_31.0 ... age_88.0 age_89.0 age_90.0 age_91.0 age_92.0 age_93.0 age_94.0 age_95.0 age_96.0 age_98.0
0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
5 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
6 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
7 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
8 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
9 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
10 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
11 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
12 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
13 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
14 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
15 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
16 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
17 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
18 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
19 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
20 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
21 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
22 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
23 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
24 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
25 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
26 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
27 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
28 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
29 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2584 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2585 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2586 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2587 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2588 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2589 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2590 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2591 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2592 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2593 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2594 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2595 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2596 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2597 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2598 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2599 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2600 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2601 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2602 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2603 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2604 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2605 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2606 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2607 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2608 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2609 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2610 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2611 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2612 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2613 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

2614 rows × 76 columns

Multicollinearity


In [32]:
#What does thar mean? Two values that are kind of dependent. BAsed on one you can predict the other. Highly correlated.

In [33]:
pd.get_dummies(billionaires['gender'],prefix='gender').corr()


Out[33]:
gender_female gender_male gender_married couple
gender_female 1.000000 -0.925748 -0.010999
gender_male -0.925748 1.000000 -0.096709
gender_married couple -0.010999 -0.096709 1.000000

In [36]:
#Why isnt this one?
pd.get_dummies(billionaires['selfmade'],prefix='selfmade').corr()


Out[36]:
selfmade_inherited selfmade_self-made
selfmade_inherited 1.000000 -0.982092
selfmade_self-made -0.982092 1.000000

In [41]:
x = billionaires[['age','networthusbillion']].values
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df_normalized = pd.DataFrame(x_scaled)


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-41-77a8c3f4cbad> in <module>()
      1 x = billionaires[['age','networthusbillion']].values
      2 min_max_scaler = preprocessing.MinMaxScaler()
----> 3 x_scaled = min_max_scaler.fit_transform(x)
      4 df_normalized = pd.DataFrame(x_scaled)

/usr/local/lib/python3.5/site-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params)
    453         if y is None:
    454             # fit method of arity 1 (unsupervised transformation)
--> 455             return self.fit(X, **fit_params).transform(X)
    456         else:
    457             # fit method of arity 2 (supervised transformation)

/usr/local/lib/python3.5/site-packages/sklearn/preprocessing/data.py in fit(self, X, y)
    293         # Reset internal state before fitting
    294         self._reset()
--> 295         return self.partial_fit(X, y)
    296 
    297     def partial_fit(self, X, y=None):

/usr/local/lib/python3.5/site-packages/sklearn/preprocessing/data.py in partial_fit(self, X, y)
    319 
    320         X = check_array(X, copy=self.copy, ensure_2d=False, warn_on_dtype=True,
--> 321                         estimator=self, dtype=FLOAT_DTYPES)
    322 
    323         if X.ndim == 1:

/usr/local/lib/python3.5/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
    396                              % (array.ndim, estimator_name))
    397         if force_all_finite:
--> 398             _assert_all_finite(array)
    399 
    400     shape_repr = _shape_repr(array.shape)

/usr/local/lib/python3.5/site-packages/sklearn/utils/validation.py in _assert_all_finite(X)
     52             and not np.isfinite(X).all()):
     53         raise ValueError("Input contains NaN, infinity"
---> 54                          " or a value too large for %r." % X.dtype)
     55 
     56 

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [38]:
x


Out[38]:
array([[ 70. ,   3. ],
       [ 83. ,   2.6],
       [  nan,   1.5],
       ..., 
       [ 45. ,   1.2],
       [ 68. ,  11.6],
       [ 57. ,   3.5]])

In [42]:
billionaires_nonulls = billionaires[pd.notnull(billionaires['age'])]

In [43]:
x = billionaires_nonulls[['age','networthusbillion']].values
min_max_scaler = preprocessing.MinMaxScaler() #Transforms features by scaling each feature to a given range.

x_scaled = min_max_scaler.fit_transform(x) #Fit to data, then transform it.

df_normalized = pd.DataFrame(x_scaled)

In [44]:
df_normalized


Out[44]:
0 1
0 0.800000 0.026667
1 0.892857 0.021333
2 0.635714 0.012000
3 0.300000 0.000000
4 0.685714 0.002667
5 0.542857 0.020000
6 0.578571 0.108000
7 0.671429 0.217333
8 0.721429 0.002667
9 0.757143 0.008000
10 0.850000 0.024000
11 0.828571 0.014667
12 0.892857 0.008000
13 0.807143 0.033333
14 0.735714 0.000000
15 0.771429 0.014667
16 0.300000 0.016000
17 0.800000 0.014667
18 0.614286 0.001333
19 0.300000 0.000000
20 0.700000 0.001333
21 0.792857 0.002667
22 0.750000 0.008000
23 0.664286 0.001333
24 0.678571 0.012000
25 0.714286 0.006667
26 0.300000 0.016000
27 0.300000 0.017333
28 0.800000 0.004000
29 0.771429 0.038667
... ... ...
2399 0.671429 0.010667
2400 0.621429 0.006667
2401 0.571429 0.004000
2402 0.850000 0.001333
2403 0.557143 0.033333
2404 0.800000 0.001333
2405 0.635714 0.034667
2406 0.678571 0.000000
2407 0.685714 0.004000
2408 0.692857 0.006667
2409 0.742857 0.001333
2410 0.721429 0.009333
2411 0.657143 0.036000
2412 0.735714 0.014667
2413 0.778571 0.034667
2414 0.642857 0.036000
2415 0.600000 0.050667
2416 0.621429 0.010667
2417 0.657143 0.002667
2418 0.700000 0.002667
2419 0.642857 0.008000
2420 0.607143 0.014667
2421 0.700000 0.014667
2422 0.642857 0.000000
2423 0.685714 0.010667
2424 0.650000 0.006667
2425 0.642857 0.024000
2426 0.621429 0.002667
2427 0.785714 0.141333
2428 0.707143 0.033333

2429 rows × 2 columns


In [45]:
billionaires_nonulls.describe()


/usr/local/lib/python3.5/site-packages/numpy/lib/function_base.py:3823: RuntimeWarning: Invalid value encountered in percentile
  RuntimeWarning)
Out[45]:
year rank networthusbillion age north politicalconnection founder foundingdate gdpcurrentus Old Guys
count 2429.000000 2429.000000 2429.000000 2429.000000 2429.000000 71.0 2429.000000 2391.000000 8.290000e+02 2429.000000
mean 2008.638946 607.571017 3.603417 57.403870 0.636888 1.0 0.482915 1955.220410 4.925504e+12 0.503499
std 7.526569 472.008031 5.218999 21.386215 0.480996 0.0 0.499811 42.872414 4.385036e+12 0.500091
min 1996.000000 1.000000 1.000000 -42.000000 0.000000 1.0 0.000000 1610.000000 2.491801e+09 0.000000
25% 2001.000000 212.000000 1.400000 50.000000 0.000000 NaN 0.000000 NaN NaN 0.000000
50% 2014.000000 446.000000 2.100000 61.000000 1.000000 NaN 0.000000 NaN NaN 1.000000
75% 2014.000000 988.000000 3.500000 71.000000 1.000000 NaN 1.000000 NaN NaN 1.000000
max 2014.000000 1565.000000 76.000000 98.000000 1.000000 1.0 1.000000 2012.000000 1.062180e+13 1.000000

Why are we doing this?


In [ ]:
#So we don'r have any misrepresenations.

#