In [5]:
# Why is called feature engineering? Classifying data so machine can understand it
In [6]:
# How do we do that? We make them binary. And create features.
In [7]:
# What is a feature? Feature are based on attributes of the data and will help make the predictions.
In [40]:
import pandas as pd
%matplotlib inline
from sklearn import preprocessing
In [12]:
df = pd.DataFrame({'key':['cat','cat','dog','donkey','dog','cat'],'data1':range(6)})
df
Out[12]:
data1
key
0
0
cat
1
1
cat
2
2
dog
3
3
donkey
4
4
dog
5
5
cat
In [13]:
#data1
#key = attributes, and they can become the features for our pred model.
In [14]:
#Definition
#Dummy variables assign the numbers ‘0’ and ‘1’ to indicate membership in any mutually exclusive and exhaustive category.
#https://www.moresteam.com/whitepapers/download/dummy-variables.pdf
In [16]:
pd.get_dummies(df['key'],prefix='key') #String to append DataFrame column names
Out[16]:
key_cat
key_dog
key_donkey
0
1.0
0.0
0.0
1
1.0
0.0
0.0
2
0.0
1.0
0.0
3
0.0
0.0
1.0
4
0.0
1.0
0.0
5
1.0
0.0
0.0
In [17]:
billionaires = pd.read_excel('richpeople.xlsx')
In [18]:
billionaires.head(2)
Out[18]:
year
name
rank
citizenship
countrycode
networthusbillion
selfmade
typeofwealth
gender
age
...
relationshiptocompany
foundingdate
gdpcurrentus
sourceofwealth
notes
notes2
source
source_2
source_3
source_4
0
2001
A Jerrold Perenchio
151
United States
USA
3.0
self-made
executive
male
70.0
...
former chairman and CEO
1955.0
1.062180e+13
NaN
represented Marlon Brando and Elizabeth Taylor
NaN
http://en.wikipedia.org/wiki/Jerry_Perenchio
http://www.forbes.com/profile/a-jerrold-perenc...
COLUMN ONE; A Hollywood Player Who Owns the Ga...
NaN
1
2014
A. Jerrold Perenchio
663
United States
USA
2.6
self-made
executive
male
83.0
...
former chairman and CEO
1955.0
NaN
television, Univision
represented Marlon Brando and Elizabeth Taylor
NaN
http://en.wikipedia.org/wiki/Jerry_Perenchio
http://www.forbes.com/profile/a-jerrold-perenc...
COLUMN ONE; A Hollywood Player Who Owns the Ga...
NaN
2 rows × 30 columns
In [19]:
#Doing the same with the billionaires
billionaires['Old Guys'] = billionaires['age'].apply(lambda x: 1 if x>60 else 0 )
In [25]:
#These are the billionaires above 70
Oldguys = billionaires[['name', 'age', 'Old Guys']]
In [27]:
Oldguys.head(2)
Out[27]:
name
age
Old Guys
0
A Jerrold Perenchio
70.0
1
1
A. Jerrold Perenchio
83.0
1
In [29]:
#Here we are telling the machine that in row 4 we have an age 0. Which is probably wrong.
pd.get_dummies(billionaires['age'],prefix='age')
Out[29]:
age_-42.0
age_-7.0
age_0.0
age_12.0
age_21.0
age_24.0
age_28.0
age_29.0
age_30.0
age_31.0
...
age_88.0
age_89.0
age_90.0
age_91.0
age_92.0
age_93.0
age_94.0
age_95.0
age_96.0
age_98.0
0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
2
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
3
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
4
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
5
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
6
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
7
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
8
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
9
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
10
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
11
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
12
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
13
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
14
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
15
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
16
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
17
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
18
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
19
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
20
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
21
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
22
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
23
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
24
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
25
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
26
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
27
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
28
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
29
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
2584
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
2585
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
2586
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
2587
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
2588
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
2589
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
2590
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
2591
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
2592
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
2593
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
2594
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
2595
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
2596
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
2597
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
2598
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
2599
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
2600
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
2601
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
2602
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
2603
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
2604
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
2605
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
2606
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
2607
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
2608
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
2609
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
2610
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
2611
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
2612
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
2613
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
2614 rows × 76 columns
In [32]:
#What does thar mean? Two values that are kind of dependent. BAsed on one you can predict the other. Highly correlated.
In [33]:
pd.get_dummies(billionaires['gender'],prefix='gender').corr()
Out[33]:
gender_female
gender_male
gender_married couple
gender_female
1.000000
-0.925748
-0.010999
gender_male
-0.925748
1.000000
-0.096709
gender_married couple
-0.010999
-0.096709
1.000000
In [36]:
#Why isnt this one?
pd.get_dummies(billionaires['selfmade'],prefix='selfmade').corr()
Out[36]:
selfmade_inherited
selfmade_self-made
selfmade_inherited
1.000000
-0.982092
selfmade_self-made
-0.982092
1.000000
In [41]:
x = billionaires[['age','networthusbillion']].values
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df_normalized = pd.DataFrame(x_scaled)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-41-77a8c3f4cbad> in <module>()
1 x = billionaires[['age','networthusbillion']].values
2 min_max_scaler = preprocessing.MinMaxScaler()
----> 3 x_scaled = min_max_scaler.fit_transform(x)
4 df_normalized = pd.DataFrame(x_scaled)
/usr/local/lib/python3.5/site-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params)
453 if y is None:
454 # fit method of arity 1 (unsupervised transformation)
--> 455 return self.fit(X, **fit_params).transform(X)
456 else:
457 # fit method of arity 2 (supervised transformation)
/usr/local/lib/python3.5/site-packages/sklearn/preprocessing/data.py in fit(self, X, y)
293 # Reset internal state before fitting
294 self._reset()
--> 295 return self.partial_fit(X, y)
296
297 def partial_fit(self, X, y=None):
/usr/local/lib/python3.5/site-packages/sklearn/preprocessing/data.py in partial_fit(self, X, y)
319
320 X = check_array(X, copy=self.copy, ensure_2d=False, warn_on_dtype=True,
--> 321 estimator=self, dtype=FLOAT_DTYPES)
322
323 if X.ndim == 1:
/usr/local/lib/python3.5/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
396 % (array.ndim, estimator_name))
397 if force_all_finite:
--> 398 _assert_all_finite(array)
399
400 shape_repr = _shape_repr(array.shape)
/usr/local/lib/python3.5/site-packages/sklearn/utils/validation.py in _assert_all_finite(X)
52 and not np.isfinite(X).all()):
53 raise ValueError("Input contains NaN, infinity"
---> 54 " or a value too large for %r." % X.dtype)
55
56
ValueError: Input contains NaN, infinity or a value too large for dtype('float64').
In [38]:
x
Out[38]:
array([[ 70. , 3. ],
[ 83. , 2.6],
[ nan, 1.5],
...,
[ 45. , 1.2],
[ 68. , 11.6],
[ 57. , 3.5]])
In [42]:
billionaires_nonulls = billionaires[pd.notnull(billionaires['age'])]
In [43]:
x = billionaires_nonulls[['age','networthusbillion']].values
min_max_scaler = preprocessing.MinMaxScaler() #Transforms features by scaling each feature to a given range.
x_scaled = min_max_scaler.fit_transform(x) #Fit to data, then transform it.
df_normalized = pd.DataFrame(x_scaled)
In [44]:
df_normalized
Out[44]:
0
1
0
0.800000
0.026667
1
0.892857
0.021333
2
0.635714
0.012000
3
0.300000
0.000000
4
0.685714
0.002667
5
0.542857
0.020000
6
0.578571
0.108000
7
0.671429
0.217333
8
0.721429
0.002667
9
0.757143
0.008000
10
0.850000
0.024000
11
0.828571
0.014667
12
0.892857
0.008000
13
0.807143
0.033333
14
0.735714
0.000000
15
0.771429
0.014667
16
0.300000
0.016000
17
0.800000
0.014667
18
0.614286
0.001333
19
0.300000
0.000000
20
0.700000
0.001333
21
0.792857
0.002667
22
0.750000
0.008000
23
0.664286
0.001333
24
0.678571
0.012000
25
0.714286
0.006667
26
0.300000
0.016000
27
0.300000
0.017333
28
0.800000
0.004000
29
0.771429
0.038667
...
...
...
2399
0.671429
0.010667
2400
0.621429
0.006667
2401
0.571429
0.004000
2402
0.850000
0.001333
2403
0.557143
0.033333
2404
0.800000
0.001333
2405
0.635714
0.034667
2406
0.678571
0.000000
2407
0.685714
0.004000
2408
0.692857
0.006667
2409
0.742857
0.001333
2410
0.721429
0.009333
2411
0.657143
0.036000
2412
0.735714
0.014667
2413
0.778571
0.034667
2414
0.642857
0.036000
2415
0.600000
0.050667
2416
0.621429
0.010667
2417
0.657143
0.002667
2418
0.700000
0.002667
2419
0.642857
0.008000
2420
0.607143
0.014667
2421
0.700000
0.014667
2422
0.642857
0.000000
2423
0.685714
0.010667
2424
0.650000
0.006667
2425
0.642857
0.024000
2426
0.621429
0.002667
2427
0.785714
0.141333
2428
0.707143
0.033333
2429 rows × 2 columns
In [45]:
billionaires_nonulls.describe()
/usr/local/lib/python3.5/site-packages/numpy/lib/function_base.py:3823: RuntimeWarning: Invalid value encountered in percentile
RuntimeWarning)
Out[45]:
year
rank
networthusbillion
age
north
politicalconnection
founder
foundingdate
gdpcurrentus
Old Guys
count
2429.000000
2429.000000
2429.000000
2429.000000
2429.000000
71.0
2429.000000
2391.000000
8.290000e+02
2429.000000
mean
2008.638946
607.571017
3.603417
57.403870
0.636888
1.0
0.482915
1955.220410
4.925504e+12
0.503499
std
7.526569
472.008031
5.218999
21.386215
0.480996
0.0
0.499811
42.872414
4.385036e+12
0.500091
min
1996.000000
1.000000
1.000000
-42.000000
0.000000
1.0
0.000000
1610.000000
2.491801e+09
0.000000
25%
2001.000000
212.000000
1.400000
50.000000
0.000000
NaN
0.000000
NaN
NaN
0.000000
50%
2014.000000
446.000000
2.100000
61.000000
1.000000
NaN
0.000000
NaN
NaN
1.000000
75%
2014.000000
988.000000
3.500000
71.000000
1.000000
NaN
1.000000
NaN
NaN
1.000000
max
2014.000000
1565.000000
76.000000
98.000000
1.000000
1.0
1.000000
2012.000000
1.062180e+13
1.000000
In [ ]:
#So we don'r have any misrepresenations.
#
Content source: ledeprogram/algorithms
Similar notebooks: