Explaining Feature Engineering

Basic questions



In [5]:

    
# Why is called feature engineering? Classifying data so machine can understand it



In [6]:

    
# How do we do that? We make them binary. And create features.



In [7]:

    
# What is a feature? Feature are based on attributes of the data and will help make the predictions.

An Example



In [40]:

    
import pandas as pd
%matplotlib inline
from sklearn import preprocessing



In [12]:

    
df = pd.DataFrame({'key':['cat','cat','dog','donkey','dog','cat'],'data1':range(6)})
df

Getting the dummies



In [13]:

    
#data1
#key = attributes, and they can become the features for our pred model.



In [14]:

    
#Definition
#Dummy variables assign the numbers ‘0’ and ‘1’ to indicate membership in any mutually exclusive and exhaustive category.
#https://www.moresteam.com/whitepapers/download/dummy-variables.pdf



In [16]:

    
pd.get_dummies(df['key'],prefix='key') #String to append DataFrame column names



In [17]:

    
billionaires = pd.read_excel('richpeople.xlsx')



In [18]:

    
billionaires.head(2)









    Out[18]:






  
    
      
      year
      name
      rank
      citizenship
      countrycode
      networthusbillion
      selfmade
      typeofwealth
      gender
      age
      ...
      relationshiptocompany
      foundingdate
      gdpcurrentus
      sourceofwealth
      notes
      notes2
      source
      source_2
      source_3
      source_4
    
  
  
    
      0
      2001
      A Jerrold Perenchio
      151
      United States
      USA
      3.0
      self-made
      executive
      male
      70.0
      ...
      former chairman and CEO
      1955.0
      1.062180e+13
      NaN
      represented Marlon Brando and Elizabeth Taylor
      NaN
      http://en.wikipedia.org/wiki/Jerry_Perenchio
      http://www.forbes.com/profile/a-jerrold-perenc...
      COLUMN ONE; A Hollywood Player Who Owns the Ga...
      NaN
    
    
      1
      2014
      A. Jerrold Perenchio
      663
      United States
      USA
      2.6
      self-made
      executive
      male
      83.0
      ...
      former chairman and CEO
      1955.0
      NaN
      television, Univision
      represented Marlon Brando and Elizabeth Taylor
      NaN
      http://en.wikipedia.org/wiki/Jerry_Perenchio
      http://www.forbes.com/profile/a-jerrold-perenc...
      COLUMN ONE; A Hollywood Player Who Owns the Ga...
      NaN
    
  

2 rows × 30 columns



In [19]:

    
#Doing the same with the billionaires
billionaires['Old Guys'] = billionaires['age'].apply(lambda x: 1 if x>60 else 0 )



In [25]:

    
#These are the billionaires above 70
Oldguys = billionaires[['name', 'age', 'Old Guys']]



In [27]:

    
Oldguys.head(2)









    Out[27]:






  
    
      
      name
      age
      Old Guys
    
  
  
    
      0
      A Jerrold Perenchio
      70.0
      1
    
    
      1
      A. Jerrold Perenchio
      83.0
      1



In [29]:

    
#Here we are telling the machine that in row 4 we have an age 0. Which is probably wrong.
pd.get_dummies(billionaires['age'],prefix='age')









    Out[29]:






  
    
      
      age_-42.0
      age_-7.0
      age_0.0
      age_12.0
      age_21.0
      age_24.0
      age_28.0
      age_29.0
      age_30.0
      age_31.0
      ...
      age_88.0
      age_89.0
      age_90.0
      age_91.0
      age_92.0
      age_93.0
      age_94.0
      age_95.0
      age_96.0
      age_98.0
    
  
  
    
      0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      1
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      2
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      3
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      4
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      5
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      6
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      7
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      8
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      9
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      10
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      11
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      12
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      13
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      14
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      15
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      16
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      17
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      18
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      19
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      20
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      21
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      22
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      23
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      24
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      25
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      26
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      27
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      28
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      29
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      2584
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      2585
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      2586
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      2587
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      2588
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      2589
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      2590
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      2591
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      2592
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      2593
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      2594
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      2595
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      2596
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      2597
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      2598
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      2599
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      2600
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      2601
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      2602
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      2603
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      2604
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      2605
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      2606
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      2607
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      2608
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      2609
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      2610
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      2611
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      2612
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      2613
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
  

2614 rows × 76 columns

Multicollinearity



In [32]:

    
#What does thar mean? Two values that are kind of dependent. BAsed on one you can predict the other. Highly correlated.



In [33]:

    
pd.get_dummies(billionaires['gender'],prefix='gender').corr()









    Out[33]:






  
    
      
      gender_female
      gender_male
      gender_married couple
    
  
  
    
      gender_female
      1.000000
      -0.925748
      -0.010999
    
    
      gender_male
      -0.925748
      1.000000
      -0.096709
    
    
      gender_married couple
      -0.010999
      -0.096709
      1.000000



In [36]:

    
#Why isnt this one?
pd.get_dummies(billionaires['selfmade'],prefix='selfmade').corr()









    Out[36]:






  
    
      
      selfmade_inherited
      selfmade_self-made
    
  
  
    
      selfmade_inherited
      1.000000
      -0.982092
    
    
      selfmade_self-made
      -0.982092
      1.000000



In [41]:

    
x = billionaires[['age','networthusbillion']].values
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df_normalized = pd.DataFrame(x_scaled)









    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-41-77a8c3f4cbad> in <module>()
      1 x = billionaires[['age','networthusbillion']].values
      2 min_max_scaler = preprocessing.MinMaxScaler()
----> 3 x_scaled = min_max_scaler.fit_transform(x)
      4 df_normalized = pd.DataFrame(x_scaled)

/usr/local/lib/python3.5/site-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params)
    453         if y is None:
    454             # fit method of arity 1 (unsupervised transformation)
--> 455             return self.fit(X, **fit_params).transform(X)
    456         else:
    457             # fit method of arity 2 (supervised transformation)

/usr/local/lib/python3.5/site-packages/sklearn/preprocessing/data.py in fit(self, X, y)
    293         # Reset internal state before fitting
    294         self._reset()
--> 295         return self.partial_fit(X, y)
    296 
    297     def partial_fit(self, X, y=None):

/usr/local/lib/python3.5/site-packages/sklearn/preprocessing/data.py in partial_fit(self, X, y)
    319 
    320         X = check_array(X, copy=self.copy, ensure_2d=False, warn_on_dtype=True,
--> 321                         estimator=self, dtype=FLOAT_DTYPES)
    322 
    323         if X.ndim == 1:

/usr/local/lib/python3.5/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
    396                              % (array.ndim, estimator_name))
    397         if force_all_finite:
--> 398             _assert_all_finite(array)
    399 
    400     shape_repr = _shape_repr(array.shape)

/usr/local/lib/python3.5/site-packages/sklearn/utils/validation.py in _assert_all_finite(X)
     52             and not np.isfinite(X).all()):
     53         raise ValueError("Input contains NaN, infinity"
---> 54                          " or a value too large for %r." % X.dtype)
     55 
     56 

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').



In [38]:

    
x









    Out[38]:





array([[ 70. ,   3. ],
       [ 83. ,   2.6],
       [  nan,   1.5],
       ..., 
       [ 45. ,   1.2],
       [ 68. ,  11.6],
       [ 57. ,   3.5]])



In [42]:

    
billionaires_nonulls = billionaires[pd.notnull(billionaires['age'])]



In [43]:

    
x = billionaires_nonulls[['age','networthusbillion']].values
min_max_scaler = preprocessing.MinMaxScaler() #Transforms features by scaling each feature to a given range.

x_scaled = min_max_scaler.fit_transform(x) #Fit to data, then transform it.

df_normalized = pd.DataFrame(x_scaled)



In [44]:

    
df_normalized









    Out[44]:






  
    
      
      0
      1
    
  
  
    
      0
      0.800000
      0.026667
    
    
      1
      0.892857
      0.021333
    
    
      2
      0.635714
      0.012000
    
    
      3
      0.300000
      0.000000
    
    
      4
      0.685714
      0.002667
    
    
      5
      0.542857
      0.020000
    
    
      6
      0.578571
      0.108000
    
    
      7
      0.671429
      0.217333
    
    
      8
      0.721429
      0.002667
    
    
      9
      0.757143
      0.008000
    
    
      10
      0.850000
      0.024000
    
    
      11
      0.828571
      0.014667
    
    
      12
      0.892857
      0.008000
    
    
      13
      0.807143
      0.033333
    
    
      14
      0.735714
      0.000000
    
    
      15
      0.771429
      0.014667
    
    
      16
      0.300000
      0.016000
    
    
      17
      0.800000
      0.014667
    
    
      18
      0.614286
      0.001333
    
    
      19
      0.300000
      0.000000
    
    
      20
      0.700000
      0.001333
    
    
      21
      0.792857
      0.002667
    
    
      22
      0.750000
      0.008000
    
    
      23
      0.664286
      0.001333
    
    
      24
      0.678571
      0.012000
    
    
      25
      0.714286
      0.006667
    
    
      26
      0.300000
      0.016000
    
    
      27
      0.300000
      0.017333
    
    
      28
      0.800000
      0.004000
    
    
      29
      0.771429
      0.038667
    
    
      ...
      ...
      ...
    
    
      2399
      0.671429
      0.010667
    
    
      2400
      0.621429
      0.006667
    
    
      2401
      0.571429
      0.004000
    
    
      2402
      0.850000
      0.001333
    
    
      2403
      0.557143
      0.033333
    
    
      2404
      0.800000
      0.001333
    
    
      2405
      0.635714
      0.034667
    
    
      2406
      0.678571
      0.000000
    
    
      2407
      0.685714
      0.004000
    
    
      2408
      0.692857
      0.006667
    
    
      2409
      0.742857
      0.001333
    
    
      2410
      0.721429
      0.009333
    
    
      2411
      0.657143
      0.036000
    
    
      2412
      0.735714
      0.014667
    
    
      2413
      0.778571
      0.034667
    
    
      2414
      0.642857
      0.036000
    
    
      2415
      0.600000
      0.050667
    
    
      2416
      0.621429
      0.010667
    
    
      2417
      0.657143
      0.002667
    
    
      2418
      0.700000
      0.002667
    
    
      2419
      0.642857
      0.008000
    
    
      2420
      0.607143
      0.014667
    
    
      2421
      0.700000
      0.014667
    
    
      2422
      0.642857
      0.000000
    
    
      2423
      0.685714
      0.010667
    
    
      2424
      0.650000
      0.006667
    
    
      2425
      0.642857
      0.024000
    
    
      2426
      0.621429
      0.002667
    
    
      2427
      0.785714
      0.141333
    
    
      2428
      0.707143
      0.033333
    
  

2429 rows × 2 columns



In [45]:

    
billionaires_nonulls.describe()









    



/usr/local/lib/python3.5/site-packages/numpy/lib/function_base.py:3823: RuntimeWarning: Invalid value encountered in percentile
  RuntimeWarning)






    Out[45]:






  
    
      
      year
      rank
      networthusbillion
      age
      north
      politicalconnection
      founder
      foundingdate
      gdpcurrentus
      Old Guys
    
  
  
    
      count
      2429.000000
      2429.000000
      2429.000000
      2429.000000
      2429.000000
      71.0
      2429.000000
      2391.000000
      8.290000e+02
      2429.000000
    
    
      mean
      2008.638946
      607.571017
      3.603417
      57.403870
      0.636888
      1.0
      0.482915
      1955.220410
      4.925504e+12
      0.503499
    
    
      std
      7.526569
      472.008031
      5.218999
      21.386215
      0.480996
      0.0
      0.499811
      42.872414
      4.385036e+12
      0.500091
    
    
      min
      1996.000000
      1.000000
      1.000000
      -42.000000
      0.000000
      1.0
      0.000000
      1610.000000
      2.491801e+09
      0.000000
    
    
      25%
      2001.000000
      212.000000
      1.400000
      50.000000
      0.000000
      NaN
      0.000000
      NaN
      NaN
      0.000000
    
    
      50%
      2014.000000
      446.000000
      2.100000
      61.000000
      1.000000
      NaN
      0.000000
      NaN
      NaN
      1.000000
    
    
      75%
      2014.000000
      988.000000
      3.500000
      71.000000
      1.000000
      NaN
      1.000000
      NaN
      NaN
      1.000000
    
    
      max
      2014.000000
      1565.000000
      76.000000
      98.000000
      1.000000
      1.0
      1.000000
      2012.000000
      1.062180e+13
      1.000000

Why are we doing this?



In [ ]:

    
#So we don'r have any misrepresenations.

#

	key_cat	key_dog	key_donkey
0	1.0	0.0	0.0
1	1.0	0.0	0.0
2	0.0	1.0	0.0
3	0.0	0.0	1.0
4	0.0	1.0	0.0
5	1.0	0.0	0.0

	year	name	rank	citizenship	countrycode	networthusbillion	selfmade	typeofwealth	gender	age	...	relationshiptocompany	foundingdate	gdpcurrentus	sourceofwealth	notes	notes2	source	source_2	source_3	source_4
0	2001	A Jerrold Perenchio	151	United States	USA	3.0	self-made	executive	male	70.0	...	former chairman and CEO	1955.0	1.062180e+13	NaN	represented Marlon Brando and Elizabeth Taylor	NaN	http://en.wikipedia.org/wiki/Jerry_Perenchio	http://www.forbes.com/profile/a-jerrold-perenc...	COLUMN ONE; A Hollywood Player Who Owns the Ga...	NaN
1	2014	A. Jerrold Perenchio	663	United States	USA	2.6	self-made	executive	male	83.0	...	former chairman and CEO	1955.0	NaN	television, Univision	represented Marlon Brando and Elizabeth Taylor	NaN	http://en.wikipedia.org/wiki/Jerry_Perenchio	http://www.forbes.com/profile/a-jerrold-perenc...	COLUMN ONE; A Hollywood Player Who Owns the Ga...	NaN

	age_-42.0	age_-7.0	age_0.0	age_12.0	age_21.0	age_24.0	age_28.0	age_29.0	age_30.0	age_31.0	...	age_88.0	age_89.0	age_90.0	age_91.0	age_92.0	age_93.0	age_94.0	age_95.0	age_96.0	age_98.0
0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
1	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
4	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
5	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
6	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
7	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
8	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
9	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
10	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
11	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
12	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
13	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
14	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
15	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
16	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
17	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
18	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
19	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
20	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
21	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
22	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
23	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
24	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
25	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
26	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
27	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
28	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
29	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
2584	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2585	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2586	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2587	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2588	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2589	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2590	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2591	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2592	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2593	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2594	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2595	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2596	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2597	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2598	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2599	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2600	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2601	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2602	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2603	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2604	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2605	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2606	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2607	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2608	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2609	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2610	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2611	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2612	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2613	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0

	gender_female	gender_male	gender_married couple
gender_female	1.000000	-0.925748	-0.010999
gender_male	-0.925748	1.000000	-0.096709
gender_married couple	-0.010999	-0.096709	1.000000

	selfmade_inherited	selfmade_self-made
selfmade_inherited	1.000000	-0.982092
selfmade_self-made	-0.982092	1.000000

	0	1
0	0.800000	0.026667
1	0.892857	0.021333
2	0.635714	0.012000
3	0.300000	0.000000
4	0.685714	0.002667
5	0.542857	0.020000
6	0.578571	0.108000
7	0.671429	0.217333
8	0.721429	0.002667
9	0.757143	0.008000
10	0.850000	0.024000
11	0.828571	0.014667
12	0.892857	0.008000
13	0.807143	0.033333
14	0.735714	0.000000
15	0.771429	0.014667
16	0.300000	0.016000
17	0.800000	0.014667
18	0.614286	0.001333
19	0.300000	0.000000
20	0.700000	0.001333
21	0.792857	0.002667
22	0.750000	0.008000
23	0.664286	0.001333
24	0.678571	0.012000
25	0.714286	0.006667
26	0.300000	0.016000
27	0.300000	0.017333
28	0.800000	0.004000
29	0.771429	0.038667
...	...	...
2399	0.671429	0.010667
2400	0.621429	0.006667
2401	0.571429	0.004000
2402	0.850000	0.001333
2403	0.557143	0.033333
2404	0.800000	0.001333
2405	0.635714	0.034667
2406	0.678571	0.000000
2407	0.685714	0.004000
2408	0.692857	0.006667
2409	0.742857	0.001333
2410	0.721429	0.009333
2411	0.657143	0.036000
2412	0.735714	0.014667
2413	0.778571	0.034667
2414	0.642857	0.036000
2415	0.600000	0.050667
2416	0.621429	0.010667
2417	0.657143	0.002667
2418	0.700000	0.002667
2419	0.642857	0.008000
2420	0.607143	0.014667
2421	0.700000	0.014667
2422	0.642857	0.000000
2423	0.685714	0.010667
2424	0.650000	0.006667
2425	0.642857	0.024000
2426	0.621429	0.002667
2427	0.785714	0.141333
2428	0.707143	0.033333

	year	rank	networthusbillion	age	north	politicalconnection	founder	foundingdate	gdpcurrentus	Old Guys
count	2429.000000	2429.000000	2429.000000	2429.000000	2429.000000	71.0	2429.000000	2391.000000	8.290000e+02	2429.000000
mean	2008.638946	607.571017	3.603417	57.403870	0.636888	1.0	0.482915	1955.220410	4.925504e+12	0.503499
std	7.526569	472.008031	5.218999	21.386215	0.480996	0.0	0.499811	42.872414	4.385036e+12	0.500091
min	1996.000000	1.000000	1.000000	-42.000000	0.000000	1.0	0.000000	1610.000000	2.491801e+09	0.000000
25%	2001.000000	212.000000	1.400000	50.000000	0.000000	NaN	0.000000	NaN	NaN	0.000000
50%	2014.000000	446.000000	2.100000	61.000000	1.000000	NaN	0.000000	NaN	NaN	1.000000
75%	2014.000000	988.000000	3.500000	71.000000	1.000000	NaN	1.000000	NaN	NaN	1.000000
max	2014.000000	1565.000000	76.000000	98.000000	1.000000	1.0	1.000000	2012.000000	1.062180e+13	1.000000

	key_cat	key_dog	key_donkey
0	1.0	0.0	0.0
1	1.0	0.0	0.0
2	0.0	1.0	0.0
3	0.0	0.0	1.0
4	0.0	1.0	0.0
5	1.0	0.0	0.0

	key_cat	key_dog	key_donkey
0	1.0	0.0	0.0
1	1.0	0.0	0.0
2	0.0	1.0	0.0
3	0.0	0.0	1.0
4	0.0	1.0	0.0
5	1.0	0.0	0.0