notebook.community

Edit and run



In [1]:

    
import numpy as np
import pandas as pd



In [2]:

    
#load the data
train  = pd.read_csv("./data/train.csv")
test = pd.read_csv("./data/test.csv")



In [3]:

    
#check data set
train.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
age               32561 non-null int64
workclass         30725 non-null object
fnlwgt            32561 non-null int64
education         32561 non-null object
education.num     32561 non-null int64
marital.status    32561 non-null object
occupation        30718 non-null object
relationship      32561 non-null object
race              32561 non-null object
sex               32561 non-null object
capital.gain      32561 non-null int64
capital.loss      32561 non-null int64
hours.per.week    32561 non-null int64
native.country    31978 non-null object
target            32561 non-null object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB



In [4]:

    
print ("The train data has",train.shape)
print ("The test data has",test.shape)









    



The train data has (32561, 15)
The test data has (16281, 15)



In [5]:

    
#Let have a glimpse of the data set
train.head()









    Out[5]:






  
    
      
      age
      workclass
      fnlwgt
      education
      education.num
      marital.status
      occupation
      relationship
      race
      sex
      capital.gain
      capital.loss
      hours.per.week
      native.country
      target
    
  
  
    
      0
      39
      State-gov
      77516
      Bachelors
      13
      Never-married
      Adm-clerical
      Not-in-family
      White
      Male
      2174
      0
      40
      United-States
      <=50K
    
    
      1
      50
      Self-emp-not-inc
      83311
      Bachelors
      13
      Married-civ-spouse
      Exec-managerial
      Husband
      White
      Male
      0
      0
      13
      United-States
      <=50K
    
    
      2
      38
      Private
      215646
      HS-grad
      9
      Divorced
      Handlers-cleaners
      Not-in-family
      White
      Male
      0
      0
      40
      United-States
      <=50K
    
    
      3
      53
      Private
      234721
      11th
      7
      Married-civ-spouse
      Handlers-cleaners
      Husband
      Black
      Male
      0
      0
      40
      United-States
      <=50K
    
    
      4
      28
      Private
      338409
      Bachelors
      13
      Married-civ-spouse
      Prof-specialty
      Wife
      Black
      Female
      0
      0
      40
      Cuba
      <=50K



In [6]:

    
nans = train.shape[0] - train.dropna().shape[0]
print ("%d rows have missing values in the train data" %nans)
nand = test.shape[0] - test.dropna().shape[0]
print ("%d rows have missing values in the test data" %nand)









    



2399 rows have missing values in the train data
1221 rows have missing values in the test data



In [7]:

    
#only 3 columns have missing values
train.isnull().sum()









    Out[7]:





age                  0
workclass         1836
fnlwgt               0
education            0
education.num        0
marital.status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital.gain         0
capital.loss         0
hours.per.week       0
native.country     583
target               0
dtype: int64



In [8]:

    
cat = train.select_dtypes(include=['O'])
cat.apply(pd.Series.nunique)









    Out[8]:





workclass          8
education         16
marital.status     7
occupation        14
relationship       6
race               5
sex                2
native.country    41
target             2
dtype: int64



In [9]:

    
#Since missing values are found in all 3 character variables
#impute these missing values with their respective modes.

#Education
train.workclass.value_counts(sort=True)
train.workclass.fillna('Private',inplace=True)


#Occupation
train.occupation.value_counts(sort=True)
train.occupation.fillna('Prof-specialty',inplace=True)


#Native Country
train['native.country'].value_counts(sort=True)
train['native.country'].fillna('United-States',inplace=True)



In [10]:

    
#check missing values
train.isnull().sum()









    Out[10]:





age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
target            0
dtype: int64



In [11]:

    
#check proportion of target variable
train.target.value_counts()/train.shape[0]









    Out[11]:





 <=50K    0.75919
 >50K     0.24081
Name: target, dtype: float64



In [12]:

    
pd.crosstab(train.education, train.target,margins=True)/train.shape[0]
#create a cross tab of the target variable with education. 
#to understand the influence of education on the target variable









    Out[12]:






  
    
      target
      <=50K
      >50K
      All
    
    
      education
      
      
      
    
  
  
    
      10th
      0.026750
      0.001904
      0.028654
    
    
      11th
      0.034243
      0.001843
      0.036086
    
    
      12th
      0.012285
      0.001013
      0.013298
    
    
      1st-4th
      0.004975
      0.000184
      0.005160
    
    
      5th-6th
      0.009736
      0.000491
      0.010227
    
    
      7th-8th
      0.018611
      0.001228
      0.019840
    
    
      9th
      0.014957
      0.000829
      0.015786
    
    
      Assoc-acdm
      0.024631
      0.008139
      0.032769
    
    
      Assoc-voc
      0.031357
      0.011087
      0.042443
    
    
      Bachelors
      0.096250
      0.068210
      0.164461
    
    
      Doctorate
      0.003286
      0.009398
      0.012684
    
    
      HS-grad
      0.271060
      0.051442
      0.322502
    
    
      Masters
      0.023464
      0.029452
      0.052916
    
    
      Preschool
      0.001566
      0.000000
      0.001566
    
    
      Prof-school
      0.004699
      0.012991
      0.017690
    
    
      Some-college
      0.181321
      0.042597
      0.223918
    
    
      All
      0.759190
      0.240810
      1.000000



In [13]:

    
#load sklearn and encode all object type variables
from sklearn import preprocessing

for x in train.columns:
    if train[x].dtype == 'object':
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(train[x].values))
        train[x] = lbl.transform(list(train[x].values))



In [14]:

    
train.head()









    Out[14]:






  
    
      
      age
      workclass
      fnlwgt
      education
      education.num
      marital.status
      occupation
      relationship
      race
      sex
      capital.gain
      capital.loss
      hours.per.week
      native.country
      target
    
  
  
    
      0
      39
      6
      77516
      9
      13
      4
      0
      1
      4
      1
      2174
      0
      40
      38
      0
    
    
      1
      50
      5
      83311
      9
      13
      2
      3
      0
      4
      1
      0
      0
      13
      38
      0
    
    
      2
      38
      3
      215646
      11
      9
      0
      5
      1
      4
      1
      0
      0
      40
      38
      0
    
    
      3
      53
      3
      234721
      1
      7
      2
      5
      0
      2
      1
      0
      0
      40
      38
      0
    
    
      4
      28
      3
      338409
      9
      13
      2
      9
      5
      2
      0
      0
      0
      40
      4
      0



In [15]:

    
train.target.value_counts()









    Out[15]:





0    24720
1     7841
Name: target, dtype: int64



In [16]:

    
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import accuracy_score

y = train['target']
del train['target']

X = train
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=1,stratify=y)

#train the RF classifier
clf = RandomForestClassifier(n_estimators = 500, max_depth = 6)
clf.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=6, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

clf.predict(X_test)









    



/Users/shams/anaconda/lib/python3.5/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)






    Out[16]:





array([0, 1, 0, ..., 0, 0, 0])



In [17]:

    
#make prediction and check model's accuracy
prediction = clf.predict(X_test)
acc =  accuracy_score(np.array(y_test),prediction)
print ('The accuracy of Random Forest is {}'.format(acc))









    



The accuracy of Random Forest is 0.8522878493192753

	age	workclass	fnlwgt	education	education.num	marital.status	occupation	relationship	race	sex	capital.gain	hours.per.week	native.country	target
0	39	State-gov	77516	Bachelors	13	Never-married	Adm-clerical	Not-in-family	White	Male	2174	40	United-States	<=50K
1	50	Self-emp-not-inc	83311	Bachelors	13	Married-civ-spouse	Exec-managerial	Husband	White	Male	0	13	United-States	<=50K
2	38	Private	215646	HS-grad	9	Divorced	Handlers-cleaners	Not-in-family	White	Male	0	40	United-States	<=50K
3	53	Private	234721	11th	7	Married-civ-spouse	Handlers-cleaners	Husband	Black	Male	0	40	United-States	<=50K
4	28	Private	338409	Bachelors	13	Married-civ-spouse	Prof-specialty	Wife	Black	Female	0	40	Cuba	<=50K

target	<=50K	>50K	All
education
10th	0.026750	0.001904	0.028654
11th	0.034243	0.001843	0.036086
12th	0.012285	0.001013	0.013298
1st-4th	0.004975	0.000184	0.005160
5th-6th	0.009736	0.000491	0.010227
7th-8th	0.018611	0.001228	0.019840
9th	0.014957	0.000829	0.015786
Assoc-acdm	0.024631	0.008139	0.032769
Assoc-voc	0.031357	0.011087	0.042443
Bachelors	0.096250	0.068210	0.164461
Doctorate	0.003286	0.009398	0.012684
HS-grad	0.271060	0.051442	0.322502
Masters	0.023464	0.029452	0.052916
Preschool	0.001566	0.000000	0.001566
Prof-school	0.004699	0.012991	0.017690
Some-college	0.181321	0.042597	0.223918
All	0.759190	0.240810	1.000000