In [1]:
import numpy as np
import pandas as pd

In [2]:
#load the data
train  = pd.read_csv("./data/train.csv")
test = pd.read_csv("./data/test.csv")

In [3]:
#check data set
train.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
age               32561 non-null int64
workclass         30725 non-null object
fnlwgt            32561 non-null int64
education         32561 non-null object
education.num     32561 non-null int64
marital.status    32561 non-null object
occupation        30718 non-null object
relationship      32561 non-null object
race              32561 non-null object
sex               32561 non-null object
capital.gain      32561 non-null int64
capital.loss      32561 non-null int64
hours.per.week    32561 non-null int64
native.country    31978 non-null object
target            32561 non-null object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB

In [4]:
print ("The train data has",train.shape)
print ("The test data has",test.shape)


The train data has (32561, 15)
The test data has (16281, 15)

In [5]:
#Let have a glimpse of the data set
train.head()


Out[5]:
age workclass fnlwgt education education.num marital.status occupation relationship race sex capital.gain capital.loss hours.per.week native.country target
0 39 State-gov 77516 Bachelors 13 Never-married Adm-clerical Not-in-family White Male 2174 0 40 United-States <=50K
1 50 Self-emp-not-inc 83311 Bachelors 13 Married-civ-spouse Exec-managerial Husband White Male 0 0 13 United-States <=50K
2 38 Private 215646 HS-grad 9 Divorced Handlers-cleaners Not-in-family White Male 0 0 40 United-States <=50K
3 53 Private 234721 11th 7 Married-civ-spouse Handlers-cleaners Husband Black Male 0 0 40 United-States <=50K
4 28 Private 338409 Bachelors 13 Married-civ-spouse Prof-specialty Wife Black Female 0 0 40 Cuba <=50K

In [6]:
nans = train.shape[0] - train.dropna().shape[0]
print ("%d rows have missing values in the train data" %nans)
nand = test.shape[0] - test.dropna().shape[0]
print ("%d rows have missing values in the test data" %nand)


2399 rows have missing values in the train data
1221 rows have missing values in the test data

In [7]:
#only 3 columns have missing values
train.isnull().sum()


Out[7]:
age                  0
workclass         1836
fnlwgt               0
education            0
education.num        0
marital.status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital.gain         0
capital.loss         0
hours.per.week       0
native.country     583
target               0
dtype: int64

In [8]:
cat = train.select_dtypes(include=['O'])
cat.apply(pd.Series.nunique)


Out[8]:
workclass          8
education         16
marital.status     7
occupation        14
relationship       6
race               5
sex                2
native.country    41
target             2
dtype: int64

In [9]:
#Since missing values are found in all 3 character variables
#impute these missing values with their respective modes.

#Education
train.workclass.value_counts(sort=True)
train.workclass.fillna('Private',inplace=True)


#Occupation
train.occupation.value_counts(sort=True)
train.occupation.fillna('Prof-specialty',inplace=True)


#Native Country
train['native.country'].value_counts(sort=True)
train['native.country'].fillna('United-States',inplace=True)

In [10]:
#check missing values
train.isnull().sum()


Out[10]:
age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
target            0
dtype: int64

In [11]:
#check proportion of target variable
train.target.value_counts()/train.shape[0]


Out[11]:
 <=50K    0.75919
 >50K     0.24081
Name: target, dtype: float64

In [12]:
pd.crosstab(train.education, train.target,margins=True)/train.shape[0]
#create a cross tab of the target variable with education. 
#to understand the influence of education on the target variable


Out[12]:
target <=50K >50K All
education
10th 0.026750 0.001904 0.028654
11th 0.034243 0.001843 0.036086
12th 0.012285 0.001013 0.013298
1st-4th 0.004975 0.000184 0.005160
5th-6th 0.009736 0.000491 0.010227
7th-8th 0.018611 0.001228 0.019840
9th 0.014957 0.000829 0.015786
Assoc-acdm 0.024631 0.008139 0.032769
Assoc-voc 0.031357 0.011087 0.042443
Bachelors 0.096250 0.068210 0.164461
Doctorate 0.003286 0.009398 0.012684
HS-grad 0.271060 0.051442 0.322502
Masters 0.023464 0.029452 0.052916
Preschool 0.001566 0.000000 0.001566
Prof-school 0.004699 0.012991 0.017690
Some-college 0.181321 0.042597 0.223918
All 0.759190 0.240810 1.000000

In [13]:
#load sklearn and encode all object type variables
from sklearn import preprocessing

for x in train.columns:
    if train[x].dtype == 'object':
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(train[x].values))
        train[x] = lbl.transform(list(train[x].values))

In [14]:
train.head()


Out[14]:
age workclass fnlwgt education education.num marital.status occupation relationship race sex capital.gain capital.loss hours.per.week native.country target
0 39 6 77516 9 13 4 0 1 4 1 2174 0 40 38 0
1 50 5 83311 9 13 2 3 0 4 1 0 0 13 38 0
2 38 3 215646 11 9 0 5 1 4 1 0 0 40 38 0
3 53 3 234721 1 7 2 5 0 2 1 0 0 40 38 0
4 28 3 338409 9 13 2 9 5 2 0 0 0 40 4 0

In [15]:
train.target.value_counts()


Out[15]:
0    24720
1     7841
Name: target, dtype: int64

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import accuracy_score

y = train['target']
del train['target']

X = train
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=1,stratify=y)

#train the RF classifier
clf = RandomForestClassifier(n_estimators = 500, max_depth = 6)
clf.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=6, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

clf.predict(X_test)


/Users/shams/anaconda/lib/python3.5/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
Out[16]:
array([0, 1, 0, ..., 0, 0, 0])

In [17]:
#make prediction and check model's accuracy
prediction = clf.predict(X_test)
acc =  accuracy_score(np.array(y_test),prediction)
print ('The accuracy of Random Forest is {}'.format(acc))


The accuracy of Random Forest is 0.8522878493192753