In [1]:
import numpy as np
import pandas as pd
In [2]:
#load the data
train = pd.read_csv("./data/train.csv")
test = pd.read_csv("./data/test.csv")
In [3]:
#check data set
train.info()
In [4]:
print ("The train data has",train.shape)
print ("The test data has",test.shape)
In [5]:
#Let have a glimpse of the data set
train.head()
Out[5]:
In [6]:
nans = train.shape[0] - train.dropna().shape[0]
print ("%d rows have missing values in the train data" %nans)
nand = test.shape[0] - test.dropna().shape[0]
print ("%d rows have missing values in the test data" %nand)
In [7]:
#only 3 columns have missing values
train.isnull().sum()
Out[7]:
In [8]:
cat = train.select_dtypes(include=['O'])
cat.apply(pd.Series.nunique)
Out[8]:
In [9]:
#Since missing values are found in all 3 character variables
#impute these missing values with their respective modes.
#Education
train.workclass.value_counts(sort=True)
train.workclass.fillna('Private',inplace=True)
#Occupation
train.occupation.value_counts(sort=True)
train.occupation.fillna('Prof-specialty',inplace=True)
#Native Country
train['native.country'].value_counts(sort=True)
train['native.country'].fillna('United-States',inplace=True)
In [10]:
#check missing values
train.isnull().sum()
Out[10]:
In [11]:
#check proportion of target variable
train.target.value_counts()/train.shape[0]
Out[11]:
In [12]:
pd.crosstab(train.education, train.target,margins=True)/train.shape[0]
#create a cross tab of the target variable with education.
#to understand the influence of education on the target variable
Out[12]:
In [13]:
#load sklearn and encode all object type variables
from sklearn import preprocessing
for x in train.columns:
if train[x].dtype == 'object':
lbl = preprocessing.LabelEncoder()
lbl.fit(list(train[x].values))
train[x] = lbl.transform(list(train[x].values))
In [14]:
train.head()
Out[14]:
In [15]:
train.target.value_counts()
Out[15]:
In [16]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import accuracy_score
y = train['target']
del train['target']
X = train
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=1,stratify=y)
#train the RF classifier
clf = RandomForestClassifier(n_estimators = 500, max_depth = 6)
clf.fit(X_train,y_train)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=6, max_features='auto', max_leaf_nodes=None,
min_impurity_split=1e-07, min_samples_leaf=1,
min_samples_split=2, min_weight_fraction_leaf=0.0,
n_estimators=500, n_jobs=1, oob_score=False, random_state=None,
verbose=0, warm_start=False)
clf.predict(X_test)
Out[16]:
In [17]:
#make prediction and check model's accuracy
prediction = clf.predict(X_test)
acc = accuracy_score(np.array(y_test),prediction)
print ('The accuracy of Random Forest is {}'.format(acc))