In [26]:
import pandas as pd
import numpy as np
train = pd.read_csv("~/Downloads/datafiles19cdaf8/train.csv")
test = pd.read_csv("~/Downloads/datafiles19cdaf8/test.csv")
#Education
train.workclass.value_counts(sort=True)
train.workclass.fillna('Private',inplace=True)
#Occupation
train.occupation.value_counts(sort=True)
train.occupation.fillna('Prof-specialty',inplace=True)
#Native Country
train['native.country'].value_counts(sort=True)
train['native.country'].fillna('United-States',inplace=True)
train.isnull().sum()
Out[26]:
age 0
workclass 0
fnlwgt 0
education 0
education.num 0
marital.status 0
occupation 0
relationship 0
race 0
sex 0
capital.gain 0
capital.loss 0
hours.per.week 0
native.country 0
target 0
dtype: int64
In [16]:
train.target.value_counts()/train.shape[0]
Out[16]:
<=50K 0.75919
>50K 0.24081
Name: target, dtype: float64
In [17]:
pd.crosstab(train.education, train.target,margins=True)/train.shape[0]
Out[17]:
target
<=50K
>50K
All
education
10th
0.026750
0.001904
0.028654
11th
0.034243
0.001843
0.036086
12th
0.012285
0.001013
0.013298
1st-4th
0.004975
0.000184
0.005160
5th-6th
0.009736
0.000491
0.010227
7th-8th
0.018611
0.001228
0.019840
9th
0.014957
0.000829
0.015786
Assoc-acdm
0.024631
0.008139
0.032769
Assoc-voc
0.031357
0.011087
0.042443
Bachelors
0.096250
0.068210
0.164461
Doctorate
0.003286
0.009398
0.012684
HS-grad
0.271060
0.051442
0.322502
Masters
0.023464
0.029452
0.052916
Preschool
0.001566
0.000000
0.001566
Prof-school
0.004699
0.012991
0.017690
Some-college
0.181321
0.042597
0.223918
All
0.759190
0.240810
1.000000
In [18]:
#load sklearn and encode all object type variables
from sklearn import preprocessing
for x in train.columns:
if train[x].dtype == 'object':
lbl = preprocessing.LabelEncoder()
lbl.fit(list(train[x].values))
train[x] = lbl.transform(list(train[x].values))
In [19]:
train.head()
Out[19]:
age
workclass
fnlwgt
education
education.num
marital.status
occupation
relationship
race
sex
capital.gain
capital.loss
hours.per.week
native.country
target
0
39
6
77516
9
13
4
0
1
4
1
2174
0
40
38
0
1
50
5
83311
9
13
2
3
0
4
1
0
0
13
38
0
2
38
3
215646
11
9
0
5
1
4
1
0
0
40
38
0
3
53
3
234721
1
7
2
5
0
2
1
0
0
40
38
0
4
28
3
338409
9
13
2
9
5
2
0
0
0
40
4
0
In [20]:
train.target.value_counts()
Out[20]:
0 24720
1 7841
Name: target, dtype: int64
In [21]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import accuracy_score
y = train['target']
del train['target']
X = train
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=1,stratify=y)
print X_train
age workclass fnlwgt education education.num marital.status \
11662 47 0 329205 9 13 0
18348 34 3 169605 0 6 5
5962 38 6 107164 15 10 5
30669 41 4 177905 9 13 2
6244 18 3 84253 1 7 4
4240 22 8 87867 2 8 4
7023 50 3 400630 14 15 2
15286 70 8 230816 8 11 4
15500 35 0 39207 11 9 4
3344 20 3 195770 15 10 4
13479 32 3 136204 12 14 5
673 33 3 182556 9 13 2
19895 53 4 55139 12 14 2
20508 45 3 116163 11 9 5
23873 40 3 130760 10 16 2
4428 58 3 218281 9 13 2
30574 45 6 32186 9 13 5
29574 66 1 179285 11 9 2
31840 55 3 125000 11 9 2
1286 47 5 370119 9 13 2
19929 23 3 99543 2 8 4
28318 38 5 194534 12 14 2
1164 50 3 133963 9 13 2
16987 40 3 175935 9 13 0
15057 23 3 242375 11 9 4
29567 45 3 285858 9 13 2
6315 54 6 44172 11 9 5
20161 20 8 304076 1 7 4
19264 60 1 227332 12 14 2
9266 53 1 20676 11 9 2
... ... ... ... ... ... ...
14640 29 6 147256 7 12 4
19349 37 3 34180 11 9 2
25184 24 3 65743 9 13 4
26975 63 3 30813 12 14 2
21425 55 3 238216 11 9 2
16697 61 5 392694 15 10 0
13861 29 5 183151 11 9 2
133 37 3 254202 9 13 2
26679 41 3 30759 5 4 2
20696 33 3 37232 15 10 4
8926 73 3 333676 11 9 2
4731 43 3 212894 11 9 0
28775 32 3 426467 3 2 4
10741 26 3 36936 8 11 2
25933 38 3 102945 9 13 2
17276 21 3 203003 0 6 2
24529 48 3 176732 6 5 0
13810 41 3 112181 9 13 2
23921 29 3 423024 4 3 2
8631 37 3 69481 11 9 0
24652 27 3 50132 15 10 0
15109 53 3 167065 1 7 2
12802 17 3 117549 0 6 4
14410 31 5 145162 9 13 2
5601 34 1 177675 9 13 2
5913 18 3 98667 1 7 4
7103 32 3 152940 15 10 2
13327 49 3 75673 11 9 4
12279 29 1 219906 9 13 2
25073 39 3 126494 11 9 0
occupation relationship race sex capital.gain capital.loss \
11662 9 4 4 1 0 0
18348 7 4 4 0 0 0
5962 3 3 4 1 0 0
30669 3 0 4 1 7688 0
6244 7 3 4 0 0 0
4240 14 1 4 1 0 0
7023 9 0 4 1 0 0
15286 14 1 4 1 0 0
15500 0 1 4 1 0 0
3344 0 3 4 1 0 0
13479 3 1 4 1 0 2824
673 3 0 4 1 0 0
19895 3 0 4 1 0 0
20508 3 1 4 0 0 0
23873 9 0 4 1 0 0
4428 0 0 4 1 0 0
30574 9 1 4 0 0 0
29574 3 0 4 1 3432 0
31840 6 0 4 1 0 0
1286 11 0 4 1 15024 0
19929 13 1 4 1 0 0
28318 9 0 2 1 99999 0
1164 9 5 4 0 0 1977
16987 11 1 4 1 14084 0
15057 7 1 4 0 0 0
29567 13 0 4 1 0 0
6315 3 4 4 0 0 0
20161 14 3 2 0 0 0
19264 3 0 4 1 0 0
9266 3 0 0 1 0 0
... ... ... ... ... ... ...
14640 9 1 2 1 0 0
19349 2 0 4 1 0 0
25184 0 1 4 0 0 0
26975 9 0 4 1 0 0
21425 2 0 4 1 0 0
16697 3 1 4 1 0 0
13861 13 0 4 1 0 0
133 11 0 4 1 0 0
26679 3 0 4 1 0 0
20696 2 1 4 1 0 0
8926 4 0 4 1 0 0
4731 2 2 4 1 0 0
28775 2 1 4 1 3674 0
10741 2 0 4 1 0 2002
25933 11 0 4 1 0 0
17276 2 0 4 1 0 0
24529 11 1 4 0 0 0
13810 11 5 4 0 0 0
23921 4 0 4 1 0 0
8631 6 1 4 1 0 0
24652 11 1 4 1 0 0
15109 6 0 4 1 0 0
12802 11 2 2 0 0 0
14410 3 0 4 1 0 0
5601 3 0 4 1 0 0
5913 7 3 4 0 0 0
7103 11 0 4 1 0 0
13327 0 3 1 0 0 0
12279 9 5 4 0 0 0
25073 9 4 4 0 0 0
hours.per.week native.country
11662 40 38
18348 36 38
5962 40 38
30669 70 38
6244 24 38
4240 30 38
7023 36 38
15286 30 38
15500 40 38
3344 26 38
13479 55 38
673 40 38
19895 40 38
20508 40 38
23873 45 38
4428 40 25
30574 40 38
29574 20 38
31840 40 38
1286 50 38
19929 46 38
28318 60 38
1164 40 38
16987 40 38
15057 30 38
29567 40 38
6315 38 38
20161 20 38
19264 40 38
9266 48 38
... ... ...
14640 40 38
19349 40 38
25184 30 38
26975 50 38
21425 40 38
16697 50 38
13861 48 38
133 50 38
26679 60 38
20696 80 38
8926 50 38
4731 40 38
28775 40 12
10741 40 38
25933 52 38
17276 40 38
24529 40 38
13810 12 38
23921 40 25
8631 40 38
24652 50 38
15109 40 38
12802 12 38
14410 60 41
5601 55 38
5913 16 38
7103 40 38
13327 40 38
12279 25 38
25073 40 38
[22792 rows x 14 columns]
In [23]:
clf = RandomForestClassifier(n_estimators = 500, max_depth = 6)
clf.fit(X_train,y_train)
Out[23]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=6, max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
oob_score=False, random_state=None, verbose=0,
warm_start=False)
In [24]:
clf.predict(X_test)
Out[24]:
array([0, 1, 0, ..., 0, 0, 0])
In [27]:
#make prediction and check model's accuracy
prediction = clf.predict(X_test)
acc = accuracy_score(np.array(y_test),prediction)
print ('The accuracy of Random Forest is {}'.format(acc))
The accuracy of Random Forest is 0.851878390828
In [ ]:
Content source: palashkulsh/nseproxy
Similar notebooks: