notebook.community

Edit and run



In [26]:

    
import pandas as pd
import numpy as np

train  = pd.read_csv("~/Downloads/datafiles19cdaf8/train.csv")
test = pd.read_csv("~/Downloads/datafiles19cdaf8/test.csv")


#Education
train.workclass.value_counts(sort=True)
train.workclass.fillna('Private',inplace=True)


#Occupation
train.occupation.value_counts(sort=True)
train.occupation.fillna('Prof-specialty',inplace=True)


#Native Country
train['native.country'].value_counts(sort=True)
train['native.country'].fillna('United-States',inplace=True)

train.isnull().sum()









    Out[26]:





age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
target            0
dtype: int64



In [16]:

    
train.target.value_counts()/train.shape[0]









    Out[16]:





 <=50K    0.75919
 >50K     0.24081
Name: target, dtype: float64



In [17]:

    
pd.crosstab(train.education, train.target,margins=True)/train.shape[0]









    Out[17]:







  
    
      target
      <=50K
      >50K
      All
    
    
      education
      
      
      
    
  
  
    
      10th
      0.026750
      0.001904
      0.028654
    
    
      11th
      0.034243
      0.001843
      0.036086
    
    
      12th
      0.012285
      0.001013
      0.013298
    
    
      1st-4th
      0.004975
      0.000184
      0.005160
    
    
      5th-6th
      0.009736
      0.000491
      0.010227
    
    
      7th-8th
      0.018611
      0.001228
      0.019840
    
    
      9th
      0.014957
      0.000829
      0.015786
    
    
      Assoc-acdm
      0.024631
      0.008139
      0.032769
    
    
      Assoc-voc
      0.031357
      0.011087
      0.042443
    
    
      Bachelors
      0.096250
      0.068210
      0.164461
    
    
      Doctorate
      0.003286
      0.009398
      0.012684
    
    
      HS-grad
      0.271060
      0.051442
      0.322502
    
    
      Masters
      0.023464
      0.029452
      0.052916
    
    
      Preschool
      0.001566
      0.000000
      0.001566
    
    
      Prof-school
      0.004699
      0.012991
      0.017690
    
    
      Some-college
      0.181321
      0.042597
      0.223918
    
    
      All
      0.759190
      0.240810
      1.000000



In [18]:

    
#load sklearn and encode all object type variables
from sklearn import preprocessing

for x in train.columns:
    if train[x].dtype == 'object':
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(train[x].values))
        train[x] = lbl.transform(list(train[x].values))



In [19]:

    
train.head()









    Out[19]:







  
    
      
      age
      workclass
      fnlwgt
      education
      education.num
      marital.status
      occupation
      relationship
      race
      sex
      capital.gain
      capital.loss
      hours.per.week
      native.country
      target
    
  
  
    
      0
      39
      6
      77516
      9
      13
      4
      0
      1
      4
      1
      2174
      0
      40
      38
      0
    
    
      1
      50
      5
      83311
      9
      13
      2
      3
      0
      4
      1
      0
      0
      13
      38
      0
    
    
      2
      38
      3
      215646
      11
      9
      0
      5
      1
      4
      1
      0
      0
      40
      38
      0
    
    
      3
      53
      3
      234721
      1
      7
      2
      5
      0
      2
      1
      0
      0
      40
      38
      0
    
    
      4
      28
      3
      338409
      9
      13
      2
      9
      5
      2
      0
      0
      0
      40
      4
      0



In [20]:

    
train.target.value_counts()









    Out[20]:





0    24720
1     7841
Name: target, dtype: int64



In [21]:

    
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import accuracy_score


y = train['target']
del train['target']

X = train
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=1,stratify=y)

print X_train









    



       age  workclass  fnlwgt  education  education.num  marital.status  \
11662   47          0  329205          9             13               0   
18348   34          3  169605          0              6               5   
5962    38          6  107164         15             10               5   
30669   41          4  177905          9             13               2   
6244    18          3   84253          1              7               4   
4240    22          8   87867          2              8               4   
7023    50          3  400630         14             15               2   
15286   70          8  230816          8             11               4   
15500   35          0   39207         11              9               4   
3344    20          3  195770         15             10               4   
13479   32          3  136204         12             14               5   
673     33          3  182556          9             13               2   
19895   53          4   55139         12             14               2   
20508   45          3  116163         11              9               5   
23873   40          3  130760         10             16               2   
4428    58          3  218281          9             13               2   
30574   45          6   32186          9             13               5   
29574   66          1  179285         11              9               2   
31840   55          3  125000         11              9               2   
1286    47          5  370119          9             13               2   
19929   23          3   99543          2              8               4   
28318   38          5  194534         12             14               2   
1164    50          3  133963          9             13               2   
16987   40          3  175935          9             13               0   
15057   23          3  242375         11              9               4   
29567   45          3  285858          9             13               2   
6315    54          6   44172         11              9               5   
20161   20          8  304076          1              7               4   
19264   60          1  227332         12             14               2   
9266    53          1   20676         11              9               2   
...    ...        ...     ...        ...            ...             ...   
14640   29          6  147256          7             12               4   
19349   37          3   34180         11              9               2   
25184   24          3   65743          9             13               4   
26975   63          3   30813         12             14               2   
21425   55          3  238216         11              9               2   
16697   61          5  392694         15             10               0   
13861   29          5  183151         11              9               2   
133     37          3  254202          9             13               2   
26679   41          3   30759          5              4               2   
20696   33          3   37232         15             10               4   
8926    73          3  333676         11              9               2   
4731    43          3  212894         11              9               0   
28775   32          3  426467          3              2               4   
10741   26          3   36936          8             11               2   
25933   38          3  102945          9             13               2   
17276   21          3  203003          0              6               2   
24529   48          3  176732          6              5               0   
13810   41          3  112181          9             13               2   
23921   29          3  423024          4              3               2   
8631    37          3   69481         11              9               0   
24652   27          3   50132         15             10               0   
15109   53          3  167065          1              7               2   
12802   17          3  117549          0              6               4   
14410   31          5  145162          9             13               2   
5601    34          1  177675          9             13               2   
5913    18          3   98667          1              7               4   
7103    32          3  152940         15             10               2   
13327   49          3   75673         11              9               4   
12279   29          1  219906          9             13               2   
25073   39          3  126494         11              9               0   

       occupation  relationship  race  sex  capital.gain  capital.loss  \
11662           9             4     4    1             0             0   
18348           7             4     4    0             0             0   
5962            3             3     4    1             0             0   
30669           3             0     4    1          7688             0   
6244            7             3     4    0             0             0   
4240           14             1     4    1             0             0   
7023            9             0     4    1             0             0   
15286          14             1     4    1             0             0   
15500           0             1     4    1             0             0   
3344            0             3     4    1             0             0   
13479           3             1     4    1             0          2824   
673             3             0     4    1             0             0   
19895           3             0     4    1             0             0   
20508           3             1     4    0             0             0   
23873           9             0     4    1             0             0   
4428            0             0     4    1             0             0   
30574           9             1     4    0             0             0   
29574           3             0     4    1          3432             0   
31840           6             0     4    1             0             0   
1286           11             0     4    1         15024             0   
19929          13             1     4    1             0             0   
28318           9             0     2    1         99999             0   
1164            9             5     4    0             0          1977   
16987          11             1     4    1         14084             0   
15057           7             1     4    0             0             0   
29567          13             0     4    1             0             0   
6315            3             4     4    0             0             0   
20161          14             3     2    0             0             0   
19264           3             0     4    1             0             0   
9266            3             0     0    1             0             0   
...           ...           ...   ...  ...           ...           ...   
14640           9             1     2    1             0             0   
19349           2             0     4    1             0             0   
25184           0             1     4    0             0             0   
26975           9             0     4    1             0             0   
21425           2             0     4    1             0             0   
16697           3             1     4    1             0             0   
13861          13             0     4    1             0             0   
133            11             0     4    1             0             0   
26679           3             0     4    1             0             0   
20696           2             1     4    1             0             0   
8926            4             0     4    1             0             0   
4731            2             2     4    1             0             0   
28775           2             1     4    1          3674             0   
10741           2             0     4    1             0          2002   
25933          11             0     4    1             0             0   
17276           2             0     4    1             0             0   
24529          11             1     4    0             0             0   
13810          11             5     4    0             0             0   
23921           4             0     4    1             0             0   
8631            6             1     4    1             0             0   
24652          11             1     4    1             0             0   
15109           6             0     4    1             0             0   
12802          11             2     2    0             0             0   
14410           3             0     4    1             0             0   
5601            3             0     4    1             0             0   
5913            7             3     4    0             0             0   
7103           11             0     4    1             0             0   
13327           0             3     1    0             0             0   
12279           9             5     4    0             0             0   
25073           9             4     4    0             0             0   

       hours.per.week  native.country  
11662              40              38  
18348              36              38  
5962               40              38  
30669              70              38  
6244               24              38  
4240               30              38  
7023               36              38  
15286              30              38  
15500              40              38  
3344               26              38  
13479              55              38  
673                40              38  
19895              40              38  
20508              40              38  
23873              45              38  
4428               40              25  
30574              40              38  
29574              20              38  
31840              40              38  
1286               50              38  
19929              46              38  
28318              60              38  
1164               40              38  
16987              40              38  
15057              30              38  
29567              40              38  
6315               38              38  
20161              20              38  
19264              40              38  
9266               48              38  
...               ...             ...  
14640              40              38  
19349              40              38  
25184              30              38  
26975              50              38  
21425              40              38  
16697              50              38  
13861              48              38  
133                50              38  
26679              60              38  
20696              80              38  
8926               50              38  
4731               40              38  
28775              40              12  
10741              40              38  
25933              52              38  
17276              40              38  
24529              40              38  
13810              12              38  
23921              40              25  
8631               40              38  
24652              50              38  
15109              40              38  
12802              12              38  
14410              60              41  
5601               55              38  
5913               16              38  
7103               40              38  
13327              40              38  
12279              25              38  
25073              40              38  

[22792 rows x 14 columns]



In [23]:

    
clf = RandomForestClassifier(n_estimators = 500, max_depth = 6)
clf.fit(X_train,y_train)









    Out[23]:





RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=6, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)



In [24]:

    
clf.predict(X_test)









    Out[24]:





array([0, 1, 0, ..., 0, 0, 0])



In [27]:

    
#make prediction and check model's accuracy
prediction = clf.predict(X_test)
acc =  accuracy_score(np.array(y_test),prediction)
print ('The accuracy of Random Forest is {}'.format(acc))









    



The accuracy of Random Forest is 0.851878390828



In [ ]:

target	<=50K	>50K	All
education
10th	0.026750	0.001904	0.028654
11th	0.034243	0.001843	0.036086
12th	0.012285	0.001013	0.013298
1st-4th	0.004975	0.000184	0.005160
5th-6th	0.009736	0.000491	0.010227
7th-8th	0.018611	0.001228	0.019840
9th	0.014957	0.000829	0.015786
Assoc-acdm	0.024631	0.008139	0.032769
Assoc-voc	0.031357	0.011087	0.042443
Bachelors	0.096250	0.068210	0.164461
Doctorate	0.003286	0.009398	0.012684
HS-grad	0.271060	0.051442	0.322502
Masters	0.023464	0.029452	0.052916
Preschool	0.001566	0.000000	0.001566
Prof-school	0.004699	0.012991	0.017690
Some-college	0.181321	0.042597	0.223918
All	0.759190	0.240810	1.000000

	age	workclass	fnlwgt	education	education.num	marital.status	occupation	relationship	race	sex	capital.gain	hours.per.week	native.country
0	39	6	77516	9	13	4	0	1	4	1	2174	40	38
1	50	5	83311	9	13	2	3	0	4	1	0	13	38
2	38	3	215646	11	9	0	5	1	4	1	0	40	38
3	53	3	234721	1	7	2	5	0	2	1	0	40	38
4	28	3	338409	9	13	2	9	5	2	0	0	40	4