In [1]:
#Decision Trees are excellent tools for data inspection

In [3]:
#Data preparation

In [6]:
Data<-read.csv("http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",header=F)#import

In [7]:
names(Data)<-c("age","workclass","fnlwgt","education", "educationnum", "maritalstatus", "occupation", "relationship", "race", "sex", "capitalgain", "capitalloss", 
               "hoursperweek", "nativecountry", "response")#assign names for columns

In [8]:
str(Data)#look at the structure of the dataset


'data.frame':	32561 obs. of  15 variables:
 $ age          : int  39 50 38 53 28 37 49 52 31 42 ...
 $ workclass    : Factor w/ 9 levels " ?"," Federal-gov",..: 8 7 5 5 5 5 5 7 5 5 ...
 $ fnlwgt       : int  77516 83311 215646 234721 338409 284582 160187 209642 45781 159449 ...
 $ education    : Factor w/ 16 levels " 10th"," 11th",..: 10 10 12 2 10 13 7 12 13 10 ...
 $ educationnum : int  13 13 9 7 13 14 5 9 14 13 ...
 $ maritalstatus: Factor w/ 7 levels " Divorced"," Married-AF-spouse",..: 5 3 1 3 3 3 4 3 5 3 ...
 $ occupation   : Factor w/ 15 levels " ?"," Adm-clerical",..: 2 5 7 7 11 5 9 5 11 5 ...
 $ relationship : Factor w/ 6 levels " Husband"," Not-in-family",..: 2 1 2 1 6 6 2 1 2 1 ...
 $ race         : Factor w/ 5 levels " Amer-Indian-Eskimo",..: 5 5 5 3 3 5 3 5 5 5 ...
 $ sex          : Factor w/ 2 levels " Female"," Male": 2 2 2 2 1 1 1 2 1 2 ...
 $ capitalgain  : int  2174 0 0 0 0 0 0 0 14084 5178 ...
 $ capitalloss  : int  0 0 0 0 0 0 0 0 0 0 ...
 $ hoursperweek : int  40 13 40 40 40 40 16 45 50 40 ...
 $ nativecountry: Factor w/ 42 levels " ?"," Cambodia",..: 40 40 40 40 6 40 24 40 40 40 ...
 $ response     : Factor w/ 2 levels " <=50K"," >50K": 1 1 1 1 1 1 1 2 2 2 ...

In [9]:
Data<-Data[,c(1:13,15)]#remove the factor with more than 42 levels

In [10]:
set.seed(100)

In [11]:
train<-sample(1:nrow(Data),0.8*nrow(Data))

In [12]:
traindata<-Data[train,]

In [13]:
testdata<-Data[-train,]

In [14]:
head(traindata)


Out[14]:
ageworkclassfnlwgteducationeducationnummaritalstatusoccupationrelationshipracesexcapitalgaincapitallosshoursperweekresponse
1002245 Private161819 11th7 Separated Adm-clerical Unmarried Black Female0025 <=50K
839024 Private153082 HS-grad9 Never-married Sales Not-in-family White Male0030 <=50K
1798433 Private181091 10th6 Divorced Craft-repair Not-in-family White Male0035 <=50K
183623 Private64292 HS-grad9 Never-married Adm-clerical Not-in-family White Female0040 <=50K
1525521 Private347292 HS-grad9 Never-married Craft-repair Not-in-family White Male0040 <=50K
1575049 Federal-gov128990 Bachelors13 Married-civ-spouse Prof-specialty Husband White Male0040 >50K

In [ ]: