notebook.community

Edit and run



In [1]:

    
import pandas as pd
from sklearn import model_selection
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics



In [2]:

    
#define the data path
Learn_Data_Path = './US_Census_Data/census_income_learn.csv'
Test_Data_Path  = './US_Census_Data/census_income_test.csv'



In [3]:

    
Train_Data_Set = pd.read_csv(Learn_Data_Path,sep=',', header=None, index_col=False)
Test_Data_Set = pd.read_csv(Test_Data_Path,sep=',', header=None, index_col=False)



In [4]:

    
Train_Data_Set.head(2)









    Out[4]:






  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      32
      33
      34
      35
      36
      37
      38
      39
      40
      41
    
  
  
    
      0
      73
      Not in universe
      0
      0
      High school graduate
      0
      Not in universe
      Widowed
      Not in universe or children
      Not in universe
      ...
      United-States
      United-States
      United-States
      Native- Born in the United States
      0
      Not in universe
      2
      0
      95
      - 50000.
    
    
      1
      58
      Self-employed-not incorporated
      4
      34
      Some college but no degree
      0
      Not in universe
      Divorced
      Construction
      Precision production craft & repair
      ...
      United-States
      United-States
      United-States
      Native- Born in the United States
      0
      Not in universe
      2
      52
      94
      - 50000.
    
  

2 rows × 42 columns



In [5]:

    
Test_Data_Set.head(2)









    Out[5]:






  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      32
      33
      34
      35
      36
      37
      38
      39
      40
      41
    
  
  
    
      0
      38
      Private
      6
      36
      1st 2nd 3rd or 4th grade
      0
      Not in universe
      Married-civilian spouse present
      Manufacturing-durable goods
      Machine operators assmblrs & inspctrs
      ...
      Mexico
      Mexico
      Mexico
      Foreign born- Not a citizen of U S
      0
      Not in universe
      2
      12
      95
      - 50000.
    
    
      1
      44
      Self-employed-not incorporated
      37
      12
      Associates degree-occup /vocational
      0
      Not in universe
      Married-civilian spouse present
      Business and repair services
      Professional specialty
      ...
      United-States
      United-States
      United-States
      Native- Born in the United States
      0
      Not in universe
      2
      26
      95
      - 50000.
    
  

2 rows × 42 columns



In [6]:

    
# Naming the columns :
columns = ['AAGE', 'ACLSWKR', 'ADTIND', 'ADTOCC', 'AHGA', 'AHRSPAY',         
           'AHSCOL', 'AMARITL', 'AMJIND', 'AMJOCC', 'ARACE', 'AREORGN',      
           'ASEX', 'AUNMEM', 'AUNTYPE', 'AWKSTAT', 'CAPGAIN', 'CAPLOSS',     
           'DIVVAL', 'FILESTAT', 'GRINREG', 'GRINST', 'HHDFMX', 'HHDREL',    
           'MARSUPWT', 'MIGMTR1', 'MIGMTR3', 'MIGMTR4', 'MIGSAME', 'MIGSUN', 
           'NOEMP', 'PARENT', 'PEFNTVTY', 'PEMNTVTY', 'PENATVTY', 'PRCITSHP',
           'SEOTR', 'VETQVA', 'VETYN', 'WKSWORK', 'YEAR', 'WAGE']



In [7]:

    
Train_Data_Set.columns=columns

Test_Data_Set.columns=columns



In [8]:

    
Test_Data_Set.tail(2)









    Out[8]:






  
    
      
      AAGE
      ACLSWKR
      ADTIND
      ADTOCC
      AHGA
      AHRSPAY
      AHSCOL
      AMARITL
      AMJIND
      AMJOCC
      ...
      PEFNTVTY
      PEMNTVTY
      PENATVTY
      PRCITSHP
      SEOTR
      VETQVA
      VETYN
      WKSWORK
      YEAR
      WAGE
    
  
  
    
      99760
      30
      Private
      45
      2
      Bachelors degree(BA AB BS)
      0
      Not in universe
      Married-civilian spouse present
      Other professional services
      Executive admin and managerial
      ...
      United-States
      United-States
      United-States
      Native- Born in the United States
      0
      Not in universe
      2
      52
      95
      - 50000.
    
    
      99761
      67
      Not in universe
      0
      0
      9th grade
      0
      Not in universe
      Married-civilian spouse present
      Not in universe or children
      Not in universe
      ...
      United-States
      United-States
      United-States
      Native- Born in the United States
      0
      Not in universe
      2
      0
      94
      - 50000.
    
  

2 rows × 42 columns



In [9]:

    
#remove the unwanted information for classification

Train_Data_Set.drop(['MARSUPWT', 'MIGMTR1','GRINREG', 'GRINST','AUNMEM','HHDFMX', 'HHDREL','FILESTAT',
                     'PEFNTVTY', 'PEMNTVTY', 'PENATVTY','AREORGN','ADTIND', 'ADTOCC','SEOTR','YEAR',
                     'MIGMTR3', 'MIGMTR4', 'MIGSAME', 'MIGSUN','VETQVA', 'VETYN','AUNTYPE'], axis=1, inplace=True)
Test_Data_Set.drop(['MARSUPWT', 'MIGMTR1', 'MIGMTR3', 'GRINREG', 'GRINST','AUNMEM','HHDFMX', 'HHDREL','FILESTAT',
                    'PEFNTVTY', 'PEMNTVTY', 'PENATVTY','AREORGN','ADTIND', 'ADTOCC','SEOTR','YEAR',
                    'MIGMTR4', 'MIGSAME', 'MIGSUN','VETQVA', 'VETYN','AUNTYPE'], axis=1, inplace=True)



In [10]:

    
list(Test_Data_Set)









    Out[10]:





['AAGE',
 'ACLSWKR',
 'AHGA',
 'AHRSPAY',
 'AHSCOL',
 'AMARITL',
 'AMJIND',
 'AMJOCC',
 'ARACE',
 'ASEX',
 'AWKSTAT',
 'CAPGAIN',
 'CAPLOSS',
 'DIVVAL',
 'NOEMP',
 'PARENT',
 'PRCITSHP',
 'WKSWORK',
 'WAGE']



In [17]:

    
Train_Data_Set.dtypes
Train_Data_Set.to_csv('./US_Census_Data/Train_Data_Set.csv')
Test_Data_Set.to_csv('./US_Census_Data/Test_Data_Set.csv')



In [18]:

    
def annual_wage(df):
    wage = []
    for w in df['WAGE']:
        if w == ' - 50000.':
            wage.append(0)
        else:
            wage.append(1)
    return wage



In [19]:

    
Test_Data_Set.insert(0, 'WAGE_ANN', int)
Train_Data_Set.insert(0, 'WAGE_ANN', int)



In [20]:

    
Train_Data_Set['WAGE_ANN'] = annual_wage(Train_Data_Set)
Test_Data_Set['WAGE_ANN'] = annual_wage(Test_Data_Set)



In [21]:

    
Train_Data_Set.drop(['WAGE'], axis=1, inplace=True)
Test_Data_Set.drop(['WAGE'], axis=1, inplace=True)



In [22]:

    
def dummy_variables(df):
    df_type = df.dtypes
    for col in df_type.keys():
        if df_type[col] == 'object':
            df = pd.concat([df, pd.get_dummies(df[col]).rename(columns=lambda x: col + '_' + str(x))], axis=1)
            df.drop([col], axis=1, inplace=True)
    return df

trainset_ = dummy_variables(Train_Data_Set)
testset_ = dummy_variables(Test_Data_Set)



In [23]:

    
X = trainset_.iloc[:,1:]
Y = trainset_['WAGE_ANN']



In [24]:

    
X_train, X_val, Y_train, Y_val = model_selection.train_test_split(X, Y, \
                                                                  test_size=0.20, random_state=100)



In [25]:

    
logregCV = linear_model.LogisticRegressionCV()
logregCV.fit(X_train, Y_train)









    Out[25]:





LogisticRegressionCV(Cs=10, class_weight=None, cv=None, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
           refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0)



In [26]:

    
Y_pred = logregCV.predict(X_val)



In [27]:

    
print ('Accuracy score : %.3f' %(logregCV.score(X_val, Y_val)))









    



Accuracy score : 0.951

	0	1	2	3	4	5	6	7	8	9	...	32	33	34	35	36	37	38	39	40	41
0	73	Not in universe	0	0	High school graduate	0	Not in universe	Widowed	Not in universe or children	Not in universe	...	United-States	United-States	United-States	Native- Born in the United States	0	Not in universe	2	0	95	- 50000.
1	58	Self-employed-not incorporated	4	34	Some college but no degree	0	Not in universe	Divorced	Construction	Precision production craft & repair	...	United-States	United-States	United-States	Native- Born in the United States	0	Not in universe	2	52	94	- 50000.

	0	1	2	3	4	5	6	7	8	9	...	32	33	34	35	36	37	38	39	40	41
0	38	Private	6	36	1st 2nd 3rd or 4th grade	0	Not in universe	Married-civilian spouse present	Manufacturing-durable goods	Machine operators assmblrs & inspctrs	...	Mexico	Mexico	Mexico	Foreign born- Not a citizen of U S	0	Not in universe	2	12	95	- 50000.
1	44	Self-employed-not incorporated	37	12	Associates degree-occup /vocational	0	Not in universe	Married-civilian spouse present	Business and repair services	Professional specialty	...	United-States	United-States	United-States	Native- Born in the United States	0	Not in universe	2	26	95	- 50000.

	AAGE	ACLSWKR	ADTIND	ADTOCC	AHGA	AHRSPAY	AHSCOL	AMARITL	AMJIND	AMJOCC	...	PEFNTVTY	PEMNTVTY	PENATVTY	PRCITSHP	SEOTR	VETQVA	VETYN	WKSWORK	YEAR	WAGE
99760	30	Private	45	2	Bachelors degree(BA AB BS)	0	Not in universe	Married-civilian spouse present	Other professional services	Executive admin and managerial	...	United-States	United-States	United-States	Native- Born in the United States	0	Not in universe	2	52	95	- 50000.
99761	67	Not in universe	0	0	9th grade	0	Not in universe	Married-civilian spouse present	Not in universe or children	Not in universe	...	United-States	United-States	United-States	Native- Born in the United States	0	Not in universe	2	0	94	- 50000.