In [1]:
import pandas as pd
from sklearn import model_selection
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics

In [2]:
#define the data path
Learn_Data_Path = './US_Census_Data/census_income_learn.csv'
Test_Data_Path  = './US_Census_Data/census_income_test.csv'

In [3]:
Train_Data_Set = pd.read_csv(Learn_Data_Path,sep=',', header=None, index_col=False)
Test_Data_Set = pd.read_csv(Test_Data_Path,sep=',', header=None, index_col=False)

In [4]:
Train_Data_Set.head(2)


Out[4]:
0 1 2 3 4 5 6 7 8 9 ... 32 33 34 35 36 37 38 39 40 41
0 73 Not in universe 0 0 High school graduate 0 Not in universe Widowed Not in universe or children Not in universe ... United-States United-States United-States Native- Born in the United States 0 Not in universe 2 0 95 - 50000.
1 58 Self-employed-not incorporated 4 34 Some college but no degree 0 Not in universe Divorced Construction Precision production craft & repair ... United-States United-States United-States Native- Born in the United States 0 Not in universe 2 52 94 - 50000.

2 rows × 42 columns


In [5]:
Test_Data_Set.head(2)


Out[5]:
0 1 2 3 4 5 6 7 8 9 ... 32 33 34 35 36 37 38 39 40 41
0 38 Private 6 36 1st 2nd 3rd or 4th grade 0 Not in universe Married-civilian spouse present Manufacturing-durable goods Machine operators assmblrs & inspctrs ... Mexico Mexico Mexico Foreign born- Not a citizen of U S 0 Not in universe 2 12 95 - 50000.
1 44 Self-employed-not incorporated 37 12 Associates degree-occup /vocational 0 Not in universe Married-civilian spouse present Business and repair services Professional specialty ... United-States United-States United-States Native- Born in the United States 0 Not in universe 2 26 95 - 50000.

2 rows × 42 columns


In [6]:
# Naming the columns :
columns = ['AAGE', 'ACLSWKR', 'ADTIND', 'ADTOCC', 'AHGA', 'AHRSPAY',         
           'AHSCOL', 'AMARITL', 'AMJIND', 'AMJOCC', 'ARACE', 'AREORGN',      
           'ASEX', 'AUNMEM', 'AUNTYPE', 'AWKSTAT', 'CAPGAIN', 'CAPLOSS',     
           'DIVVAL', 'FILESTAT', 'GRINREG', 'GRINST', 'HHDFMX', 'HHDREL',    
           'MARSUPWT', 'MIGMTR1', 'MIGMTR3', 'MIGMTR4', 'MIGSAME', 'MIGSUN', 
           'NOEMP', 'PARENT', 'PEFNTVTY', 'PEMNTVTY', 'PENATVTY', 'PRCITSHP',
           'SEOTR', 'VETQVA', 'VETYN', 'WKSWORK', 'YEAR', 'WAGE']

In [7]:
Train_Data_Set.columns=columns

Test_Data_Set.columns=columns

In [8]:
Test_Data_Set.tail(2)


Out[8]:
AAGE ACLSWKR ADTIND ADTOCC AHGA AHRSPAY AHSCOL AMARITL AMJIND AMJOCC ... PEFNTVTY PEMNTVTY PENATVTY PRCITSHP SEOTR VETQVA VETYN WKSWORK YEAR WAGE
99760 30 Private 45 2 Bachelors degree(BA AB BS) 0 Not in universe Married-civilian spouse present Other professional services Executive admin and managerial ... United-States United-States United-States Native- Born in the United States 0 Not in universe 2 52 95 - 50000.
99761 67 Not in universe 0 0 9th grade 0 Not in universe Married-civilian spouse present Not in universe or children Not in universe ... United-States United-States United-States Native- Born in the United States 0 Not in universe 2 0 94 - 50000.

2 rows × 42 columns


In [9]:
#remove the unwanted information for classification

Train_Data_Set.drop(['MARSUPWT', 'MIGMTR1','GRINREG', 'GRINST','AUNMEM','HHDFMX', 'HHDREL','FILESTAT',
                     'PEFNTVTY', 'PEMNTVTY', 'PENATVTY','AREORGN','ADTIND', 'ADTOCC','SEOTR','YEAR',
                     'MIGMTR3', 'MIGMTR4', 'MIGSAME', 'MIGSUN','VETQVA', 'VETYN','AUNTYPE'], axis=1, inplace=True)
Test_Data_Set.drop(['MARSUPWT', 'MIGMTR1', 'MIGMTR3', 'GRINREG', 'GRINST','AUNMEM','HHDFMX', 'HHDREL','FILESTAT',
                    'PEFNTVTY', 'PEMNTVTY', 'PENATVTY','AREORGN','ADTIND', 'ADTOCC','SEOTR','YEAR',
                    'MIGMTR4', 'MIGSAME', 'MIGSUN','VETQVA', 'VETYN','AUNTYPE'], axis=1, inplace=True)

In [10]:
list(Test_Data_Set)


Out[10]:
['AAGE',
 'ACLSWKR',
 'AHGA',
 'AHRSPAY',
 'AHSCOL',
 'AMARITL',
 'AMJIND',
 'AMJOCC',
 'ARACE',
 'ASEX',
 'AWKSTAT',
 'CAPGAIN',
 'CAPLOSS',
 'DIVVAL',
 'NOEMP',
 'PARENT',
 'PRCITSHP',
 'WKSWORK',
 'WAGE']

In [17]:
Train_Data_Set.dtypes
Train_Data_Set.to_csv('./US_Census_Data/Train_Data_Set.csv')
Test_Data_Set.to_csv('./US_Census_Data/Test_Data_Set.csv')

In [18]:
def annual_wage(df):
    wage = []
    for w in df['WAGE']:
        if w == ' - 50000.':
            wage.append(0)
        else:
            wage.append(1)
    return wage

In [19]:
Test_Data_Set.insert(0, 'WAGE_ANN', int)
Train_Data_Set.insert(0, 'WAGE_ANN', int)

In [20]:
Train_Data_Set['WAGE_ANN'] = annual_wage(Train_Data_Set)
Test_Data_Set['WAGE_ANN'] = annual_wage(Test_Data_Set)

In [21]:
Train_Data_Set.drop(['WAGE'], axis=1, inplace=True)
Test_Data_Set.drop(['WAGE'], axis=1, inplace=True)

In [22]:
def dummy_variables(df):
    df_type = df.dtypes
    for col in df_type.keys():
        if df_type[col] == 'object':
            df = pd.concat([df, pd.get_dummies(df[col]).rename(columns=lambda x: col + '_' + str(x))], axis=1)
            df.drop([col], axis=1, inplace=True)
    return df

trainset_ = dummy_variables(Train_Data_Set)
testset_ = dummy_variables(Test_Data_Set)

In [23]:
X = trainset_.iloc[:,1:]
Y = trainset_['WAGE_ANN']

In [24]:
X_train, X_val, Y_train, Y_val = model_selection.train_test_split(X, Y, \
                                                                  test_size=0.20, random_state=100)

In [25]:
logregCV = linear_model.LogisticRegressionCV()
logregCV.fit(X_train, Y_train)


Out[25]:
LogisticRegressionCV(Cs=10, class_weight=None, cv=None, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
           refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0)

In [26]:
Y_pred = logregCV.predict(X_val)

In [27]:
print ('Accuracy score : %.3f' %(logregCV.score(X_val, Y_val)))


Accuracy score : 0.951