In [1]:
import pandas as pd
from sklearn import model_selection
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
In [2]:
#define the data path
Learn_Data_Path = './US_Census_Data/census_income_learn.csv'
Test_Data_Path = './US_Census_Data/census_income_test.csv'
In [3]:
Train_Data_Set = pd.read_csv(Learn_Data_Path,sep=',', header=None, index_col=False)
Test_Data_Set = pd.read_csv(Test_Data_Path,sep=',', header=None, index_col=False)
In [4]:
Train_Data_Set.head(2)
Out[4]:
In [5]:
Test_Data_Set.head(2)
Out[5]:
In [6]:
# Naming the columns :
columns = ['AAGE', 'ACLSWKR', 'ADTIND', 'ADTOCC', 'AHGA', 'AHRSPAY',
'AHSCOL', 'AMARITL', 'AMJIND', 'AMJOCC', 'ARACE', 'AREORGN',
'ASEX', 'AUNMEM', 'AUNTYPE', 'AWKSTAT', 'CAPGAIN', 'CAPLOSS',
'DIVVAL', 'FILESTAT', 'GRINREG', 'GRINST', 'HHDFMX', 'HHDREL',
'MARSUPWT', 'MIGMTR1', 'MIGMTR3', 'MIGMTR4', 'MIGSAME', 'MIGSUN',
'NOEMP', 'PARENT', 'PEFNTVTY', 'PEMNTVTY', 'PENATVTY', 'PRCITSHP',
'SEOTR', 'VETQVA', 'VETYN', 'WKSWORK', 'YEAR', 'WAGE']
In [7]:
Train_Data_Set.columns=columns
Test_Data_Set.columns=columns
In [8]:
Test_Data_Set.tail(2)
Out[8]:
In [9]:
#remove the unwanted information for classification
Train_Data_Set.drop(['MARSUPWT', 'MIGMTR1','GRINREG', 'GRINST','AUNMEM','HHDFMX', 'HHDREL','FILESTAT',
'PEFNTVTY', 'PEMNTVTY', 'PENATVTY','AREORGN','ADTIND', 'ADTOCC','SEOTR','YEAR',
'MIGMTR3', 'MIGMTR4', 'MIGSAME', 'MIGSUN','VETQVA', 'VETYN','AUNTYPE'], axis=1, inplace=True)
Test_Data_Set.drop(['MARSUPWT', 'MIGMTR1', 'MIGMTR3', 'GRINREG', 'GRINST','AUNMEM','HHDFMX', 'HHDREL','FILESTAT',
'PEFNTVTY', 'PEMNTVTY', 'PENATVTY','AREORGN','ADTIND', 'ADTOCC','SEOTR','YEAR',
'MIGMTR4', 'MIGSAME', 'MIGSUN','VETQVA', 'VETYN','AUNTYPE'], axis=1, inplace=True)
In [10]:
list(Test_Data_Set)
Out[10]:
In [17]:
Train_Data_Set.dtypes
Train_Data_Set.to_csv('./US_Census_Data/Train_Data_Set.csv')
Test_Data_Set.to_csv('./US_Census_Data/Test_Data_Set.csv')
In [18]:
def annual_wage(df):
wage = []
for w in df['WAGE']:
if w == ' - 50000.':
wage.append(0)
else:
wage.append(1)
return wage
In [19]:
Test_Data_Set.insert(0, 'WAGE_ANN', int)
Train_Data_Set.insert(0, 'WAGE_ANN', int)
In [20]:
Train_Data_Set['WAGE_ANN'] = annual_wage(Train_Data_Set)
Test_Data_Set['WAGE_ANN'] = annual_wage(Test_Data_Set)
In [21]:
Train_Data_Set.drop(['WAGE'], axis=1, inplace=True)
Test_Data_Set.drop(['WAGE'], axis=1, inplace=True)
In [22]:
def dummy_variables(df):
df_type = df.dtypes
for col in df_type.keys():
if df_type[col] == 'object':
df = pd.concat([df, pd.get_dummies(df[col]).rename(columns=lambda x: col + '_' + str(x))], axis=1)
df.drop([col], axis=1, inplace=True)
return df
trainset_ = dummy_variables(Train_Data_Set)
testset_ = dummy_variables(Test_Data_Set)
In [23]:
X = trainset_.iloc[:,1:]
Y = trainset_['WAGE_ANN']
In [24]:
X_train, X_val, Y_train, Y_val = model_selection.train_test_split(X, Y, \
test_size=0.20, random_state=100)
In [25]:
logregCV = linear_model.LogisticRegressionCV()
logregCV.fit(X_train, Y_train)
Out[25]:
In [26]:
Y_pred = logregCV.predict(X_val)
In [27]:
print ('Accuracy score : %.3f' %(logregCV.score(X_val, Y_val)))