In [1]:
import pandas as pd
X_train = pd.read_csv('[your file path]/X_train.csv')
Y_train = pd.read_csv('[your file path]/Y_train.csv')
X_test = pd.read_csv('[your file path]/X_test.csv')
Y_test = pd.read_csv('[your file path]/Y_test.csv')
In [2]:
print (X_train.head())
print
print (X_test.head())
In [16]:
print (Y_train.head())
print
print (Y_test.head())
In [3]:
# check data types
X_train.dtypes
Out[3]:
In [4]:
%matplotlib inline
In [5]:
# Apply Feature Scaling on continuous variables so that they can be compared on the same ground
import matplotlib.pyplot as plt
p = X_train[X_train.dtypes[(X_train.dtypes=="float64")|(X_train.dtypes=="int64")]
.index.values].hist(figsize=[11,11])
In [13]:
# Use KNN to check the performance before & after feature scaling,
# because KNN accuracy is easy to be influnced when variables are in different scale
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
## Before using feature scaling
knn=KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train[['ApplicantIncome', 'CoapplicantIncome','LoanAmount',
'Loan_Amount_Term', 'Credit_History']],Y_train)
print accuracy_score(Y_test,knn.predict(X_test[['ApplicantIncome', 'CoapplicantIncome',
'LoanAmount', 'Loan_Amount_Term', 'Credit_History']]))
In [14]:
## After using feature scaling
# the percentage of each target value
print Y_train.Target.value_counts()/Y_train.Target.count()
print
print Y_test.Target.value_counts()/Y_test.Target.count()
In [17]:
from sklearn.preprocessing import MinMaxScaler
min_max=MinMaxScaler()
X_train_minmax=min_max.fit_transform(X_train[['ApplicantIncome', 'CoapplicantIncome',
'LoanAmount', 'Loan_Amount_Term', 'Credit_History']])
X_test_minmax=min_max.fit_transform(X_test[['ApplicantIncome', 'CoapplicantIncome',
'LoanAmount', 'Loan_Amount_Term', 'Credit_History']])
knn=KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_minmax,Y_train)
print accuracy_score(Y_test,knn.predict(X_test_minmax))
In [20]:
# Feature Standarization
from sklearn.preprocessing import scale
from sklearn.linear_model import LogisticRegression
# Before Standarization
log=LogisticRegression(penalty='l2',C=.01)
log.fit(X_train[['ApplicantIncome', 'CoapplicantIncome','LoanAmount',
'Loan_Amount_Term', 'Credit_History']],Y_train)
print accuracy_score(Y_test,log.predict(X_test[['ApplicantIncome', 'CoapplicantIncome','LoanAmount',
'Loan_Amount_Term', 'Credit_History']]))
print
# After Standarization
X_train_scale=scale(X_train[['ApplicantIncome', 'CoapplicantIncome',
'LoanAmount', 'Loan_Amount_Term', 'Credit_History']])
X_test_scale=scale(X_test[['ApplicantIncome', 'CoapplicantIncome',
'LoanAmount', 'Loan_Amount_Term', 'Credit_History']])
log=LogisticRegression(penalty='l2',C=.01)
log.fit(X_train_scale,Y_train)
print accuracy_score(Y_test,log.predict(X_test_scale))
In [23]:
# Label Encoding
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
for col in X_train.columns.values:
# Encoding only categorical variables
if X_train[col].dtypes=='object':
X_train[col]=le.fit_transform(X_train[col])
X_test[col]=le.fit_transform(X_test[col])
print X_train.head()
In [24]:
X_train_scale=scale(X_train)
X_test_scale=scale(X_test)
log=LogisticRegression(penalty='l2',C=.01)
log.fit(X_train_scale,Y_train)
print accuracy_score(Y_test,log.predict(X_test_scale))
In [27]:
# One-Hot Encoding
from sklearn.preprocessing import OneHotEncoder
ohe=OneHotEncoder(sparse=False)
X_train_scale=scale(X_train)
X_test_scale=scale(X_test)
# Before one-hot encoding
log=LogisticRegression(penalty='l2',C=1)
log.fit(X_train_scale,Y_train)
print accuracy_score(Y_test,log.predict(X_test_scale))
print
X_train_one_hot=X_train
X_test_one_hot=X_test
columns=['Gender', 'Married', 'Dependents', 'Education','Self_Employed',
'Credit_History', 'Property_Area']
for col in columns:
# Fitting One Hot Encoding on train data
temp = ohe.fit_transform(X_train[[col]])
# Changing the encoded features into a data frame with new column names
temp=pd.DataFrame(temp,columns=[(col+"_"+str(i)) for i in X_train[col].value_counts().index])
# In side by side concatenation index values should be same
# Setting the index values similar to the X_train data frame
temp=temp.set_index(X_train.index.values)
# adding the new One Hot Encoded varibales to the train data frame
X_train_one_hot=pd.concat([X_train_one_hot,temp],axis=1)
temp = ohe.fit_transform(X_test[[col]])
temp=pd.DataFrame(temp,columns=[(col+"_"+str(i)) for i in X_test[col].value_counts().index])
temp=temp.set_index(X_test.index.values)
X_test_one_hot=pd.concat([X_test_one_hot,temp],axis=1)
# After one-hot encoding
X_train_scale=scale(X_train_one_hot)
X_test_scale=scale(X_test_one_hot)
log=LogisticRegression(penalty='l2',C=1)
log.fit(X_train_scale,Y_train)
print accuracy_score(Y_test,log.predict(X_test_scale))
In [ ]: