In [501]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.svm import LinearSVC
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVC
import matplotlib.pyplot as plt
from sklearn import datasets
import seaborn as sns
# This enables inline Plots
%matplotlib inline
#Limit rows
pd.set_option('display.max_rows', 50)
pd.set_option('display.precision', 2)
In [505]:
#Read the file into a dataframe
credit_df = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/credit-screening/crx.data',header=None,names=["A1","A2","A3","A4","A5","A6","A7","A8","A9","A10","A11","A12","A13","A14","A15","A16"])
In [506]:
#Understanding the columns and the data
credit_df.info()
In [507]:
# Printing the unique value from each column to identify the null value. The null value is "?"
for col in range(credit_df.shape[1]):
print credit_df.iloc[:,col].unique()
In [508]:
#Replace '?' with Null(Nan) for all columns
credit_df.replace('?',np.nan,inplace=True)
In [509]:
credit_df.info()
In [510]:
#Selecting rows with null values
credit_df[credit_df.isnull().any(axis=1)]
#from the output, we see there are 6 rows where A4,A5,A6 and A7 are null. There are 2 rows were A6 and A7 are null. They
#can be removed from the data set.
Out[510]:
In [511]:
# drop rows were A4 is null
credit_df.dropna(how='any',subset = ['A4'],inplace=True)
In [512]:
# drop rows were A6 is null
credit_df.dropna(how='any',subset = ['A6'],inplace=True)
In [513]:
credit_df.info()
In [514]:
#A2 and A14 are numerical values. Imputing values for the Null rows
#Check the distributions of A2 and A14
credit_df['A2'] = credit_df.A2.astype(float)
credit_df['A14'] = credit_df.A14.astype(float)
In [515]:
#Plot A2 on a histogram
credit_df.A2.hist(bins=20)
Out[515]:
In [516]:
# Get the Mean and STD of ALL Data
print 'Mean Age:', credit_df.A2.mean()
print 'Std Age:', credit_df.A2.std()
In [517]:
# Create a Normal Distribution centered on Mean of 31.5 and Standard Dev of 11.8
# Get 12 Entries since that's how many missing entries we have for A2
#function is defined so that we can use the same steps on both the training and test data
def get_a2_impute_values(n):
return np.random.normal(31.5, 11.8, n)
In [518]:
# When we look for entries that are null we can create a mask or boolean filter
a2_missing_mask = credit_df.A2.isnull()
In [519]:
# Notice that we have 12 rows that have missing A2 field
credit_df[a2_missing_mask]
Out[519]:
In [520]:
# Now we can set these values to the values we picked from Random Normal Distribution
credit_df.loc[credit_df.A2.isnull(), 'A2'] = get_a2_impute_values(n=12)
In [521]:
#A2 does not have missing values anymore
credit_df.info()
In [522]:
#Imputing Values for A14
credit_df.A14.hist(bins=100)
#from the graph below
Out[522]:
In [523]:
#from the graph above, it seems that a lot of values for A14 is 0. Hence we can impute the null values to 0
credit_df.A14[credit_df.A14==0].count()
Out[523]:
In [524]:
#Imputing the null A14 values to 0
credit_df.loc[credit_df.A14.isnull(), 'A14'] = 0
In [525]:
#checking if the null values have been made 0
credit_df.A14[credit_df.A14==0].count()
Out[525]:
In [526]:
#We remain with only A1 having 10 null values
#Printing the distribution of a's and b's in the dataframe
credit_df.groupby(["A1"]).count()["A2"]
Out[526]:
In [527]:
np.random.choice(['a','b'],10,p=[0.3,0.7])
Out[527]:
In [528]:
# Create a function to return a random value for A1 as it is a categorical variable and not numerical
# Get 10 Entries since that's how many missing entries we have for A1
#function is defined so that we can use the same steps on both the training and test data
def get_a1_impute_values(n):
return np.random.choice(['a','b'],n,p=[0.3,0.7])
In [529]:
# When we look for entries that are null we can create a mask or boolean filter
a1_missing_mask = credit_df.A1.isnull()
In [530]:
# Now we can set these values to the values we picked from Random Normal Distribution
credit_df.loc[credit_df.A1.isnull(), 'A1'] = get_a1_impute_values(n=10)
In [531]:
credit_df.info()
In [293]:
#Finalize the variable required to predict. Check for significance
#Before that change the y(predictor) which is A16 in our case to 0's and 1's.
#Hence adding a new column which acts as the new A16 having 0's and 1's
In [532]:
credit_df["cardapproval"] = credit_df.A16.replace(['+','-'],[1,0])
In [533]:
#Significance of A1
approved_by_a1 = credit_df.groupby('A1').cardapproval.agg(['sum', 'count'])
approved_by_a1['approved_rate'] = approved_by_a1['sum'] / approved_by_a1['count']
approved_by_a1
#The rates are not very different. A1 is not a significant regressor. We can remove it from our model.
Out[533]:
In [534]:
#Significance of A4
approved_by_a4 = credit_df.groupby('A4').cardapproval.agg(['sum', 'count'])
approved_by_a4['approved_rate'] = approved_by_a4['sum'] / approved_by_a4['count']
approved_by_a4
#A4 has significance since for different A4, the approved rate is different
Out[534]:
In [535]:
#Significance of A5
approved_by_a5 = credit_df.groupby('A5').cardapproval.agg(['sum', 'count'])
approved_by_a5['approved_rate'] = approved_by_a5['sum'] / approved_by_a5['count']
approved_by_a5
Out[535]:
In [536]:
#Seems from the above two tables, that A4 and A5 are correlated.
credit_df.groupby(['A4'])['A5'].count()
#df.groupby([‘column1’])[‘column2’].sum()
Out[536]:
In [537]:
#Concatenating A4 and A5 to see if they are correlated and then doing a group by to see if they are correlated
credit_df["A4A5"]=credit_df.A4 + credit_df.A5
credit_df.groupby(['A4A5'])['A5'].count()
#They are correlated. So we can drop A5 from our model.
Out[537]:
In [538]:
#Significance of A6
approved_by_a6 = credit_df.groupby('A6').cardapproval.agg(['sum', 'count'])
approved_by_a6['approved_rate'] = approved_by_a6['sum'] / approved_by_a6['count']
approved_by_a6
#A6 has significance since for different A6, the approved rate is different
Out[538]:
In [539]:
#Significance of A7
approved_by_a7 = credit_df.groupby('A7').cardapproval.agg(['sum', 'count'])
approved_by_a7['approved_rate'] = approved_by_a7['sum'] / approved_by_a7['count']
approved_by_a7
#A7 has significance since for different A7, the approved rate is different
Out[539]:
In [540]:
#Significance of A9
approved_by_a9 = credit_df.groupby('A9').cardapproval.agg(['sum', 'count'])
approved_by_a9['approved_rate'] = approved_by_a9['sum'] / approved_by_a9['count']
approved_by_a9
#A9 has significance since for different A9, the approved rate is different
Out[540]:
In [541]:
#Significance of A10
approved_by_a10 = credit_df.groupby('A10').cardapproval.agg(['sum', 'count'])
approved_by_a10['approved_rate'] = approved_by_a10['sum'] / approved_by_a10['count']
approved_by_a10
#A10 has significance since for different A10, the approved rate is different
Out[541]:
In [542]:
#Significance of A12
approved_by_a12 = credit_df.groupby('A12').cardapproval.agg(['sum', 'count'])
approved_by_a12['approved_rate'] = approved_by_a12['sum'] / approved_by_a12['count']
approved_by_a12
#The rates are not very different. A12 is not a significant regressor. We can remove it from our model.
Out[542]:
In [543]:
#Significance of A13
approved_by_a13 = credit_df.groupby('A13').cardapproval.agg(['sum', 'count'])
approved_by_a13['approved_rate'] = approved_by_a13['sum'] / approved_by_a13['count']
approved_by_a13
#Significant
Out[543]:
In [544]:
#is there any correlation between the continuous valued attributes
credit_df.corr()
#From the below table, there does not seem to be a correlation among the X's
Out[544]:
In [545]:
#Plotting all numerical values to see if the values are significant
credit_df.boxplot(['A15'])
Out[545]:
In [546]:
#A8,A11,A15 seem to have a lot of 0 values. Check the count
print credit_df.A8[credit_df.A8==0].count()
#A8 is significant as only 63 have 0 values
print credit_df.A11[credit_df.A11==0].count()
#We can exclude A11, but we will keep it for now.
print credit_df.A15[credit_df.A15==0].count()
#We can exclude A15, but we will keep it for now.
In [547]:
X_data = credit_df.drop(["A5","A16","cardapproval","A4A5"],axis=1)
#played with A11,A1,A15,A12 and removing them does not seem to change the score a lot
In [548]:
X_data.info()
In [549]:
X_data = pd.get_dummies(X_data)
In [550]:
y_data = credit_df["cardapproval"]
In [551]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, random_state=12, test_size=0.2)
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)
Out[551]:
In [552]:
#Confusion Matrix for the logistic regression model
from sklearn.metrics import confusion_matrix, classification_report
y_pred = clf.predict(X_test)
# Confusion Matrix for Type 1 and Type 2 Error
confusion_matrix(y_test, y_pred)
Out[552]:
In [555]:
#Interpreting the confusion matrix
def confusion_matrix_df(y_test, y_pred):
cm = confusion_matrix(y_test, y_pred)
cm = pd.DataFrame(data=cm, columns=[0, 1], index=[0, 1])
cm.columns.name = 'Predicted label'
cm.index.name = 'True label'
error_rate = (y_pred != y_test).mean()
print('error rate: %.2f' % error_rate)
return cm
confusion_matrix_df(y_test, y_pred)
Out[555]:
In [556]:
# Examine Precision and Recall
print classification_report(y_test, y_pred)
In [449]:
# Let's take a look at Predicted Probabilities
y_pred_df = pd.DataFrame(clf.predict_proba(X_test))
y_pred_df.rename(columns={0: 'No', 1: 'Yes'}, inplace=True)
y_pred_df['y_pred'] = y_pred
y_pred_df['y_true'] = y_test
y_pred_df.tail()
Out[449]:
In [450]:
# Take a look at Predicted vs True Values
sns.regplot(y_pred, y_test, x_jitter=0.1, y_jitter=0.1)
Out[450]:
sklearn.svm.LinearSVC
In [558]:
#initialize C=1e-3
est = LinearSVC()
In [559]:
est.fit(X_train,y_train)
Out[559]:
In [560]:
est.score(X_test,y_test)
Out[560]:
C
using np.logspace(-3., 3., 10)
.Read the example for grid search sklearn.grid_search.GridSearchCV
.
In [561]:
import numpy as np
In [562]:
gs = GridSearchCV(LinearSVC(),{'C':np.logspace(-3,3,10)})
In [563]:
gs.fit(X_train,y_train)
Out[563]:
In [564]:
gs.best_params_,gs.best_score_
Out[564]:
In [565]:
est = LinearSVC(C=0.021544346900318832)
In [566]:
est.fit(X_train,y_train)
est.score(X_test,y_test)
Out[566]:
In [567]:
#Confusion Matrix for the logistic regression model
from sklearn.metrics import confusion_matrix, classification_report
y_pred = est.predict(X_test)
# Confusion Matrix for Type 1 and Type 2 Error
confusion_matrix(y_test, y_pred)
Out[567]:
In [568]:
#Interpreting the confusion matrix
def confusion_matrix_df(y_test, y_pred):
cm = confusion_matrix(y_test, y_pred)
cm = pd.DataFrame(data=cm, columns=[0, 1], index=[0, 1])
cm.columns.name = 'Predicted label'
cm.index.name = 'True label'
error_rate = (y_pred != y_test).mean()
print('error rate: %.2f' % error_rate)
return cm
confusion_matrix_df(y_test, y_pred)
Out[568]:
For this you will use sklearn.svm.SVC
Let's first make sure we understand how to read the documentation:
Implement an SVM classifier using the defaults and fit to our data:
In [569]:
svc1 = SVC()
In [570]:
svc1.fit(X_train,y_train)
Out[570]:
In [571]:
gs = GridSearchCV(SVC(),{'C':np.logspace(-3,3,10)})
In [572]:
gs.fit(X_train,y_train)
Out[572]:
In [573]:
gs.best_params_,gs.best_score_
Out[573]:
In [574]:
svc1 = SVC(C=10)
svc1.fit(X_train,y_train)
svc1.score(X_test,y_test)
Out[574]:
In [579]:
#The scores from the 3 models are as follows
print "Logistic Regression:"
print clf.score(X_test,y_test)
print "Linear SVM :"
print est.score(X_test,y_test)
print "Non-Linear Kernel SVM:"
print svc1.score(X_test,y_test)
In [580]:
#The highest score is for the Logistic Regression and the confusion matrix is already run above. The results look good.
#Hence the best model for this data is Logistic Regression
In [ ]: