In [175]:
# Standard imports for data analysis packages in Python
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.svm import LinearSVC
from sklearn.lda import LDA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.grid_search import GridSearchCV
# This enables inline Plots
%matplotlib inline
# Limit rows displayed in notebook
pd.set_option('display.max_rows', 10)
pd.set_option('display.precision', 2)
In [176]:
!pwd
In [189]:
#I pull in the data twice. One set I filled using normal distributions for continous values and histograms/random integers
# for discrete values which I replaced with strings to replicate the distribution. One set I just removed all ? data.
col_names=[ "A1", "A2", "A3", "A4", "A5", "A6", "A7", "A8", "A9", "A10", "A11", "A12", "A13", "A14", "A15","A16"]
credit = pd.read_csv('.//data/crx.data', names=col_names)
credit1 = pd.read_csv('.//data/crx.data', names=col_names, na_values='?')
In [192]:
credit1.head()
Out[192]:
In [191]:
credit1=credit1.dropna()
In [85]:
credit.head()
Out[85]:
In [9]:
credit.info()
In [10]:
credit['A1'].unique()
Out[10]:
In [11]:
credit['A1'].value_counts().plot(kind='bar');
In [12]:
credit['A1'].replace(to_replace='?', value=np.random.randint(0,2),inplace=True)
In [13]:
credit['A1'].replace(to_replace=0, value='a',inplace=True)
In [14]:
credit['A1'].replace(to_replace=1, value='b',inplace=True)
credit['A1'].replace(to_replace=2, value='b',inplace=True)
In [15]:
credit['A1'].unique()
Out[15]:
In [16]:
credit['A2'].replace(to_replace='?', value= 0,inplace=True)
In [17]:
credit['A2'].unique()
credit['A2']=list(map(float, credit['A2']))
#credit['A2'].replace(to_replace='?', value=credit['A2'].mean(),inplace=True)
In [18]:
listremove=credit['A2']
In [19]:
data=[]
for i in listremove:
if i != '0':
data.append(i)
mean = sum(data)/len(data)
std=np.std(data)
In [20]:
credit['A2'].replace(to_replace=0, value=np.random.normal(mean,std),inplace=True)
In [21]:
credit['A4'].unique()
Out[21]:
In [21]:
In [22]:
credit['A4'].value_counts().plot(kind='bar');
In [23]:
credit['A4'].replace(to_replace='?', value=np.random.randint(0,2),inplace=True)
In [24]:
credit['A4'].replace(to_replace=0, value='u' ,inplace=True)
credit['A4'].replace(to_replace=1, value='u' ,inplace=True)
credit['A4'].replace(to_replace=2, value='y' ,inplace=True)
credit['A4'].replace(to_replace='l', value='y' ,inplace=True)
In [25]:
credit['A5'].unique()
Out[25]:
In [26]:
credit['A5'].value_counts().plot(kind='bar');
In [27]:
credit['A5'].replace(to_replace='?', value=np.random.randint(0,2),inplace=True)
In [28]:
credit['A5'].replace(to_replace=0, value='g' ,inplace=True)
credit['A5'].replace(to_replace=1, value='g' ,inplace=True)
credit['A5'].replace(to_replace=2, value='p' ,inplace=True)
In [29]:
credit['A6'].unique()
Out[29]:
In [30]:
credit['A6'].value_counts().plot(kind='bar');
In [31]:
credit['A6'].replace(to_replace='?', value=np.random.randint(0,2),inplace=True)
In [32]:
credit['A6'].replace(to_replace=0, value='c' ,inplace=True)
credit['A6'].replace(to_replace=1, value='q' ,inplace=True)
credit['A6'].replace(to_replace=2, value='w' ,inplace=True)
In [33]:
credit['A6'].value_counts().plot(kind='bar');
In [34]:
credit['A7'].unique()
Out[34]:
In [35]:
credit['A7'].value_counts().plot(kind='bar');
In [36]:
credit['A7'].replace(to_replace='?', value=np.random.randint(0,7),inplace=True)
In [37]:
credit['A7'].replace(to_replace=0, value='v' ,inplace=True)
credit['A7'].replace(to_replace=1, value='v' ,inplace=True)
credit['A7'].replace(to_replace=2, value='v' ,inplace=True)
credit['A7'].replace(to_replace=3, value='h' ,inplace=True)
credit['A7'].replace(to_replace=4, value='h' ,inplace=True)
credit['A7'].replace(to_replace=5, value='bb' ,inplace=True)
credit['A7'].replace(to_replace=6, value='ff' ,inplace=True)
credit['A7'].replace(to_replace=7, value='z' ,inplace=True)
In [38]:
credit['A8'].unique()
#no missing values
Out[38]:
In [39]:
credit['A9'].unique()
Out[39]:
In [40]:
credit['A10'].unique()
Out[40]:
In [41]:
credit['A11'].unique()
Out[41]:
In [42]:
credit['A12'].unique()
Out[42]:
In [43]:
credit['A13'].unique()
Out[43]:
In [44]:
credit['A14'].unique()
Out[44]:
In [45]:
credit['A15'].unique()
Out[45]:
In [46]:
type(credit['A15'][1])
Out[46]:
In [47]:
credit['A16'].unique()
Out[47]:
In [48]:
credit['Approval']=credit['A16']
In [49]:
credit['Approval']=credit['Approval'].replace('+',1)
credit['Approval']=credit['Approval'].replace('-',0)
credit['Approval']
Out[49]:
In [194]:
credit1['Approval']=credit1['A16']
credit1['Approval']=credit1['Approval'].replace('+',1)
credit1['Approval']=credit1['Approval'].replace('-',0)
credit1['Approval']
Out[194]:
In [50]:
#Col A1 significant? Slightly
approved_by_a1 = credit.groupby('A1').Approval.agg(['sum', 'count'])
approved_by_a1['creditrating'] = approved_by_a1['sum'] / approved_by_a1['count']
approved_by_a1
Out[50]:
In [51]:
counta=0
for i in credit['A16']:
if i=='+':
counta = counta+1
counta
Out[51]:
In [52]:
countb=0
for i in credit['A16']:
if i=='-':
countb = countb+1
countb
Out[52]:
In [53]:
#Col A4 significant? Yes
approved_by_a4 = credit.groupby('A4').Approval.agg(['sum', 'count'])
approved_by_a4['creditrating'] = approved_by_a4['sum'] / approved_by_a4['count']
approved_by_a4
Out[53]:
In [54]:
approved_by_a5 = credit.groupby('A5').Approval.agg(['sum', 'count'])
approved_by_a5['creditrating'] = approved_by_a5['sum'] / approved_by_a5['count']
approved_by_a5
Out[54]:
In [55]:
approved_by_a6 = credit.groupby('A6').Approval.agg(['sum', 'count'])
approved_by_a6['creditrating'] = approved_by_a6['sum'] / approved_by_a6['count']
approved_by_a6
Out[55]:
In [ ]:
# I start fitting and scoring my data to sereral models below including logistic regression, LDA, nearest neighbor, SVC,
#and Linear SVC. I found logistic regression to return the best score with a grid search on the parameter C.
#My confusion matrix shows I predicted the disapproval very well and approval moderately well.
#When I recheck my model using a dataset that threw away all of the ? data rather than filling it. My model does even better.
#This makes me question whether the idea to fill data is a good idea or now.
In [56]:
from sklearn.cross_validation import train_test_split
In [57]:
X_data = credit[["A1", "A2", "A3", "A4", "A5", "A6", "A7", "A8", "A9", "A10", "A11", "A12", "A13", "A15"]]
In [58]:
X_data = pd.get_dummies(X_data)
In [119]:
y_data=credit["Approval"]
In [120]:
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, random_state=12, test_size=0.2)
In [195]:
X_data1 = credit1[["A1", "A2", "A3", "A4", "A5", "A6", "A7", "A8", "A9", "A10", "A11", "A12", "A13", "A15"]]
X_data1 = pd.get_dummies(X_data1)
In [197]:
y_data1=credit1["Approval"]
In [198]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_data1, y_data1, random_state=12, test_size=0.2)
In [121]:
from sklearn.linear_model import LogisticRegression
In [122]:
clf = LogisticRegression(C=1.5)
In [123]:
clf.fit(X_train, y_train)
Out[123]:
In [124]:
#looks like logistic regression is the best
clf.score(X_test, y_test)
Out[124]:
In [155]:
param={'C':np.logspace(-3,3,100)}
lin_grid = GridSearchCV(LogisticRegression(),param)
lin_grid.fit(X_train,y_train)
Out[155]:
In [156]:
lin_grid.score(X_test, y_test)
Out[156]:
In [200]:
param={'C':np.logspace(-3,3,100)}
lin_grid1 = GridSearchCV(LogisticRegression(),param)
lin_grid1.fit(X_train1,y_train1)
Out[200]:
In [201]:
lin_grid1.score(X_test1, y_test1)
Out[201]:
In [125]:
from sklearn.metrics import confusion_matrix, classification_report
In [126]:
y_pred = clf.predict(X_test)
In [157]:
confusion_matrix(y_test, y_pred)
Out[157]:
In [166]:
# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)
# Show confusion matrix in a separate window
plt.matshow(cm)
plt.title('Confusion matrix')
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
In [128]:
print classification_report(y_test, y_pred)
In [207]:
y_pred2 = lin_grid.predict(X_test)
In [208]:
# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred2)
print(cm)
# Show confusion matrix in a separate window
plt.matshow(cm)
plt.title('Confusion matrix')
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
In [202]:
y_pred1 = lin_grid1.predict(X_test1)
In [204]:
# Compute confusion matrix
cm = confusion_matrix(y_test1, y_pred1)
print(cm)
# Show confusion matrix in a separate window
plt.matshow(cm)
plt.title('Confusion matrix')
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
In [129]:
pd.DataFrame(zip(X_data.columns, np.transpose(clf.coef_)))
Out[129]:
In [158]:
y_pred_df = pd.DataFrame(clf.predict_proba(X_test))
y_pred_df.rename(columns={0: 'No', 1: 'Yes'}, inplace=True)
y_pred_df['y_pred'] = y_pred
y_pred_df['y_true'] = y_test
y_pred_df.head()
Out[158]:
In [130]:
In [131]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train,y_train)
Out[131]:
In [132]:
svc.score(X_test,y_test)
#not as good as logistic regression
Out[132]:
In [133]:
linest=LinearSVC(C=1e-3)
linest.fit(X_train, y_train)
Out[133]:
In [134]:
#not as good as logistic regression
linest.score(X_train, y_train)
Out[134]:
In [135]:
lda = LDA()
lda.fit(X_train, y_train)
Out[135]:
In [136]:
lda.score(X_test, y_test)
#linear discriminant analysis not quite as good as logistic regression
Out[136]:
In [137]:
neighbor = KNeighborsClassifier(n_neighbors=1)
neighbor.fit(X_train, y_train)
Out[137]:
In [138]:
neighbor.score(X_test, y_test)
#nearest neighbor not as good as logitic regression
Out[138]:
In [139]:
X_train.shape
Out[139]:
In [140]:
y_train.shape
Out[140]:
In [141]:
param={'C':np.logspace(-3,3,10), 'gamma':np.logspace(-3,3,10)}
svc_grid = GridSearchCV(SVC(),param)
svc_grid.fit(X_train,y_train)
Out[141]:
In [142]:
svc_grid.score(X_test,y_test)
Out[142]:
In [162]:
param={'C':np.logspace(-3,3,500)}
linsvc_grid = GridSearchCV(LinearSVC(),param)
linsvc_grid.fit(X_train,y_train)
Out[162]:
In [160]:
linsvc_grid.score(X_test,y_test)
Out[160]:
In [82]:
In [ ]: