In [152]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
num is the target variable, goal: 77% accuracy for classification predictors age: in years sex: 0=female, 1 = male cp: chest pain: 1 = typical angina, 2 = atypical nagina, 3 = non-anginal pain, 4 = asympotomatic. Agnina is a temporary lack of oxygen rich blood to the heart trestbps: resting blood pressure (mmHg upon admission to hospital) chol: serum cholestoral (mg/dl) fbs: fasting blood sugar > 120 mg/dl (1 = True, 0 = False) restecg: resting electrocariographic results: 0 = normal, 1 = St-T wave abnormality, 2 = Probable left ventricular hypertrophy by Estes' criteria thalach: max heart rate exang: exercise enduced angina (1 = yes, 0 = no) oldpeak: ST depression induced by exercise relative to rest slope: the slope of the peak exercise ST segment (1 = upsloping, 2 = flat, 3 = downsloping) ca: number of major vessels (0-3) thal: 3 = normal, 6 = fexed defect, 7 = reversable defect num: presence of heart disease (angiographic disease status) 0 : <50% diameter narrowing 1: >50% diameter narrowing 0: no presence of heart disease, 1,2,3,4: presence
In [153]:
data = pd.read_csv('/home/kevin/Downloads/data/heart_disease_processed.cleveland.txt', header=None, na_values = '?' )
#they denoted ? as unknown so we replace these with np.NaNs and remove them later on
columns = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num']
data.columns = columns
In [154]:
data.head()
Out[154]:
In [182]:
data['num'] = data['num'].replace([2,3,4], 1)#only care about heart disease or not
In [183]:
len(data)
Out[183]:
In [184]:
data.dtypes
Out[184]:
In [185]:
data = data.dropna()
In [186]:
data.shape
Out[186]:
In [187]:
data.hist()
plt.show()
In [188]:
from sklearn.model_selection import train_test_split
In [189]:
X = np.asmatrix(data[columns[:-1]])
y = list(data[columns[-1]])
In [190]:
X.shape, len(y)
Out[190]:
In [191]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size =.2, random_state = 1)
In [192]:
from sklearn.linear_model import LogisticRegression
In [193]:
clf = LogisticRegression()
clf.fit(X_train, y_train)
Out[193]:
In [194]:
pred1 = clf.predict(X_test)
In [195]:
score = sum(pred1 == y_test)/len(y_test)
In [196]:
score
Out[196]:
In [197]:
from sklearn.model_selection import GridSearchCV
from sklearn import svm
In [231]:
parameters = {'kernel': ('linear', 'rbf'), 'C': [.005,.01,.05,.1,1]}
svr = svm.SVC()
clf = GridSearchCV(svr, parameters)
In [232]:
clf.fit(X, y)
Out[232]:
In [233]:
clf.best_params_, clf.best_score_
Out[233]:
In [ ]:
In [234]:
parameters = {'C': [.1,.25,.5,.75,1,5,10]}
clf = LogisticRegression()
clf = GridSearchCV(clf, parameters)
clf.fit(X,y)
Out[234]:
In [235]:
clf.best_score_, clf.best_params_
Out[235]:
In [252]:
import xgboost
parameters2 = {'colsample_bytree': 0.5,
'learning_rate': 0.01,
'max_depth': 2,
'n_estimators': 750,
'reg_alpha': 0.001,
'reg_lambda': 2}
clf = xgboost.XGBClassifier(**parameters2)
#parameters = {'max_depth':[2,3,4], 'learning_rate':[.001,.005, .01,.05,.1], 'n_estimators':[500,750, 1000], 'reg_lambda': [1,2,5,10], 'reg_alpha': [0, .001], 'colsample_bytree': [.5, 1]}
#clf = GridSearchCV(clf)#, parameters2)
pred = clf.fit(X_train,y_train)
In [253]:
score = sum(pred1 == y_test)/len(y_test)
print(score)
In [254]:
xgboost.plot_importance(clf)
Out[254]: