In [2]:
import pandas as pd
import numpy as np
In [3]:
df = pd.read_csv('C:/Amit/data.csv')
In [4]:
df = df.dropna()
In [5]:
eatingMapping = {'Bad':0,'Poor':1,'Normal':2,'Good':3,'Excellent':4}
In [6]:
from sklearn.preprocessing import LabelEncoder
number = LabelEncoder()
df['EatingHabit'] = df['EatingHabit'].map(eatingMapping)
df['BrainHemorrhage'] = number.fit_transform(df['BrainHemorrhage'].astype('str'))
df['BrainTumorPresent'] = number.fit_transform(df['BrainTumorPresent'].astype('str'))
In [7]:
# 80% - train set, 20% - test set
dfTrain, dfTest = np.split(df.sample(frac=1), [int(.8*len(df)),])
In [8]:
xTrain = dfTrain[['BloodPressure','CholesterolLevel','EatingHabit','BrainBloodVesselStrength','BrainTumorPresent']]
yTrain = dfTrain[['BrainHemorrhage']]
xTest = dfTest[['BloodPressure','CholesterolLevel','EatingHabit','BrainBloodVesselStrength','BrainTumorPresent']]
yTest = dfTest[['BrainHemorrhage']]
In [9]:
# Checked the prediction with mean imputation as well; accuracy was almost same
# Random forest cant handle missing values
# xTrainRF = xTrain.apply(lambda x: x.fillna(x.mean()),axis=0)
In [15]:
# increasing n_estimators upto 20 (best)
# min_samples_leaf=1 gives best
from sklearn.ensemble import RandomForestClassifier
modelRandomForest = RandomForestClassifier(max_features= xTrain.columns.size, n_jobs= 4,min_samples_leaf=1, n_estimators=20,
oob_score = True, random_state = 42)
modelRandomForest.fit(xTrain,yTrain)
Out[15]:
In [16]:
pd.crosstab(dfTrain['BrainHemorrhage'], modelRandomForest.predict(xTrain), rownames=['actual'], colnames=['preds'])
Out[16]:
In [ ]:
# Accuracy: 87% Train
In [17]:
pd.crosstab(dfTest['BrainHemorrhage'], modelRandomForest.predict(xTest), rownames=['actual'], colnames=['preds'])
Out[17]:
In [18]:
names = xTrain.columns
print sorted(zip(map(lambda x: round(x, 4), modelRandomForest.feature_importances_), names),
reverse=True)
In [19]:
import matplotlib.pyplot as plt
importances = modelRandomForest.feature_importances_
std = np.std([modelRandomForest.feature_importances_ for tree in modelRandomForest.estimators_],
axis=0)
indices = np.argsort(importances)[::-1]
plt.figure()
plt.title("Feature importances")
plt.bar(range(xTrain.shape[1]), importances[indices],
color="r", yerr=std[indices], align="center")
plt.xticks(range(xTrain.shape[1]), indices)
plt.xlim([-1, xTrain.shape[1]])
plt.show()