In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
import plotly.offline as py
py.init_notebook_mode()
# special matplotlib argument for improved plots
from matplotlib import rcParams
from time import time
import warnings
warnings.filterwarnings("ignore")
import itertools
In [3]:
import scipy.stats as stats
import sklearn
import statsmodels.api as sm
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn import metrics
In [4]:
liver_dataset = pd.read_csv("indian_liver_patient.csv")
liver_dataset.info()
In [5]:
liver_dataset.describe(include='all')
Out[5]:
In [6]:
def get_pct_missing(series):
'''
Gets the percentage of missing values in a pandas Series
'''
if series is None or series.size == 0:
return -1
num = series.isnull().sum()
den = series.size
return round(100*(float(num)/float(den)),2)
def get_pct_zero(series):
'''
Gets the percentage of zeros in a numeric series (for non-numeric series, returns 0)
'''
if series.dtype.name == 'float64' or series.dtype.name == 'int64':
num = series[series == 0.0].size
den = series.size
return round(100*(float(num)/float(den)),2)
else:
return 0.0
def missing_value_summary(dataset):
'''
Summarizes the percentage of missing, zero and non-missing values for each column in the input dataframe
and returns the summary dataframe as output
'''
missingSummary = dataset.apply(lambda x: pd.Series({'missing': get_pct_missing(x),'zeros' : get_pct_zero(x)}), \
axis = 0).transpose()
missingSummary['non_missing'] = 100 - (missingSummary['missing'] + missingSummary['zeros'])
missingSummary = missingSummary.loc[(missingSummary.missing != 0) | (missingSummary.zeros != 0)]
return missingSummary
In [10]:
missing_value_summary(liver_dataset)
Out[10]:
In [9]:
liver_dataset["Albumin_and_Globulin_Ratio"] = liver_dataset.Albumin_and_Globulin_Ratio.fillna(liver_dataset['Albumin_and_Globulin_Ratio'].mean())
In [11]:
fig, axs = plt.subplots(1, 2, sharey=True)
liver_dataset.plot(kind='scatter', x='Total_Protiens', y='Albumin', ax=axs[0], figsize=(16, 8))
liver_dataset.plot(kind='scatter', x='Age', y='Direct_Bilirubin', ax=axs[1])
Out[11]:
In [12]:
liver_dataset.groupby("Gender").agg({
'Total_Bilirubin': ['min', 'max'],
'Disease': 'count'})
Out[12]:
In [13]:
liver_dataset.groupby("Disease").agg(
{'Alkaline_Phosphotase': ['min', 'max']})
Out[13]:
In [15]:
ax = sns.boxplot(x=liver_dataset[liver_dataset['Disease'] == 2]["Alkaline_Phosphotase"])
In [17]:
from plots import *
_ = drawPlots(liver_dataset.Alkaline_Phosphotase, by=liver_dataset.Disease.astype(str))
In [18]:
corr = liver_dataset[liver_dataset.columns].corr()
sns.heatmap(corr, annot = True)
Out[18]:
In [26]:
features = liver_dataset.drop(['Disease'], axis = 1)
target = liver_dataset['Disease']
In [20]:
liver_dataset = pd.concat([liver_dataset,
pd.get_dummies(liver_dataset['Gender'],
prefix = 'Gender')], axis=1)
In [23]:
liver_dataset = liver_dataset.drop(['Gender'], axis=1)
In [27]:
X_train, X_test, y_train, y_test = train_test_split(features,
target,
test_size = 0.25, random_state = 5)
In [29]:
X_test.shape
Out[29]:
In [30]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report,confusion_matrix
random_forest = RandomForestClassifier(n_estimators=100,
criterion='entropy',
verbose=0,
#class_weight='balanced',
oob_score=True,n_jobs=4, max_features = 0.25)
In [32]:
random_forest = random_forest.fit(X_train, y_train)
In [33]:
random_forest.fit(X_train, y_train)
#Predict Output
rf_predicted = random_forest.predict(X_test)
random_forest_score = round(random_forest.score(X_train, y_train) * 100, 2)
random_forest_score_test = round(random_forest.score(X_test, y_test) * 100, 2)
print('Accuracy: \n', accuracy_score(y_test,rf_predicted))
print(confusion_matrix(y_test,rf_predicted))
print(classification_report(y_test,rf_predicted))