In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline


import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

import plotly.offline as py
py.init_notebook_mode()

# special matplotlib argument for improved plots
from matplotlib import rcParams
from time import time
import warnings
warnings.filterwarnings("ignore")

import itertools



In [3]:
import scipy.stats as stats
import sklearn
import statsmodels.api as sm

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn import metrics

In [4]:
liver_dataset = pd.read_csv("indian_liver_patient.csv")
liver_dataset.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 583 entries, 0 to 582
Data columns (total 11 columns):
Age                           583 non-null int64
Gender                        583 non-null object
Total_Bilirubin               583 non-null float64
Direct_Bilirubin              583 non-null float64
Alkaline_Phosphotase          583 non-null int64
Alamine_Aminotransferase      583 non-null int64
Aspartate_Aminotransferase    583 non-null int64
Total_Protiens                583 non-null float64
Albumin                       583 non-null float64
Albumin_and_Globulin_Ratio    579 non-null float64
Disease                       583 non-null int64
dtypes: float64(5), int64(5), object(1)
memory usage: 50.2+ KB

In [5]:
liver_dataset.describe(include='all')


Out[5]:
Age Gender Total_Bilirubin Direct_Bilirubin Alkaline_Phosphotase Alamine_Aminotransferase Aspartate_Aminotransferase Total_Protiens Albumin Albumin_and_Globulin_Ratio Disease
count 583.000000 583 583.000000 583.000000 583.000000 583.000000 583.000000 583.000000 583.000000 579.000000 583.000000
unique NaN 2 NaN NaN NaN NaN NaN NaN NaN NaN NaN
top NaN Male NaN NaN NaN NaN NaN NaN NaN NaN NaN
freq NaN 441 NaN NaN NaN NaN NaN NaN NaN NaN NaN
mean 44.746141 NaN 3.298799 1.486106 290.576329 80.713551 109.910806 6.483190 3.141852 0.947064 1.286449
std 16.189833 NaN 6.209522 2.808498 242.937989 182.620356 288.918529 1.085451 0.795519 0.319592 0.452490
min 4.000000 NaN 0.400000 0.100000 63.000000 10.000000 10.000000 2.700000 0.900000 0.300000 1.000000
25% 33.000000 NaN 0.800000 0.200000 175.500000 23.000000 25.000000 5.800000 2.600000 0.700000 1.000000
50% 45.000000 NaN 1.000000 0.300000 208.000000 35.000000 42.000000 6.600000 3.100000 0.930000 1.000000
75% 58.000000 NaN 2.600000 1.300000 298.000000 60.500000 87.000000 7.200000 3.800000 1.100000 2.000000
max 90.000000 NaN 75.000000 19.700000 2110.000000 2000.000000 4929.000000 9.600000 5.500000 2.800000 2.000000

In [6]:
def get_pct_missing(series):
    '''
    Gets the percentage of missing values in a pandas Series
    '''
    if series is None or series.size == 0:
        return -1
    num = series.isnull().sum()
    den = series.size
    return round(100*(float(num)/float(den)),2)

def get_pct_zero(series):
    '''
    Gets the percentage of zeros in a numeric series (for non-numeric series, returns 0)
    '''
    if series.dtype.name == 'float64' or series.dtype.name == 'int64':
        num = series[series == 0.0].size
        den = series.size
        return round(100*(float(num)/float(den)),2)
    else:
        return 0.0

def missing_value_summary(dataset):
    '''
    Summarizes the percentage of missing, zero and non-missing values for each column in the input dataframe 
    and returns the summary dataframe as output
    '''
    missingSummary = dataset.apply(lambda x: pd.Series({'missing': get_pct_missing(x),'zeros' : get_pct_zero(x)}), \
                                   axis = 0).transpose()
    missingSummary['non_missing'] = 100 - (missingSummary['missing'] + missingSummary['zeros'])
    missingSummary = missingSummary.loc[(missingSummary.missing != 0) | (missingSummary.zeros != 0)]
    return missingSummary

In [10]:
missing_value_summary(liver_dataset)


Out[10]:
missing zeros non_missing

In [9]:
liver_dataset["Albumin_and_Globulin_Ratio"] = liver_dataset.Albumin_and_Globulin_Ratio.fillna(liver_dataset['Albumin_and_Globulin_Ratio'].mean())

In [11]:
fig, axs = plt.subplots(1, 2, sharey=True)
liver_dataset.plot(kind='scatter', x='Total_Protiens', y='Albumin', ax=axs[0], figsize=(16, 8))
liver_dataset.plot(kind='scatter', x='Age', y='Direct_Bilirubin', ax=axs[1])


Out[11]:
<matplotlib.axes._subplots.AxesSubplot at 0x115e4f510>

In [12]:
liver_dataset.groupby("Gender").agg({
    'Total_Bilirubin': ['min', 'max'], 
    'Disease': 'count'})


Out[12]:
Total_Bilirubin Disease
min max count
Gender
Female 0.5 27.7 142
Male 0.4 75.0 441

In [13]:
liver_dataset.groupby("Disease").agg(
    {'Alkaline_Phosphotase': ['min', 'max']})


Out[13]:
Alkaline_Phosphotase
min max
Disease
1 63 2110
2 90 1580

In [15]:
ax = sns.boxplot(x=liver_dataset[liver_dataset['Disease'] == 2]["Alkaline_Phosphotase"])



In [17]:
from plots import *
_ = drawPlots(liver_dataset.Alkaline_Phosphotase, by=liver_dataset.Disease.astype(str))



In [18]:
corr = liver_dataset[liver_dataset.columns].corr()
sns.heatmap(corr, annot = True)


Out[18]:
<matplotlib.axes._subplots.AxesSubplot at 0x108e4d890>

In [26]:
features = liver_dataset.drop(['Disease'], axis = 1)
target = liver_dataset['Disease']

In [20]:
liver_dataset = pd.concat([liver_dataset,
                           pd.get_dummies(liver_dataset['Gender'], 
                                          prefix = 'Gender')], axis=1)

In [23]:
liver_dataset = liver_dataset.drop(['Gender'], axis=1)

In [27]:
X_train, X_test, y_train, y_test = train_test_split(features, 
                                                    target, 
                                                    test_size = 0.25, random_state = 5)

In [29]:
X_test.shape


Out[29]:
(146, 11)

In [30]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report,confusion_matrix

random_forest = RandomForestClassifier(n_estimators=100,
                             criterion='entropy',
                             verbose=0,
                             #class_weight='balanced',
                             oob_score=True,n_jobs=4, max_features = 0.25)

In [32]:
random_forest = random_forest.fit(X_train, y_train)

In [33]:
random_forest.fit(X_train, y_train)
#Predict Output
rf_predicted = random_forest.predict(X_test)

random_forest_score = round(random_forest.score(X_train, y_train) * 100, 2)
random_forest_score_test = round(random_forest.score(X_test, y_test) * 100, 2)
print('Accuracy: \n', accuracy_score(y_test,rf_predicted))
print(confusion_matrix(y_test,rf_predicted))
print(classification_report(y_test,rf_predicted))


('Accuracy: \n', 0.71232876712328763)
[[87 15]
 [27 17]]
             precision    recall  f1-score   support

          1       0.76      0.85      0.81       102
          2       0.53      0.39      0.45        44

avg / total       0.69      0.71      0.70       146