Analysis of UCI Cardiotocography Dataset
In [1]:
%pylab inline
pylab.style.use('ggplot')
import pandas as pd
import numpy as np
import seaborn as sns
In [2]:
data_df = pd.read_csv('cgt.csv')
In [3]:
data_df.head()
Out[3]:
In [4]:
data_df.info()
In [5]:
data_df.describe().T
Out[5]:
In [6]:
nsp_codes = {1: 'normal', 2: 'suspect', 3: 'pathologic'}
data_df = data_df.assign(NSP=data_df.NSP.map(lambda v: nsp_codes[v]))
In [7]:
data_df.head()
Out[7]:
In [8]:
# CLASS is a code, not a number. Remove this feature for now.
data_df = data_df.drop('CLASS', axis=1)
In [9]:
data_df.NSP.value_counts().plot(kind='bar')
Out[9]:
In [10]:
from sklearn.feature_selection import mutual_info_classif
In [11]:
f_info = mutual_info_classif(data_df.drop('NSP', axis=1), data_df.NSP)
f_info = pd.Series(f_info, index=data_df.columns.drop('NSP').copy())
_, ax = pylab.subplots(1, 1, figsize=(6, 6))
f_info.sort_values(ascending=True).plot(kind='barh', ax=ax)
Out[11]:
Now let's select the top 4 features into our model.
In [12]:
fg = sns.FacetGrid(col='NSP', data=data_df)
fg = fg.map(pylab.hist, 'SUSP')
In [13]:
fg = sns.FacetGrid(col='NSP', data=data_df)
fg = fg.map(pylab.hist, 'MSTV')
In [14]:
fg = sns.FacetGrid(col='NSP', data=data_df)
fg = fg.map(pylab.hist, 'ASTV')
In [15]:
fg = sns.FacetGrid(col='NSP', data=data_df)
fg = fg.map(pylab.hist, 'ALTV')
In [16]:
top4 = ['SUSP', 'MSTV', 'ASTV', 'ALTV']
smaller_df = data_df.loc[:, top4].assign(NSP=data_df.NSP)
In [17]:
sns.pairplot(smaller_df, hue='NSP')
Out[17]:
In [18]:
f_corrs = smaller_df.drop('NSP', axis=1).corr()
sns.heatmap(f_corrs, annot=True)
Out[18]:
In [19]:
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.naive_bayes import GaussianNB
In [20]:
model = GaussianNB()
X = smaller_df.drop('NSP', axis=1)
y = data_df.NSP
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=12345)
scores_prop_prior = cross_val_score(model, X=X, y=y, cv=cv, scoring='f1_macro')
scores_prop_prior = pd.Series(scores_prop_prior)
scores_prop_prior.plot(kind='bar')
Out[20]:
In [21]:
model = GaussianNB(priors=[1/3, 1/3, 1/3])
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=12345)
scores_eq_prior = cross_val_score(model, X=X, y=y, cv=cv, scoring='f1_macro')
scores_eq_prior = pd.Series(scores_eq_prior)
scores_eq_prior.plot(kind='bar')
Out[21]:
In [22]:
scores = pd.DataFrame({'proportional_priors': scores_prop_prior, 'equal_priors': scores_eq_prior})
scores.plot(kind='bar')
Out[22]:
In [23]:
scores.mean()
Out[23]:
In [24]:
features_to_use = f_info[f_info > 0.10].index
In [25]:
features_to_use
Out[25]:
In [26]:
model = GaussianNB()
X = data_df.loc[:, features_to_use]
y = data_df.NSP
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=12345)
scores_with_more_features = cross_val_score(model, X=X, y=y, cv=cv, scoring='f1_macro')
scores_with_more_features = pd.Series(scores_with_more_features)
scores_with_more_features.plot(kind='bar')
Out[26]:
In [27]:
len(features_to_use)
Out[27]:
In [28]:
scores_cmp = pd.DataFrame({'f_4': scores_prop_prior, 'f_13': scores_with_more_features})
In [29]:
scores_cmp.plot(kind='bar')
Out[29]:
In [30]:
from sklearn.ensemble import RandomForestClassifier
In [31]:
model = RandomForestClassifier(
n_estimators=10,
max_depth=4,
max_features=5)
X = data_df.loc[:, features_to_use]
y = data_df.NSP
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=12345)
scores_with_rf_13 = cross_val_score(model, X=X, y=y, cv=cv, scoring='f1_macro')
scores_with_rf_13 = pd.Series(scores_with_rf_13)
scores_with_rf_13.plot(kind='bar')
Out[31]:
In [32]:
scores_with_rf_13.mean()
Out[32]:
In [33]:
from sklearn.model_selection import GridSearchCV
In [34]:
model = RandomForestClassifier()
param_grid = {
'n_estimators': [10, 20],
'max_depth': [4, 8],
'max_features': [4, 5, 13],
}
X = data_df.loc[:, features_to_use]
y = data_df.NSP
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=12345)
grid_search = GridSearchCV(
estimator=model,
param_grid=param_grid,
scoring='f1_macro',
verbose=1,
cv=cv)
grid_search = grid_search.fit(X, y)
In [35]:
grid_search.best_params_
Out[35]:
In [36]:
grid_search.best_score_
Out[36]:
In [37]:
grid_search.cv_results_['rank_test_score']
Out[37]:
In [38]:
best_test_score = grid_search.cv_results_['split9_test_score']
best_train_score = grid_search.cv_results_['split9_train_score']
best_scores = pd.DataFrame({'best_train': best_train_score, 'best_test': best_test_score})
best_scores.plot(kind='bar')
Out[38]: