Breast cancer data set


In [1]:
import pandas as pd
import numpy as np

breast_cancer_data = pd.read_csv('data/breast-cancer-wisconsin.tsv.gz',
                                 sep='\t',
                                 compression='gzip')

Class frequencies


In [2]:
from collections import Counter

Counter(breast_cancer_data['class'].values)


Out[2]:
Counter({0: 357, 1: 212})

Compute the cross-validation scores

Here, the scores are accuracy on the data set.


In [3]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier

cross_val_score(RandomForestClassifier(n_estimators=100, n_jobs=-1),
                breast_cancer_data.drop('class', axis=1).values,
                breast_cancer_data.loc[:, 'class'].values, cv=StratifiedKFold(n_splits=5, shuffle=True))


Out[3]:
array([ 0.96521739,  0.96521739,  0.94690265,  0.97345133,  0.98230088])

Visualize the predictions vs. actual status

Each dot corresponds to one prediction.

Training data


In [4]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sb
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(breast_cancer_data.drop('class', axis=1).values,
                                                    breast_cancer_data['class'].values,
                                                    stratify=breast_cancer_data['class'].values,
                                                    train_size=0.75, test_size=0.25)

clf = RandomForestRegressor(n_estimators=100, n_jobs=-1)
clf.fit(X_train, y_train)

plt.figure(figsize=(12, 7))
sb.swarmplot(y_train, clf.predict(X_train))
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.xlabel('Actual status', fontsize=14)
plt.ylabel('Predicted probability', fontsize=14)
plt.ylim(-0.01, 1.01)
;


/Users/randal_olson/anaconda/lib/python3.5/site-packages/matplotlib/__init__.py:878: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))
Out[4]:
''

Testing data


In [5]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sb
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(breast_cancer_data.drop('class', axis=1).values,
                                                    breast_cancer_data['class'].values,
                                                    stratify=breast_cancer_data['class'].values,
                                                    train_size=0.75, test_size=0.25)

clf = RandomForestRegressor(n_estimators=100, n_jobs=-1)
clf.fit(X_train, y_train)

plt.figure(figsize=(12, 7))
sb.swarmplot(y_test, clf.predict(X_test))
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.xlabel('Actual status', fontsize=14)
plt.ylabel('Predicted probability', fontsize=14)
plt.ylim(-0.01, 1.01)
;


Out[5]:
''

Crowd machine

  1. Run random forest with 15 or 20 different terminal node sizes, on the same training data, in each case getting the probability for each subject or instance;
  2. Use the output from each as a new synthetic feature, which is then input to another (single) random random forest, also run in regression mode; In this case the probability estimates from each synthetic feature will be sort of continuous as they are probability estimates and not just zero or one things;
  3. Generate some simple plots for the crowd;
  4. Compare the crowd results to some individual random forest runs, using some two or three terminal node settings.

In [6]:
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline, make_union
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, VotingClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import cross_val_score, StratifiedKFold

breast_cancer_data = pd.read_csv('data/breast-cancer-wisconsin.tsv.gz',
                                 sep='\t',
                                 compression='gzip')

all_features = breast_cancer_data.drop('class', axis=1).values
all_classes = breast_cancer_data['class'].values

union_ops = [SelectKBest(k='all')]

for i, mwfl in enumerate(np.arange(0., 0.21, 0.01)):
    union_ops.append(VotingClassifier(estimators=[('rf-mwfl={}'.format(mwfl),
                                                   RandomForestRegressor(n_estimators=100,
                                                                         n_jobs=-1,
                                                                         min_weight_fraction_leaf=mwfl))]))
    
    clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, min_weight_fraction_leaf=mwfl)
    print('RF w/ mwfl={:0.2f} CV score: {:0.3f}'.format(
            mwfl,
            np.mean(cross_val_score(clf, all_features, all_classes, cv=StratifiedKFold(n_splits=5, shuffle=True)))))

clf = make_pipeline(make_union(*union_ops), RandomForestClassifier(n_estimators=100, n_jobs=-1))
print('Crowd machine CV score: {:0.3f}'.format(np.mean(cross_val_score(clf, all_features, all_classes, cv=StratifiedKFold(n_splits=5, shuffle=True)))))


RF w/ mwfl=0.00 CV score: 0.965
RF w/ mwfl=0.01 CV score: 0.951
RF w/ mwfl=0.02 CV score: 0.954
RF w/ mwfl=0.03 CV score: 0.953
RF w/ mwfl=0.04 CV score: 0.949
RF w/ mwfl=0.05 CV score: 0.947
RF w/ mwfl=0.06 CV score: 0.951
RF w/ mwfl=0.07 CV score: 0.940
RF w/ mwfl=0.08 CV score: 0.946
RF w/ mwfl=0.09 CV score: 0.940
RF w/ mwfl=0.10 CV score: 0.933
RF w/ mwfl=0.11 CV score: 0.933
RF w/ mwfl=0.12 CV score: 0.924
RF w/ mwfl=0.13 CV score: 0.935
RF w/ mwfl=0.14 CV score: 0.933
RF w/ mwfl=0.15 CV score: 0.930
RF w/ mwfl=0.16 CV score: 0.935
RF w/ mwfl=0.17 CV score: 0.928
RF w/ mwfl=0.18 CV score: 0.926
RF w/ mwfl=0.19 CV score: 0.937
RF w/ mwfl=0.20 CV score: 0.930
Crowd machine CV score: 0.958

Spambase data set


In [7]:
import pandas as pd

spambase_data = pd.read_csv('data/spambase.tsv.gz',
                            sep='\t',
                            compression='gzip')

Class frequencies


In [8]:
from collections import Counter

Counter(spambase_data['class'].values)


Out[8]:
Counter({0: 2788, 1: 1813})

Compute the cross-validation scores

Here, the scores are accuracy on the data set.


In [9]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier

cross_val_score(RandomForestClassifier(n_estimators=100, n_jobs=-1),
                spambase_data.drop('class', axis=1).values,
                spambase_data.loc[:, 'class'].values,
                cv=StratifiedKFold(n_splits=5, shuffle=True))


Out[9]:
array([ 0.95114007,  0.96199783,  0.95222584,  0.95429815,  0.96082699])

Visualize the predictions vs. actual status

Each dot corresponds to one prediction.

Training data


In [10]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sb
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(spambase_data.drop('class', axis=1).values,
                                                    spambase_data['class'].values,
                                                    stratify=spambase_data['class'].values,
                                                    train_size=0.75, test_size=0.25)

clf = RandomForestRegressor(n_estimators=100, n_jobs=-1)
clf.fit(X_train, y_train)

plt.figure(figsize=(12, 7))
sb.boxplot(y_train, clf.predict(X_train))
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.xlabel('Actual status', fontsize=14)
plt.ylabel('Predicted probability', fontsize=14)
plt.ylim(-0.01, 1.01)
;


Out[10]:
''

Testing data


In [11]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sb
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(spambase_data.drop('class', axis=1).values,
                                                    spambase_data['class'].values,
                                                    stratify=spambase_data['class'].values,
                                                    train_size=0.75, test_size=0.25)

clf = RandomForestRegressor(n_estimators=100, n_jobs=-1)
clf.fit(X_train, y_train)

plt.figure(figsize=(12, 7))
sb.boxplot(y_test, clf.predict(X_test))
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.xlabel('Actual status', fontsize=14)
plt.ylabel('Predicted probability', fontsize=14)
plt.ylim(-0.01, 1.01)
;


Out[11]:
''

Crowd machine

  1. Run random forest with 15 or 20 different terminal node sizes, on the same training data, in each case getting the probability for each subject or instance;
  2. Use the output from each as a new synthetic feature, which is then input to another (single) random random forest, also run in regression mode; In this case the probability estimates from each synthetic feature will be sort of continuous as they are probability estimates and not just zero or one things;
  3. Generate some simple plots for the crowd;
  4. Compare the crowd results to some individual random forest runs, using some two or three terminal node settings.

In [12]:
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline, make_union
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, VotingClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import cross_val_score, StratifiedKFold

spambase_data = pd.read_csv('data/spambase.tsv.gz',
                            sep='\t',
                            compression='gzip')

all_features = spambase_data.drop('class', axis=1).values
all_classes = spambase_data['class'].values

union_ops = [SelectKBest(k='all')]

for i, mwfl in enumerate(np.arange(0., 0.21, 0.01)):
    union_ops.append(VotingClassifier(estimators=[('rf-mwfl={}'.format(mwfl),
                                                   RandomForestRegressor(n_estimators=100,
                                                                         n_jobs=-1,
                                                                         min_weight_fraction_leaf=mwfl))]))
    
    clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, min_weight_fraction_leaf=mwfl)
    print('RF w/ mwfl={:0.2f} CV score: {:0.3f}'.format(
            mwfl,
            np.mean(cross_val_score(clf, all_features, all_classes, cv=StratifiedKFold(n_splits=5, shuffle=True)))))

clf = make_pipeline(make_union(*union_ops), RandomForestClassifier(n_estimators=100, n_jobs=-1))
print('Crowd machine CV score: {:0.3f}'.format(np.mean(cross_val_score(clf, all_features, all_classes,
                                                                       cv=StratifiedKFold(n_splits=5, shuffle=True)))))


RF w/ mwfl=0.00 CV score: 0.954
RF w/ mwfl=0.01 CV score: 0.932
RF w/ mwfl=0.02 CV score: 0.923
RF w/ mwfl=0.03 CV score: 0.918
RF w/ mwfl=0.04 CV score: 0.912
RF w/ mwfl=0.05 CV score: 0.911
RF w/ mwfl=0.06 CV score: 0.909
RF w/ mwfl=0.07 CV score: 0.905
RF w/ mwfl=0.08 CV score: 0.900
RF w/ mwfl=0.09 CV score: 0.899
RF w/ mwfl=0.10 CV score: 0.897
RF w/ mwfl=0.11 CV score: 0.897
RF w/ mwfl=0.12 CV score: 0.895
RF w/ mwfl=0.13 CV score: 0.890
RF w/ mwfl=0.14 CV score: 0.887
RF w/ mwfl=0.15 CV score: 0.889
RF w/ mwfl=0.16 CV score: 0.881
RF w/ mwfl=0.17 CV score: 0.882
RF w/ mwfl=0.18 CV score: 0.874
RF w/ mwfl=0.19 CV score: 0.862
RF w/ mwfl=0.20 CV score: 0.859
Crowd machine CV score: 0.946

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]: