In [1]:

    
import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import fbeta_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score

%matplotlib inline









    



C:\Users\Lingyu\.conda\envs\tensorflow\lib\site-packages\IPython\html.py:14: ShimWarning: The `IPython.html` package has been deprecated since IPython 4.0. You should import from `notebook` instead. `IPython.html.widgets` has moved to `ipywidgets`.
  "`IPython.html.widgets` has moved to `ipywidgets`.", ShimWarning)



In [2]:

    
LABELS = {'primary': 0,
          'clear': 1,
          'agriculture': 2,
          'road': 3,
          'water': 4,
          'partly_cloudy': 5,
          'cultivation': 6,
          'habitation': 7,
          'haze': 8,
          'cloudy': 9,
          'bare_ground': 10,
          'selective_logging': 11,
          'artisinal_mine': 12,
          'blooming': 13,
          'slash_burn': 14,
          'blow_down': 15,
          'conventional_mine': 16}

EVALUATION_PATH = os.path.join(r'../reports/planet_validation')



In [3]:

    
hashtable = dict([[v,k] for k,v in LABELS.items()])
label_names = list(hashtable.values())



In [4]:

    
total_size = 40483
train_size = int(np.floor(40483 * 0.8)) + 1
validation_size = total_size - train_size

1. Analyze the training set results

1.1 Training data labels distribution



In [5]:

    
labels = np.loadtxt(os.path.join(EVALUATION_PATH, 'out_labels_train.txt'))[0 : train_size, :]
preds = np.loadtxt(os.path.join(EVALUATION_PATH, 'out_predictions_train.txt'))[0 : train_size, :]



In [6]:

    
plt.figure(figsize=(10,10))
s = pd.Series(labels.sum(axis=0), index=label_names)
s.plot.pie(fontsize=15)
s









    Out[6]:





primary              30005.0
clear                22759.0
agriculture           9808.0
road                  6434.0
water                 5920.0
partly_cloudy         5804.0
cultivation           3573.0
habitation            2892.0
haze                  2151.0
cloudy                1673.0
bare_ground            680.0
selective_logging      273.0
artisinal_mine         279.0
blooming               277.0
slash_burn             169.0
blow_down               79.0
conventional_mine       79.0
dtype: float64



In [7]:

    
df = pd.DataFrame(preds.reshape(-1, 1), columns=['pred'])
df['label'] = labels.reshape(-1, 1)
df['tag'] = label_names * train_size



In [8]:

    
g = sns.FacetGrid(df, col='tag', col_wrap=4)
g = g.map(plt.hist, 'pred', bins=np.arange(0,1.05,0.05), color='r', alpha=0.5)
g = g.map(plt.hist, 'label', bins=np.arange(0,1.05,0.05), color='c', alpha=0.5)

Conclusion: Due to the unbalanced data, the model tends to be conservative, maybe try to oversample the minority labels or add penalty to false negative.

1.2 fscore relationship with threshold



In [9]:

    
thresh_list = np.arange(0, 1.01, 0.01)
fbeta_mat = np.empty((len(thresh_list), len(label_names)))
for idx, thresh in enumerate(thresh_list):
    score = fbeta_score(labels, preds > thresh, beta=2, average=None)
    fbeta_mat[idx, :] = score









    



C:\Users\Lingyu\.conda\envs\tensorflow\lib\site-packages\sklearn\metrics\classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)



In [10]:

    
plt.figure(figsize=(20,10))
cm = plt.get_cmap('rainbow')
cm = [cm(i) for i in np.linspace(0,1,17)]
for i in range(len(label_names)):
    plt.plot(thresh_list, fbeta_mat[:, i], color=cm[i])
plt.legend(label_names, fontsize=18)
plt.tick_params(axis='both', which='major', labelsize=15)
plt.xlabel('threshold', fontsize=18)
plt.ylabel('fbeta score', fontsize=18)









    Out[10]:





<matplotlib.text.Text at 0x2421f5ca7f0>

conclusion: The fbeta_score for minority labels looks very poor. Also, the optimal thresholds for each labels are not consistent at all.

1.3 ROC curve analysis



In [11]:

    
weather_labels = ['clear', 'partly_cloudy', 'haze', 'cloudy']
common_labels = ['primary', 'agriculture', 'road', 'water', 'habitation', 'cultivation', 'bare_ground']
rare_labels = ['selective_logging', 'artisinal_mine', 'blooming', 'slash_burn', 'blow_down', 'conventional_mine']



In [12]:

    
fpr = dict()
tpr = dict()
roc_auc = dict()
for idx, label in enumerate(weather_labels):
    fpr[label], tpr[label], _ = roc_curve(labels[:, LABELS[label]], preds[:, LABELS[label]])
    roc_auc[label] = auc(fpr[label], tpr[label])

plt.figure(figsize=(10,5))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
for label in weather_labels:
    plt.plot(fpr[label], tpr[label], lw=2, label='{} (area = {:.3f})'.format(label, roc_auc[label]))
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=16)
plt.ylabel('True Positive Rate', fontsize=16)
plt.tick_params(axis='both', which='major', labelsize=15)
plt.legend(loc='lower right', fontsize=16)









    Out[12]:





<matplotlib.legend.Legend at 0x2421dca8898>



In [13]:

    
fpr = dict()
tpr = dict()
roc_auc = dict()
for idx, label in enumerate(common_labels):
    fpr[label], tpr[label], _ = roc_curve(labels[:, LABELS[label]], preds[:, LABELS[label]])
    roc_auc[label] = auc(fpr[label], tpr[label])

plt.figure(figsize=(10,5))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
for label in common_labels:
    plt.plot(fpr[label], tpr[label], lw=2, label='{} (area = {:.3f})'.format(label, roc_auc[label]))
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=16)
plt.ylabel('True Positive Rate', fontsize=16)
plt.tick_params(axis='both', which='major', labelsize=15)
plt.legend(loc='lower right', fontsize=16)









    Out[13]:





<matplotlib.legend.Legend at 0x2421e56fcc0>



In [14]:

    
fpr = dict()
tpr = dict()
roc_auc = dict()
for idx, label in enumerate(rare_labels):
    fpr[label], tpr[label], _ = roc_curve(labels[:, LABELS[label]], preds[:, LABELS[label]])
    roc_auc[label] = auc(fpr[label], tpr[label])

plt.figure(figsize=(10,5))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
for label in rare_labels:
    plt.plot(fpr[label], tpr[label], lw=2, label='{} (area = {:.3f})'.format(label, roc_auc[label]))
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=16)
plt.ylabel('True Positive Rate', fontsize=16)
plt.tick_params(axis='both', which='major', labelsize=15)
plt.legend(loc='lower right', fontsize=16)









    Out[14]:





<matplotlib.legend.Legend at 0x2422224e588>

Conclusion: The ROC curve is unexpectedly good (all of them are higher than 0.85), which is inconsistent with the fscore plot in section 1.2. I think it might be not a good indicator of the network performance.

1.4 precision-recall curve analysis



In [15]:

    
precision = dict()
recall = dict()
average_precision = dict()
for idx, label in enumerate(weather_labels):
    precision[label], recall[label], _ = precision_recall_curve(labels[:, LABELS[label]], preds[:, LABELS[label]])
    average_precision[label] = average_precision_score(labels[:, LABELS[label]], preds[:, LABELS[label]])

plt.figure(figsize=(10,5))
for label in weather_labels:
    plt.plot(recall[label], precision[label], lw=2, label='{} (area = {:.3f})'.format(label, average_precision[label]))
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Recall', fontsize=16)
plt.ylabel('Precision', fontsize=16)
plt.tick_params(axis='both', which='major', labelsize=15)
plt.legend(loc='lower left', fontsize=16)









    Out[15]:





<matplotlib.legend.Legend at 0x2422226ce48>



In [16]:

    
precision = dict()
recall = dict()
average_precision = dict()
for idx, label in enumerate(common_labels):
    precision[label], recall[label], _ = precision_recall_curve(labels[:, LABELS[label]], preds[:, LABELS[label]])
    average_precision[label] = average_precision_score(labels[:, LABELS[label]], preds[:, LABELS[label]])

plt.figure(figsize=(10,5))
for label in common_labels:
    plt.plot(recall[label], precision[label], lw=2, label='{} (area = {:.3f})'.format(label, average_precision[label]))
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Recall', fontsize=16)
plt.ylabel('Precision', fontsize=16)
plt.tick_params(axis='both', which='major', labelsize=15)
plt.legend(loc='lower left', fontsize=16)









    Out[16]:





<matplotlib.legend.Legend at 0x2421f5e2860>



In [17]:

    
precision = dict()
recall = dict()
average_precision = dict()
for idx, label in enumerate(rare_labels):
    precision[label], recall[label], _ = precision_recall_curve(labels[:, LABELS[label]], preds[:, LABELS[label]])
    average_precision[label] = average_precision_score(labels[:, LABELS[label]], preds[:, LABELS[label]])

plt.figure(figsize=(10,5))
for label in rare_labels:
    plt.plot(recall[label], precision[label], lw=2, label='{} (area = {:.3f})'.format(label, average_precision[label]))
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Recall', fontsize=16)
plt.ylabel('Precision', fontsize=16)
plt.tick_params(axis='both', which='major', labelsize=15)
plt.legend(loc='lower left', fontsize=16)









    Out[17]:





<matplotlib.legend.Legend at 0x2421f532a58>

Conclusion: The Precision-Recall curve is consistent with the analysis in Section 1.1 and Section 1.2. The rare labels classes perform very poorly. I think this curve can be a good indicator of the network performance.



In [ ]: