Import the training and testing data

Download data.tar.gz from https://tiny.cc/ + the full manuscript ID for part 1 (case sensitive), and untar and ungzip this into the directory "MCS/mcs" and make sure the output folder is "data" and it has a folder named "training_data". Examine the code to see the proper location if you are getting an error (i.e., "../data/training_data/")


In [1]:
import pickle
import pandas as pd
from functools import partial

filename = "../data/training_data/official_test_train_5_class_013018.pkl"

X_train, X_test, y_train, y_test = pickle.load(open(filename, "rb"))

df = pd.concat([X_train, X_test])
df = df.apply(partial(pd.to_numeric, errors='ignore'))
df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 3659 entries, 1966 to 610
Data columns (total 24 columns):
index                          3659 non-null int64
filename                       3659 non-null object
mean_intensity                 3659 non-null float64
solidity                       3659 non-null float64
intensity_variance             3659 non-null float64
intense_convection_ratio       3659 non-null float64
convection_stratiform_ratio    3659 non-null float64
area                           3659 non-null int64
convection_area                3659 non-null int64
max_intensity                  3659 non-null int64
minor_axis_length              3659 non-null float64
intense_area                   3659 non-null int64
eccentricity                   3659 non-null float64
convex_area                    3659 non-null int64
intense_stratiform_ratio       3659 non-null float64
major_axis_length              3659 non-null float64
datetime                       3659 non-null int64
label                          3659 non-null int64
minor_major_ratio              3659 non-null float64
xmax                           3659 non-null int64
xmin                           3659 non-null int64
ymax                           3659 non-null int64
ymin                           3659 non-null int64
label1                         3659 non-null int64
dtypes: float64(10), int64(13), object(1)
memory usage: 714.6+ KB

In [2]:
df['datetime'] = pd.to_datetime(df['datetime'])
df = df.set_index('datetime')

X_train = df[df.index.year>=2006]
y_train = X_train.label.values

X_test = df[df.index.year<2006]
y_test = X_test.label.values

These vectors are associated with the following features...


In [3]:
feature_list = ['area', 'convex_area', 'eccentricity', 
                'intense_area', 'convection_area',
                'convection_stratiform_ratio', 'intense_stratiform_ratio',
                'intense_convection_ratio', 'mean_intensity', 'max_intensity',
                'intensity_variance', 'major_axis_length', 'minor_axis_length',
                'solidity']

... and the following labels


In [4]:
labels = ['MCS', 'UCC', 'TROP', 'SYNP', 'CLUT']

In [5]:
df.label.values


Out[5]:
array([1, 0, 3, ..., 1, 3, 0], dtype=int64)

In [6]:
df['label_name'] = [labels[x] for x in df.label.values]

df.head(5)


Out[6]:
index filename mean_intensity solidity intensity_variance intense_convection_ratio convection_stratiform_ratio area convection_area max_intensity ... intense_stratiform_ratio major_axis_length label minor_major_ratio xmax xmin ymax ymin label1 label_name
datetime
2003-08-08 00:00:00 1966 extracted/non_mcs_files/non_mcs_BREF_030808_00... 34.501445 0.610768 123.848264 0.378378 0.427746 2768 1184 60 ... 0.161850 148.427694 1 0.214345 2481 2421 1074 1032 0 UCC
2004-06-03 00:00:00 570 extracted/mcs_files/mcs_BREF_040603_0000_000.png 32.766836 0.424754 97.369261 0.299957 0.309585 210204 65076 70 ... 0.092862 1375.858518 0 0.302458 2053 1403 1123 879 1 MCS
2009-06-29 09:15:00 3387 extracted/synoptic_files/synoptic_BREF_090629_... 24.711597 0.159800 25.975323 0.000000 0.000000 39112 488 45 ... 0.000000 772.283050 3 0.390535 2520 2084 507 294 0 SYNP
2008-08-13 04:00:00 343 extracted/clutter_files/clutter_BREF_080813_04... 20.506608 0.735223 2.606785 0.000000 0.000000 3632 0 30 ... 0.000000 87.072821 4 0.748341 1910 1866 1162 1127 0 CLUT
2010-06-01 18:00:00 3407 extracted/synoptic_files/synoptic_BREF_100601_... 25.334432 0.514085 26.203776 0.021197 0.020498 156504 3208 50 ... 0.000434 798.110391 3 0.423256 3422 2969 542 285 0 SYNP

5 rows × 24 columns

Examine feature value distribution for training data


In [7]:
import matplotlib.pyplot as plt
%matplotlib inline

plt.rcParams['figure.figsize'] = 20, 20
plt.rcParams['ytick.labelsize'] = 20
plt.rcParams['xtick.labelsize'] = 20

labs = ['MCS', 'UCC', 'Tropical', 'Synoptic', 'Clutter']

intensities = ['mean_intensity', 'max_intensity', 'intensity_variance']
ratios = ['intense_stratiform_ratio', 'convection_stratiform_ratio', 'intense_convection_ratio']
shapes = ['solidity', 'minor_major_ratio', 'eccentricity']
areas = ['area', 'convection_area', 'intense_area']

meanpointprops = dict(marker='.', markeredgecolor='black',
                      markerfacecolor='black')

for pnum, (var_group, group_name) in enumerate(zip([areas, intensities, ratios, shapes], 
                                                   ['area', 'int', 'rat', 'shp'])):

    d = {'var':[], 'label':[], 'val':[]}

    for var in var_group:
    
        grouped = df.groupby('label')

        for gid, group in grouped:

            d['var'].append(var)
            d['label'].append(labs[gid])
            d['val'].append(group[var].values.astype(float))

        dists = pd.DataFrame.from_dict(d)

        ax = plt.subplot(2, 2, pnum+1)

        bplot = ax.boxplot(dists['val'].values, whis=[5, 95], showmeans=True, meanprops=meanpointprops)

        plt.setp(bplot['medians'], color='black')

        [item.set_color('black') for item in bplot['means']] 

        ax.set_xticklabels(dists['label'].values, rotation=45)

        ax.axvspan(5.5, 10.5, color='grey', alpha=.2)

        ax.grid(True)

        ticklines = ax.get_xticklines() + ax.get_yticklines()
        gridlines = ax.get_xgridlines() + ax.get_ygridlines()
        ticklabels = ax.get_xticklabels() + ax.get_yticklabels()


        for line in ticklines:
            line.set_linewidth(.5)

        for line in gridlines:
            line.set_linestyle('--')
            line.set_color('k')
            line.set_linewidth(.5)

        if group_name == 'int':

            ax.set_ylabel('dBZ', fontsize=25)

        elif group_name == 'area':

            ax.set_ylabel("Area (" + r'$km^2$' + ")", fontsize=25)
            ax.set_yscale('symlog')


C:\Users\ahaberlie\Anaconda3\envs\py35\lib\site-packages\matplotlib\cbook\deprecation.py:106: MatplotlibDeprecationWarning: Adding an axes using the same arguments as a previous axes currently reuses the earlier instance.  In a future version, a new instance will always be created and returned.  Meanwhile, this warning can be suppressed, and the future behavior ensured, by passing a unique label to each axes instance.
  warnings.warn(message, mplDeprecation, stacklevel=1)

Examine feature correlation


In [8]:
plt.rcParams['ytick.labelsize'] = 25
plt.rcParams['xtick.labelsize'] = 25

for color, label in zip(['r', 'g', 'b', 'y', 'k'], labels):
    t_df = df[df.label_name==label]
    plt.scatter(t_df.area, t_df.mean_intensity, c=color, label=label)


plt.xscale('symlog')
plt.legend()
plt.xlim(1000, 600000)
plt.title("Area vs Mean Intensity", fontsize=30)
plt.ylabel("Mean Intensity (dBZ)", fontsize=30)
plt.xlabel("Area (" + r'$km^2$' + ")", fontsize=30)


Out[8]:
Text(0.5,0,'Area ($km^2$)')

In [9]:
for color, label in zip(['r', 'g', 'b', 'y', 'k'], labels):
    t_df = df[df.label_name==label]
    plt.scatter(t_df.convection_stratiform_ratio, t_df.intense_convection_ratio, c=color, label=label)


plt.legend()
plt.ylabel("Intense / Convection Ratio", fontsize=30)
plt.xlabel("Convection / Stratiform Ratio", fontsize=30)

plt.title("Convection / Stratiform Ratio vs. Intense / Convection Ratio", fontsize=30)

plt.axes().set_aspect('equal')


C:\Users\ahaberlie\Anaconda3\envs\py35\lib\site-packages\matplotlib\cbook\deprecation.py:106: MatplotlibDeprecationWarning: Adding an axes using the same arguments as a previous axes currently reuses the earlier instance.  In a future version, a new instance will always be created and returned.  Meanwhile, this warning can be suppressed, and the future behavior ensured, by passing a unique label to each axes instance.
  warnings.warn(message, mplDeprecation, stacklevel=1)

Read in "best" classifiers and build ensemble classifier

Examine performance on testing data


In [10]:
from sklearn.ensemble import VotingClassifier
from sklearn import tree
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
import pickle
import numpy as np

rf_clf = pickle.load(open("../data/classifiers/best_rfc_020618.pkl", 'rb'))
gb_clf = pickle.load(open("../data/classifiers/best_gbc_020618.pkl", 'rb'))
#xb_clf = pickle.load(open("../data/classifiers/best_xgbc_092717.pkl", 'rb'))

#add , ('xgb', xb_clf) if on linux
vclf = VotingClassifier([('rf', rf_clf), ('gb', gb_clf)], voting='soft')

vclf.fit(X_train[feature_list].values, y_train)

predicted = vclf.predict(X_test[feature_list].values)
expected = y_test

print("Classification report for classifier %s:\n%s\n"
      % (vclf, metrics.classification_report(expected, predicted)))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted))


Classification report for classifier VotingClassifier(estimators=[('rf', GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.01, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_...timators=100, n_jobs=1, oob_score=False, random_state=42,
            verbose=0, warm_start=False))],
         n_jobs=1, voting='soft', weights=None):
             precision    recall  f1-score   support

          0       0.88      0.93      0.91       578
          1       0.93      0.86      0.90       666
          2       0.85      0.64      0.73        90
          3       0.88      0.93      0.91       159
          4       0.77      0.94      0.85       121

avg / total       0.89      0.89      0.89      1614


Confusion matrix:
[[538  37   3   0   0]
 [ 57 576   0   1  32]
 [ 13   0  58  19   0]
 [  2   0   7 148   2]
 [  0   7   0   0 114]]

Read in best binary classifiers and build ensemble classifier


In [11]:
plt.rcParams['ytick.labelsize'] = 30
plt.rcParams['xtick.labelsize'] = 30
plt.rcParams['figure.figsize'] = 20, 20

y_train1 = 1*(y_train==0)
y_test1 = 1*(y_test==0)

rfc = pickle.load(open("../data/classifiers/best_rfc_binary_020618.pkl", 'rb'))
gbc = pickle.load(open("../data/classifiers/best_gbc_binary_020618.pkl", 'rb'))
#xb_clf = pickle.load(open("../classifiers/best_xgbc_binary_092717.pkl", 'rb'))

vclf = VotingClassifier([('rf', rf_clf), ('gb', gb_clf)], voting='soft')

vclf.fit(X_train[feature_list].values, y_train1)

predicted = vclf.predict(X_test[feature_list].values)
expected = y_test1

print("Classification report for classifier %s:\n%s\n"
      % (vclf, metrics.classification_report(expected, predicted)))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted))


Classification report for classifier VotingClassifier(estimators=[('rf', GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.01, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_...timators=100, n_jobs=1, oob_score=False, random_state=42,
            verbose=0, warm_start=False))],
         n_jobs=1, voting='soft', weights=None):
             precision    recall  f1-score   support

          0       0.95      0.96      0.96      1036
          1       0.93      0.91      0.92       578

avg / total       0.94      0.94      0.94      1614


Confusion matrix:
[[999  37]
 [ 53 525]]

In [12]:
dt_clf = tree.DecisionTreeClassifier(max_depth=3)
kn_clf = KNeighborsClassifier()
lr_clf = LogisticRegression()

linetype = ['-', '--', ':', '-.']

vclf = VotingClassifier([('rf', rfc), ('gb', gbc)], voting='soft')

classifier_list = [vclf, lr_clf, kn_clf, dt_clf]
name_list = ['Ensemble', 'Logistic Regression', 'K-Nearest Neighbors', 'Decision Tree']

y_train1 = 1*(y_train==0)
y_test1 = 1*(y_test==0)

for l, (clf, lab) in enumerate(zip(classifier_list, name_list)):

    clf.fit(X_train[feature_list[:-1]].values, y_train1)

    preds = clf.predict_proba(X_test[feature_list[:-1]].values)[:,1]
        
    fpr, tpr, _ = metrics.roc_curve(y_test1, preds)
    
    plt.plot(fpr, tpr, label=lab + ' (AUC = %0.2f)' % metrics.auc(fpr, tpr), lw=5, color='k', linestyle=linetype[l])
    

plt.plot([0, 1], [0, 1], color='grey', lw=5, linestyle='--', label='All MCS (AUC = 0.50)')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Probability of False Detection', fontsize=45)
plt.ylabel('Probability of Detection', fontsize=45)
plt.legend(loc="lower right", prop={'size': 35})
plt.show()



In [13]:
for year in range(2003, 2006):

    X_t = X_test[X_test.index.year==year]
    y_t = X_t.label1.values

    preds = vclf.predict_proba(X_t[feature_list[:-1]].values)[:,1]
        
    fpr, tpr, _ = metrics.roc_curve(y_t, preds)
    
    cval = (year-2000)/5
    
    plt.plot(fpr, tpr, label=str(year) + ' (AUC = %0.2f)' % metrics.auc(fpr, tpr), 
             lw=5, color=plt.cm.Greys(cval), linestyle='-')
    

plt.plot([0, 1], [0, 1], color='grey', lw=5, linestyle='--', label='All MCS (AUC = 0.50)')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Probability of False Detection', fontsize=45)
plt.ylabel('Probability of Detection', fontsize=45)
plt.legend(loc="lower right", prop={'size': 35})
plt.show()



In [14]:
from sklearn.metrics import brier_score_loss
from sklearn.calibration import calibration_curve

fig = plt.figure(0, figsize=(20, 20))
plt.rcParams['ytick.labelsize'] = 35
plt.rcParams['xtick.labelsize'] = 35
ax1 = plt.subplot2grid((3, 1), (0, 0), rowspan=2)
ax2 = plt.subplot2grid((3, 1), (2, 0))

rfc = pickle.load(open("../data/classifiers/best_rfc_binary_020618.pkl", 'rb'))
gbc = pickle.load(open("../data/classifiers/best_gbc_binary_020618.pkl", 'rb'))
vclf = VotingClassifier([('rf', rfc), ('gb', gbc)], voting='soft')


linetype = ['-', '--', ':', '-.']
for l, (clf, name) in enumerate(zip([vclf, lr_clf, kn_clf, dt_clf], ['Ensemble', 'Logistic Regression', 'K-Nearest Neighbors', 'Decision Tree'])):
    
    
    clf.fit(X_train[feature_list[:-1]].values, y_train1)
    y_pred = clf.predict(X_test[feature_list[:-1]].values)

    prob_pos = clf.predict_proba(X_test[feature_list[:-1]].values)[:, 1]

    clf_score = brier_score_loss(y_test1, prob_pos, pos_label=1)

    fraction_of_positives, mean_predicted_value = \
        calibration_curve(y_test1, prob_pos, n_bins=11)

    ax1.plot(mean_predicted_value, fraction_of_positives, "-", color='k', linestyle=linetype[l],
             label="%s (%1.3f)" % (name, clf_score), lw=5)

    ax2.hist(prob_pos, range=(0, 1), bins=11, color='k', linestyle=linetype[l], label=name,
             histtype="step", lw=5)
    
ax1.plot([0, 1], [0, 1], "--", lw=5, color='grey', label="Perfectly\ncalibrated (0.000)")


ax1.set_ylabel("Fractional Frequency of MCS Label", fontsize=45)
ax1.set_xticks(np.linspace(0, 1, 11))
ax1.set_ylim([-0.05, 1.05])
ax1.legend(loc="lower right", prop={'size': 30})

ax2.set_xlabel("Predicted Probability of MCS Label", fontsize=45)
ax2.set_xticks(np.linspace(0, 1, 11))
ax2.set_ylabel("Count", fontsize=45)

plt.tight_layout()



In [15]:
fig = plt.figure(0, figsize=(20, 20))
ax1 = plt.subplot2grid((3, 1), (0, 0), rowspan=2)
ax2 = plt.subplot2grid((3, 1), (2, 0))

for year in range(2003, 2006):

    print(year)
    X_t = X_test[X_test.index.year==year]
    y_t = X_t.label1.values

    prob_pos = vclf.predict_proba(X_t[feature_list[:-1]].values)[:,1]

    clf_score = brier_score_loss(y_t, prob_pos, pos_label=1)

    fraction_of_positives, mean_predicted_value = \
        calibration_curve(y_t, prob_pos, n_bins=11)
        
    cval = (year-2000)/5

    ax1.plot(mean_predicted_value, fraction_of_positives, color=plt.cm.Greys(cval), linestyle='-',
             label="%s (%1.3f)" % (str(year), clf_score), lw=5)

    ax2.hist(prob_pos, range=(0, 1), bins=11, color=plt.cm.Greys(cval), linestyle='-', label=name,
             histtype="step", lw=5)
    
ax1.plot([0, 1], [0, 1], "--", lw=5, color='grey', label="Perfectly\nCalibrated (0.000)")


ax1.set_ylabel("Fractional Frequency of MCS Label", fontsize=45)
ax1.set_xticks(np.linspace(0, 1, 11))
ax1.set_ylim([-0.05, 1.05])
ax1.legend(prop={'size': 30})

ax2.set_xlabel("Predicted Probability of MCS Label", fontsize=45)
ax2.set_xticks(np.linspace(0, 1, 11))
ax2.set_ylabel("Count", fontsize=45)

plt.tight_layout()


2003
2004
2005

In [16]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=100, random_state=42)
rfc.fit(X_train[feature_list].values, y_train1)

importances = rfc.feature_importances_
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(X_train[feature_list].values.shape[1]):
    print("%d. (%s) feature %d (%f)" % (f + 1, feature_list[indices[f]], indices[f], importances[indices[f]]))


Feature ranking:
1. (intense_area) feature 3 (0.154929)
2. (area) feature 0 (0.150800)
3. (convection_area) feature 4 (0.125779)
4. (intense_stratiform_ratio) feature 6 (0.100412)
5. (convex_area) feature 1 (0.093537)
6. (intensity_variance) feature 10 (0.084200)
7. (major_axis_length) feature 11 (0.065341)
8. (minor_axis_length) feature 12 (0.063029)
9. (intense_convection_ratio) feature 7 (0.044103)
10. (convection_stratiform_ratio) feature 5 (0.040995)
11. (solidity) feature 13 (0.025004)
12. (max_intensity) feature 9 (0.024382)
13. (mean_intensity) feature 8 (0.018374)
14. (eccentricity) feature 2 (0.009115)

In [17]:
gbc = GradientBoostingClassifier(n_estimators=1000, learning_rate=0.1, random_state=42)
gbc.fit(X_train[feature_list].values, y_train1)

importances = gbc.feature_importances_
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(X_train[feature_list].values.shape[1]):
    print("%d. (%s) feature %d (%f)" % (f + 1, feature_list[indices[f]], indices[f], importances[indices[f]]))


Feature ranking:
1. (area) feature 0 (0.160078)
2. (solidity) feature 13 (0.149533)
3. (minor_axis_length) feature 12 (0.071323)
4. (major_axis_length) feature 11 (0.064826)
5. (mean_intensity) feature 8 (0.063080)
6. (convex_area) feature 1 (0.061495)
7. (convection_stratiform_ratio) feature 5 (0.060388)
8. (intense_area) feature 3 (0.060372)
9. (intense_stratiform_ratio) feature 6 (0.053880)
10. (intensity_variance) feature 10 (0.050063)
11. (eccentricity) feature 2 (0.043743)
12. (convection_area) feature 4 (0.038881)
13. (intense_convection_ratio) feature 7 (0.035277)
14. (max_intensity) feature 9 (0.021062)

In [18]:
df_labeled = pd.read_csv("../data/slice_data/labeled_slices_020618.csv")

In [19]:
df_ = df_labeled.sample(50000)

sp = plt.scatter(df_.area, df_.mean_intensity, c=df_.mcs_proba, s=10)
plt.xscale('symlog')
plt.title("Area vs Mean Intensity", fontsize=30)
plt.ylabel("Mean Intensity (dBZ)", fontsize=30)
plt.xlabel("Area (" + r'$km^2$' + ")", fontsize=30)
plt.colorbar(sp, shrink=0.4, pad=0.1, orientation='horizontal', boundaries=[0, .25, 0.5, 0.9, 0.95, 1], label='MCS Probability')
plt.ylim(20, 45)


Out[19]:
(20, 45)