1. Final Selection of Features in the 50 Percent Best Feature Set


In [ ]:
# Just import everything we might need, it saves time.
import warnings
import numpy as np
import scipy as sp
import pandas as pd
import sklearn as skl
import matplotlib.pyplot as plt
from time import time
from scipy.stats import randint as sp_randint
from sklearn.metrics import log_loss, confusion_matrix, accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.cross_validation import cross_val_score, KFold, train_test_split
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import RidgeClassifierCV
from sklearn.svm import SVC
import seaborn as sns
%pylab inline
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

2. Load The Sorted Training Data Features

- sorted-train-malware-features-asm-reduced.csv
- sorted-train-malware-features-byte.csv
- sorted-train-labels.csv
- sorted-train-features-combined.csv

In [2]:
# First load the .asm and .byte training/test data and training labels
sorted_train_data_asm = pd.read_csv('data/sorted-train-malware-features-asm-50percent.csv')
sorted_train_data_byte = pd.read_csv('data/sorted-train-malware-features-byte.csv')
sorted_test_data_asm = pd.read_csv('data/sorted-test-malware-features-asm-50percent.csv')
sorted_test_data_byte = pd.read_csv('data/sorted-test-malware-features-byte.csv')
sorted_train_labels = pd.read_csv('data/sorted-train-labels.csv')
#combined_train_data = pd.read_csv('data/sorted-train-features-combined.csv')

# Load the image data for training data
sorted_train_image_asm = pd.read_csv('data/sorted-train-image-features-asm-50percent.csv')
sorted_test_image_asm = pd.read_csv('data/sorted-test-image-features-asm-50percent.csv')
#sorted_train_image_byte = pd.read_csv('data/sorted-train-image-features-byte-reduced.csv')

# Now load the row statistics for the original feature set, these will be combined with the new features
all_train_rowstats = pd.read_csv('data/all-train-asm-rowstats.csv')
all_test_rowstats = pd.read_csv('data/all-test-asm-rowstats.csv')
#all_train_image_asm_rowstats = pd.read_csv('data/all-train-image-asm-rowstats.csv')
#all_test_image_asm_rowstats = pd.read_csv('data/all-test-image-asm-rowstats.csv')

In [3]:
sorted_train_data_asm.head()


Out[3]:
filename edx esi es fs ds ss gs cs ah al ax bh bl bx ch cl cx dh dl
0 01IsoiSMh5gxyDYTl4CB 750 496 3 0 0 0 0 0 8 224 49 34 25 0 41 191 52 38 163 ...
1 01SuzwMJEIXsK7A8dQbl 1121 24 3 0 1 4 0 2 6 22 7 1 4 0 3 37 2 4 9 ...
2 01azqd4InC7m9JpocGv5 1493 1900 0 0 0 0 0 0 1 398 0 0 47 0 1 77 4 1 56 ...
3 01jsnpXSAlgw6aPeDxrU 525 4 0 0 0 0 0 0 0 0 0 0 1 0 0 1 2 0 0 ...
4 01kcPWA9K2BOxQeS5Rju 23 35 0 0 0 0 0 0 0 3 0 0 1 0 0 1 0 0 0 ...

5 rows × 504 columns


In [4]:
sorted_train_labels.head()


Out[4]:
Id Class
0 01IsoiSMh5gxyDYTl4CB 2
1 01SuzwMJEIXsK7A8dQbl 8
2 01azqd4InC7m9JpocGv5 9
3 01jsnpXSAlgw6aPeDxrU 9
4 01kcPWA9K2BOxQeS5Rju 1

5 rows × 2 columns


In [4]:
sorted_train_image_asm.head()


Out[4]:
filename ASM_1 ASM_3 ASM_4 ASM_14 ASM_20 ASM_21 ASM_23 ASM_24 ASM_25 ASM_26 ASM_28 ASM_29 ASM_31 ASM_32 ASM_33 ASM_34 ASM_40 ASM_41 ASM_42
0 01IsoiSMh5gxyDYTl4CB 116 120 116 9 32 32 32 32 32 32 13 10 116 101 120 116 49 48 48 ...
1 01SuzwMJEIXsK7A8dQbl 69 68 69 48 9 9 13 10 72 69 68 69 58 48 48 52 9 9 9 ...
2 01azqd4InC7m9JpocGv5 69 68 69 48 9 9 13 10 72 69 68 69 58 48 48 52 9 9 9 ...
3 01jsnpXSAlgw6aPeDxrU 69 68 69 48 9 9 13 10 72 69 68 69 58 48 48 52 9 9 9 ...
4 01kcPWA9K2BOxQeS5Rju 69 68 69 48 9 9 13 10 72 69 68 69 58 49 48 48 9 9 9 ...

5 rows × 501 columns


In [5]:
sorted_test_image_asm.head()


Out[5]:
filename ASM_1 ASM_3 ASM_4 ASM_14 ASM_20 ASM_21 ASM_23 ASM_24 ASM_25 ASM_26 ASM_28 ASM_29 ASM_31 ASM_32 ASM_33 ASM_34 ASM_40 ASM_41 ASM_42
0 ITSUPtCmh7WdJcsYDwQ5 69 68 69 48 9 9 13 10 72 69 68 69 58 48 48 52 9 9 9 ...
1 Ig2DB5tSiEy1cJvV0zdw 69 68 69 48 9 9 13 10 72 69 68 69 58 48 48 52 9 9 9 ...
2 Jmo6eIhLZ4t9r8QsxEg5 69 68 69 48 9 9 13 10 72 69 68 69 58 48 48 52 9 9 9 ...
3 JtPFl4ewgdD78OzCMa3o 69 68 69 48 9 9 13 10 72 69 68 69 58 48 48 52 9 9 9 ...
4 K3ZtByPHGSFYNljDUEXp 69 68 69 48 9 9 13 10 72 69 68 69 58 48 48 52 9 9 9 ...

5 rows × 501 columns


In [7]:
sorted_train_data_byte.head()


Out[7]:
filename entropy filesize
0 01IsoiSMh5gxyDYTl4CB 0.614952 6874624
1 01SuzwMJEIXsK7A8dQbl 0.843262 460288
2 01azqd4InC7m9JpocGv5 0.703961 5256192
3 01jsnpXSAlgw6aPeDxrU 0.806035 4825600
4 01kcPWA9K2BOxQeS5Rju 0.871610 712704

5 rows × 3 columns


In [6]:
sorted_test_data_asm.head()


Out[6]:
filename edx esi es fs ds ss gs cs ah al ax bh bl bx ch cl cx dh dl
0 ITSUPtCmh7WdJcsYDwQ5 245 434 0 0 1 0 0 0 9 51 1 0 4 1 3 1 3 0 1 ...
1 Ig2DB5tSiEy1cJvV0zdw 258 437 0 0 0 0 0 0 11 60 1 1 2 2 0 1 2 2 1 ...
2 Jmo6eIhLZ4t9r8QsxEg5 238 365 0 0 0 0 0 0 8 51 0 0 4 1 0 1 2 1 0 ...
3 JtPFl4ewgdD78OzCMa3o 241 556 1 0 1 1 0 0 4 52 0 1 0 1 2 2 2 0 1 ...
4 K3ZtByPHGSFYNljDUEXp 92 75 0 0 0 0 0 0 0 1 0 0 0 0 0 2 0 0 0 ...

5 rows × 504 columns


In [7]:
# Assign asm data to X,y for brevity.
X = sorted_train_data_asm.iloc[:,1:]
y = np.array(sorted_train_labels.iloc[:,1])
X_test = sorted_test_data_asm.iloc[:,1:]
X_train_image_asm = sorted_train_image_asm.iloc[:,1:]
X_test_image_asm = sorted_test_image_asm.iloc[:,1:]
# X_image_byte = sorted_train_image_byte.iloc[:,1:]

In [8]:
X_train_image_asm.shape


Out[8]:
(10868, 500)

In [9]:
X_test_image_asm.shape


Out[9]:
(10873, 500)

3. Perform Some Basic Statistical Analysis on the Feature Sets

Find the following:
- Feature and row mean
- Feature and row standard deviation
- Feature and row correlation coefficient
- Feature and row covariance
- Feature and row minimum and maximum

In [10]:
# Train feature stats
X_means = X.mean()
X_std = X.std()
X_cor = X.corr()
X_cov = X.cov()

# Test feature stats
X_test_means = X_test.mean()
X_test_std = X_test.std()
X_test_cor = X_test.corr()
X_test_cov = X_test.cov()

# Train image feature stats
X_train_image_asm_means = X_train_image_asm.mean()
X_train_image_asm_std = X_train_image_asm.std()
X_train_image_asm_cor = X_train_image_asm.corr()
X_train_image_asm_cov = X_train_image_asm.cov()

# Test image feature stats
X_test_image_asm_means = X_test_image_asm.mean()
X_test_image_asm_std = X_test_image_asm.std()
X_test_image_asm_cor = X_test_image_asm.corr()
X_test_image_asm_cov = X_test_image_asm.cov()

# Not using byte image features
#X_image_byte_means = X_image_byte.mean()
#X_image_byte_std = X_image_byte.std()
#X_image_byte_cor = X_image_byte.corr()
#X_image_byte_cov = X_image_byte.cov()

In [11]:
X_train_image_asm_means.head()


Out[11]:
ASM_1     79.944056
ASM_3     79.556220
ASM_4     79.304564
ASM_14    39.147405
ASM_20    14.223040
dtype: float64

In [12]:
X_train_image_asm_std.head()


Out[12]:
ASM_1     19.950966
ASM_3     21.598677
ASM_4     19.392095
ASM_14    16.344400
ASM_20     9.636302
dtype: float64

In [13]:
X_test_image_asm_means.head()


Out[13]:
ASM_1     79.938471
ASM_3     79.558080
ASM_4     79.319415
ASM_14    39.151936
ASM_20    14.224133
dtype: float64

In [14]:
# The byte image data has low standard deviation and mean variance and is not very useful for learning
# so this data will not be used for further analysis.
X_test_image_asm_std.head()


Out[14]:
ASM_1     19.925523
ASM_3     21.626103
ASM_4     19.435313
ASM_14    16.335403
ASM_20     9.648070
dtype: float64

In [ ]:
X_means.head()

X_std.head()

X_cor.head()

X_cov.head()

X_means.min()

X_means.max()

X_std.min()

X_std.max()

X_means[X_means == X_means.min()]

X_means[X_means == X_means.max()]

X_std[X_std == X_std.min()]

X_std[X_std == X_std.max()]

In [15]:
# Row stats for train and test data
X_train_rowstats = pd.DataFrame(index=np.arange(X.shape[0]), columns=['filename','mean','std','min','max','total','logtotal'], dtype=float64)
X_test_rowstats = pd.DataFrame(index=np.arange(X_test.shape[0]), columns=['filename','e_mean','e_std','e_min','e_max','e_total','e_logtotal'], dtype=float64)

X_train_rowstats['filename'] = sorted_train_data_asm['filename']
for i in range(0,X.shape[0]):
    X_train_rowstats['mean'][i] = X.iloc[i,:].mean()
    X_train_rowstats['std'][i] = X.iloc[i,:].std()
    X_train_rowstats['min'][i] = X.iloc[i,:].min()
    X_train_rowstats['max'][i] = X.iloc[i,:].max()
    

X_train_rowstats['total'] = X_train_rowstats['max'] * X_train_rowstats['mean'] * X_train_rowstats['std']
X_train_rowstats['logtotal'] = np.log(X_train_rowstats['total']) # natural logarithm

X_test_rowstats['filename'] = sorted_test_data_asm['filename']
for i in range(0,X_test.shape[0]):
    X_test_rowstats['e_mean'][i] = X_test.iloc[i,:].mean()
    X_test_rowstats['e_std'][i] = X_test.iloc[i,:].std()
    X_test_rowstats['e_min'][i] = X_test.iloc[i,:].min()
    X_test_rowstats['e_max'][i] = X_test.iloc[i,:].max()
    

X_test_rowstats['e_total'] = X_test_rowstats['e_max'] * X_test_rowstats['e_mean'] * X_test_rowstats['e_std']
X_test_rowstats['e_logtotal'] = np.log(X_test_rowstats['e_total']) # natural logarithm

X_train_rowstats.head()


Out[15]:
filename mean std min max total logtotal
0 01IsoiSMh5gxyDYTl4CB 648.620278 6811.551280 0 87555 3.868276e+11 26.681245
1 01SuzwMJEIXsK7A8dQbl 68.274354 462.249118 0 5817 1.835831e+08 19.028178
2 01azqd4InC7m9JpocGv5 5550.391650 86112.319736 0 1367070 6.534008e+14 34.113212
3 01jsnpXSAlgw6aPeDxrU 284.071571 4153.720695 0 65928 7.779200e+10 25.077304
4 01kcPWA9K2BOxQeS5Rju 4.930417 26.064759 0 445 5.718701e+04 10.954082

5 rows × 7 columns


In [16]:
X_test_rowstats.head()


Out[16]:
filename e_mean e_std e_min e_max e_total e_logtotal
0 ITSUPtCmh7WdJcsYDwQ5 144.286282 1977.507215 0 31361 8.948145e+09 22.914712
1 Ig2DB5tSiEy1cJvV0zdw 961.739563 14923.803067 0 236923 3.400511e+12 28.854947
2 Jmo6eIhLZ4t9r8QsxEg5 1039.353877 16206.598011 0 257289 4.333876e+12 29.097483
3 JtPFl4ewgdD78OzCMa3o 85.041750 1035.014770 0 16341 1.438326e+09 21.086746
4 K3ZtByPHGSFYNljDUEXp 3133.671968 49409.150011 0 784363 1.214445e+14 32.430479

5 rows × 7 columns


In [17]:
# Image row stats
X_train_image_asm_rowstats = pd.DataFrame(index=np.arange(X_train_image_asm.shape[0]), columns=['filename','tr_mean','tr_std','tr_min','tr_max','tr_total','tr_logtotal'], dtype=float64)
X_test_image_asm_rowstats = pd.DataFrame(index=np.arange(X_test_image_asm.shape[0]), columns=['filename','te_mean','te_std','te_min','te_max','te_total','te_logtotal'], dtype=float64)

X_train_image_asm_rowstats['filename'] = sorted_train_data_asm['filename']
for i in range(0,X_train_image_asm.shape[0]):
    X_train_image_asm_rowstats['tr_mean'][i] = X_train_image_asm.iloc[i,:].mean()
    X_train_image_asm_rowstats['tr_std'][i] = X_train_image_asm.iloc[i,:].std()
    X_train_image_asm_rowstats['tr_min'][i] = X_train_image_asm.iloc[i,:].min()
    X_train_image_asm_rowstats['tr_max'][i] = X_train_image_asm.iloc[i,:].max()
    

X_train_image_asm_rowstats['tr_total'] = X_train_image_asm_rowstats['tr_max'] * X_train_image_asm_rowstats['tr_mean'] * X_train_image_asm_rowstats['tr_std']
X_train_image_asm_rowstats['tr_logtotal'] = np.log(X_train_image_asm_rowstats['tr_total']) # natural logarithm

X_test_image_asm_rowstats['filename'] = sorted_test_data_asm['filename']
for i in range(0,X_test_image_asm.shape[0]):
    X_test_image_asm_rowstats['te_mean'][i] = X_test_image_asm.iloc[i,:].mean()
    X_test_image_asm_rowstats['te_std'][i] = X_test_image_asm.iloc[i,:].std()
    X_test_image_asm_rowstats['te_min'][i] = X_test_image_asm.iloc[i,:].min()
    X_test_image_asm_rowstats['te_max'][i] = X_test_image_asm.iloc[i,:].max()
    

X_test_image_asm_rowstats['te_total'] = X_test_image_asm_rowstats['te_max'] * X_test_image_asm_rowstats['te_mean'] * X_test_image_asm_rowstats['te_std']
X_test_image_asm_rowstats['te_logtotal'] = np.log(X_test_image_asm_rowstats['te_total']) # natural logarithm

X_train_image_asm_rowstats.head()


Out[17]:
filename tr_mean tr_std tr_min tr_max tr_total tr_logtotal
0 01IsoiSMh5gxyDYTl4CB 55.516 37.775597 9 124 260046.605955 12.468616
1 01SuzwMJEIXsK7A8dQbl 49.790 33.782605 9 124 208572.454549 12.248042
2 01azqd4InC7m9JpocGv5 49.790 33.782605 9 124 208572.454549 12.248042
3 01jsnpXSAlgw6aPeDxrU 49.790 33.782605 9 124 208572.454549 12.248042
4 01kcPWA9K2BOxQeS5Rju 52.780 36.023534 9 124 235763.942362 12.370586

5 rows × 7 columns


In [18]:
X_test_image_asm_rowstats.head()


Out[18]:
filename te_mean te_std te_min te_max te_total te_logtotal
0 ITSUPtCmh7WdJcsYDwQ5 50.432 34.630506 9 124 216564.221800 12.285642
1 Ig2DB5tSiEy1cJvV0zdw 50.240 34.704686 9 124 216201.864665 12.283968
2 Jmo6eIhLZ4t9r8QsxEg5 52.826 36.020698 9 124 235950.846586 12.371379
3 JtPFl4ewgdD78OzCMa3o 52.826 36.020698 9 124 235950.846586 12.371379
4 K3ZtByPHGSFYNljDUEXp 50.240 34.704686 9 124 216201.864665 12.283968

5 rows × 7 columns


In [19]:
# Write column stats and row stats to file

X_train_rowstats.to_csv('data/train-asm-rowstats-50percent.csv', index=False)
X_test_rowstats.to_csv('data/test-asm-rowstats-50percent.csv', index=False)
X_train_image_asm_rowstats.to_csv('data/train-image-asm-rowstats-50percent.csv', index=False)
X_test_image_asm_rowstats.to_csv('data/test-image-asm-rowstats-50percent.csv', index=False)

In [20]:
# Combine all the training features and write to file.
combined_train_data = sorted_train_data_asm.merge(sorted_train_data_byte, on='filename')

combined_train_data = combined_train_data.merge(X_train_rowstats, on='filename')

combined_train_data = combined_train_data.merge(X_train_image_asm_rowstats, on='filename', suffixes=('_A', '_I'))

combined_train_data = combined_train_data.merge(sorted_train_image_asm, on='filename')

combined_train_data = combined_train_data.merge(all_train_rowstats, on='filename')

# Result is better without the image rowstats for all image features
#combined_train_data = combined_train_data.merge(all_train_image_asm_rowstats, on='filename')

combined_train_data.to_csv('data/final-combined-train-data-50percent.csv', index=False)

combined_train_data.head()


Out[20]:
filename edx esi es fs ds ss gs cs ah al ax bh bl bx ch cl cx dh dl
0 01IsoiSMh5gxyDYTl4CB 750 496 3 0 0 0 0 0 8 224 49 34 25 0 41 191 52 38 163 ...
1 01SuzwMJEIXsK7A8dQbl 1121 24 3 0 1 4 0 2 6 22 7 1 4 0 3 37 2 4 9 ...
2 01azqd4InC7m9JpocGv5 1493 1900 0 0 0 0 0 0 1 398 0 0 47 0 1 77 4 1 56 ...
3 01jsnpXSAlgw6aPeDxrU 525 4 0 0 0 0 0 0 0 0 0 0 1 0 0 1 2 0 0 ...
4 01kcPWA9K2BOxQeS5Rju 23 35 0 0 0 0 0 0 0 3 0 0 1 0 0 1 0 0 0 ...

5 rows × 1024 columns


In [21]:
# Combine all the testing features and write to file.
combined_test_data = sorted_test_data_asm.merge(sorted_test_data_byte, on='filename')

combined_test_data = combined_test_data.merge(X_test_rowstats, on='filename')

combined_test_data = combined_test_data.merge(X_test_image_asm_rowstats, on='filename', suffixes=('_A', '_I'))

combined_test_data = combined_test_data.merge(sorted_test_image_asm, on='filename')

combined_test_data = combined_test_data.merge(all_test_rowstats, on='filename')

# Result is better without the image rowstats for all image features
#combined_test_data = combined_test_data.merge(all_test_image_asm_rowstats, on='filename')

combined_test_data.to_csv('data/final-combined-test-data-50percent.csv', index=False)

combined_test_data.head()


Out[21]:
filename edx esi es fs ds ss gs cs ah al ax bh bl bx ch cl cx dh dl
0 ITSUPtCmh7WdJcsYDwQ5 245 434 0 0 1 0 0 0 9 51 1 0 4 1 3 1 3 0 1 ...
1 Ig2DB5tSiEy1cJvV0zdw 258 437 0 0 0 0 0 0 11 60 1 1 2 2 0 1 2 2 1 ...
2 Jmo6eIhLZ4t9r8QsxEg5 238 365 0 0 0 0 0 0 8 51 0 0 4 1 0 1 2 1 0 ...
3 JtPFl4ewgdD78OzCMa3o 241 556 1 0 1 1 0 0 4 52 0 1 0 1 2 2 2 0 1 ...
4 K3ZtByPHGSFYNljDUEXp 92 75 0 0 0 0 0 0 0 1 0 0 0 0 0 2 0 0 0 ...

5 rows × 1024 columns

4. Perform Some Classification Tests


In [22]:
def multiclass_log_loss(y_true, y_pred, eps=1e-15):
    """Multi class version of Logarithmic Loss metric.
    https://www.kaggle.com/wiki/MultiClassLogLoss

    Parameters
    ----------
    y_true : array, shape = [n_samples]
            true class, intergers in [0, n_classes - 1)
    y_pred : array, shape = [n_samples, n_classes]

    Returns
    -------
    loss : float
    """
    predictions = np.clip(y_pred, eps, 1 - eps)

    # normalize row sums to 1
    predictions /= predictions.sum(axis=1)[:, np.newaxis]

    actual = np.zeros(y_pred.shape)
    n_samples = actual.shape[0]
    actual[np.arange(n_samples), y_true.astype(int)] = 1
    vectsum = np.sum(actual * np.log(predictions))
    loss = -1.0 / n_samples * vectsum
    return loss

In [23]:
def run_cv(X,y, clf):

    # Construct a kfolds object
    kf = KFold(len(y),n_folds=10,shuffle=True)
    y_prob = np.zeros((len(y),9))
    y_pred = np.zeros(len(y))
    
    # Iterate through folds
    for train_index, test_index in kf:
        print(test_index, train_index)
        X_train = X.loc[train_index,:]
        X_test = X.loc[test_index,:]
        y_train = y[train_index]

        clf.fit(X_train,y_train)
        y_prob[test_index] = clf.predict_proba(X_test)
        y_pred[test_index] = clf.predict(X_test)
    
    return y_prob, y_pred

In [24]:
# Set our X,y for the classifiers
X = combined_train_data.iloc[:,1:]
y = np.array(sorted_train_labels.iloc[:,1])
yloss = y - 1

In [25]:
clf1 = ExtraTreesClassifier(n_estimators=1000, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='gini')
p1, pred1 = run_cv(X,y,clf1)
print("logloss = {:.4f}".format(log_loss(y, p1)))
print("multiclass logloss = {:.4f}".format(multiclass_log_loss(yloss,p1)))
print("score = {:.4f}".format(accuracy_score(y, pred1)))
cm = confusion_matrix(y, pred1)
print(cm)


(array([    3,     4,     7, ..., 10843, 10846, 10865]), array([    0,     1,     2, ..., 10864, 10866, 10867]))
(array([    0,    19,    22, ..., 10848, 10856, 10867]), array([    1,     2,     3, ..., 10864, 10865, 10866]))
(array([   12,    20,    28, ..., 10803, 10817, 10845]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([   15,    71,    75, ..., 10851, 10864, 10866]), array([    0,     1,     2, ..., 10863, 10865, 10867]))
(array([   10,    43,    53, ..., 10852, 10853, 10857]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    5,     8,    48, ..., 10837, 10860, 10861]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([   14,    25,    31, ..., 10818, 10836, 10854]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    6,    18,    23, ..., 10840, 10862, 10863]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    2,    16,    17, ..., 10844, 10855, 10859]), array([    0,     1,     3, ..., 10865, 10866, 10867]))
(array([    1,     9,    11, ..., 10832, 10849, 10858]), array([    0,     2,     3, ..., 10865, 10866, 10867]))
logloss = 0.0134
multiclass logloss = 0.0134
score = 0.9973
[[1540    0    0    0    0    1    0    0    0]
 [   1 2475    2    0    0    0    0    0    0]
 [   0    0 2941    0    0    0    1    0    0]
 [   1    0    0  474    0    0    0    0    0]
 [   3    0    0    0   38    1    0    0    0]
 [   5    0    0    0    0  746    0    0    0]
 [   2    0    0    0    0    0  396    0    0]
 [   0    0    0    0    0    0    0 1225    3]
 [   0    0    0    0    0    0    0    9 1004]]

In [26]:
clf1 = ExtraTreesClassifier(n_estimators=1000, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='gini')
p1, pred1 = run_cv(X,y,clf1)
print("logloss = {:.4f}".format(log_loss(y, p1)))
print("multiclass logloss = {:.4f}".format(multiclass_log_loss(yloss,p1)))
print("score = {:.4f}".format(accuracy_score(y, pred1)))
cm = confusion_matrix(y, pred1)
print(cm)


(array([   10,    35,    46, ..., 10852, 10855, 10856]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    3,     8,     9, ..., 10840, 10860, 10865]), array([    0,     1,     2, ..., 10864, 10866, 10867]))
(array([   26,    27,    34, ..., 10801, 10808, 10814]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    1,     5,    14, ..., 10861, 10866, 10867]), array([    0,     2,     3, ..., 10863, 10864, 10865]))
(array([   12,    41,    49, ..., 10845, 10854, 10858]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    7,    16,    19, ..., 10831, 10857, 10863]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([   11,    13,    42, ..., 10846, 10847, 10848]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    6,    20,    22, ..., 10836, 10839, 10864]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    0,     4,    15, ..., 10833, 10853, 10862]), array([    1,     2,     3, ..., 10865, 10866, 10867]))
(array([    2,    28,    31, ..., 10844, 10849, 10851]), array([    0,     1,     3, ..., 10865, 10866, 10867]))
logloss = 0.0135
multiclass logloss = 0.0135
score = 0.9976
[[1540    0    0    0    0    1    0    0    0]
 [   1 2475    2    0    0    0    0    0    0]
 [   0    0 2942    0    0    0    0    0    0]
 [   1    0    0  474    0    0    0    0    0]
 [   3    0    0    0   38    1    0    0    0]
 [   4    0    0    0    0  747    0    0    0]
 [   1    0    0    0    0    0  397    0    0]
 [   0    0    0    0    0    0    0 1225    3]
 [   0    0    0    0    0    0    0    9 1004]]

In [25]:
clf1 = ExtraTreesClassifier(n_estimators=1000, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='gini')
p1, pred1 = run_cv(X,y,clf1)
print("logloss = {:.4f}".format(log_loss(y, p1)))
print("multiclass logloss = {:.4f}".format(multiclass_log_loss(yloss,p1)))
print("score = {:.4f}".format(accuracy_score(y, pred1)))
cm = confusion_matrix(y, pred1)
print(cm)


(array([   14,    22,    25, ..., 10840, 10859, 10862]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    7,     9,    36, ..., 10826, 10850, 10861]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    1,     2,     6, ..., 10856, 10858, 10867]), array([    0,     3,     4, ..., 10864, 10865, 10866]))
(array([   10,    11,    15, ..., 10831, 10841, 10844]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    5,    16,    23, ..., 10838, 10863, 10864]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    0,    13,    20, ..., 10848, 10853, 10860]), array([    1,     2,     3, ..., 10865, 10866, 10867]))
(array([    4,     8,    44, ..., 10847, 10852, 10866]), array([    0,     1,     2, ..., 10864, 10865, 10867]))
(array([   17,    24,    27, ..., 10855, 10857, 10865]), array([    0,     1,     2, ..., 10864, 10866, 10867]))
(array([   12,    18,    32, ..., 10830, 10833, 10849]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    3,    51,    60, ..., 10793, 10798, 10814]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
logloss = 0.0133
multiclass logloss = 0.0133
score = 0.9978
[[1540    0    0    0    0    1    0    0    0]
 [   1 2475    2    0    0    0    0    0    0]
 [   0    0 2942    0    0    0    0    0    0]
 [   1    0    0  474    0    0    0    0    0]
 [   2    0    0    0   38    2    0    0    0]
 [   3    0    0    0    0  748    0    0    0]
 [   1    0    0    0    0    0  397    0    0]
 [   0    0    0    0    0    0    0 1225    3]
 [   0    0    0    0    0    0    0    8 1005]]

In [24]:
clf1 = ExtraTreesClassifier(n_estimators=1000, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='gini')
p1, pred1 = run_cv(X,y,clf1)
print("logloss = {:.4f}".format(log_loss(y, p1)))
print("multiclass logloss = {:.4f}".format(multiclass_log_loss(yloss,p1)))
print("score = {:.4f}".format(accuracy_score(y, pred1)))
cm = confusion_matrix(y, pred1)
print(cm)


(array([    8,    33,    41, ..., 10830, 10838, 10862]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    0,     3,    18, ..., 10844, 10858, 10865]), array([    1,     2,     4, ..., 10864, 10866, 10867]))
(array([    9,    14,    27, ..., 10854, 10855, 10866]), array([    0,     1,     2, ..., 10864, 10865, 10867]))
(array([   26,    39,    40, ..., 10829, 10853, 10856]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    1,    12,    31, ..., 10842, 10863, 10867]), array([    0,     2,     3, ..., 10864, 10865, 10866]))
(array([   20,    28,    32, ..., 10823, 10836, 10852]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    6,    11,    19, ..., 10848, 10849, 10851]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    4,     5,    10, ..., 10850, 10857, 10859]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([   21,    37,    72, ..., 10860, 10861, 10864]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    2,     7,    13, ..., 10805, 10825, 10845]), array([    0,     1,     3, ..., 10865, 10866, 10867]))
logloss = 0.0140
multiclass logloss = 0.0140
score = 0.9977
[[1541    0    0    0    0    0    0    0    0]
 [   1 2475    2    0    0    0    0    0    0]
 [   0    0 2942    0    0    0    0    0    0]
 [   1    0    0  474    0    0    0    0    0]
 [   5    0    0    0   37    0    0    0    0]
 [   5    0    0    0    0  746    0    0    0]
 [   1    0    0    0    0    0  397    0    0]
 [   0    0    0    0    0    0    0 1227    1]
 [   0    0    0    0    0    0    0    9 1004]]

5. Test/Experimental Code Only


In [ ]:
combined_train_data['class'] = sorted_train_labels.iloc[:,1]

class1 = X[combined_train_data['class'] == 1]
class2 = X[combined_train_data['class'] == 2]
class3 = X[combined_train_data['class'] == 3]
class4 = X[combined_train_data['class'] == 4]
class5 = X[combined_train_data['class'] == 5]
class6 = X[combined_train_data['class'] == 6]
class7 = X[combined_train_data['class'] == 7]
class8 = X[combined_train_data['class'] == 8]
class9 = X[combined_train_data['class'] == 9]

columns = ['mean','std','corr','cov']
index = [ 1,2,3,4,5,6,7,8,9 ]
class_stats = pd.DataFrame(index=index, columns=columns)

class_stats['mean'][1] = 1.0
class_stats.head()

classx = X[combined_train_data['class'] == i]
classxmean = classx.mean()
classxmean.head()

columns = ['mean','std','corr','cov']
index = [ 1,2,3,4,5,6,7,8,9 ]
class_stats = pd.DataFrame(index=index, columns=columns, dtype=float)
for i in range(1,10):
  classx = X[combined_train_data['class'] == i]
  class_stats['mean'][i] = classx.mean().sum()
  class_stats['std'][i] = classx.std().sum()
  #class_stats['corr'][i] = classx.corr().sum()
  #class_stats['cov'][i] = classx.cov().sum()

class_stats.head()

plt.figure(figsize=(15,15))
plt.xlabel("means")
plt.ylabel("standard deviation")
plt.scatter(class_stats['mean'], class_stats['std'], c=[ 1,2,3,4,5,6,7,8,9 ], cmap='brg')