1. Final Selection of Features in the 40 Percent Best Feature Set


In [ ]:
# Just import everything we might need, it saves time.
import warnings
import numpy as np
import scipy as sp
import pandas as pd
import sklearn as skl
import matplotlib.pyplot as plt
from time import time
from scipy.stats import randint as sp_randint
from sklearn.metrics import log_loss, confusion_matrix, accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.cross_validation import cross_val_score, KFold, train_test_split
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import RidgeClassifierCV
from sklearn.svm import SVC
import seaborn as sns
%pylab inline
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

2. Load The Sorted Training Data Features

- sorted-train-malware-features-asm-reduced.csv
- sorted-train-malware-features-byte.csv
- sorted-train-labels.csv
- sorted-train-features-combined.csv

In [3]:
# First load the .asm and .byte training/test data and training labels
sorted_train_data_asm = pd.read_csv('data/sorted-train-malware-features-asm-40percent.csv')
sorted_train_data_byte = pd.read_csv('data/sorted-train-malware-features-byte.csv')
sorted_test_data_asm = pd.read_csv('data/sorted-test-malware-features-asm-40percent.csv')
sorted_test_data_byte = pd.read_csv('data/sorted-test-malware-features-byte.csv')
sorted_train_labels = pd.read_csv('data/sorted-train-labels.csv')
#combined_train_data = pd.read_csv('data/sorted-train-features-combined.csv')

# Load the image data for training data
sorted_train_image_asm = pd.read_csv('data/sorted-train-image-features-asm-40percent.csv')
sorted_test_image_asm = pd.read_csv('data/sorted-test-image-features-asm-40percent.csv')
#sorted_train_image_byte = pd.read_csv('data/sorted-train-image-features-byte-reduced.csv')

# Now load the row statistics for the original feature set, these will be combined with the new features
all_train_rowstats = pd.read_csv('data/all-train-asm-rowstats.csv')
all_test_rowstats = pd.read_csv('data/all-test-asm-rowstats.csv')
#all_train_image_asm_rowstats = pd.read_csv('data/all-train-image-asm-rowstats.csv')
#all_test_image_asm_rowstats = pd.read_csv('data/all-test-image-asm-rowstats.csv')

In [4]:
sorted_train_data_asm.head()


Out[4]:
filename edx esi es ds ss cs ah al ax bh bl bx ch cl cx dh dl dx eax
0 01IsoiSMh5gxyDYTl4CB 750 496 3 0 0 0 8 224 49 34 25 0 41 191 52 38 163 63 1447 ...
1 01SuzwMJEIXsK7A8dQbl 1121 24 3 1 4 2 6 22 7 1 4 0 3 37 2 4 9 3 1220 ...
2 01azqd4InC7m9JpocGv5 1493 1900 0 0 0 0 1 398 0 0 47 0 1 77 4 1 56 2 4438 ...
3 01jsnpXSAlgw6aPeDxrU 525 4 0 0 0 0 0 0 0 0 1 0 0 1 2 0 0 0 942 ...
4 01kcPWA9K2BOxQeS5Rju 23 35 0 0 0 0 0 3 0 0 1 0 0 1 0 0 0 0 137 ...

5 rows × 403 columns


In [4]:
sorted_train_labels.head()


Out[4]:
Id Class
0 01IsoiSMh5gxyDYTl4CB 2
1 01SuzwMJEIXsK7A8dQbl 8
2 01azqd4InC7m9JpocGv5 9
3 01jsnpXSAlgw6aPeDxrU 9
4 01kcPWA9K2BOxQeS5Rju 1

5 rows × 2 columns


In [5]:
sorted_train_image_asm.head()


Out[5]:
filename ASM_14 ASM_20 ASM_21 ASM_28 ASM_29 ASM_31 ASM_32 ASM_33 ASM_34 ASM_40 ASM_41 ASM_42 ASM_43 ASM_47 ASM_124 ASM_125 ASM_135 ASM_137 ASM_138
0 01IsoiSMh5gxyDYTl4CB 9 32 32 13 10 116 101 120 116 49 48 48 48 9 45 45 10 116 101 ...
1 01SuzwMJEIXsK7A8dQbl 48 9 9 68 69 58 48 48 52 9 9 9 9 59 13 10 52 48 48 ...
2 01azqd4InC7m9JpocGv5 48 9 9 68 69 58 48 48 52 9 9 9 9 59 13 10 52 48 48 ...
3 01jsnpXSAlgw6aPeDxrU 48 9 9 68 69 58 48 48 52 9 9 9 9 59 13 10 52 48 48 ...
4 01kcPWA9K2BOxQeS5Rju 48 9 9 68 69 58 49 48 48 9 9 9 9 59 13 10 48 48 48 ...

5 rows × 401 columns


In [6]:
sorted_test_image_asm.head()


Out[6]:
filename ASM_14 ASM_20 ASM_21 ASM_28 ASM_29 ASM_31 ASM_32 ASM_33 ASM_34 ASM_40 ASM_41 ASM_42 ASM_43 ASM_47 ASM_124 ASM_125 ASM_135 ASM_137 ASM_138
0 ITSUPtCmh7WdJcsYDwQ5 48 9 9 68 69 58 48 48 52 9 9 9 9 59 13 10 52 48 48 ...
1 Ig2DB5tSiEy1cJvV0zdw 48 9 9 68 69 58 48 48 52 9 9 9 9 59 13 10 52 48 48 ...
2 Jmo6eIhLZ4t9r8QsxEg5 48 9 9 68 69 58 48 48 52 9 9 9 9 59 13 10 52 48 48 ...
3 JtPFl4ewgdD78OzCMa3o 48 9 9 68 69 58 48 48 52 9 9 9 9 59 13 10 52 48 48 ...
4 K3ZtByPHGSFYNljDUEXp 48 9 9 68 69 58 48 48 52 9 9 9 9 59 13 10 52 48 48 ...

5 rows × 401 columns


In [7]:
sorted_train_data_byte.head()


Out[7]:
filename entropy filesize
0 01IsoiSMh5gxyDYTl4CB 0.614952 6874624
1 01SuzwMJEIXsK7A8dQbl 0.843262 460288
2 01azqd4InC7m9JpocGv5 0.703961 5256192
3 01jsnpXSAlgw6aPeDxrU 0.806035 4825600
4 01kcPWA9K2BOxQeS5Rju 0.871610 712704

5 rows × 3 columns


In [7]:
sorted_test_data_asm.head()


Out[7]:
filename edx esi es ds ss cs ah al ax bh bl bx ch cl cx dh dl dx eax
0 ITSUPtCmh7WdJcsYDwQ5 245 434 0 1 0 0 9 51 1 0 4 1 3 1 3 0 1 2 553 ...
1 Ig2DB5tSiEy1cJvV0zdw 258 437 0 0 0 0 11 60 1 1 2 2 0 1 2 2 1 3 554 ...
2 Jmo6eIhLZ4t9r8QsxEg5 238 365 0 0 0 0 8 51 0 0 4 1 0 1 2 1 0 0 519 ...
3 JtPFl4ewgdD78OzCMa3o 241 556 1 1 1 0 4 52 0 1 0 1 2 2 2 0 1 2 668 ...
4 K3ZtByPHGSFYNljDUEXp 92 75 0 0 0 0 0 1 0 0 0 0 0 2 0 0 0 0 402 ...

5 rows × 403 columns


In [8]:
# Assign asm data to X,y for brevity.
X = sorted_train_data_asm.iloc[:,1:]
y = np.array(sorted_train_labels.iloc[:,1])
X_test = sorted_test_data_asm.iloc[:,1:]
X_train_image_asm = sorted_train_image_asm.iloc[:,1:]
X_test_image_asm = sorted_test_image_asm.iloc[:,1:]
# X_image_byte = sorted_train_image_byte.iloc[:,1:]

In [9]:
X_train_image_asm.shape


Out[9]:
(10868, 400)

In [10]:
X_test_image_asm.shape


Out[10]:
(10873, 400)

3. Perform Some Basic Statistical Analysis on the Feature Sets

Find the following:
- Feature and row mean
- Feature and row standard deviation
- Feature and row correlation coefficient
- Feature and row covariance
- Feature and row minimum and maximum

In [11]:
# Train feature stats
X_means = X.mean()
X_std = X.std()
X_cor = X.corr()
X_cov = X.cov()

# Test feature stats
X_test_means = X_test.mean()
X_test_std = X_test.std()
X_test_cor = X_test.corr()
X_test_cov = X_test.cov()

# Train image feature stats
X_train_image_asm_means = X_train_image_asm.mean()
X_train_image_asm_std = X_train_image_asm.std()
X_train_image_asm_cor = X_train_image_asm.corr()
X_train_image_asm_cov = X_train_image_asm.cov()

# Test image feature stats
X_test_image_asm_means = X_test_image_asm.mean()
X_test_image_asm_std = X_test_image_asm.std()
X_test_image_asm_cor = X_test_image_asm.corr()
X_test_image_asm_cov = X_test_image_asm.cov()

# Not using byte image features
#X_image_byte_means = X_image_byte.mean()
#X_image_byte_std = X_image_byte.std()
#X_image_byte_cor = X_image_byte.corr()
#X_image_byte_cov = X_image_byte.cov()

In [12]:
X_train_image_asm_means.head()


Out[12]:
ASM_14    39.147405
ASM_20    14.223040
ASM_21    14.223040
ASM_28    55.418936
ASM_29    55.515366
dtype: float64

In [13]:
X_train_image_asm_std.head()


Out[13]:
ASM_14    16.344400
ASM_20     9.636302
ASM_21     9.636302
ASM_28    23.048969
ASM_29    24.740569
dtype: float64

In [14]:
X_test_image_asm_means.head()


Out[14]:
ASM_14    39.151936
ASM_20    14.224133
ASM_21    14.220638
ASM_28    55.430608
ASM_29    55.529937
dtype: float64

In [15]:
# The byte image data has low standard deviation and mean variance and is not very useful for learning
# so this data will not be used for further analysis.
X_test_image_asm_std.head()


Out[15]:
ASM_14    16.335403
ASM_20     9.648070
ASM_21     9.634737
ASM_28    23.041988
ASM_29    24.736962
dtype: float64

In [ ]:
X_means.head()

X_std.head()

X_cor.head()

X_cov.head()

X_means.min()

X_means.max()

X_std.min()

X_std.max()

X_means[X_means == X_means.min()]

X_means[X_means == X_means.max()]

X_std[X_std == X_std.min()]

X_std[X_std == X_std.max()]

In [16]:
# Row stats for train and test data
X_train_rowstats = pd.DataFrame(index=np.arange(X.shape[0]), columns=['filename','mean','std','min','max','total','logtotal'], dtype=float64)
X_test_rowstats = pd.DataFrame(index=np.arange(X_test.shape[0]), columns=['filename','e_mean','e_std','e_min','e_max','e_total','e_logtotal'], dtype=float64)

X_train_rowstats['filename'] = sorted_train_data_asm['filename']
for i in range(0,X.shape[0]):
    X_train_rowstats['mean'][i] = X.iloc[i,:].mean()
    X_train_rowstats['std'][i] = X.iloc[i,:].std()
    X_train_rowstats['min'][i] = X.iloc[i,:].min()
    X_train_rowstats['max'][i] = X.iloc[i,:].max()
    

X_train_rowstats['total'] = X_train_rowstats['max'] * X_train_rowstats['mean'] * X_train_rowstats['std']
X_train_rowstats['logtotal'] = np.log(X_train_rowstats['total']) # natural logarithm

X_test_rowstats['filename'] = sorted_test_data_asm['filename']
for i in range(0,X_test.shape[0]):
    X_test_rowstats['e_mean'][i] = X_test.iloc[i,:].mean()
    X_test_rowstats['e_std'][i] = X_test.iloc[i,:].std()
    X_test_rowstats['e_min'][i] = X_test.iloc[i,:].min()
    X_test_rowstats['e_max'][i] = X_test.iloc[i,:].max()
    

X_test_rowstats['e_total'] = X_test_rowstats['e_max'] * X_test_rowstats['e_mean'] * X_test_rowstats['e_std']
X_test_rowstats['e_logtotal'] = np.log(X_test_rowstats['e_total']) # natural logarithm

X_train_rowstats.head()


Out[16]:
filename mean std min max total logtotal
0 01IsoiSMh5gxyDYTl4CB 811.335821 7612.563931 0 87555 5.407700e+11 27.016260
1 01SuzwMJEIXsK7A8dQbl 85.353234 515.786719 0 5817 2.560880e+08 19.361032
2 01azqd4InC7m9JpocGv5 6944.706468 96298.117166 0 1367070 9.142447e+14 34.449119
3 01jsnpXSAlgw6aPeDxrU 355.380597 4644.741791 0 65928 1.088241e+11 25.412999
4 01kcPWA9K2BOxQeS5Rju 6.146766 29.034778 0 445 7.941914e+04 11.282495

5 rows × 7 columns


In [17]:
X_test_rowstats.head()


Out[17]:
filename e_mean e_std e_min e_max e_total e_logtotal
0 ITSUPtCmh7WdJcsYDwQ5 180.465174 2211.096404 0 31361 1.251385e+10 23.250102
1 Ig2DB5tSiEy1cJvV0zdw 1203.296020 16689.071688 0 236923 4.757862e+12 29.190820
2 Jmo6eIhLZ4t9r8QsxEg5 1300.333333 18123.699199 0 257289 6.063491e+12 29.433307
3 JtPFl4ewgdD78OzCMa3o 106.320896 1157.070503 0 16341 2.010282e+09 21.421541
4 K3ZtByPHGSFYNljDUEXp 3920.910448 55254.429136 0 784363 1.699304e+14 32.766410

5 rows × 7 columns


In [18]:
# Image row stats
X_train_image_asm_rowstats = pd.DataFrame(index=np.arange(X_train_image_asm.shape[0]), columns=['filename','tr_mean','tr_std','tr_min','tr_max','tr_total','tr_logtotal'], dtype=float64)
X_test_image_asm_rowstats = pd.DataFrame(index=np.arange(X_test_image_asm.shape[0]), columns=['filename','te_mean','te_std','te_min','te_max','te_total','te_logtotal'], dtype=float64)

X_train_image_asm_rowstats['filename'] = sorted_train_data_asm['filename']
for i in range(0,X_train_image_asm.shape[0]):
    X_train_image_asm_rowstats['tr_mean'][i] = X_train_image_asm.iloc[i,:].mean()
    X_train_image_asm_rowstats['tr_std'][i] = X_train_image_asm.iloc[i,:].std()
    X_train_image_asm_rowstats['tr_min'][i] = X_train_image_asm.iloc[i,:].min()
    X_train_image_asm_rowstats['tr_max'][i] = X_train_image_asm.iloc[i,:].max()
    

X_train_image_asm_rowstats['tr_total'] = X_train_image_asm_rowstats['tr_max'] * X_train_image_asm_rowstats['tr_mean'] * X_train_image_asm_rowstats['tr_std']
X_train_image_asm_rowstats['tr_logtotal'] = np.log(X_train_image_asm_rowstats['tr_total']) # natural logarithm

X_test_image_asm_rowstats['filename'] = sorted_test_data_asm['filename']
for i in range(0,X_test_image_asm.shape[0]):
    X_test_image_asm_rowstats['te_mean'][i] = X_test_image_asm.iloc[i,:].mean()
    X_test_image_asm_rowstats['te_std'][i] = X_test_image_asm.iloc[i,:].std()
    X_test_image_asm_rowstats['te_min'][i] = X_test_image_asm.iloc[i,:].min()
    X_test_image_asm_rowstats['te_max'][i] = X_test_image_asm.iloc[i,:].max()
    

X_test_image_asm_rowstats['te_total'] = X_test_image_asm_rowstats['te_max'] * X_test_image_asm_rowstats['te_mean'] * X_test_image_asm_rowstats['te_std']
X_test_image_asm_rowstats['te_logtotal'] = np.log(X_test_image_asm_rowstats['te_total']) # natural logarithm

X_train_image_asm_rowstats.head()


Out[18]:
filename tr_mean tr_std tr_min tr_max tr_total tr_logtotal
0 01IsoiSMh5gxyDYTl4CB 56.7100 38.936693 9 124 273804.382950 12.520169
1 01SuzwMJEIXsK7A8dQbl 49.1675 35.480934 9 124 216319.094666 12.284510
2 01azqd4InC7m9JpocGv5 49.1675 35.480934 9 124 216319.094666 12.284510
3 01jsnpXSAlgw6aPeDxrU 49.1675 35.480934 9 124 216319.094666 12.284510
4 01kcPWA9K2BOxQeS5Rju 52.4775 37.763470 9 124 245734.831905 12.412008

5 rows × 7 columns


In [19]:
X_test_image_asm_rowstats.head()


Out[19]:
filename te_mean te_std te_min te_max te_total te_logtotal
0 ITSUPtCmh7WdJcsYDwQ5 48.2375 36.535506 9 124 218535.302616 12.294703
1 Ig2DB5tSiEy1cJvV0zdw 48.1175 36.252162 9 124 216301.061581 12.284427
2 Jmo6eIhLZ4t9r8QsxEg5 52.5225 37.761081 9 124 245929.991034 12.412802
3 JtPFl4ewgdD78OzCMa3o 52.5225 37.761081 9 124 245929.991034 12.412802
4 K3ZtByPHGSFYNljDUEXp 48.1175 36.252162 9 124 216301.061581 12.284427

5 rows × 7 columns


In [20]:
# Write column stats and row stats to file

X_train_rowstats.to_csv('data/train-asm-rowstats-40percent.csv', index=False)
X_test_rowstats.to_csv('data/test-asm-rowstats-40percent.csv', index=False)
X_train_image_asm_rowstats.to_csv('data/train-image-asm-rowstats-40percent.csv', index=False)
X_test_image_asm_rowstats.to_csv('data/test-image-asm-rowstats-40percent.csv', index=False)

In [21]:
# Combine all the training features and write to file.
combined_train_data = sorted_train_data_asm.merge(sorted_train_data_byte, on='filename')

combined_train_data = combined_train_data.merge(X_train_rowstats, on='filename')

combined_train_data = combined_train_data.merge(X_train_image_asm_rowstats, on='filename', suffixes=('_A', '_I'))

combined_train_data = combined_train_data.merge(sorted_train_image_asm, on='filename')

combined_train_data = combined_train_data.merge(all_train_rowstats, on='filename')

# Result is better without the image rowstats for all image features
#combined_train_data = combined_train_data.merge(all_train_image_asm_rowstats, on='filename')

combined_train_data.to_csv('data/final-combined-train-data-40percent.csv', index=False)

combined_train_data.head()


Out[21]:
filename edx esi es ds ss cs ah al ax bh bl bx ch cl cx dh dl dx eax
0 01IsoiSMh5gxyDYTl4CB 750 496 3 0 0 0 8 224 49 34 25 0 41 191 52 38 163 63 1447 ...
1 01SuzwMJEIXsK7A8dQbl 1121 24 3 1 4 2 6 22 7 1 4 0 3 37 2 4 9 3 1220 ...
2 01azqd4InC7m9JpocGv5 1493 1900 0 0 0 0 1 398 0 0 47 0 1 77 4 1 56 2 4438 ...
3 01jsnpXSAlgw6aPeDxrU 525 4 0 0 0 0 0 0 0 0 1 0 0 1 2 0 0 0 942 ...
4 01kcPWA9K2BOxQeS5Rju 23 35 0 0 0 0 0 3 0 0 1 0 0 1 0 0 0 0 137 ...

5 rows × 823 columns


In [22]:
# Combine all the testing features and write to file.
combined_test_data = sorted_test_data_asm.merge(sorted_test_data_byte, on='filename')

combined_test_data = combined_test_data.merge(X_test_rowstats, on='filename')

combined_test_data = combined_test_data.merge(X_test_image_asm_rowstats, on='filename', suffixes=('_A', '_I'))

combined_test_data = combined_test_data.merge(sorted_test_image_asm, on='filename')

combined_test_data = combined_test_data.merge(all_test_rowstats, on='filename')

# Result is better without the image rowstats for all image features
#combined_test_data = combined_test_data.merge(all_test_image_asm_rowstats, on='filename')

combined_test_data.to_csv('data/final-combined-test-data-40percent.csv', index=False)

combined_test_data.head()


Out[22]:
filename edx esi es ds ss cs ah al ax bh bl bx ch cl cx dh dl dx eax
0 ITSUPtCmh7WdJcsYDwQ5 245 434 0 1 0 0 9 51 1 0 4 1 3 1 3 0 1 2 553 ...
1 Ig2DB5tSiEy1cJvV0zdw 258 437 0 0 0 0 11 60 1 1 2 2 0 1 2 2 1 3 554 ...
2 Jmo6eIhLZ4t9r8QsxEg5 238 365 0 0 0 0 8 51 0 0 4 1 0 1 2 1 0 0 519 ...
3 JtPFl4ewgdD78OzCMa3o 241 556 1 1 1 0 4 52 0 1 0 1 2 2 2 0 1 2 668 ...
4 K3ZtByPHGSFYNljDUEXp 92 75 0 0 0 0 0 1 0 0 0 0 0 2 0 0 0 0 402 ...

5 rows × 823 columns

4. Perform Some Classification Tests


In [23]:
def multiclass_log_loss(y_true, y_pred, eps=1e-15):
    """Multi class version of Logarithmic Loss metric.
    https://www.kaggle.com/wiki/MultiClassLogLoss

    Parameters
    ----------
    y_true : array, shape = [n_samples]
            true class, intergers in [0, n_classes - 1)
    y_pred : array, shape = [n_samples, n_classes]

    Returns
    -------
    loss : float
    """
    predictions = np.clip(y_pred, eps, 1 - eps)

    # normalize row sums to 1
    predictions /= predictions.sum(axis=1)[:, np.newaxis]

    actual = np.zeros(y_pred.shape)
    n_samples = actual.shape[0]
    actual[np.arange(n_samples), y_true.astype(int)] = 1
    vectsum = np.sum(actual * np.log(predictions))
    loss = -1.0 / n_samples * vectsum
    return loss

In [24]:
def run_cv(X,y, clf):

    # Construct a kfolds object
    kf = KFold(len(y),n_folds=10,shuffle=True)
    y_prob = np.zeros((len(y),9))
    y_pred = np.zeros(len(y))
    
    # Iterate through folds
    for train_index, test_index in kf:
        print(test_index, train_index)
        X_train = X.loc[train_index,:]
        X_test = X.loc[test_index,:]
        y_train = y[train_index]

        clf.fit(X_train,y_train)
        y_prob[test_index] = clf.predict_proba(X_test)
        y_pred[test_index] = clf.predict(X_test)
    
    return y_prob, y_pred

In [25]:
# Set our X,y for the classifiers
X = combined_train_data.iloc[:,1:]
y = np.array(sorted_train_labels.iloc[:,1])
yloss = y - 1

In [26]:
clf1 = ExtraTreesClassifier(n_estimators=1000, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='gini')
p1, pred1 = run_cv(X,y,clf1)
print("logloss = {:.4f}".format(log_loss(y, p1)))
print("multiclass logloss = {:.4f}".format(multiclass_log_loss(yloss,p1)))
print("score = {:.4f}".format(accuracy_score(y, pred1)))
cm = confusion_matrix(y, pred1)
print(cm)


(array([   10,    35,    46, ..., 10852, 10855, 10856]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    3,     8,     9, ..., 10840, 10860, 10865]), array([    0,     1,     2, ..., 10864, 10866, 10867]))
(array([   26,    27,    34, ..., 10801, 10808, 10814]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    1,     5,    14, ..., 10861, 10866, 10867]), array([    0,     2,     3, ..., 10863, 10864, 10865]))
(array([   12,    41,    49, ..., 10845, 10854, 10858]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    7,    16,    19, ..., 10831, 10857, 10863]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([   11,    13,    42, ..., 10846, 10847, 10848]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    6,    20,    22, ..., 10836, 10839, 10864]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    0,     4,    15, ..., 10833, 10853, 10862]), array([    1,     2,     3, ..., 10865, 10866, 10867]))
(array([    2,    28,    31, ..., 10844, 10849, 10851]), array([    0,     1,     3, ..., 10865, 10866, 10867]))
logloss = 0.0135
multiclass logloss = 0.0135
score = 0.9976
[[1540    0    0    0    0    1    0    0    0]
 [   1 2475    2    0    0    0    0    0    0]
 [   0    0 2942    0    0    0    0    0    0]
 [   1    0    0  474    0    0    0    0    0]
 [   3    0    0    0   38    1    0    0    0]
 [   4    0    0    0    0  747    0    0    0]
 [   1    0    0    0    0    0  397    0    0]
 [   0    0    0    0    0    0    0 1225    3]
 [   0    0    0    0    0    0    0    9 1004]]

In [25]:
clf1 = ExtraTreesClassifier(n_estimators=1000, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='gini')
p1, pred1 = run_cv(X,y,clf1)
print("logloss = {:.4f}".format(log_loss(y, p1)))
print("multiclass logloss = {:.4f}".format(multiclass_log_loss(yloss,p1)))
print("score = {:.4f}".format(accuracy_score(y, pred1)))
cm = confusion_matrix(y, pred1)
print(cm)


(array([   14,    22,    25, ..., 10840, 10859, 10862]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    7,     9,    36, ..., 10826, 10850, 10861]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    1,     2,     6, ..., 10856, 10858, 10867]), array([    0,     3,     4, ..., 10864, 10865, 10866]))
(array([   10,    11,    15, ..., 10831, 10841, 10844]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    5,    16,    23, ..., 10838, 10863, 10864]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    0,    13,    20, ..., 10848, 10853, 10860]), array([    1,     2,     3, ..., 10865, 10866, 10867]))
(array([    4,     8,    44, ..., 10847, 10852, 10866]), array([    0,     1,     2, ..., 10864, 10865, 10867]))
(array([   17,    24,    27, ..., 10855, 10857, 10865]), array([    0,     1,     2, ..., 10864, 10866, 10867]))
(array([   12,    18,    32, ..., 10830, 10833, 10849]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    3,    51,    60, ..., 10793, 10798, 10814]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
logloss = 0.0133
multiclass logloss = 0.0133
score = 0.9978
[[1540    0    0    0    0    1    0    0    0]
 [   1 2475    2    0    0    0    0    0    0]
 [   0    0 2942    0    0    0    0    0    0]
 [   1    0    0  474    0    0    0    0    0]
 [   2    0    0    0   38    2    0    0    0]
 [   3    0    0    0    0  748    0    0    0]
 [   1    0    0    0    0    0  397    0    0]
 [   0    0    0    0    0    0    0 1225    3]
 [   0    0    0    0    0    0    0    8 1005]]

In [24]:
clf1 = ExtraTreesClassifier(n_estimators=1000, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='gini')
p1, pred1 = run_cv(X,y,clf1)
print("logloss = {:.4f}".format(log_loss(y, p1)))
print("multiclass logloss = {:.4f}".format(multiclass_log_loss(yloss,p1)))
print("score = {:.4f}".format(accuracy_score(y, pred1)))
cm = confusion_matrix(y, pred1)
print(cm)


(array([    8,    33,    41, ..., 10830, 10838, 10862]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    0,     3,    18, ..., 10844, 10858, 10865]), array([    1,     2,     4, ..., 10864, 10866, 10867]))
(array([    9,    14,    27, ..., 10854, 10855, 10866]), array([    0,     1,     2, ..., 10864, 10865, 10867]))
(array([   26,    39,    40, ..., 10829, 10853, 10856]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    1,    12,    31, ..., 10842, 10863, 10867]), array([    0,     2,     3, ..., 10864, 10865, 10866]))
(array([   20,    28,    32, ..., 10823, 10836, 10852]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    6,    11,    19, ..., 10848, 10849, 10851]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    4,     5,    10, ..., 10850, 10857, 10859]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([   21,    37,    72, ..., 10860, 10861, 10864]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    2,     7,    13, ..., 10805, 10825, 10845]), array([    0,     1,     3, ..., 10865, 10866, 10867]))
logloss = 0.0140
multiclass logloss = 0.0140
score = 0.9977
[[1541    0    0    0    0    0    0    0    0]
 [   1 2475    2    0    0    0    0    0    0]
 [   0    0 2942    0    0    0    0    0    0]
 [   1    0    0  474    0    0    0    0    0]
 [   5    0    0    0   37    0    0    0    0]
 [   5    0    0    0    0  746    0    0    0]
 [   1    0    0    0    0    0  397    0    0]
 [   0    0    0    0    0    0    0 1227    1]
 [   0    0    0    0    0    0    0    9 1004]]

5. Test/Experimental Code Only


In [ ]:
combined_train_data['class'] = sorted_train_labels.iloc[:,1]

class1 = X[combined_train_data['class'] == 1]
class2 = X[combined_train_data['class'] == 2]
class3 = X[combined_train_data['class'] == 3]
class4 = X[combined_train_data['class'] == 4]
class5 = X[combined_train_data['class'] == 5]
class6 = X[combined_train_data['class'] == 6]
class7 = X[combined_train_data['class'] == 7]
class8 = X[combined_train_data['class'] == 8]
class9 = X[combined_train_data['class'] == 9]

columns = ['mean','std','corr','cov']
index = [ 1,2,3,4,5,6,7,8,9 ]
class_stats = pd.DataFrame(index=index, columns=columns)

class_stats['mean'][1] = 1.0
class_stats.head()

classx = X[combined_train_data['class'] == i]
classxmean = classx.mean()
classxmean.head()

columns = ['mean','std','corr','cov']
index = [ 1,2,3,4,5,6,7,8,9 ]
class_stats = pd.DataFrame(index=index, columns=columns, dtype=float)
for i in range(1,10):
  classx = X[combined_train_data['class'] == i]
  class_stats['mean'][i] = classx.mean().sum()
  class_stats['std'][i] = classx.std().sum()
  #class_stats['corr'][i] = classx.corr().sum()
  #class_stats['cov'][i] = classx.cov().sum()

class_stats.head()

plt.figure(figsize=(15,15))
plt.xlabel("means")
plt.ylabel("standard deviation")
plt.scatter(class_stats['mean'], class_stats['std'], c=[ 1,2,3,4,5,6,7,8,9 ], cmap='brg')