1. Analysis and Transormation of Selected Features to Strengthen Weak Learners


In [1]:
# Just import everything we might need, it saves time.
import warnings
import numpy as np
import scipy as sp
import pandas as pd
import sklearn as skl
import matplotlib.pyplot as plt
from time import time
from scipy.stats import randint as sp_randint
from sklearn.metrics import log_loss, confusion_matrix, accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.cross_validation import cross_val_score, KFold, train_test_split
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import RidgeClassifierCV
from sklearn.svm import SVC
import seaborn as sns
%pylab inline
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)


Populating the interactive namespace from numpy and matplotlib

2. Load The Sorted Training Data Features

- sorted-train-malware-features-asm-reduced.csv
- sorted-train-malware-features-byte.csv
- sorted-train-labels.csv
- sorted-train-features-combined.csv

In [2]:
# First load the .asm and .byte training/test data and training labels
sorted_train_data_asm = pd.read_csv('data/sorted-train-malware-features-asm-10percent.csv')
sorted_train_data_byte = pd.read_csv('data/sorted-train-malware-features-byte.csv')
sorted_test_data_asm = pd.read_csv('data/sorted-test-malware-features-asm-10percent.csv')
sorted_test_data_byte = pd.read_csv('data/sorted-test-malware-features-byte.csv')
sorted_train_labels = pd.read_csv('data/sorted-train-labels.csv')
#combined_train_data = pd.read_csv('data/sorted-train-features-combined.csv')
#combined_test_data = pd.read_csv('data/sorted-test-features-combined.csv')

# Load the image data for training data
sorted_train_image_asm = pd.read_csv('data/sorted-train-image-features-asm-10percent.csv')
sorted_test_image_asm = pd.read_csv('data/sorted-test-image-features-asm-10percent.csv')
#sorted_train_image_byte = pd.read_csv('data/sorted-train-image-features-byte-reduced.csv')

# Now load the row statistics for the original feature set, these will be combined with the new features
all_train_rowstats = pd.read_csv('data/all-train-asm-rowstats.csv')
all_test_rowstats = pd.read_csv('data/all-test-asm-rowstats.csv')
all_train_image_asm_rowstats = pd.read_csv('data/all-train-image-asm-rowstats.csv')
all_test_image_asm_rowstats = pd.read_csv('data/all-test-image-asm-rowstats.csv')

In [ ]:
sorted_train_labels.head()

In [3]:
sorted_train_image_asm.head()


Out[3]:
filename ASM_141 ASM_144 ASM_150 ASM_155 ASM_163 ASM_172 ASM_185 ASM_187 ASM_189 ASM_214 ASM_221 ASM_222 ASM_223 ASM_225 ASM_226 ASM_243 ASM_244 ASM_245 ASM_246
0 01IsoiSMh5gxyDYTl4CB 58 52 9 9 59 115 101 9 101 118 115 115 101 98 108 116 101 120 116 ...
1 01SuzwMJEIXsK7A8dQbl 9 9 124 104 9 32 32 104 32 32 32 32 32 13 10 9 9 9 9 ...
2 01azqd4InC7m9JpocGv5 9 9 124 104 9 32 32 104 32 32 32 32 32 13 10 9 9 9 9 ...
3 01jsnpXSAlgw6aPeDxrU 9 9 124 104 9 32 32 104 32 32 32 32 32 13 10 9 9 9 9 ...
4 01kcPWA9K2BOxQeS5Rju 9 9 124 104 9 32 32 104 32 32 32 32 32 13 10 9 9 9 9 ...

5 rows × 101 columns


In [4]:
sorted_test_image_asm.head()


Out[4]:
filename ASM_141 ASM_144 ASM_150 ASM_155 ASM_163 ASM_172 ASM_185 ASM_187 ASM_189 ASM_214 ASM_221 ASM_222 ASM_223 ASM_225 ASM_226 ASM_243 ASM_244 ASM_245 ASM_246
0 ITSUPtCmh7WdJcsYDwQ5 9 9 124 104 9 32 32 104 32 32 32 32 32 13 10 9 9 9 9 ...
1 Ig2DB5tSiEy1cJvV0zdw 9 9 124 104 9 32 32 104 32 32 32 32 32 13 10 9 9 9 9 ...
2 Jmo6eIhLZ4t9r8QsxEg5 9 9 124 104 9 32 32 104 32 32 32 32 32 13 10 9 9 9 9 ...
3 JtPFl4ewgdD78OzCMa3o 9 9 124 104 9 32 32 104 32 32 32 32 32 13 10 9 9 9 9 ...
4 K3ZtByPHGSFYNljDUEXp 9 9 124 104 9 32 32 104 32 32 32 32 32 13 10 9 9 9 9 ...

5 rows × 101 columns


In [5]:
# Assign asm data to X,y for brevity.
X = sorted_train_data_asm.iloc[:,1:]
y = np.array(sorted_train_labels.iloc[:,1])
X_test = sorted_test_data_asm.iloc[:,1:]
X_train_image_asm = sorted_train_image_asm.iloc[:,1:]
X_test_image_asm = sorted_test_image_asm.iloc[:,1:]
# X_image_byte = sorted_train_image_byte.iloc[:,1:]

In [6]:
X_train_image_asm.shape


Out[6]:
(10868, 100)

In [7]:
X_test_image_asm.shape


Out[7]:
(10873, 100)

3. Perform Some Basic Statistical Analysis on the Feature Sets

Find the following:
- Feature and row mean
- Feature and row standard deviation
- Feature and row correlation coefficient
- Feature and row covariance
- Feature and row minimum and maximum

In [8]:
# Train feature stats
X_means = X.mean()
X_std = X.std()
X_cor = X.corr()
X_cov = X.cov()

# Test feature stats
X_test_means = X_test.mean()
X_test_std = X_test.std()
X_test_cor = X_test.corr()
X_test_cov = X_test.cov()

# Train image feature stats
X_train_image_asm_means = X_train_image_asm.mean()
X_train_image_asm_std = X_train_image_asm.std()
X_train_image_asm_cor = X_train_image_asm.corr()
X_train_image_asm_cov = X_train_image_asm.cov()

# Test image feature stats
X_test_image_asm_means = X_test_image_asm.mean()
X_test_image_asm_std = X_test_image_asm.std()
X_test_image_asm_cor = X_test_image_asm.corr()
X_test_image_asm_cov = X_test_image_asm.cov()

# Not using byte image features
#X_image_byte_means = X_image_byte.mean()
#X_image_byte_std = X_image_byte.std()
#X_image_byte_cor = X_image_byte.corr()
#X_image_byte_cov = X_image_byte.cov()

In [9]:
X_train_image_asm_means.head()


Out[9]:
ASM_141    20.163508
ASM_144    18.768955
ASM_150    97.806956
ASM_155    82.360324
ASM_163    20.427310
dtype: float64

In [10]:
X_train_image_asm_std.head()


Out[10]:
ASM_141    20.545869
ASM_144    18.019311
ASM_150    48.209797
ASM_155    39.826070
ASM_163    21.071590
dtype: float64

In [11]:
X_test_image_asm_means.head()


Out[11]:
ASM_141    20.168675
ASM_144    18.760324
ASM_150    97.809988
ASM_155    82.354732
ASM_163    20.461510
dtype: float64

In [12]:
# The byte image data has low standard deviation and mean variance and is not very useful for learning
# so this data will not be used for further analysis.
X_test_image_asm_std.head()


Out[12]:
ASM_141    20.544820
ASM_144    18.012769
ASM_150    48.190723
ASM_155    39.821367
ASM_163    21.141740
dtype: float64

In [ ]:
X_means.head()

X_std.head()

X_cor.head()

X_cov.head()

X_means.min()

X_means.max()

X_std.min()

X_std.max()

X_means[X_means == X_means.min()]

X_means[X_means == X_means.max()]

X_std[X_std == X_std.min()]

X_std[X_std == X_std.max()]

In [13]:
# Row stats for train and test data
X_train_rowstats = pd.DataFrame(index=np.arange(X.shape[0]), columns=['filename','mean','std','min','max','total','logtotal'], dtype=float64)
X_test_rowstats = pd.DataFrame(index=np.arange(X_test.shape[0]), columns=['filename','e_mean','e_std','e_min','e_max','e_total','e_logtotal'], dtype=float64)

X_train_rowstats['filename'] = sorted_train_data_asm['filename']
for i in range(0,X.shape[0]):
    X_train_rowstats['mean'][i] = X.iloc[i,:].mean()
    X_train_rowstats['std'][i] = X.iloc[i,:].std()
    X_train_rowstats['min'][i] = X.iloc[i,:].min()
    X_train_rowstats['max'][i] = X.iloc[i,:].max()
    

X_train_rowstats['total'] = X_train_rowstats['max'] * X_train_rowstats['mean'] * X_train_rowstats['std']
X_train_rowstats['logtotal'] = np.log(X_train_rowstats['total']) # natural logarithm

X_test_rowstats['filename'] = sorted_test_data_asm['filename']
for i in range(0,X_test.shape[0]):
    X_test_rowstats['e_mean'][i] = X_test.iloc[i,:].mean()
    X_test_rowstats['e_std'][i] = X_test.iloc[i,:].std()
    X_test_rowstats['e_min'][i] = X_test.iloc[i,:].min()
    X_test_rowstats['e_max'][i] = X_test.iloc[i,:].max()
    

X_test_rowstats['e_total'] = X_test_rowstats['e_max'] * X_test_rowstats['e_mean'] * X_test_rowstats['e_std']
X_test_rowstats['e_logtotal'] = np.log(X_test_rowstats['e_total']) # natural logarithm

X_train_rowstats.head()


Out[13]:
filename mean std min max total logtotal
0 01IsoiSMh5gxyDYTl4CB 3220.544554 14985.141846 0 87555 4.225432e+12 29.072143
1 01SuzwMJEIXsK7A8dQbl 337.425743 990.494123 0 5817 1.944147e+09 21.388089
2 01azqd4InC7m9JpocGv5 27635.227723 191333.687686 0 1367070 7.228451e+15 36.516801
3 01jsnpXSAlgw6aPeDxrU 1411.188119 9219.899598 0 65928 8.577900e+11 27.477625
4 01kcPWA9K2BOxQeS5Rju 23.405941 54.519937 0 445 5.678602e+05 13.249631

5 rows × 7 columns


In [14]:
X_test_rowstats.head()


Out[14]:
filename e_mean e_std e_min e_max e_total e_logtotal
0 ITSUPtCmh7WdJcsYDwQ5 714.881188 4383.992211 0 31361 9.828643e+10 25.311152
1 Ig2DB5tSiEy1cJvV0zdw 4785.782178 33159.818362 0 236923 3.759865e+13 31.257989
2 Jmo6eIhLZ4t9r8QsxEg5 5171.564356 36013.098378 0 257289 4.791855e+13 31.500524
3 JtPFl4ewgdD78OzCMa3o 419.376238 2288.297974 0 16341 1.568177e+10 23.475765
4 K3ZtByPHGSFYNljDUEXp 15603.920792 109811.760353 0 784363 1.344001e+15 34.834428

5 rows × 7 columns


In [15]:
# Image row stats
X_train_image_asm_rowstats = pd.DataFrame(index=np.arange(X_train_image_asm.shape[0]), columns=['filename','tr_mean','tr_std','tr_min','tr_max','tr_total','tr_logtotal'], dtype=float64)
X_test_image_asm_rowstats = pd.DataFrame(index=np.arange(X_test_image_asm.shape[0]), columns=['filename','te_mean','te_std','te_min','te_max','te_total','te_logtotal'], dtype=float64)

X_train_image_asm_rowstats['filename'] = sorted_train_image_asm['filename']
for i in range(0,X_train_image_asm.shape[0]):
    X_train_image_asm_rowstats['tr_mean'][i] = X_train_image_asm.iloc[i,:].mean()
    X_train_image_asm_rowstats['tr_std'][i] = X_train_image_asm.iloc[i,:].std()
    X_train_image_asm_rowstats['tr_min'][i] = X_train_image_asm.iloc[i,:].min()
    X_train_image_asm_rowstats['tr_max'][i] = X_train_image_asm.iloc[i,:].max()
    

X_train_image_asm_rowstats['tr_total'] = X_train_image_asm_rowstats['tr_max'] * X_train_image_asm_rowstats['tr_mean'] * X_train_image_asm_rowstats['tr_std']
X_train_image_asm_rowstats['tr_logtotal'] = np.log(X_train_image_asm_rowstats['tr_total']) # natural logarithm

X_test_image_asm_rowstats['filename'] = sorted_test_image_asm['filename']
for i in range(0,X_test_image_asm.shape[0]):
    X_test_image_asm_rowstats['te_mean'][i] = X_test_image_asm.iloc[i,:].mean()
    X_test_image_asm_rowstats['te_std'][i] = X_test_image_asm.iloc[i,:].std()
    X_test_image_asm_rowstats['te_min'][i] = X_test_image_asm.iloc[i,:].min()
    X_test_image_asm_rowstats['te_max'][i] = X_test_image_asm.iloc[i,:].max()
    

X_test_image_asm_rowstats['te_total'] = X_test_image_asm_rowstats['te_max'] * X_test_image_asm_rowstats['te_mean'] * X_test_image_asm_rowstats['te_std']
X_test_image_asm_rowstats['te_logtotal'] = np.log(X_test_image_asm_rowstats['te_total']) # natural logarithm

X_train_image_asm_rowstats.head()


Out[15]:
filename tr_mean tr_std tr_min tr_max tr_total tr_logtotal
0 01IsoiSMh5gxyDYTl4CB 73.60 44.988888 9 124 410586.583033 12.925342
1 01SuzwMJEIXsK7A8dQbl 46.72 40.429282 9 124 234218.152663 12.364008
2 01azqd4InC7m9JpocGv5 46.72 40.429282 9 124 234218.152663 12.364008
3 01jsnpXSAlgw6aPeDxrU 46.72 40.429282 9 124 234218.152663 12.364008
4 01kcPWA9K2BOxQeS5Rju 48.40 41.941518 9 124 251716.212779 12.436058

5 rows × 7 columns


In [16]:
X_test_image_asm_rowstats.head()


Out[16]:
filename te_mean te_std te_min te_max te_total te_logtotal
0 ITSUPtCmh7WdJcsYDwQ5 38.39 38.039027 9 124 181079.462610 12.106691
1 Ig2DB5tSiEy1cJvV0zdw 38.13 37.343790 9 124 176565.922610 12.081450
2 Jmo6eIhLZ4t9r8QsxEg5 48.39 41.941492 9 124 251664.053562 12.435850
3 JtPFl4ewgdD78OzCMa3o 48.39 41.941492 9 124 251664.053562 12.435850
4 K3ZtByPHGSFYNljDUEXp 38.13 37.343790 9 124 176565.922610 12.081450

5 rows × 7 columns


In [17]:
# Write column stats and row stats to file

X_train_rowstats.to_csv('data/train-asm-rowstats.csv', index=False)
X_test_rowstats.to_csv('data/test-asm-rowstats.csv', index=False)
X_train_image_asm_rowstats.to_csv('data/train-image-asm-rowstats.csv', index=False)
X_test_image_asm_rowstats.to_csv('data/test-image-asm-rowstats.csv', index=False)

4. Combine All Features into Two Files (TRAIN and TEST)


In [27]:
# Now merge all the training set features and write to file
combined_train_data = sorted_train_data_asm.merge(sorted_train_data_byte, on='filename')

combined_train_data = combined_train_data.merge(X_train_rowstats, on='filename')

combined_train_data = combined_train_data.merge(X_train_image_asm_rowstats, on='filename', suffixes=('_A', '_I'))

combined_train_data = combined_train_data.merge(sorted_train_image_asm, on='filename')

combined_train_data = combined_train_data.merge(all_train_rowstats, on='filename')

combined_train_data = combined_train_data.merge(all_train_image_asm_rowstats, on='filename')

combined_train_data.to_csv('data/all-combined-train-data-10percent.csv', index=False)

# Now merge all the testing set features and write to file
combined_test_data = sorted_test_data_asm.merge(sorted_test_data_byte, on='filename')

combined_test_data = combined_test_data.merge(X_test_rowstats, on='filename')

combined_test_data = combined_test_data.merge(X_test_image_asm_rowstats, on='filename', suffixes=('_A', '_I'))

combined_test_data = combined_test_data.merge(sorted_test_image_asm, on='filename')

combined_test_data = combined_test_data.merge(all_test_rowstats, on='filename')

combined_test_data = combined_test_data.merge(all_test_image_asm_rowstats, on='filename')

combined_test_data.to_csv('data/all-combined-test-data-10percent.csv', index=False)

In [19]:
combined_train_data.head()


Out[19]:
filename edx esi ah al ax bl bx cl cx dl dx eax ebp ebx ecx edi esp add al.1
0 01IsoiSMh5gxyDYTl4CB 750 496 8 224 49 25 0 191 52 163 63 1447 905 260 1093 393 420 323 79 ...
1 01SuzwMJEIXsK7A8dQbl 1121 24 6 22 7 4 0 37 2 9 3 1220 1544 18 1228 24 107 427 8 ...
2 01azqd4InC7m9JpocGv5 1493 1900 1 398 0 47 0 77 4 56 2 4438 591 810 2317 1284 701 622 262 ...
3 01jsnpXSAlgw6aPeDxrU 525 4 0 0 0 1 0 1 2 0 0 942 451 5 547 5 56 32 0 ...
4 01kcPWA9K2BOxQeS5Rju 23 35 0 3 0 1 0 1 0 0 0 137 43 19 66 15 81 11 1 ...

5 rows × 228 columns


In [20]:
combined_test_data.head()


Out[20]:
filename edx esi ah al ax bl bx cl cx dl dx eax ebp ebx ecx edi esp add al.1
0 ITSUPtCmh7WdJcsYDwQ5 245 434 9 51 1 4 1 1 3 1 2 553 302 300 245 223 41 204 15 ...
1 Ig2DB5tSiEy1cJvV0zdw 258 437 11 60 1 2 2 1 2 1 3 554 321 298 218 198 44 175 20 ...
2 Jmo6eIhLZ4t9r8QsxEg5 238 365 8 51 0 4 1 1 2 0 0 519 285 268 194 236 46 173 23 ...
3 JtPFl4ewgdD78OzCMa3o 241 556 4 52 0 0 1 2 2 1 2 668 402 298 208 225 61 214 27 ...
4 K3ZtByPHGSFYNljDUEXp 92 75 0 1 0 0 0 2 0 0 0 402 186 103 139 226 225 119 0 ...

5 rows × 228 columns

5. Some Plots


In [ ]:
# Do some plotting
plt.figure(figsize=(15,15))
plt.xlabel("mean")
plt.ylabel('logtotal')
plt.scatter(Xrowstats['mean'], Xrowstats['logtotal'], c=y, cmap='brg')

In [ ]:
plt.figure(figsize=(15,15))
plt.xlabel("total")
plt.ylabel('class')
plt.scatter(Xrowstats['logtotal'], y, c=y, cmap='brg')

In [ ]:
plt.figure(figsize=(15,15))
plt.xlabel("mean")
plt.ylabel("std")
plt.scatter(Xrowstats['mean'], Xrowstats['std'], c=y, cmap='brg')

In [ ]:
plt.figure(figsize=(15,15))
plt.xlabel("mean")
plt.ylabel('class')
plt.scatter(Xrowstats['mean'], y, c=y, cmap='brg')

In [ ]:
plt.figure(figsize=(15,15))
plt.xlabel("standard deviation")
plt.ylabel('class')
plt.scatter(Xrowstats['std'], y, c=y, cmap='brg')

In [ ]:
plt.figure(figsize=(15,15))
plt.xlabel("max")
plt.ylabel('class')
plt.scatter(Xrowstats['max'], y, c=y, cmap='brg')

In [ ]:
plt.figure(figsize=(15,15))
sns.pairplot(Xrowstats, vars=['mean','std'], kind='reg', hue='class', size=6)

In [ ]:
plt.figure(figsize=(15,15))
sns.pairplot(Xrowstats, vars=['min','max'], kind='reg', hue='class', size=6)

In [ ]:
plt.figure(figsize=(15,15))
sns.pairplot(Xrowstats, vars=['mean','std','min','max'], kind='reg', hue='class', size=4)

In [ ]:
plt.figure(figsize=(15,15))
plt.xlabel("means")
plt.ylabel("standard deviation")
plt.scatter(X_means, X_std, cmap='brg')

In [ ]:
plt.figure(figsize=(15,15))
plt.xlabel("mean")
plt.ylabel("malware class")
ya = range(0,101)
plt.scatter(X_means, ya, cmap='brg')

In [ ]:
plt.figure(figsize=(15,15))
plt.xlabel("fword")
plt.ylabel("db")
plt.scatter(X['fword'], X['db'], c=y, cmap='brg')

In [ ]:
plt.figure(figsize=(15,15))
plt.xlabel("WinMain")
plt.ylabel("db.1")
plt.scatter(X['WinMain'], X['db.1'], c=y, cmap='brg')

In [ ]:
plt.figure(figsize=(15,15))
plt.xlabel("X_image_asm")
plt.ylabel("Malware Class")
plt.scatter(X_image_asm.iloc[:,5], y, c=y, cmap='brg')

In [ ]:
#sns.set()
#sns.set_context("paper")
plt.figure(figsize=(15,15))
sns.pairplot(combined_train_data, vars=['fword','db'], kind='reg', hue='class', size=6)

In [ ]:
help(np.zeros)

6. Perform Some Classification Tests


In [21]:
def multiclass_log_loss(y_true, y_pred, eps=1e-15):
    """Multi class version of Logarithmic Loss metric.
    https://www.kaggle.com/wiki/MultiClassLogLoss

    Parameters
    ----------
    y_true : array, shape = [n_samples]
            true class, intergers in [0, n_classes - 1)
    y_pred : array, shape = [n_samples, n_classes]

    Returns
    -------
    loss : float
    """
    predictions = np.clip(y_pred, eps, 1 - eps)

    # normalize row sums to 1
    predictions /= predictions.sum(axis=1)[:, np.newaxis]

    actual = np.zeros(y_pred.shape)
    n_samples = actual.shape[0]
    actual[np.arange(n_samples), y_true.astype(int)] = 1
    vectsum = np.sum(actual * np.log(predictions))
    loss = -1.0 / n_samples * vectsum
    return loss

In [22]:
def run_cv(X,y, clf):

    # Construct a kfolds object
    kf = KFold(len(y),n_folds=10,shuffle=True)
    y_prob = np.zeros((len(y),9))
    y_pred = np.zeros(len(y))
    
    # Iterate through folds
    for train_index, test_index in kf:
        print(test_index, train_index)
        X_train = X.loc[train_index,:]
        X_test = X.loc[test_index,:]
        y_train = y[train_index]

        clf.fit(X_train,y_train)
        y_prob[test_index] = clf.predict_proba(X_test)
        y_pred[test_index] = clf.predict(X_test)
    
    return y_prob, y_pred

In [23]:
X = combined_train_data.iloc[:,1:]

In [24]:
X.head()


Out[24]:
edx esi ah al ax bl bx cl cx dl dx eax ebp ebx ecx edi esp add al.1 call
0 750 496 8 224 49 25 0 191 52 163 63 1447 905 260 1093 393 420 323 79 333 ...
1 1121 24 6 22 7 4 0 37 2 9 3 1220 1544 18 1228 24 107 427 8 194 ...
2 1493 1900 1 398 0 47 0 77 4 56 2 4438 591 810 2317 1284 701 622 262 1533 ...
3 525 4 0 0 0 1 0 1 2 0 0 942 451 5 547 5 56 32 0 125 ...
4 23 35 0 3 0 1 0 1 0 0 0 137 43 19 66 15 81 11 1 53 ...

5 rows × 227 columns


In [25]:
X.shape


Out[25]:
(10868, 227)

In [26]:
clf1 = ExtraTreesClassifier(n_estimators=1000, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='gini')
p1, pred1 = run_cv(X,y,clf1)
print("logloss = {:.4f}".format(log_loss(y, p1)))
yloss = y - 1
print("multiclass logloss = {:.4f}".format(multiclass_log_loss(yloss,p1)))
print("score = {:.4f}".format(accuracy_score(y, pred1)))
cm = confusion_matrix(y, pred1)
print(cm)


(array([    0,    16,    76, ..., 10858, 10860, 10864]), array([    1,     2,     3, ..., 10865, 10866, 10867]))
(array([    4,     6,    14, ..., 10851, 10855, 10857]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([   26,    41,    54, ..., 10844, 10859, 10865]), array([    0,     1,     2, ..., 10864, 10866, 10867]))
(array([   10,    15,    18, ..., 10840, 10854, 10861]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    2,     7,     8, ..., 10852, 10866, 10867]), array([    0,     1,     3, ..., 10863, 10864, 10865]))
(array([    1,     3,     9, ..., 10785, 10813, 10835]), array([    0,     2,     4, ..., 10865, 10866, 10867]))
(array([   21,    36,    43, ..., 10820, 10825, 10847]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([   17,    19,    40, ..., 10838, 10843, 10862]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([   13,    22,    32, ..., 10839, 10842, 10853]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    5,    24,    29, ..., 10849, 10856, 10863]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
logloss = 0.0177
multiclass logloss = 0.0177
score = 0.9971
[[1538    0    0    0    0    3    0    0    0]
 [   2 2474    2    0    0    0    0    0    0]
 [   0    0 2942    0    0    0    0    0    0]
 [   1    0    0  473    0    1    0    0    0]
 [   3    0    0    0   38    1    0    0    0]
 [   5    0    0    0    0  746    0    0    0]
 [   1    0    0    0    0    0  397    0    0]
 [   0    0    0    0    0    0    0 1226    2]
 [   0    0    0    0    0    0    0   10 1003]]

In [48]:
print("score = {:.4f}".format(accuracy_score(y, pred1)))
cm = confusion_matrix(y, pred1)
print(cm)


score = 0.9975
[[1539    0    0    0    0    2    0    0    0]
 [   2 2474    2    0    0    0    0    0    0]
 [   0    0 2942    0    0    0    0    0    0]
 [   1    0    0  473    0    1    0    0    0]
 [   3    0    0    0   39    0    0    0    0]
 [   5    0    0    0    0  746    0    0    0]
 [   1    0    0    0    0    0  397    0    0]
 [   0    0    0    0    0    0    0 1226    2]
 [   0    0    0    0    0    0    0    8 1005]]

In [49]:
yloss = y - 1
print("multiclass logloss = {:.4f}".format(multiclass_log_loss(yloss,p1)))


multiclass logloss = 0.0174

6. Test/Experimental Code Only


In [ ]:
combined_train_data['class'] = sorted_train_labels.iloc[:,1]

class1 = X[combined_train_data['class'] == 1]
class2 = X[combined_train_data['class'] == 2]
class3 = X[combined_train_data['class'] == 3]
class4 = X[combined_train_data['class'] == 4]
class5 = X[combined_train_data['class'] == 5]
class6 = X[combined_train_data['class'] == 6]
class7 = X[combined_train_data['class'] == 7]
class8 = X[combined_train_data['class'] == 8]
class9 = X[combined_train_data['class'] == 9]

columns = ['mean','std','corr','cov']
index = [ 1,2,3,4,5,6,7,8,9 ]
class_stats = pd.DataFrame(index=index, columns=columns)

class_stats['mean'][1] = 1.0
class_stats.head()

classx = X[combined_train_data['class'] == i]
classxmean = classx.mean()
classxmean.head()

columns = ['mean','std','corr','cov']
index = [ 1,2,3,4,5,6,7,8,9 ]
class_stats = pd.DataFrame(index=index, columns=columns, dtype=float)
for i in range(1,10):
  classx = X[combined_train_data['class'] == i]
  class_stats['mean'][i] = classx.mean().sum()
  class_stats['std'][i] = classx.std().sum()
  #class_stats['corr'][i] = classx.corr().sum()
  #class_stats['cov'][i] = classx.cov().sum()

class_stats.head()

plt.figure(figsize=(15,15))
plt.xlabel("means")
plt.ylabel("standard deviation")
plt.scatter(class_stats['mean'], class_stats['std'], c=[ 1,2,3,4,5,6,7,8,9 ], cmap='brg')