In [1]:
import numpy as np
import scipy as sp
import pandas as pd
import sklearn as skl
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest, SelectPercentile
from sklearn.feature_selection import chi2
from sklearn.metrics import log_loss, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.cross_validation import cross_val_score, KFold
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
In [2]:
train_data = pd.read_csv('data/train-malware-features-asm.csv')
labels = pd.read_csv('data/trainLabels.csv')
sorted_train_data = train_data.sort(columns='filename', axis=0, ascending=True, inplace=False)
sorted_train_labels = labels.sort(columns='Id', axis=0, ascending=True, inplace=False)
X = sorted_train_data.iloc[:,1:]
y = np.array(sorted_train_labels.iloc[:,1])
In [3]:
print(X.shape)
In [4]:
print(y.shape)
In [5]:
sorted_train_data.head()
Out[5]:
In [20]:
sorted_train_labels.head()
Out[20]:
In [21]:
train_data.head()
Out[21]:
In [23]:
y
Out[23]:
In [7]:
# find the top 10 percent variance features, from 1006 -> 101 features
fsp = SelectPercentile(chi2, 10)
X_new_10 = fsp.fit_transform(X,y)
X_new_10.shape
Out[7]:
In [8]:
X_new_10
Out[8]:
In [9]:
selected_names = fsp.get_support(indices=True)
selected_names = selected_names + 1
selected_names
Out[9]:
In [10]:
data_trimmed = sorted_train_data.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_train_data['filename'])
data_reduced = data_fnames.join(data_trimmed)
data_reduced.head()
Out[10]:
In [11]:
data_reduced.to_csv('data/sorted-train-malware-features-asm-10percent.csv', index=False)
sorted_train_labels.to_csv('data/sorted-train-labels.csv', index=False)
In [17]:
# find the top 20 percent variance features, from 1006 -> 201 features
fsp = SelectPercentile(chi2, 20)
X_new_20 = fsp.fit_transform(X,y)
X_new_20.shape
Out[17]:
In [18]:
selected_names = fsp.get_support(indices=True)
selected_names = selected_names + 1
selected_names
Out[18]:
In [19]:
data_trimmed = sorted_train_data.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_train_data['filename'])
data_reduced = data_fnames.join(data_trimmed)
data_reduced.head()
Out[19]:
In [20]:
data_reduced.to_csv('data/sorted-train-malware-features-asm-20percent.csv', index=False)
In [16]:
# find the top 30 percent variance features, from 1006 -> 301 features
fsp = SelectPercentile(chi2, 30)
X_new_30 = fsp.fit_transform(X,y)
X_new_30.shape
Out[16]:
In [17]:
selected_names = fsp.get_support(indices=True)
selected_names = selected_names + 1
selected_names
Out[17]:
In [5]:
data_trimmed = sorted_train_data.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_train_data['filename'])
data_reduced = data_fnames.join(data_trimmed)
data_reduced.head()
Out[5]:
In [6]:
data_reduced.to_csv('data/sorted-train-malware-features-asm-30percent.csv', index=False)
In [15]:
# find the top 40 percent variance features, from 1006 -> 401 features
fsp = SelectPercentile(chi2, 40)
X_new_40 = fsp.fit_transform(X,y)
X_new_40.shape
Out[15]:
In [16]:
selected_names = fsp.get_support(indices=True)
selected_names = selected_names + 1
selected_names
Out[16]:
In [18]:
data_trimmed = sorted_train_data.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_train_data['filename'])
data_reduced = data_fnames.join(data_trimmed)
data_reduced.head()
Out[18]:
In [6]:
data_reduced.to_csv('data/sorted-train-malware-features-asm-40percent.csv', index=False)
In [7]:
# find the top 50 percent variance features, from 1006 -> 503 features
fsp = SelectPercentile(chi2, 50)
X_new_50 = fsp.fit_transform(X,y)
X_new_50.shape
Out[7]:
In [8]:
selected_names = fsp.get_support(indices=True)
selected_names = selected_names + 1
selected_names
Out[8]:
In [9]:
data_trimmed = sorted_train_data.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_train_data['filename'])
data_reduced = data_fnames.join(data_trimmed)
data_reduced.head()
Out[9]:
In [10]:
data_reduced.to_csv('data/sorted-train-malware-features-asm-50percent.csv', index=False)
In [11]:
test_data = pd.read_csv('data/test-malware-features-asm.csv')
sorted_test_data = test_data.sort(columns='filename', axis=0, ascending=True, inplace=False)
sorted_test_data.shape
Out[11]:
In [19]:
sorted_test_data.head()
Out[19]:
In [19]:
# Get the feature names from the reduced train dataframe
column_names = data_reduced.columns
print(column_names)
In [20]:
# Extract the reduced feature set from the full test feature set
sorted_test_data_reduced = sorted_test_data.loc[:,column_names]
sorted_test_data_reduced.head()
Out[20]:
In [16]:
sorted_test_data_reduced.to_csv('data/sorted-test-malware-features-asm-10percent.csv', index=False)
In [25]:
sorted_test_data_reduced.to_csv('data/sorted-test-malware-features-asm-20percent.csv', index=False)
In [22]:
sorted_test_data_reduced.to_csv('data/sorted-test-malware-features-asm-30percent.csv', index=False)
In [21]:
sorted_test_data_reduced.to_csv('data/sorted-test-malware-features-asm-40percent.csv', index=False)
In [14]:
sorted_test_data_reduced.to_csv('data/sorted-test-malware-features-asm-50percent.csv', index=False)
In [37]:
# First load the .asm training features and training labels
#sorted_train_data_asm = pd.read_csv('data/sorted-train-malware-features-asm-reduced.csv')
#sorted_train_labels = pd.read_csv('data/sorted-train-labels.csv','r')
# Next load the .byte training features and sort
train_data_byte = pd.read_csv('data/train-malware-features-byte.csv')
sorted_train_data_byte = train_data_byte.sort(columns='filename', axis=0, ascending=True, inplace=False)
# Next load the .byte test features and sort
test_data_byte = pd.read_csv('data/test-malware-features-byte.csv')
sorted_test_data_byte = test_data_byte.sort(columns='filename', axis=0, ascending=True, inplace=False)
#combined_train_data = pd.DataFrame.merge(sorted_train_data_asm, sorted_train_data_byte, on='filename', how='inner', sort=False)
# Now write all the sorted feature sets to file
#f = open('data/sorted-train-features-combined.csv', 'w')
#combined_train_data.to_csv(f, index=False)
#f.close()
f = open('data/sorted-train-malware-features-byte.csv', 'w')
sorted_train_data_byte.to_csv(f, index=False)
f.close()
f = open('data/sorted-test-malware-features-byte.csv', 'w')
sorted_test_data_byte.to_csv(f, index=False)
f.close()
In [ ]:
# Load and sort asm image data for test and train files
train_image_asm = pd.read_csv('data/train-image-features-asm.csv')
sorted_train_image_asm = train_image_asm.sort(columns='filename', axis=0, ascending=True, inplace=False)
test_image_asm = pd.read_csv('data/test-image-features-asm.csv')
sorted_test_image_asm = test_image_asm.sort(columns='filename', axis=0, ascending=True, inplace=False)
# NOTE: byte file images have low standard deviation and mean variance, not very useful for learning.
# Load and sort byte image data for test and train files
# train_image_byte = pd.read_csv('data/train-image-features-byte.csv')
# sorted_train_image_byte = train_image_byte.sort(columns='filename', axis=0, ascending=True, inplace=False)
# test_image_byte = pd.read_csv('data/test-image-features-byte.csv')
#sorted_test_image_byte = test_image_byte.sort(columns='filename', axis=0, ascending=True, inplace=False)
In [4]:
# Now write all the sorted image feature sets to file
f = open('data/sorted-train-image-features-asm.csv', 'w')
sorted_train_image_asm.to_csv(f, index=False)
f.close()
f = open('data/sorted-test-image-features-asm.csv', 'w')
sorted_test_image_asm.to_csv(f, index=False)
f.close()
#f = open('data/sorted-train-image-features-byte.csv', 'w')
#sorted_train_image_byte.to_csv(f, index=False)
#f.close()
#f = open('data/sorted-test-image-features-byte.csv', 'w')
#sorted_test_image_byte.to_csv(f, index=False)
#f.close()
In [29]:
sorted_train_image_asm.head()
Out[29]:
In [30]:
# Now select 10% best train image asm features by variance
sorted_train_labels = pd.read_csv('data/sorted-train-labels.csv')
X = sorted_train_image_asm.iloc[:,1:]
y = np.array(sorted_train_labels.iloc[:,1])
fsp = SelectPercentile(chi2, 10)
X_new = fsp.fit_transform(X,y)
X_new.shape
Out[30]:
In [31]:
selected_names = fsp.get_support(indices=True)
selected_names = selected_names + 1
selected_names
Out[31]:
In [32]:
data_trimmed = sorted_train_image_asm.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_train_image_asm['filename'])
sorted_train_image_asm_reduced = data_fnames.join(data_trimmed)
sorted_train_image_asm_reduced.head()
Out[32]:
In [33]:
data_trimmed = sorted_test_image_asm.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_test_image_asm['filename'])
sorted_test_image_asm_reduced = data_fnames.join(data_trimmed)
sorted_test_image_asm_reduced.head()
Out[33]:
In [8]:
In [34]:
# Now write all the sorted and reduced image feature sets to file
f = open('data/sorted-train-image-features-asm-10percent.csv', 'w')
sorted_train_image_asm_reduced.to_csv(f, index=False)
f.close()
f = open('data/sorted-test-image-features-asm-10percent.csv', 'w')
sorted_test_image_asm_reduced.to_csv(f, index=False)
f.close()
#f = open('data/sorted-train-image-features-byte-reduced.csv', 'w')
#sorted_train_image_byte_reduced.to_csv(f, index=False)
#f.close()
#f = open('data/sorted-test-image-features-byte-reduced.csv', 'w')
#sorted_test_image_byte_reduced.to_csv(f, index=False)
#f.close()
In [35]:
# Now select 20% best train image asm features by variance
sorted_train_labels = pd.read_csv('data/sorted-train-labels.csv')
X = sorted_train_image_asm.iloc[:,1:]
y = np.array(sorted_train_labels.iloc[:,1])
fsp = SelectPercentile(chi2, 20)
X_new = fsp.fit_transform(X,y)
X_new.shape
Out[35]:
In [36]:
selected_names = fsp.get_support(indices=True)
selected_names = selected_names + 1
selected_names
Out[36]:
In [37]:
data_trimmed = sorted_train_image_asm.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_train_image_asm['filename'])
sorted_train_image_asm_reduced = data_fnames.join(data_trimmed)
sorted_train_image_asm_reduced.head()
Out[37]:
In [39]:
data_trimmed = sorted_test_image_asm.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_test_image_asm['filename'])
sorted_test_image_asm_reduced = data_fnames.join(data_trimmed)
sorted_test_image_asm_reduced.head()
Out[39]:
In [40]:
# Now write all the sorted and reduced image feature sets to file
f = open('data/sorted-train-image-features-asm-20percent.csv', 'w')
sorted_train_image_asm_reduced.to_csv(f, index=False)
f.close()
f = open('data/sorted-test-image-features-asm-20percent.csv', 'w')
sorted_test_image_asm_reduced.to_csv(f, index=False)
f.close()
In [8]:
# Now select 30% best train image asm features by variance
sorted_train_image_asm = pd.read_csv('data/sorted-train-image-features-asm.csv')
sorted_test_image_asm = pd.read_csv('data/sorted-test-image-features-asm.csv')
sorted_train_labels = pd.read_csv('data/sorted-train-labels.csv')
X = sorted_train_image_asm.iloc[:,1:]
y = sorted_train_labels['Class'].values.tolist()
fsp = SelectPercentile(chi2, 30)
X_new = fsp.fit_transform(X,y)
X_new.shape
Out[8]:
In [9]:
selected_names = fsp.get_support(indices=True)
selected_names = selected_names + 1
selected_names
Out[9]:
In [10]:
data_trimmed = sorted_train_image_asm.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_train_image_asm['filename'])
sorted_train_image_asm_reduced = data_fnames.join(data_trimmed)
sorted_train_image_asm_reduced.head()
Out[10]:
In [11]:
data_trimmed = sorted_test_image_asm.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_test_image_asm['filename'])
sorted_test_image_asm_reduced = data_fnames.join(data_trimmed)
sorted_test_image_asm_reduced.head()
Out[11]:
In [12]:
# Now write all the sorted and reduced image feature sets to file
f = open('data/sorted-train-image-features-asm-30percent.csv', 'w')
sorted_train_image_asm_reduced.to_csv(f, index=False)
f.close()
f = open('data/sorted-test-image-features-asm-30percent.csv', 'w')
sorted_test_image_asm_reduced.to_csv(f, index=False)
f.close()
In [22]:
# Now select 40% best train image asm features by variance
sorted_train_image_asm = pd.read_csv('data/sorted-train-image-features-asm.csv')
sorted_test_image_asm = pd.read_csv('data/sorted-test-image-features-asm.csv')
sorted_train_labels = pd.read_csv('data/sorted-train-labels.csv')
X = sorted_train_image_asm.iloc[:,1:]
y = np.array(sorted_train_labels.iloc[:,1])
fsp = SelectPercentile(chi2, 40)
X_new = fsp.fit_transform(X,y)
X_new.shape
Out[22]:
In [23]:
selected_names = fsp.get_support(indices=True)
selected_names = selected_names + 1
selected_names
Out[23]:
In [24]:
data_trimmed = sorted_train_image_asm.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_train_image_asm['filename'])
sorted_train_image_asm_reduced = data_fnames.join(data_trimmed)
sorted_train_image_asm_reduced.head()
Out[24]:
In [25]:
data_trimmed = sorted_test_image_asm.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_test_image_asm['filename'])
sorted_test_image_asm_reduced = data_fnames.join(data_trimmed)
sorted_test_image_asm_reduced.head()
Out[25]:
In [26]:
# Now write all the sorted and reduced image feature sets to file
f = open('data/sorted-train-image-features-asm-40percent.csv', 'w')
sorted_train_image_asm_reduced.to_csv(f, index=False)
f.close()
f = open('data/sorted-test-image-features-asm-40percent.csv', 'w')
sorted_test_image_asm_reduced.to_csv(f, index=False)
f.close()
In [27]:
# Now select 50% best train image asm features by variance
sorted_train_image_asm = pd.read_csv('data/sorted-train-image-features-asm.csv')
sorted_test_image_asm = pd.read_csv('data/sorted-test-image-features-asm.csv')
sorted_train_labels = pd.read_csv('data/sorted-train-labels.csv')
X = sorted_train_image_asm.iloc[:,1:]
y = np.array(sorted_train_labels.iloc[:,1])
fsp = SelectPercentile(chi2, 50)
X_new = fsp.fit_transform(X,y)
X_new.shape
Out[27]:
In [28]:
selected_names = fsp.get_support(indices=True)
selected_names = selected_names + 1
selected_names
Out[28]:
In [29]:
data_trimmed = sorted_train_image_asm.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_train_image_asm['filename'])
sorted_train_image_asm_reduced = data_fnames.join(data_trimmed)
sorted_train_image_asm_reduced.head()
Out[29]:
In [30]:
data_trimmed = sorted_test_image_asm.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_test_image_asm['filename'])
sorted_test_image_asm_reduced = data_fnames.join(data_trimmed)
sorted_test_image_asm_reduced.head()
Out[30]:
In [31]:
# Now write all the sorted and reduced image feature sets to file
f = open('data/sorted-train-image-features-asm-50percent.csv', 'w')
sorted_train_image_asm_reduced.to_csv(f, index=False)
f.close()
f = open('data/sorted-test-image-features-asm-50percent.csv', 'w')
sorted_test_image_asm_reduced.to_csv(f, index=False)
f.close()
In [ ]:
In [ ]:
In [ ]:
In [19]:
def run_cv(X,y, clf):
# Construct a kfolds object
kf = KFold(len(y),n_folds=10,shuffle=True)
y_prob = np.zeros((len(y),9))
y_pred = np.zeros(len(y))
# Iterate through folds
for train_index, test_index in kf:
print(test_index, train_index)
X_train = X.loc[train_index,:]
X_test = X.loc[test_index,:]
y_train = y[train_index]
clf.fit(X_train,y_train)
y_prob[test_index] = clf.predict_proba(X_test)
y_pred[test_index] = clf.predict(X_test)
return y_prob, y_pred
In [20]:
ytrain = np.array(y)
In [21]:
X = data_reduced.iloc[:,1:]
X.shape
Out[21]:
In [22]:
# At last we can build a hypothesis
clf1 = ExtraTreesClassifier(n_estimators=1000, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='gini')
p1, pred1 = run_cv(X,ytrain,clf1)
print("logloss = ", log_loss(y, p1))
print("score = ", accuracy_score(ytrain, pred1))
cm = confusion_matrix(y, pred1)
print(cm)
In [23]:
# Finally shove the test feature set into the classifier
test_X = test_data_reduced.iloc[:,1:]
test_predictions = clf1.predict(test_X)
test_predictions
Out[23]:
In [30]:
# Write out the predictions to a csv file
out_test_y = pd.DataFrame(columns=['filename', 'class'])
out_test_y['filename'] = test_data_reduced['filename']
out_test_y['class'] = pd.DataFrame(test_predictions, columns=['class'])
out_test_y.head()
Out[30]:
In [33]:
out_test_y.to_csv('data/test-label-etc-predictions.csv', index=False)
In [ ]:
# go through the features and delete any that sum to less than 200
colsum = X.sum(axis=0, numeric_only=True)
In [ ]:
zerocols = colsum[(colsum[:] == 0)]
zerocols
In [ ]:
zerocols = colsum[(colsum[:] < 110)]
zerocols.shape
In [ ]:
reduceX = X
for col in reduceX.columns:
if sum(reduceX[col]) < 100:
del reduceX[col]
reduceX.shape
In [ ]:
skb = SelectKBest(chi2, k=20)
X_kbestnew = skb.fit_transform(X, y)
X_kbestnew.shape
In [ ]:
y = [0]*labels.shape[0]
fnames = train_data['filename']
for i in range(len(y)):
fname = train_data.loc[i,'filename']
row = labels[labels['Id'] == fname]
y[i] = row.iloc[0,1]
In [ ]:
# DO NOT USE BYTE IMAGE DATA
# Now select 10% best train image byte features by variance
sorted_train_labels = pd.read_csv('data/sorted-train-labels.csv')
X = sorted_train_image_byte.iloc[:,1:]
y = np.array(sorted_train_labels.iloc[:,1])
fsp = SelectPercentile(chi2, 10)
X_new = fsp.fit_transform(X,y)
X_new.shape
selected_names = fsp.get_support(indices=True)
selected_names = selected_names + 1
selected_names
data_trimmed = sorted_train_image_byte.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_train_image_byte['filename'])
sorted_train_image_byte_reduced = data_fnames.join(data_trimmed)
sorted_train_image_byte_reduced.head()
data_trimmed = sorted_test_image_byte.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_test_image_byte['filename'])
sorted_test_image_byte_reduced = data_fnames.join(data_trimmed)
sorted_test_image_byte_reduced.head()