In [1]:
import sklearn
import numpy as np
import pandas as pd

In [2]:
X=np.load("./lsi_500.npy")

In [3]:
X_train = X[:3086]

In [4]:
X_test = X[3086:]

In [5]:
Y=np.load("../data/features/train_classes.npy")

In [6]:
from sklearn import model_selection
X_train, X_val, Y_train, Y_val = sklearn.model_selection.train_test_split(X_train, Y, test_size=0.2, random_state=0)

In [26]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=200)
neigh.fit(X_train, Y_train)


Out[26]:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=200, p=2,
           weights='uniform')

In [27]:
neigh.score(X_val, Y_val)


Out[27]:
0.51132686084142398

In [29]:
Y_test = neigh.predict(X_test)

pd.Series(Y_test).value_counts()/Y_test.shape[0]*100

In [34]:
pd.Series(Y).value_counts()/Y.shape[0]*100


Out[34]:
None         52.138691
Swizzor      17.563189
VB           12.184057
Agent         3.694102
Virut         1.911860
Lipler        1.717434
AutoRun       1.620220
Magania       1.328581
Hupigon       1.328581
Zbot          1.296176
Krap          1.263772
FraudLoad     1.198963
FraudPack     1.036941
Tdss          1.036941
Poison        0.680493
dtype: float64

In [37]:
from sklearn import svm
clf = svm.SVC()
clf.fit(X_train, Y_train)


Out[37]:
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [38]:
clf.score(X_val, Y_val)


Out[38]:
0.51132686084142398

In [39]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

In [45]:
rf = RandomForestClassifier(n_estimators=100)

In [46]:
rf.fit(X_train, Y_train)


Out[46]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [47]:
rf.score(X_val, Y_val)


Out[47]:
0.50161812297734631

In [48]:
Y_test = rf.predict(X_test)

pd.Series(Y_test).value_counts()/Y_test.shape[0]*100


Out[48]:
None         96.509130
Swizzor       2.067669
VB            0.859291
Virut         0.187970
Zbot          0.107411
Magania       0.053706
Lipler        0.053706
Hupigon       0.026853
FraudLoad     0.026853
Poison        0.026853
Krap          0.026853
AutoRun       0.026853
Agent         0.026853
dtype: float64

In [7]:
import keras


Using Theano backend.

In [36]:
from keras.models import Sequential
from keras.layers import Dense, Activation, LSTM

model = Sequential()
model.add(LSTM(32, input_length=X_train.shape[0], input_dim=X_train.shape[1]))

In [37]:
model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy'])
model.fit(X_train, Y_train, nb_epoch=5, batch_size=32)
#loss_and_metrics = model.evaluate(X_val, Y_val, batch_size=32)


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-37-c68989328816> in <module>()
      1 model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy'])
----> 2 model.fit(X_train, Y_train, nb_epoch=5, batch_size=32)
      3 #loss_and_metrics = model.evaluate(X_val, Y_val, batch_size=32)

C:\Users\gr\Anaconda3\envs\py27\lib\site-packages\keras\models.pyc in fit(self, x, y, batch_size, nb_epoch, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, **kwargs)
    670                               class_weight=class_weight,
    671                               sample_weight=sample_weight,
--> 672                               initial_epoch=initial_epoch)
    673 
    674     def evaluate(self, x, y, batch_size=32, verbose=1,

C:\Users\gr\Anaconda3\envs\py27\lib\site-packages\keras\engine\training.pyc in fit(self, x, y, batch_size, nb_epoch, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch)
   1114             class_weight=class_weight,
   1115             check_batch_axis=False,
-> 1116             batch_size=batch_size)
   1117         # prepare validation data
   1118         if validation_data:

C:\Users\gr\Anaconda3\envs\py27\lib\site-packages\keras\engine\training.pyc in _standardize_user_data(self, x, y, sample_weight, class_weight, check_batch_axis, batch_size)
   1027                                    self.internal_input_shapes,
   1028                                    check_batch_axis=False,
-> 1029                                    exception_prefix='model input')
   1030         y = standardize_input_data(y, self.output_names,
   1031                                    output_shapes,

C:\Users\gr\Anaconda3\envs\py27\lib\site-packages\keras\engine\training.pyc in standardize_input_data(data, names, shapes, check_batch_axis, exception_prefix)
    110                                  ' to have ' + str(len(shapes[i])) +
    111                                  ' dimensions, but got array with shape ' +
--> 112                                  str(array.shape))
    113             for j, (dim, ref_dim) in enumerate(zip(array.shape, shapes[i])):
    114                 if not j and not check_batch_axis:

ValueError: Error when checking model input: expected lstm_input_1 to have 3 dimensions, but got array with shape (2468L, 500L)

In [ ]: