In [1]:
__author__ = 'Saber Shokat Fadaee'

from gensim import corpora, models, similarities
from gensim.models.doc2vec import TaggedDocument, LabeledSentence, Doc2Vec
import gensim
from sklearn import manifold, datasets
import numpy as np
from itertools import chain
import multiprocessing
import csv
import matplotlib as ml
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import re
from matplotlib.backends.backend_pdf import PdfPages

import numpy as np

from sklearn.datasets import make_checkerboard
from sklearn.datasets import samples_generator as sg
from sklearn.cluster.bicluster import SpectralBiclustering
from sklearn.cluster.bicluster import SpectralCoclustering

from sklearn.cluster.bicluster import SpectralCoclustering
from sklearn.cluster import MiniBatchKMeans
from sklearn.externals.six import iteritems
from sklearn.datasets.twenty_newsgroups import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.cluster import v_measure_score
from sklearn.utils.extmath import *
from sklearn.metrics import consensus_score

import operator

storage = {}
i = 1.0
EID_set = set()
botnet_set = set()
event_set = set()


file1 = open('EID.txt')
for line in file1:
        EID = line.strip()
        EID_set.add(EID)
file1.close()

file1= open("botnets.txt")
for line in file1:
    botnet = line.strip()
    botnet_set.add(botnet)
file1.close()

EID_set = sorted(EID_set)
botnet_set = sorted(botnet_set)
event_set = sorted(event_set)

In [2]:
count = np.loadtxt("count.txt")

In [3]:
botnet_family = []
file1= open("bot_relations.txt")
for line in file1:
    botnet_family.append(line.strip().split())
file1.close()
#Plus one for the unidentified classes
num_classes = len(botnet_family) + 1

In [4]:
def in_list(item,L):
    for i in L:
        if item in i:
            return L.index(i)
    return num_classes - 1
def bot_to_vector(bot):
    output = [0] * 23
    output[in_list(bot, botnet_family)] = 1
    return output

In [ ]:


In [5]:
#Set colors to each category
def sec_to_col(argument):
    switcher = {
		'Aerospace/Defense': 'aqua',
		'Business Services': 'blueviolet',
		'Consumer Goods': 'brown',
		'Education': 'coral',
		'Energy/Resources': 'crimson',
		'Engineering': 'darkgreen',
		'Finance': 'gold',
		'Food Production': 'green',
		'Government/Politics': 'lime',
		'Healthcare/Wellness': 'magenta',
		'Insurance': 'mintcream',
		'Legal': 'olive',
		'Manufacturing': 'orchid',
		'Media/Entertainment': 'peru',
		'Nonprofit/NGO': 'purple',
		'Real Estate': 'red',
		'Retail': 'skyblue',
		'Technology': 'silver',
		'Telecommunications': 'tomato',
		'Tourism/Hospitality': 'peachpuff',
		'Transportation': 'rosybrown',
		'Unknown': 'dimgray',
		'Utilities': 'royalblue',
    }
    return switcher.get(argument, "yellow")



#Set color to the different sizes
	
def size_to_col(argument):
    switcher = {
		'0-100': 'red',
		'100-1000': 'blue',
		'1000-10000': 'brown',
		'10000-50000': 'green',
		'50000+': 'gold',
		'Unknown': 'lime',
    }
    return switcher.get(argument, "yellow")

# Assigns the topics to the documents in corpus

col = []
col_size = []

sector = {}
count_range = {}

#Adding extra information
with open('extra.csv', 'rb' ) as theFile:
    reader = csv.DictReader( theFile )
    for line in reader:
		ind = int(line['']) 
		eid = line['entity_id_hash']
		sec = line['industry_sector']
		cnt = line['employee_count_range']
		sector[eid] = sec
		count_range[eid] = cnt

#Set numbers to each category
def sec_to_num(argument):
    switcher = {
		'Aerospace/Defense': 0,
		'Business Services': 1,
		'Consumer Goods': 2,
		'Education': 3,
		'Energy/Resources': 4,
		'Engineering': 5,
		'Finance': 6,
		'Food Production': 7,
		'Government/Politics': 8,
		'Healthcare/Wellness': 9,
		'Insurance': 10,
		'Legal': 11,
		'Manufacturing': 12,
		'Media/Entertainment': 13,
		'Nonprofit/NGO': 14,
		'Real Estate': 15,
		'Retail': 16,
		'Technology': 17,
		'Telecommunications': 18,
		'Tourism/Hospitality': 19,
		'Transportation': 20,
		'Unknown': 21,
		'Utilities': 22,
    }
    return switcher.get(argument, 23)
#Set numbers to each size
def size_to_num(argument):
    switcher = {
		'0-100': 0,
		'100-1000': 1,
		'1000-10000': 2,
		'10000-50000': 3,
		'50000+': 4,
		'Unknown': 5,
    }
    return switcher.get(argument, 6)

#Set category to each number
def num_to_sec(argument):
    switcher = {
		0:'Aerospace/Defense',
		1:'Business Services',
		2:'Consumer Goods',
		3:'Education',
		4:'Energy/Resources',
		5:'Engineering',
		6:'Finance',
		7:'Food Production',
		8:'Government/Politics',
		9:'Healthcare/Wellness',
		10:'Insurance',
		11:'Legal',
		12:'Manufacturing',
		13:'Media/Entertainment',
		14:'Nonprofit/NGO',
		15:'Real Estate',
		16:'Retail',
		17:'Technology',
		18:'Telecommunications',
		19:'Tourism/Hospitality',
		20:'Transportation',
		21:'Unknown',
		22:'Utilities',
    }
    return switcher.get(argument,23)
#Set numbers to each size
def num_to_size(argument):
    switcher = {
		0:'0-100',
		1:'100-1000',
		2:'1000-10000',
		3:'10000-50000',
		4:'50000+',
		5:'Unknown',
    }
    return switcher.get(argument, 6)

In [6]:
def included_entry(entry_name):
    if sector[entry_name] == 'Education':
        return False
    if sector[entry_name] == 'Technology':
        return False
    if sector[entry_name] == 'Tourism/Hospitality':
        return False
    if sector[entry_name] == 'Telecommunications':
        return False
    return True

In [7]:
sum(included_entry(entity) for entity in EID_set)


Out[7]:
3879

In [8]:
count_new1 = np.zeros((207,sum(included_entry(entity) for entity in EID_set)))

In [9]:
#Build a new count matrix excluding the unwanted sectors
index = 0
EID_set_new = []
for i in range(len(EID_set)):
    if included_entry(EID_set[i]):
        count_new1[:,index] = count[:,i]
        EID_set_new.append(EID_set[i])
        index += 1

In [10]:
count_new = np.zeros((num_classes,sum(included_entry(entity) for entity in EID_set)))

In [11]:
for i in range(len(botnet_set)):
    count_new[in_list(botnet_set[i], botnet_family) ,:] += count_new1[i,:]

In [12]:
count_new.shape


Out[12]:
(23, 3879)

In [13]:
for i in range(num_classes):
    print("Group: ", i+1, sum(count_new[i,:]))


('Group: ', 1, 554588.0)
('Group: ', 2, 2.0)
('Group: ', 3, 0.0)
('Group: ', 4, 3628.0)
('Group: ', 5, 2086.0)
('Group: ', 6, 2539.0)
('Group: ', 7, 156272.0)
('Group: ', 8, 7167.0)
('Group: ', 9, 3.0)
('Group: ', 10, 150.0)
('Group: ', 11, 34789.0)
('Group: ', 12, 1297.0)
('Group: ', 13, 3.0)
('Group: ', 14, 3920.0)
('Group: ', 15, 844704.0)
('Group: ', 16, 6659.0)
('Group: ', 17, 12042.0)
('Group: ', 18, 3408069.0)
('Group: ', 19, 0.0)
('Group: ', 20, 184976.0)
('Group: ', 21, 76927.0)
('Group: ', 22, 4.0)
('Group: ', 23, 398817.0)

In [14]:
sum_count_new = 0
for i in range(num_classes):
    sum_count_new += sum(count_new[i,:])
print(sum_count_new)


5698642.0

In [15]:
def sectors_count(botnet_group):
    sectors_count = [0]*23
    res = dict()
    for i in range(len(EID_set_new)):
        if count_new[botnet_group,i] > 0:
            sectors_count[sec_to_num(sector[EID_set_new[i]])] += count_new[botnet_group,i]
    for i in range(23):
        res[num_to_sec(i)] = sectors_count[i]
    return res

def sectors_count_botnet(bot):
    sectors_count = [0]*23
    for i in range(len(EID_set_new)):
        if count_new1[bot,i] > 0:
            sectors_count[sec_to_num(sector[EID_set_new[i]])] += count_new1[bot,i]
    return sectors_count

In [16]:
for i in range(num_classes-2):
    x = range(23)
    y = sectors_count(i).values()
    labels = sectors_count(i).keys()
    plt.figure(figsize=(16,18))
    plt.plot(x, y, 'r-')
    plt.title(("Group: %d. Contains botnets like: %s %s")%(i+1,botnet_family[i][0],botnet_family[i][1]))
    plt.xticks(x, labels, rotation='vertical')
    plt.savefig("Group_%d.png"%(i+1))
    plt.close()

In [22]:
input1 = []
for i in range(len(botnet_set)):
    input1.append(sectors_count_botnet(i))
output = []
for bot in botnet_set:
    output.append(bot_to_vector(bot))

In [23]:
from __future__ import print_function
np.random.seed(1337)  # for reproducibility
from sklearn.cross_validation import train_test_split

from keras.datasets import mnist
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.optimizers import SGD, Adam, RMSprop
from keras.utils import np_utils

inp = np.array(input1)
out = np.array(output)
X_train, X_test, Y_train, Y_test = train_test_split(inp, out, test_size=0.2, random_state=42)

model = Sequential()
model.add(Dense(16, input_shape=(num_classes,)))
model.add(Activation('tanh'))
model.add(Dropout(0.15))

model.add(Dense(8))
model.add(Activation('tanh'))
model.add(Dropout(0.15))

model.add(Dense(4))
model.add(Activation('tanh'))
model.add(Dropout(0.15))

model.add(Dense(16))
model.add(Activation('tanh'))
model.add(Dropout(0.15))


model.add(Dense(23))
model.add(Activation('softmax'))

model.summary()

batch_size = 4
nb_classes = 23
nb_epoch = 25
#target = open("NN_out.txt", 'w')

model.compile(loss='categorical_crossentropy', optimizer=SGD(), class_mode="categorical")

history = model.fit(X_train, Y_train,  batch_size=batch_size, nb_epoch=nb_epoch,
                    verbose=1, validation_data=(X_test, Y_test))
score = model.evaluate(X_test, Y_test, verbose=0)
#print('Test score:', score)

p = model.predict(X_test)
yy = np.argmax(p, axis=1)
yyy = np.argmax(Y_test, axis=1)

a = np.equal(yy, yyy)
test_acc = ( 100.0 * (0.0 + sum(a)) / (len(a) + 0.0 ))

p = model.predict(X_train)
yy = np.argmax(p, axis=1)
yyy = np.argmax(Y_train, axis=1)

a = np.equal(yy, yyy)
train_acc = ( 100.0 * (0.0 + sum(a)) / (len(a) + 0.0 ))
print("NB_EPOCH : " , str(nb_epoch) , " Score: " , str(score) , " test accuracy: " , str(test_acc) , " Train accuracy: "  , str(train_acc) + "\n")
#target.close()


--------------------------------------------------------------------------------
Initial input shape: (None, 23)
--------------------------------------------------------------------------------
Layer (name)                  Output Shape                  Param #             
--------------------------------------------------------------------------------
Dense (Unnamed)               (None, 16)                    384                 
Activation (Unnamed)          (None, 16)                    0                   
Dropout (Unnamed)             (None, 16)                    0                   
Dense (Unnamed)               (None, 8)                     136                 
Activation (Unnamed)          (None, 8)                     0                   
Dropout (Unnamed)             (None, 8)                     0                   
Dense (Unnamed)               (None, 4)                     36                  
Activation (Unnamed)          (None, 4)                     0                   
Dropout (Unnamed)             (None, 4)                     0                   
Dense (Unnamed)               (None, 16)                    80                  
Activation (Unnamed)          (None, 16)                    0                   
Dropout (Unnamed)             (None, 16)                    0                   
Dense (Unnamed)               (None, 23)                    391                 
Activation (Unnamed)          (None, 23)                    0                   
--------------------------------------------------------------------------------
Total params: 1027
--------------------------------------------------------------------------------
Train on 164 samples, validate on 42 samples
Epoch 1/25
164/164 [==============================] - 0s - loss: 3.1179 - val_loss: 3.0333
Epoch 2/25
164/164 [==============================] - 0s - loss: 2.9240 - val_loss: 2.8591
Epoch 3/25
164/164 [==============================] - 0s - loss: 2.7318 - val_loss: 2.6831
Epoch 4/25
164/164 [==============================] - 0s - loss: 2.5457 - val_loss: 2.5146
Epoch 5/25
164/164 [==============================] - 0s - loss: 2.4159 - val_loss: 2.3728
Epoch 6/25
164/164 [==============================] - 0s - loss: 2.2729 - val_loss: 2.2728
Epoch 7/25
164/164 [==============================] - 0s - loss: 2.1804 - val_loss: 2.2069
Epoch 8/25
164/164 [==============================] - 0s - loss: 2.1355 - val_loss: 2.1707
Epoch 9/25
164/164 [==============================] - 0s - loss: 2.1285 - val_loss: 2.1506
Epoch 10/25
164/164 [==============================] - 0s - loss: 2.0955 - val_loss: 2.1453
Epoch 11/25
164/164 [==============================] - 0s - loss: 2.0496 - val_loss: 2.1401
Epoch 12/25
164/164 [==============================] - 0s - loss: 2.0636 - val_loss: 2.1347
Epoch 13/25
164/164 [==============================] - 0s - loss: 2.0687 - val_loss: 2.1344
Epoch 14/25
164/164 [==============================] - 0s - loss: 2.0349 - val_loss: 2.1330
Epoch 15/25
164/164 [==============================] - 0s - loss: 2.0750 - val_loss: 2.1330
Epoch 16/25
164/164 [==============================] - 0s - loss: 2.0249 - val_loss: 2.1336
Epoch 17/25
164/164 [==============================] - 0s - loss: 2.0336 - val_loss: 2.1275
Epoch 18/25
164/164 [==============================] - 0s - loss: 2.0522 - val_loss: 2.1311
Epoch 19/25
164/164 [==============================] - 0s - loss: 2.0309 - val_loss: 2.1353
Epoch 20/25
164/164 [==============================] - 0s - loss: 2.0098 - val_loss: 2.1326
Epoch 21/25
164/164 [==============================] - 0s - loss: 2.0298 - val_loss: 2.1309
Epoch 22/25
164/164 [==============================] - 0s - loss: 2.0283 - val_loss: 2.1316
Epoch 23/25
164/164 [==============================] - 0s - loss: 1.9913 - val_loss: 2.1354
Epoch 24/25
164/164 [==============================] - 0s - loss: 2.0548 - val_loss: 2.1325
Epoch 25/25
164/164 [==============================] - 0s - loss: 2.0574 - val_loss: 2.1310
NB_EPOCH :  25  Score:  2.13097429276  test accuracy:  38.0952380952  Train accuracy:  36.5853658537


In [24]:
p = model.predict(X_test)
yy = np.argmax(p, axis=1)
yyy = np.argmax(Y_test, axis=1)

print(yy)
print(yyy)


[17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17
 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17]
[22 17 10 22 22 22 22 17 17 19  5 12 17 17 16  4 22 17 17 17 22 17 21 22  6
 22 17  7 12 20 22 17 17 17 15 22 17 13 22 17 17  0]

In [48]:
ans = 0.0
for i in range(len(Y_test)):
    pred = p[i].argsort()[-3:][::-1]
    if np.argmax(Y_test[i]) in pred:
        ans += 1.0
print("%.2f"%(100.0*ans / (len(Y_test) + 0.0)))


66.67

In [30]:
p.shape


Out[30]:
(42, 23)

In [32]:
Y_test.shape


Out[32]:
(42, 23)

In [33]:



Out[33]:
42

In [ ]:


In [ ]:
in_list('Pushdo', botnet_family)

In [ ]:
num_classes

In [ ]:
botnet_family[6]

In [ ]:
botnet_set

In [ ]:
Y_train

In [39]:
28.0/42.0


Out[39]:
0.6666666666666666

In [ ]: