In [1]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
# Make the graphs a bit prettier, and bigger
pd.set_option('display.mpl_style', 'default')
plt.rcParams['figure.figsize'] = (15, 5)
plt.rcParams['font.family'] = 'sans-serif'
# This is necessary to show lots of columns in pandas 0.12.
# Not necessary in pandas 0.13.
pd.set_option('display.width', 5000)
pd.set_option('display.max_columns', 60)
In [2]:
cd md
/home/raisa/md
I download non-overlapping randomly selected samples from file auth.txt.gz so that each file contains roughly the same number of successes and fails.
In [54]:
all_df=[]
nfiles=15
for i in range(nfiles):
filename = 'msample%d.csv' % i
print i
all_df.append(pd.read_csv(filename, header=None))
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
In [55]:
all_df[0]
Out[55]:
0
1
2
3
4
5
6
7
8
0
2
U26@DOM1
U26@DOM1
C616
U26
?
?
TGS
Fail
1
9
U101@DOM1
U101@DOM1
C1862
C1862
Negotiate
Interactive
LogOn
Success
2
33
C2025$@DOM1
C2025$@DOM1
C467
C467
?
Network
LogOff
Success
3
47
C2653$@DOM1
C2653$@DOM1
C2653
C2653
?
?
TGT
Fail
4
54
C2653$@DOM1
C2653$@DOM1
C2653
C586
NTLM
Network
LogOn
Fail
5
55
C2653$@DOM1
C2653$@DOM1
C2653
C2653
?
?
TGT
Fail
6
95
U66@DOM1
U66@DOM1
C832
C832
?
Network
LogOff
Success
7
128
C1114$@DOM1
C1114$@DOM1
C1115
C1114
?
?
TGS
Fail
8
164
C1114$@DOM1
C1114$@DOM1
C1115
C1114
?
?
TGS
Fail
9
174
C2692$@DOM1
C2692$@DOM1
C528
C528
?
Network
LogOff
Success
10
205
U252@DOM1
U252@DOM1
C2627
C1315
NTLM
Network
LogOn
Fail
11
213
C599$@DOM1
C599$@DOM1
C553
C553
?
Network
LogOff
Success
12
239
C3390$@DOM1
C3390$@DOM1
C3392
C3392
?
?
TGT
Fail
13
243
U22@DOM1
U22@DOM1
C477
U22
?
?
TGS
Fail
14
286
C1607$@DOM1
C1607$@DOM1
C457
C457
?
Network
LogOff
Success
15
308
C2096$@?
C2096$@?
C25240
C25240
?
?
TGT
Fail
16
335
U4@DOM1
U4@DOM1
C229
C229
Kerberos
Network
LogOn
Success
17
355
C1714$@DOM1
C1714$@DOM1
C612
C612
?
Network
LogOff
Success
18
363
C1527$@DOM1
C1527$@DOM1
C1527
C612
NTLM
Network
LogOn
Fail
19
454
C2096$@?
C2096$@?
C457
C457
?
?
TGT
Fail
20
489
C2902$@DOM1
C2902$@DOM1
C2902
C1065
Kerberos
Network
LogOn
Success
21
523
C4334$@DOM1
C4334$@DOM1
C4334
C2106
Kerberos
Network
LogOn
Success
22
554
C2653$@DOM1
C2653$@DOM1
C2653
C2653
?
?
TGT
Fail
23
571
U1825@?
U1825@?
C612
C612
?
?
TGT
Fail
24
623
U22@DOM1
U22@DOM1
C506
U22
?
?
TGS
Fail
25
641
C860$@DOM1
C860$@DOM1
C860
C457
Kerberos
Network
LogOn
Success
26
673
C2043$@DOM1
C2043$@DOM1
C529
C529
?
Network
LogOff
Success
27
677
C2759$@DOM1
C2759$@DOM1
C2759
C2759
?
?
TGS
Success
28
773
LOCAL SERVICE@C3049
LOCAL SERVICE@C3049
C3049
C3049
Negotiate
Service
LogOn
Success
29
834
C2480$@DOM1
C2480$@DOM1
C2479
C2479
MICROSOFT_AUTHENTICATION_PACKAGE_V1_0
Network
LogOn
Fail
...
...
...
...
...
...
...
...
...
...
400682
5010840
U9@DOM1
C586$@DOM1
C586
C586
?
?
AuthMap
Success
400683
5010841
U59@?
U59@?
C1634
C1634
?
?
TGT
Fail
400684
5010861
U8929@?
U8929@?
C19037
C19037
?
?
TGT
Fail
400685
5010873
U59@?
U59@?
C1634
C1634
?
?
TGT
Fail
400686
5010874
U9@?
U9@?
C222
C222
?
?
TGT
Fail
400687
5010879
U22@DOM1
U22@DOM1
C849
U22
?
?
TGS
Fail
400688
5010879
U9@DOM1
U9@DOM1
C222
C222
Negotiate
Interactive
LogOn
Fail
400689
5010884
NETWORK SERVICE@C25102
NETWORK SERVICE@C25102
C25102
C25102
Negotiate
Service
LogOn
Success
400690
5010900
C23484$@DOM1
C23484$@DOM1
C23484
C586
Kerberos
Network
LogOn
Success
400691
5010907
C1692$@DOM1
C1692$@DOM1
C1692
C1692
?
Network
LogOff
Success
400692
5010916
C743$@DOM1
C743$@DOM1
C586
C586
?
Network
LogOff
Success
400693
5010938
U9@?
U9@?
C222
C222
?
?
TGT
Fail
400694
5010963
C2344$@DOM1
C2344$@DOM1
C457
C457
?
Network
LogOff
Success
400695
5010970
U22@DOM1
U22@DOM1
C246
U22
?
?
TGS
Fail
400696
5011005
U59@?
U59@?
C1634
C1634
?
?
TGT
Fail
400697
5011008
C3188$@DOM1
C3188$@DOM1
C3188
C3188
?
Network
LogOff
Success
400698
5011014
U101@?
U101@?
C3415
C3415
?
?
TGT
Fail
400699
5011015
U59@?
U59@?
C589
C589
?
?
TGT
Fail
400700
5011043
U10107@DOM1
U10107@DOM1
C419
C419
?
Network
LogOff
Success
400701
5011067
C27118$@DOM1
C27118$@DOM1
C1369
C1369
?
?
TGT
Fail
400702
5011071
C21596$@DOM1
C21596$@DOM1
C21596
C612
Kerberos
Network
LogOn
Success
400703
5011083
U9@?
U9@?
C222
C222
?
?
TGT
Fail
400704
5011087
U9@DOM1
U9@DOM1
C222
C222
Negotiate
Interactive
LogOn
Fail
400705
5011110
C398$@DOM1
C398$@DOM1
C1767
C1767
Kerberos
Network
LogOn
Fail
400706
5011116
C1791$@DOM1
C1791$@DOM1
C1065
C1065
?
Network
LogOff
Success
400707
5011120
C1617$@DOM1
C1617$@DOM1
C1618
C457
Kerberos
Network
LogOn
Success
400708
5011157
C7780$@DOM1
C7780$@DOM1
C7780
C528
Kerberos
Network
LogOn
Success
400709
5011161
U22@DOM1
U22@DOM1
C965
U22
?
?
TGS
Fail
400710
5011167
U6715@DOM1
U6715@DOM1
C10781
C10781
NTLM
Network
LogOn
Success
400711
5011195
U199@DOM1
U1825@DOM1
C1929
C1929
Negotiate
Batch
LogOn
Fail
400712 rows × 9 columns
In [56]:
Y=[]
for i in range(nfiles):
Y.append(all_df[i][8]=='Success')
In [13]:
Y[1]
Out[13]:
0 False
1 False
2 False
3 True
4 True
5 False
6 True
7 True
8 True
9 False
10 False
11 False
12 True
13 False
14 True
15 True
16 False
17 True
18 True
19 True
20 False
21 False
22 True
23 True
24 True
25 True
26 False
27 False
28 True
29 False
...
400276 True
400277 False
400278 True
400279 False
400280 False
400281 True
400282 True
400283 False
400284 True
400285 False
400286 True
400287 True
400288 False
400289 True
400290 False
400291 True
400292 True
400293 False
400294 False
400295 False
400296 True
400297 False
400298 False
400299 True
400300 False
400301 True
400302 False
400303 False
400304 True
400305 False
Name: 8, dtype: bool
I here repeat my procedure for generating labeled data and features for training/test data.
In [57]:
def map_user(x):
if x.startswith('C'):
return 'C'
elif x.startswith('U'):
return 'U'
else:
return x
In [68]:
X=[]
for i in range(nfiles):
df=all_df[i]
df["source_user"], df["source_domain"] = zip(*df[1].str.split('@').tolist())
df["source_user"]=df["source_user"].str.rstrip('$')
df["destination_user"], df["destination_domain"] = zip(*df[2].str.split('@').tolist())
df["destination_user"]=df["destination_user"].str.rstrip('$')
df['source_class']=df['source_user'].map(map_user)
df['destination_class']=df['destination_user'].map(map_user)
x=pd.DataFrame.from_items([
('time', (df[0]%(24*60*60)).astype(int))])
x['same_user']= (df['destination_user']==df['source_user'])
x['same_domain']=(df['destination_domain']==df['source_domain'])
x['source_user_comp_same']=(df[3]==df['source_user'])
x['destination_user_comp_same']=(df['destination_user']==df[4])
x['same_comp']=(df[3]==df[4])
x['source_domain_comp_same']=(df[3]==df['source_domain'])
x['destination_domain_comp_same']=(df['destination_domain']==df[4])
for j in [5,6, 7]:
for label in sorted(df[j].unique()):
if label=='?':
if j==5:
x['?_authentication type']=(df[j]==label)
elif j==6:
x['?_logon type']=(df[j]==label)
else:
x[label]=(df[j]==label)
for cl in ['source_class', 'destination_class']:
for label in sorted(df[cl].unique()):
if cl=='source_class':
x['source_'+label]=(df[cl]==label)
else:
x['destination_'+label]=(df[cl]==label)
X.append(x)
In [62]:
X[1]
Out[62]:
time
same_user
same_domain
source_user_comp_same
destination_user_comp_same
same_comp
source_domain_comp_same
destination_domain_comp_same
?_authentication type
ACRONIS_RELOGON_AUTHENTICATION_PACKAGE
Kerberos
MICROSOFT_AUTHENTICATION_PA
MICROSOFT_AUTHENTICATION_PAC
MICROSOFT_AUTHENTICATION_PACK
MICROSOFT_AUTHENTICATION_PACKA
MICROSOFT_AUTHENTICATION_PACKAG
MICROSOFT_AUTHENTICATION_PACKAGE
MICROSOFT_AUTHENTICATION_PACKAGE_
MICROSOFT_AUTHENTICATION_PACKAGE_V
MICROSOFT_AUTHENTICATION_PACKAGE_V1
MICROSOFT_AUTHENTICATION_PACKAGE_V1_
MICROSOFT_AUTHENTICATION_PACKAGE_V1_0
NETWARE_AUTHENTICATION_PACKAGE_V1_0
NTLM
Negotiate
Setuid
Wave
?_logon type
Batch
CachedInteractive
Interactive
Network
NetworkCleartext
NewCredentials
RemoteInteractive
Service
Unlock
AuthMap
LogOff
LogOn
ScreenLock
ScreenUnlock
TGS
TGT
source_U
source_C
source_LOCAL SERVICE
source_ANONYMOUS LOGON
source_NETWORK SERVICE
source_SYSTEM
destination_U
destination_C
destination_LOCAL SERVICE
destination_ANONYMOUS LOGON
destination_NETWORK SERVICE
destination_SYSTEM
0
2
True
True
False
True
False
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
True
False
False
False
False
False
True
False
False
False
False
False
1
3
True
True
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
True
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
True
False
False
False
False
False
True
False
False
False
False
False
2
11
True
True
False
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
False
False
True
False
False
False
False
False
False
False
True
False
False
False
False
True
False
False
False
False
False
True
False
False
False
False
False
3
140
True
True
True
False
False
False
False
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
False
False
True
False
False
False
False
False
True
False
False
False
False
False
True
False
False
False
False
4
176
True
True
False
False
True
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
False
True
False
False
False
False
False
False
True
False
False
False
False
False
True
False
False
False
False
5
185
True
True
False
False
True
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
True
False
False
False
False
False
True
False
False
False
False
False
6
224
True
True
True
False
False
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
True
False
False
False
False
False
True
False
False
False
False
7
250
True
True
False
False
True
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
False
True
False
False
False
False
False
False
True
False
False
False
False
False
True
False
False
False
False
8
252
True
True
False
False
False
False
False
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
False
False
True
False
False
False
False
False
True
False
False
False
False
False
True
False
False
False
False
9
333
True
True
True
True
True
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
True
False
False
False
False
False
True
False
False
False
False
10
348
True
True
False
True
False
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
True
False
False
False
False
False
True
False
False
False
False
False
11
416
True
True
False
True
False
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
True
False
False
False
False
False
True
False
False
False
False
False
12
459
True
True
True
True
True
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
True
False
False
False
False
False
True
False
False
False
False
13
470
True
True
True
True
True
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
True
False
False
False
False
False
True
False
False
False
False
14
485
True
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
False
False
True
False
False
False
False
False
False
False
True
False
False
False
False
False
True
False
False
False
False
False
True
False
False
False
False
15
490
True
True
True
False
False
False
False
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
False
False
True
False
False
False
False
False
True
False
False
False
False
False
True
False
False
False
False
16
510
True
True
False
True
False
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
True
False
False
False
False
False
True
False
False
False
False
False
17
542
True
True
False
False
True
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
False
True
False
False
False
False
False
False
True
False
False
False
False
False
True
False
False
False
False
18
551
True
True
True
False
False
False
False
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
False
False
True
False
False
False
False
False
True
False
False
False
False
False
True
False
False
False
False
19
570
True
True
False
False
True
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
False
True
False
False
False
False
False
False
True
False
False
False
False
False
True
False
False
False
False
20
588
True
True
False
True
False
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
True
False
False
False
False
False
True
False
False
False
False
False
21
623
True
True
False
True
False
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
True
False
False
False
False
False
True
False
False
False
False
False
22
679
True
True
False
True
False
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
True
False
False
False
False
False
True
False
False
False
False
23
704
True
True
False
False
True
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
False
True
False
False
False
False
False
False
True
False
False
False
False
False
True
False
False
False
False
24
726
True
True
False
False
True
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
False
True
False
False
False
False
False
False
True
False
False
False
False
False
True
False
False
False
False
25
745
True
True
True
False
False
False
False
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
False
False
True
False
False
False
False
False
True
False
False
False
False
False
True
False
False
False
False
26
750
True
True
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
False
False
True
False
False
False
False
False
True
False
False
False
False
False
True
False
False
False
False
27
859
True
True
False
False
True
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
True
False
False
False
False
False
True
False
False
False
False
False
28
936
True
True
False
False
True
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
False
True
False
False
False
False
False
True
False
False
False
False
False
True
False
False
False
False
False
29
975
True
True
False
False
True
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
True
False
False
False
False
False
True
False
False
False
False
False
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
400276
86055
True
True
False
False
False
False
False
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
False
False
True
False
False
False
False
True
False
False
False
False
False
True
False
False
False
False
False
400277
86057
True
True
False
False
True
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
True
False
False
False
False
False
True
False
False
False
False
False
400278
86061
True
True
True
True
True
False
False
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
False
False
True
False
False
False
False
False
True
False
False
False
False
False
True
False
False
False
False
400279
86073
True
True
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
True
False
False
False
False
False
False
False
False
True
False
False
False
False
True
False
False
False
False
False
True
False
False
False
False
False
400280
86089
True
True
False
False
True
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
True
False
False
False
False
False
True
False
False
False
False
False
400281
86093
True
True
False
False
False
False
False
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
False
False
True
False
False
False
False
True
False
False
False
False
False
True
False
False
False
False
False
400282
86104
True
True
False
False
True
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
False
True
False
False
False
False
False
True
False
False
False
False
False
True
False
False
False
False
False
400283
86127
True
True
False
False
True
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
True
False
False
False
False
False
True
False
False
False
False
False
400284
86131
True
True
False
False
True
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
False
True
False
False
False
False
False
False
True
False
False
False
False
False
True
False
False
False
False
400285
86151
True
True
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
True
False
False
False
False
False
False
False
False
True
False
False
False
False
True
False
False
False
False
False
True
False
False
False
False
False
400286
86179
True
True
True
False
False
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
True
False
False
False
False
False
True
False
False
False
False
400287
86251
True
True
False
False
True
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
False
True
False
False
False
False
False
False
True
False
False
False
False
False
True
False
False
False
False
400288
86259
True
True
False
False
True
False
False
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
False
False
True
False
False
False
False
True
False
False
False
False
False
True
False
False
False
False
False
400289
86267
True
True
True
True
True
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
False
True
False
False
False
False
False
False
True
False
False
False
False
False
True
False
False
False
False
400290
86267
True
True
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
True
False
False
False
False
False
False
False
False
True
False
False
False
False
True
False
False
False
False
False
True
False
False
False
False
False
400291
86275
True
True
False
False
True
True
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
False
False
False
False
False
True
False
False
False
True
False
False
False
False
False
False
False
False
True
False
False
False
False
False
True
False
400292
86280
True
True
False
False
True
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
False
True
False
False
False
False
False
True
False
False
False
False
False
True
False
False
False
False
False
400293
86281
True
True
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
False
False
True
False
False
False
False
False
False
False
True
False
False
False
False
True
False
False
False
False
False
True
False
False
False
False
False
400294
86281
True
True
False
True
False
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
True
False
False
False
False
False
True
False
False
False
False
False
400295
86287
True
True
False
False
True
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
True
False
False
False
False
False
True
False
False
False
False
False
400296
86298
True
True
False
False
True
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
False
True
False
False
False
False
False
False
True
False
False
False
False
False
True
False
False
False
False
400297
86341
True
True
False
False
True
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
True
False
False
False
False
False
True
False
False
False
False
False
400298
86347
True
True
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
True
False
False
False
False
False
False
False
False
True
False
False
False
False
True
False
False
False
False
False
True
False
False
False
False
False
400299
86354
True
True
False
False
True
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
True
False
False
False
False
False
True
False
False
False
False
False
400300
86356
True
True
False
False
True
False
False
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
False
False
True
False
False
False
False
True
False
False
False
False
False
True
False
False
False
False
False
400301
86372
True
True
False
False
True
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
False
True
False
False
False
False
False
True
False
False
False
False
False
True
False
False
False
False
False
400302
86373
True
True
False
False
True
False
False
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
False
False
True
False
False
False
False
True
False
False
False
False
False
True
False
False
False
False
False
400303
86374
True
True
False
False
True
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
True
False
False
False
False
False
True
False
False
False
False
False
400304
86391
True
True
False
False
True
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
False
True
False
False
False
False
False
True
False
False
False
False
False
True
False
False
False
False
False
400305
86393
True
True
False
False
False
True
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
True
False
False
False
False
False
False
False
True
False
False
False
False
False
False
False
True
False
False
False
False
True
False
False
False
False
False
True
False
False
False
False
False
400306 rows × 56 columns
In [63]:
X[0].columns
Out[63]:
Index([u'time', u'same_user', u'same_domain', u'source_user_comp_same', u'destination_user_comp_same', u'same_comp', u'source_domain_comp_same', u'destination_domain_comp_same', u'?_authentication type', u'Kerberos', u'MICROSOFT_AUTHENTICATION_PAC', u'MICROSOFT_AUTHENTICATION_PACKA', u'MICROSOFT_AUTHENTICATION_PACKAG', u'MICROSOFT_AUTHENTICATION_PACKAGE', u'MICROSOFT_AUTHENTICATION_PACKAGE_', u'MICROSOFT_AUTHENTICATION_PACKAGE_V', u'MICROSOFT_AUTHENTICATION_PACKAGE_V1', u'MICROSOFT_AUTHENTICATION_PACKAGE_V1_', u'MICROSOFT_AUTHENTICATION_PACKAGE_V1_0', u'NETWARE_AUTHENTICATION_PACKAGE_V1_0', u'NTLM', u'Negotiate', u'Setuid', u'Wave', u'?_logon type', u'Batch', u'CachedInteractive', u'Interactive', u'Network', u'NetworkCleartext', u'NewCredentials', u'RemoteInteractive', u'Service', u'Unlock', u'AuthMap', u'LogOff', u'LogOn', u'ScreenLock', u'ScreenUnlock', u'TGS', u'TGT', u'source_U', u'source_C', u'source_LOCAL SERVICE', u'source_ANONYMOUS LOGON', u'source_NETWORK SERVICE', u'source_SYSTEM', u'destination_U', u'destination_C', u'destination_LOCAL SERVICE', u'destination_ANONYMOUS LOGON', u'destination_SYSTEM', u'destination_NETWORK SERVICE'], dtype='object')
In [64]:
[len(entry.columns) for entry in X]
Out[64]:
[53, 56, 53, 54, 54, 52, 56, 56, 57, 55, 55, 54, 55, 54, 54]
I just discovered that my sample sets do not contain the same number of features. Below I am checking to see what the source of this difference.
In [65]:
all_col = set(sum([list(entry.columns) for entry in X], []))
[all_col.difference(list(entry.columns)) for entry in X]
Out[65]:
[{'ACRONIS_RELOGON_AUTHENTICATION_PACKAGE',
'CygwinLsa',
'MICROSOFT_AUTHENTICA',
'MICROSOFT_AUTHENTICATION_P',
'MICROSOFT_AUTHENTICATION_PA',
'MICROSOFT_AUTHENTICATION_PACK'},
{'CygwinLsa', 'MICROSOFT_AUTHENTICA', 'MICROSOFT_AUTHENTICATION_P'},
{'ACRONIS_RELOGON_AUTHENTICATION_PACKAGE',
'CygwinLsa',
'MICROSOFT_AUTHENTICA',
'MICROSOFT_AUTHENTICATION_P',
'MICROSOFT_AUTHENTICATION_PA',
'MICROSOFT_AUTHENTICATION_PAC'},
{'ACRONIS_RELOGON_AUTHENTICATION_PACKAGE',
'CygwinLsa',
'MICROSOFT_AUTHENTICA',
'MICROSOFT_AUTHENTICATION_P',
'MICROSOFT_AUTHENTICATION_PAC'},
{'ACRONIS_RELOGON_AUTHENTICATION_PACKAGE',
'CygwinLsa',
'MICROSOFT_AUTHENTICA',
'MICROSOFT_AUTHENTICATION_P',
'MICROSOFT_AUTHENTICATION_PAC'},
{'ACRONIS_RELOGON_AUTHENTICATION_PACKAGE',
'CygwinLsa',
'MICROSOFT_AUTHENTICA',
'MICROSOFT_AUTHENTICATION_P',
'MICROSOFT_AUTHENTICATION_PAC',
'MICROSOFT_AUTHENTICATION_PACK',
'Setuid'},
{'MICROSOFT_AUTHENTICA',
'MICROSOFT_AUTHENTICATION_P',
'MICROSOFT_AUTHENTICATION_PA'},
{'ACRONIS_RELOGON_AUTHENTICATION_PACKAGE',
'MICROSOFT_AUTHENTICA',
'MICROSOFT_AUTHENTICATION_PAC'},
{'CygwinLsa', 'MICROSOFT_AUTHENTICATION_P'},
{'ACRONIS_RELOGON_AUTHENTICATION_PACKAGE',
'CygwinLsa',
'MICROSOFT_AUTHENTICA',
'MICROSOFT_AUTHENTICATION_PACKAGE_V1'},
{'MICROSOFT_AUTHENTICA',
'MICROSOFT_AUTHENTICATION_P',
'MICROSOFT_AUTHENTICATION_PA',
'MICROSOFT_AUTHENTICATION_PAC'},
{'CygwinLsa',
'MICROSOFT_AUTHENTICA',
'MICROSOFT_AUTHENTICATION_P',
'MICROSOFT_AUTHENTICATION_PA',
'MICROSOFT_AUTHENTICATION_PAC'},
{'CygwinLsa',
'MICROSOFT_AUTHENTICA',
'MICROSOFT_AUTHENTICATION_P',
'MICROSOFT_AUTHENTICATION_PA'},
{'CygwinLsa',
'MICROSOFT_AUTHENTICA',
'MICROSOFT_AUTHENTICATION_P',
'MICROSOFT_AUTHENTICATION_PA',
'Setuid'},
{'ACRONIS_RELOGON_AUTHENTICATION_PACKAGE',
'MICROSOFT_AUTHENTICA',
'MICROSOFT_AUTHENTICATION_P',
'MICROSOFT_AUTHENTICATION_PA',
'MICROSOFT_AUTHENTICATION_PAC'}]
This is potentially different spelling of two different commands/labels. For now I will just remove all the labels that are not present in 15 files of data I have just downloaded. If the scores for machine learning will change noticeably. I will look into ways to clean and incorporate this data.
In [69]:
col_set = [set(entry.columns) for entry in X]
common_subset = set.intersection(*col_set)
drop_cols = [e.difference(common_subset) for e in col_set]
for entry, to_drop in zip(X, drop_cols):
print 'dropping', to_drop
for item in to_drop:
del entry[item]
dropping set(['Setuid', 'MICROSOFT_AUTHENTICATION_PACKAGE_V1', 'MICROSOFT_AUTHENTICATION_PAC'])
dropping set(['MICROSOFT_AUTHENTICATION_PA', 'Setuid', 'ACRONIS_RELOGON_AUTHENTICATION_PACKAGE', 'MICROSOFT_AUTHENTICATION_PACKAGE_V1', 'MICROSOFT_AUTHENTICATION_PAC', 'MICROSOFT_AUTHENTICATION_PACK'])
dropping set(['Setuid', 'MICROSOFT_AUTHENTICATION_PACK', 'MICROSOFT_AUTHENTICATION_PACKAGE_V1'])
dropping set(['MICROSOFT_AUTHENTICATION_PA', 'Setuid', 'MICROSOFT_AUTHENTICATION_PACKAGE_V1', 'MICROSOFT_AUTHENTICATION_PACK'])
dropping set(['MICROSOFT_AUTHENTICATION_PA', 'Setuid', 'MICROSOFT_AUTHENTICATION_PACKAGE_V1', 'MICROSOFT_AUTHENTICATION_PACK'])
dropping set(['MICROSOFT_AUTHENTICATION_PA', 'MICROSOFT_AUTHENTICATION_PACKAGE_V1'])
dropping set(['Setuid', 'MICROSOFT_AUTHENTICATION_PACK', 'ACRONIS_RELOGON_AUTHENTICATION_PACKAGE', 'CygwinLsa', 'MICROSOFT_AUTHENTICATION_PACKAGE_V1', 'MICROSOFT_AUTHENTICATION_PAC'])
dropping set(['MICROSOFT_AUTHENTICATION_PA', 'Setuid', 'MICROSOFT_AUTHENTICATION_P', 'CygwinLsa', 'MICROSOFT_AUTHENTICATION_PACKAGE_V1', 'MICROSOFT_AUTHENTICATION_PACK'])
dropping set(['MICROSOFT_AUTHENTICATION_PA', 'Setuid', 'ACRONIS_RELOGON_AUTHENTICATION_PACKAGE', 'MICROSOFT_AUTHENTICATION_PACKAGE_V1', 'MICROSOFT_AUTHENTICATION_PAC', 'MICROSOFT_AUTHENTICA', 'MICROSOFT_AUTHENTICATION_PACK'])
dropping set(['MICROSOFT_AUTHENTICATION_PA', 'Setuid', 'MICROSOFT_AUTHENTICATION_P', 'MICROSOFT_AUTHENTICATION_PAC', 'MICROSOFT_AUTHENTICATION_PACK'])
dropping set(['Setuid', 'CygwinLsa', 'ACRONIS_RELOGON_AUTHENTICATION_PACKAGE', 'MICROSOFT_AUTHENTICATION_PACKAGE_V1', 'MICROSOFT_AUTHENTICATION_PACK'])
dropping set(['Setuid', 'MICROSOFT_AUTHENTICATION_PACK', 'ACRONIS_RELOGON_AUTHENTICATION_PACKAGE', 'MICROSOFT_AUTHENTICATION_PACKAGE_V1'])
dropping set(['Setuid', 'MICROSOFT_AUTHENTICATION_PACK', 'ACRONIS_RELOGON_AUTHENTICATION_PACKAGE', 'MICROSOFT_AUTHENTICATION_PACKAGE_V1', 'MICROSOFT_AUTHENTICATION_PAC'])
dropping set(['MICROSOFT_AUTHENTICATION_PACK', 'ACRONIS_RELOGON_AUTHENTICATION_PACKAGE', 'MICROSOFT_AUTHENTICATION_PACKAGE_V1', 'MICROSOFT_AUTHENTICATION_PAC'])
dropping set(['Setuid', 'MICROSOFT_AUTHENTICATION_PACK', 'CygwinLsa', 'MICROSOFT_AUTHENTICATION_PACKAGE_V1'])
In [70]:
col0 = list(X[0].columns)
for i in range(1,nfiles):
col_i = list(X[i].columns)
assert col0 == col_i, 'mismatch in %r:\n%s\n%s' % (i, col0, col_i)
In [71]:
from sklearn import linear_model
clf_l1_LR = linear_model.LogisticRegression(C=1000, penalty='l1', tol=0.001).fit(X[0], Y[0])
scores=[]
scores.append(clf_l1_LR.score(X[0], Y[0]))
print 'score for training set', scores[0]
for i in range(1,nfiles):
scores.append(clf_l1_LR.score(X[i], Y[i]))
print 'score for test set', i, scores[i]
score for training set 0.944072051748
score for test set 1 0.94448247091
score for test set 2 0.943976919929
score for test set 3 0.944386639788
score for test set 4 0.944560448937
score for test set 5 0.943735713999
score for test set 6 0.944166904201
score for test set 7 0.943538001825
score for test set 8 0.944438192553
score for test set 9 0.943566597067
score for test set 10 0.944126539894
score for test set 11 0.944858468573
score for test set 12 0.944788959785
score for test set 13 0.944127431039
score for test set 14 0.943777194073
In [72]:
print 'mean', np.mean(scores), 'std', np.std(scores)
mean 0.944173502288 std 0.00039856965612
Logistic regression with Lasso (L1 penalty) computed over 15 non-overlapping subsets of auth.txt.gz gave me a score with mean 0.9442 and std 0.0004. I believe I am samplying from a normal distribution, which means I have a very narrow gaussian. This in turn means that further sampling will not change my results significantly.
Content source: r-karasik/lanl-auth-cybersecurity
Similar notebooks: